Use generalized mappings to handle import

This commit is contained in:
Mouse Reeve 2021-11-10 16:49:54 -08:00
parent 0736c7e160
commit 4ccd9fc633
7 changed files with 152 additions and 178 deletions

View file

@ -7,10 +7,3 @@ class GoodreadsImporter(Importer):
For a more complete example of overriding see librarything_import.py"""
service = "Goodreads"
def parse_fields(self, entry):
"""handle the specific fields in goodreads csvs"""
entry.update({"import_source": self.service})
# add missing 'Date Started' field
entry.update({"Date Started": None})
return entry

View file

@ -1,5 +1,6 @@
""" handle reading a csv from an external service, defaults are from Goodreads """
import csv
from dataclasses import dataclass
import logging
from django.utils import timezone
@ -18,30 +19,59 @@ class Importer:
service = "Unknown"
delimiter = ","
encoding = "UTF-8"
mandatory_fields = ["Title", "Author"]
# these are from Goodreads
row_mappings_guesses = {
"id": ["id", "book id"],
"title": ["title"],
"authors": ["author", "authors", "primary author"],
"isbn_13": ["isbn13", "isbn"],
"isbn_10": ["isbn10", "isbn"],
"shelf": ["shelf", "exclusive shelf", "read status"],
"review_name": [],
"review_body": ["my review"],
"rating": ["my rating", "rating", "star rating"],
"date_added": ["date added", "entry date", "added"],
"date_started": ["date started", "started"],
"date_finished": ["date finished", "last date read", "date read", "finished"],
}
def create_job(self, user, csv_file, include_reviews, privacy):
"""check over a csv and creates a database entry for the job"""
csv_reader = csv.DictReader(csv_file, delimiter=self.delimiter)
rows = enumerate(list(csv_reader))
job = ImportJob.objects.create(
user=user, include_reviews=include_reviews, privacy=privacy
user=user,
include_reviews=include_reviews,
privacy=privacy,
mappings=self.create_row_mappings(csv_reader.fieldnames),
)
for index, entry in enumerate(
list(csv.DictReader(csv_file, delimiter=self.delimiter))
):
if not all(x in entry for x in self.mandatory_fields):
raise ValueError("Author and title must be in data.")
entry = self.parse_fields(entry)
self.save_item(job, index, entry)
for index, entry in rows:
print(index, entry)
self.create_item(job, index, entry)
return job
def save_item(self, job, index, data): # pylint: disable=no-self-use
"""creates and saves an import item"""
ImportItem(job=job, index=index, data=data).save()
def create_row_mappings(self, headers):
"""guess what the headers mean"""
mappings = {}
for (key, guesses) in self.row_mappings_guesses.items():
value = [h for h in headers if h.lower() in guesses]
value = value[0] if len(value) else None
if value:
headers.remove(value)
mappings[key] = value
return mappings
def parse_fields(self, entry):
"""updates csv data with additional info"""
entry.update({"import_source": self.service})
return entry
def create_item(self, job, index, data):
"""creates and saves an import item"""
print(data)
normalized = self.normalize_row(data, job.mappings)
ImportItem(job=job, index=index, data=data, normalized_data=normalized).save()
def normalize_row(self, entry, mappings): # pylint: disable=no-self-use
"""use the dataclass to create the formatted row of data"""
return {k: entry.get(v) for k, v in mappings.items()}
def create_retry_job(self, user, original_job, items):
"""retry items that didn't import"""
@ -49,10 +79,13 @@ class Importer:
user=user,
include_reviews=original_job.include_reviews,
privacy=original_job.privacy,
# TODO: allow users to adjust mappings
mappings=original_job.mappings,
retry=True,
)
for item in items:
self.save_item(job, item.index, item.data)
# this will re-normalize the raw data
self.create_item(job, item.index, item.data)
return job
def start_import(self, job):
@ -156,3 +189,23 @@ def handle_imported_book(source, user, item, include_reviews, privacy):
)
# only broadcast this review to other bookwyrm instances
review.save(software="bookwyrm")
@dataclass
class ImportEntry:
"""data extracted from a line in a csv"""
title: str
authors: str = None
isbn_13: str = None
isbn_10: str = None
shelf: str = None
review_name: str = None
review_rating: float = None
review_body: str = None
review_cw: str = None
rating: float = None
date_added: str = None
date_started: str = None
date_finished: str = None
import_source: str = "Unknown"

View file

@ -1,7 +1,4 @@
""" handle reading a csv from librarything """
import re
import math
""" handle reading a tsv from librarything """
from . import Importer
@ -11,32 +8,3 @@ class LibrarythingImporter(Importer):
service = "LibraryThing"
delimiter = "\t"
encoding = "ISO-8859-1"
# mandatory_fields : fields matching the book title and author
mandatory_fields = ["Title", "Primary Author"]
def parse_fields(self, entry):
"""custom parsing for librarything"""
data = {}
data["import_source"] = self.service
data["Book Id"] = entry["Book Id"]
data["Title"] = entry["Title"]
data["Author"] = entry["Primary Author"]
data["ISBN13"] = entry["ISBN"]
data["My Review"] = entry["Review"]
if entry["Rating"]:
data["My Rating"] = math.ceil(float(entry["Rating"]))
else:
data["My Rating"] = ""
data["Date Added"] = re.sub(r"\[|\]", "", entry["Entry Date"])
data["Date Started"] = re.sub(r"\[|\]", "", entry["Date Started"])
data["Date Read"] = re.sub(r"\[|\]", "", entry["Date Read"])
data["Exclusive Shelf"] = None
if data["Date Read"]:
data["Exclusive Shelf"] = "read"
elif data["Date Started"]:
data["Exclusive Shelf"] = "reading"
else:
data["Exclusive Shelf"] = "to-read"
return data

View file

@ -1,6 +1,4 @@
""" handle reading a csv from librarything """
import re
""" handle reading a csv from storygraph"""
from . import Importer
@ -8,26 +6,3 @@ class StorygraphImporter(Importer):
"""csv downloads from librarything"""
service = "Storygraph"
# mandatory_fields : fields matching the book title and author
mandatory_fields = ["Title"]
def parse_fields(self, entry):
"""custom parsing for storygraph"""
data = {}
data["import_source"] = self.service
data["Title"] = entry["Title"]
data["Author"] = entry["Authors"] if "Authors" in entry else entry["Author"]
data["ISBN13"] = entry["ISBN"]
data["My Review"] = entry["Review"]
if entry["Star Rating"]:
data["My Rating"] = float(entry["Star Rating"])
else:
data["My Rating"] = ""
data["Date Added"] = re.sub(r"[/]", "-", entry["Date Added"])
data["Date Read"] = re.sub(r"[/]", "-", entry["Last Date Read"])
data["Exclusive Shelf"] = (
{"read": "read", "currently-reading": "reading", "to-read": "to-read"}
).get(entry["Read Status"], None)
return data

View file

@ -35,6 +35,7 @@ class ImportJob(models.Model):
created_date = models.DateTimeField(default=timezone.now)
task_id = models.CharField(max_length=100, null=True)
include_reviews = models.BooleanField(default=True)
mappings = models.JSONField()
complete = models.BooleanField(default=False)
privacy = models.CharField(
max_length=255, default="public", choices=PrivacyLevels.choices
@ -48,6 +49,7 @@ class ImportItem(models.Model):
job = models.ForeignKey(ImportJob, on_delete=models.CASCADE, related_name="items")
index = models.IntegerField()
data = models.JSONField()
normalized_data = models.JSONField()
book = models.ForeignKey(Book, on_delete=models.SET_NULL, null=True, blank=True)
book_guess = models.ForeignKey(
Book,
@ -98,55 +100,59 @@ class ImportItem(models.Model):
@property
def title(self):
"""get the book title"""
return self.data["title"]
return self.normalized_data["title"]
@property
def author(self):
"""get the book's authors"""
return self.data["authors"]
return self.normalized_data["authors"]
@property
def isbn(self):
"""pulls out the isbn13 field from the csv line data"""
return unquote_string(self.data["isbn_13"])
return unquote_string(self.normalized_data["isbn_13"])
@property
def shelf(self):
"""the goodreads shelf field"""
return self.data.get("shelf")
return self.normalized_data.get("shelf")
@property
def review(self):
"""a user-written review, to be imported with the book data"""
return self.data["review_body"]
return self.normalized_data["review_body"]
@property
def rating(self):
"""x/5 star rating for a book"""
if self.data.get("rating"):
return float(self.data["rating"])
if self.normalized_data.get("rating"):
return float(self.normalized_data["rating"])
return None
@property
def date_added(self):
"""when the book was added to this dataset"""
if self.data.get("date_added"):
return timezone.make_aware(dateutil.parser.parse(self.data["date_added"]))
if self.normalized_data.get("date_added"):
return timezone.make_aware(
dateutil.parser.parse(self.normalized_data["date_added"])
)
return None
@property
def date_started(self):
"""when the book was started"""
if self.data.get("date_started"):
return timezone.make_aware(dateutil.parser.parse(self.data["date_started"]))
if self.normalized_data.get("date_started"):
return timezone.make_aware(
dateutil.parser.parse(self.normalized_data["date_started"])
)
return None
@property
def date_read(self):
"""the date a book was completed"""
if self.data.get("date_finished"):
if self.normalized_data.get("date_finished"):
return timezone.make_aware(
dateutil.parser.parse(self.data["date_finished"])
dateutil.parser.parse(self.normalized_data["date_finished"])
)
return None
@ -177,8 +183,12 @@ class ImportItem(models.Model):
def __repr__(self):
# pylint: disable=consider-using-f-string
return "<{!r}Item {!r}>".format(self.data["import_source"], self.data["title"])
return "<{!r}Item {!r}>".format(
self.normalized_data["import_source"], self.normalized_data["title"]
)
def __str__(self):
# pylint: disable=consider-using-f-string
return "{} by {}".format(self.data["title"], self.data["authors"])
return "{} by {}".format(
self.normalized_data["title"], self.normalized_data["authors"]
)

View file

@ -1,5 +1,5 @@
id,title,author,ISBN,rating,shelf,review,added
38,Gideon the Ninth (The Locked Tomb #1),Tamsyn Muir,"9781250313195",,read,,2021-11-10
48,Harrow the Ninth (The Locked Tomb #2),Tamsyn Muir,,3,read,,2021-11-10
id,title,author,ISBN,rating,shelf,review,added,finished
38,Gideon the Ninth,Tamsyn Muir,"9781250313195",,read,,2021-11-10,2021-11-11
48,Harrow the Ninth,Tamsyn Muir,,3,read,,2021-11-10
23,Subcutanean,Aaron A. Reed,,,read,,2021-11-10
10,Patisserie at Home,Mélanie Dupuis,"9780062445315",2,read,"mixed feelings",2021-11-10
10,Patisserie at Home,Mélanie Dupuis,"9780062445315",2,read,"mixed feelings",2021-11-10,2021-11-11

Can't render this file because it has a wrong number of fields in line 3.

View file

@ -1,6 +1,5 @@
""" testing import """
from collections import namedtuple
import csv
import pathlib
from unittest.mock import patch
import datetime
@ -29,26 +28,7 @@ class GenericImporter(TestCase):
def setUp(self):
"""use a test csv"""
class TestImporter(Importer):
"""basic importer"""
mandatory_fields = ["title", "author"]
def parse_fields(self, entry):
return {
"id": entry["id"],
"Title": entry["title"],
"Author": entry["author"],
"ISBN13": entry["ISBN"],
"Star Rating": entry["rating"],
"My Rating": entry["rating"],
"My Review": entry["review"],
"Exclusive Shelf": entry["shelf"],
"Date Added": entry["added"],
"Date Read": None,
}
self.importer = TestImporter()
self.importer = Importer()
datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv")
self.csv = open(datafile, "r", encoding=self.importer.encoding)
with patch("bookwyrm.suggested_users.rerank_suggestions_task.delay"), patch(
@ -77,13 +57,24 @@ class GenericImporter(TestCase):
import_items = models.ImportItem.objects.filter(job=import_job).all()
self.assertEqual(len(import_items), 4)
self.assertEqual(import_items[0].index, 0)
self.assertEqual(import_items[0].data["id"], "38")
self.assertEqual(import_items[0].normalized_data["id"], "38")
self.assertEqual(import_items[0].normalized_data["title"], "Gideon the Ninth")
self.assertEqual(import_items[0].normalized_data["authors"], "Tamsyn Muir")
self.assertEqual(import_items[0].normalized_data["isbn_13"], "9781250313195")
self.assertIsNone(import_items[0].normalized_data["isbn_10"])
self.assertEqual(import_items[0].normalized_data["shelf"], "read")
self.assertEqual(import_items[1].index, 1)
self.assertEqual(import_items[1].data["id"], "48")
self.assertEqual(import_items[1].normalized_data["id"], "48")
self.assertEqual(import_items[1].normalized_data["title"], "Harrow the Ninth")
self.assertEqual(import_items[2].index, 2)
self.assertEqual(import_items[2].data["id"], "23")
self.assertEqual(import_items[2].normalized_data["id"], "23")
self.assertEqual(import_items[2].normalized_data["title"], "Subcutanean")
self.assertEqual(import_items[3].index, 3)
self.assertEqual(import_items[3].data["id"], "10")
self.assertEqual(import_items[3].normalized_data["id"], "10")
self.assertEqual(import_items[3].normalized_data["title"], "Patisserie at Home")
def test_create_retry_job(self, *_):
"""trying again with items that didn't import"""
@ -103,9 +94,9 @@ class GenericImporter(TestCase):
retry_items = models.ImportItem.objects.filter(job=retry).all()
self.assertEqual(len(retry_items), 2)
self.assertEqual(retry_items[0].index, 0)
self.assertEqual(retry_items[0].data["id"], "38")
self.assertEqual(retry_items[0].normalized_data["id"], "38")
self.assertEqual(retry_items[1].index, 1)
self.assertEqual(retry_items[1].data["id"], "48")
self.assertEqual(retry_items[1].normalized_data["id"], "48")
def test_start_import(self, *_):
"""check that a task was created"""
@ -143,15 +134,12 @@ class GenericImporter(TestCase):
shelf = self.local_user.shelf_set.filter(identifier="read").first()
self.assertIsNone(shelf.books.first())
import_job = models.ImportJob.objects.create(user=self.local_user)
datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv")
csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding
for index, entry in enumerate(list(csv.DictReader(csv_file))):
entry = self.importer.parse_fields(entry)
import_item = models.ImportItem.objects.create(
job_id=import_job.id, index=index, data=entry, book=self.book
)
break
import_job = self.importer.create_job(
self.local_user, self.csv, False, "public"
)
import_item = import_job.items.first()
import_item.book = self.book
import_item.save()
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"):
handle_imported_book(
@ -172,15 +160,12 @@ class GenericImporter(TestCase):
shelved_date=make_date(2020, 2, 2),
)
import_job = models.ImportJob.objects.create(user=self.local_user)
datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv")
csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding
for index, entry in enumerate(list(csv.DictReader(csv_file))):
entry = self.importer.parse_fields(entry)
import_item = models.ImportItem.objects.create(
job_id=import_job.id, index=index, data=entry, book=self.book
)
break
import_job = self.importer.create_job(
self.local_user, self.csv, False, "unlisted"
)
import_item = import_job.items.first()
import_item.book = self.book
import_item.save()
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"):
handle_imported_book(
@ -199,15 +184,12 @@ class GenericImporter(TestCase):
def test_handle_import_twice(self, *_):
"""re-importing books"""
shelf = self.local_user.shelf_set.filter(identifier="read").first()
import_job = models.ImportJob.objects.create(user=self.local_user)
datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv")
csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding
for index, entry in enumerate(list(csv.DictReader(csv_file))):
entry = self.importer.parse_fields(entry)
import_item = models.ImportItem.objects.create(
job_id=import_job.id, index=index, data=entry, book=self.book
)
break
import_job = self.importer.create_job(
self.local_user, self.csv, False, "public"
)
import_item = import_job.items.first()
import_item.book = self.book
import_item.save()
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"):
handle_imported_book(
@ -219,18 +201,15 @@ class GenericImporter(TestCase):
shelf.refresh_from_db()
self.assertEqual(shelf.books.first(), self.book)
self.assertEqual(models.ReadThrough.objects.count(), 1)
@patch("bookwyrm.activitystreams.add_status_task.delay")
def test_handle_imported_book_review(self, *_):
"""review import"""
import_job = models.ImportJob.objects.create(user=self.local_user)
datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv")
csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding
entry = list(csv.DictReader(csv_file))[3]
entry = self.importer.parse_fields(entry)
import_item = models.ImportItem.objects.create(
job_id=import_job.id, index=0, data=entry, book=self.book
)
import_job = self.importer.create_job(self.local_user, self.csv, True, "public")
import_item = import_job.items.filter(index=3).first()
import_item.book = self.book
import_item.save()
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"):
with patch("bookwyrm.models.Status.broadcast") as broadcast_mock:
@ -251,14 +230,12 @@ class GenericImporter(TestCase):
@patch("bookwyrm.activitystreams.add_status_task.delay")
def test_handle_imported_book_rating(self, *_):
"""rating import"""
import_job = models.ImportJob.objects.create(user=self.local_user)
datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv")
csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding
entry = list(csv.DictReader(csv_file))[1]
entry = self.importer.parse_fields(entry)
import_item = models.ImportItem.objects.create(
job_id=import_job.id, index=0, data=entry, book=self.book
import_job = self.importer.create_job(
self.local_user, self.csv, False, "public"
)
import_item = import_job.items.filter(index=1).first()
import_item.book = self.book
import_item.save()
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"):
handle_imported_book(
@ -271,14 +248,12 @@ class GenericImporter(TestCase):
def test_handle_imported_book_reviews_disabled(self, *_):
"""review import"""
import_job = models.ImportJob.objects.create(user=self.local_user)
datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv")
csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding
entry = list(csv.DictReader(csv_file))[2]
entry = self.importer.parse_fields(entry)
import_item = models.ImportItem.objects.create(
job_id=import_job.id, index=0, data=entry, book=self.book
import_job = self.importer.create_job(
self.local_user, self.csv, False, "unlisted"
)
import_item = import_job.items.filter(index=3).first()
import_item.book = self.book
import_item.save()
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"):
handle_imported_book(