Store csv in the database and then import via celery.

This commit is contained in:
Adam Kelly 2020-04-21 15:09:21 +01:00
parent 881cc4d64b
commit 0bf73fef24
10 changed files with 293 additions and 133 deletions

View file

@ -1,65 +1,46 @@
''' handle reading a csv from goodreads '''
import re
import csv
import dateutil.parser
from requests import HTTPError
from fedireads import books_manager
from fedireads import outgoing
from fedireads.models import Edition, ReadThrough, User
from fedireads.tasks import app
from fedireads.models import ImportJob, ImportItem
# Mapping goodreads -> fedireads shelf titles.
GOODREADS_SHELVES = {
'read': 'read',
'currently-reading': 'reading',
'to-read': 'to-read',
}
# TODO: remove or notify about this in the UI
MAX_ENTRIES = 20
def unquote_string(text):
''' resolve csv quote weirdness '''
match = re.match(r'="([^"]*)"', text)
if match:
return match.group(1)
return text
def create_job(user, csv_file):
job = ImportJob.objects.create(user=user)
for index, entry in enumerate(list(csv.DictReader(csv_file))[:MAX_ENTRIES]):
ImportItem(job=job, index=index, data=entry).save()
return job
def construct_search_term(title, author):
''' formulate a query for the data connector '''
# Strip brackets (usually series title from search term)
title = re.sub(r'\s*\([^)]*\)\s*', '', title)
# Open library doesn't like including author initials in search term.
author = re.sub(r'(\w\.)+\s*', '', author)
return ' '.join([title, author])
def async_import(user, csv_file):
entries = list(csv.DictReader(csv_file))[:MAX_ENTRIES]
return import_data.delay(user.id, entries)
def start_import(job):
result = import_data.delay(job.id)
job.task_id = result.id
job.save()
@app.task
def import_data(user_id, entries):
user = User.objects.get(pk=user_id)
def import_data(job_id):
job = ImportJob.objects.get(id=job_id)
user = job.user
results = []
reviews = []
failures = []
for item in entries:
item = GoodreadsItem(item)
for item in job.items.all():
try:
item.resolve()
except HTTPError:
pass
if item.book:
item.save()
results.append(item)
if item.rating or item.review:
reviews.append(item)
else:
failures.append(item)
item.fail_reason = "Could not match book on OpenLibrary"
item.save()
outgoing.handle_import_books(user, results)
for item in reviews:
@ -73,84 +54,3 @@ def import_data(user_id, entries):
item.review,
item.rating,
)
class GoodreadsItem:
''' a processed line in a goodreads csv '''
def __init__(self, line):
self.line = line
self.book = None
def resolve(self):
''' try various ways to lookup a book '''
self.book = (
self.get_book_from_db_isbn() or
self.get_book_from_isbn() or
self.get_book_from_title_author()
)
def get_book_from_db_isbn(self):
''' see if we already know about the book '''
try:
return Edition.objects.get(isbn=self.isbn)
except Edition.DoesNotExist:
return None
def get_book_from_isbn(self):
''' search by isbn '''
search_results = books_manager.search(self.isbn)
if search_results:
return books_manager.get_or_create_book(search_results[0].key)
def get_book_from_title_author(self):
''' search by title and author '''
search_term = construct_search_term(
self.line['Title'],
self.line['Author']
)
search_results = books_manager.search(search_term)
if search_results:
return books_manager.get_or_create_book(search_results[0].key)
@property
def isbn(self):
return unquote_string(self.line['ISBN13'])
@property
def shelf(self):
''' the goodreads shelf field '''
if self.line['Exclusive Shelf']:
return GOODREADS_SHELVES[self.line['Exclusive Shelf']]
@property
def review(self):
return self.line['My Review']
@property
def rating(self):
return int(self.line['My Rating'])
@property
def date_added(self):
if self.line['Date Added']:
return dateutil.parser.parse(self.line['Date Added'])
@property
def date_read(self):
if self.line['Date Read']:
return dateutil.parser.parse(self.line['Date Read'])
@property
def reads(self):
return [ReadThrough(
# Date added isn't the start date, but it's (perhaps) better than nothing.
start_date=self.date_added,
finish_date=self.date_read,
pages_read=None,
)]
def __repr__(self):
return "<GoodreadsItem {!r}>".format(self.line['Title'])
def __str__(self):
return "{} by {}".format(self.line['Title'], self.line['Author'])

View file

@ -0,0 +1,60 @@
# Generated by Django 3.0.3 on 2020-04-21 13:47
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
import fedireads.utils.fields
class Migration(migrations.Migration):
dependencies = [
('fedireads', '0031_readthrough'),
]
operations = [
migrations.CreateModel(
name='ImportItem',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('data', fedireads.utils.fields.JSONField()),
],
),
migrations.CreateModel(
name='ImportJob',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created_date', models.DateTimeField(default=django.utils.timezone.now)),
('task_id', models.CharField(max_length=100, null=True)),
],
),
migrations.RemoveConstraint(
model_name='notification',
name='notification_type_valid',
),
migrations.AlterField(
model_name='notification',
name='notification_type',
field=models.CharField(choices=[('FAVORITE', 'Favorite'), ('REPLY', 'Reply'), ('TAG', 'Tag'), ('FOLLOW', 'Follow'), ('FOLLOW_REQUEST', 'Follow Request'), ('BOOST', 'Boost'), ('IMPORT_RESULT', 'Import Result')], max_length=255),
),
migrations.AddConstraint(
model_name='notification',
constraint=models.CheckConstraint(check=models.Q(notification_type__in=['FAVORITE', 'REPLY', 'TAG', 'FOLLOW', 'FOLLOW_REQUEST', 'BOOST', 'IMPORT_RESULT']), name='notification_type_valid'),
),
migrations.AddField(
model_name='importjob',
name='user',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='importitem',
name='book',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='fedireads.Book'),
),
migrations.AddField(
model_name='importitem',
name='job',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='items', to='fedireads.ImportJob'),
),
]

View file

@ -5,3 +5,4 @@ from .status import Status, Review, Comment, Quotation
from .status import Favorite, Boost, Tag, Notification, ReadThrough
from .user import User, UserFollows, UserFollowRequest, UserBlocks
from .user import FederatedServer
from .import_job import ImportJob, ImportItem

View file

@ -0,0 +1,123 @@
import re
import dateutil.parser
from django.db import models
from django.utils import timezone
from fedireads import books_manager
from fedireads.models import Edition, ReadThrough, User, Book
from fedireads.utils.fields import JSONField
# Mapping goodreads -> fedireads shelf titles.
GOODREADS_SHELVES = {
'read': 'read',
'currently-reading': 'reading',
'to-read': 'to-read',
}
def unquote_string(text):
''' resolve csv quote weirdness '''
match = re.match(r'="([^"]*)"', text)
if match:
return match.group(1)
return text
def construct_search_term(title, author):
''' formulate a query for the data connector '''
# Strip brackets (usually series title from search term)
title = re.sub(r'\s*\([^)]*\)\s*', '', title)
# Open library doesn't like including author initials in search term.
author = re.sub(r'(\w\.)+\s*', '', author)
return ' '.join([title, author])
class ImportJob(models.Model):
user = models.ForeignKey(User, on_delete=models.CASCADE)
created_date = models.DateTimeField(default=timezone.now)
task_id = models.CharField(max_length=100, null=True)
class ImportItem(models.Model):
job = models.ForeignKey(
ImportJob,
on_delete=models.CASCADE,
related_name='items')
index = models.IntegerField()
data = JSONField()
book = models.ForeignKey(
Book, on_delete=models.SET_NULL, null=True, blank=True)
fail_reason = models.TextField(null=True)
def resolve(self):
''' try various ways to lookup a book '''
self.book = (
self.get_book_from_db_isbn() or
self.get_book_from_isbn() or
self.get_book_from_title_author()
)
def get_book_from_db_isbn(self):
''' see if we already know about the book '''
try:
return Edition.objects.get(isbn=self.isbn)
except Edition.DoesNotExist:
return None
def get_book_from_isbn(self):
''' search by isbn '''
search_results = books_manager.search(self.isbn)
if search_results:
return books_manager.get_or_create_book(search_results[0].key)
def get_book_from_title_author(self):
''' search by title and author '''
search_term = construct_search_term(
self.data['Title'],
self.data['Author']
)
search_results = books_manager.search(search_term)
if search_results:
return books_manager.get_or_create_book(search_results[0].key)
@property
def isbn(self):
return unquote_string(self.data['ISBN13'])
@property
def shelf(self):
''' the goodreads shelf field '''
if self.data['Exclusive Shelf']:
return GOODREADS_SHELVES[self.data['Exclusive Shelf']]
@property
def review(self):
return self.data['My Review']
@property
def rating(self):
return int(self.data['My Rating'])
@property
def date_added(self):
if self.data['Date Added']:
return dateutil.parser.parse(self.data['Date Added'])
@property
def date_read(self):
if self.data['Date Read']:
return dateutil.parser.parse(self.data['Date Read'])
@property
def reads(self):
return [ReadThrough(
# date_added isn't the start date, but maybe better than nothing.
start_date=self.date_added,
finish_date=self.date_read,
pages_read=None,
)]
def __repr__(self):
return "<GoodreadsItem {!r}>".format(self.data['Title'])
def __str__(self):
return "{} by {}".format(self.data['Title'], self.data['Author'])

View file

@ -1,4 +1,5 @@
{% extends 'layout.html' %}
{% load humanize %}
{% block content %}
<div class="content-container">
<h2>Import Books from GoodReads</h2>
@ -6,7 +7,13 @@
{% csrf_token %}
{{ import_form.as_p }}
<button type="submit">Import</button>
<small>Hang tight, this may take a minute!</small>
</form>
<h2>Recent Imports</h2>
<ul>
{% for job in jobs %}
<li><a href="/import_status/{{ job.id }}">{{ job.created_date | naturaltime }}</a></li>
{% endfor %}
</ul>
</div>
{% endblock %}

View file

@ -1,10 +0,0 @@
{% extends 'layout.html' %}
{% block content %}
<div id="content">
<div>
<h1>Import</h1>
Import uploaded successfully. The import is being processed.
</div>
</div>
{% endblock %}

View file

@ -0,0 +1,62 @@
{% extends 'layout.html' %}
{% load fr_display %}
{% load humanize %}
{% block content %}
<div id="content">
<div>
<h1>Import Status</h1>
<p>
Import started: {{ job.created_date | naturaltime }}
<p>
{% if task.ready %}
Import completed: {{ task.date_done | naturaltime }}
{% if task.failed %}
<h3><span style="background-color: #ffaaaa;">TASK FAILED</span></h3>
<p>
{{ task.info }}
{% endif %}
{% else %}
Import still in progress.
<p>
(Hit reload to update!)
{% endif %}
<table>
<tr>
<th>
</th>
<th>
Title
</th>
<th>
Author
</th>
<th>
Book
</th>
</tr>
{% for item in items %}
<tr>
<td>
{% if item.book %}✓{% endif %}
</td>
<td>
{{ item.data|dict_key:'Title' }}
</td>
<td>
{{ item.data|dict_key:'Author' }}
</td>
<td>
{% if item.book %}
<a href="{{ item.book.absolute_id }}">
{% include 'snippets/book_cover.html' with book=item.book size='small' %}
</a>
{% endif %}
</td>
</tr>
{% endfor %}
</table>
</div>
</div>
{% endblock %}

View file

@ -40,6 +40,7 @@ urlpatterns = [
re_path(r'^notifications/?', views.notifications_page),
re_path(r'books/?$', views.books_page),
re_path(r'import/?$', views.import_page),
re_path(r'import_status/(\d+)/?$', views.import_status),
re_path(r'user-edit/?$', views.edit_profile_page),
# should return a ui view or activitypub json blob as requested

View file

@ -418,10 +418,10 @@ def import_data(request):
''' ingest a goodreads csv '''
form = forms.ImportForm(request.POST, request.FILES)
if form.is_valid():
goodreads_import.async_import(
job = goodreads_import.create_job(
request.user,
TextIOWrapper(request.FILES['csv_file'], encoding=request.encoding)
)
return TemplateResponse(request, 'import_results.html', {})
goodreads_import.start_import(job)
return redirect('/import_status/%d' % (job.id,))
return HttpResponseBadRequest()

View file

@ -1,13 +1,15 @@
''' views for pages you can go to in the application '''
from django.contrib.auth.decorators import login_required
from django.db.models import Avg, Q
from django.http import HttpResponseBadRequest, HttpResponseNotFound, \
from django.http import HttpResponseBadRequest, HttpResponseNotFound,\
JsonResponse
from django.core.exceptions import PermissionDenied
from django.template.response import TemplateResponse
from django.views.decorators.csrf import csrf_exempt
from fedireads import activitypub
from fedireads import forms, models, books_manager
from fedireads.tasks import app
def get_user_from_username(username):
@ -158,9 +160,24 @@ def import_page(request):
''' import history from goodreads '''
return TemplateResponse(request, 'import.html', {
'import_form': forms.ImportForm(),
'jobs': models.ImportJob.
objects.filter(user=request.user).order_by('-created_date'),
})
@login_required
def import_status(request, job_id):
''' status of an import job '''
job = models.ImportJob.objects.get(id=job_id)
if job.user != request.user:
raise PermissionDenied
task = app.AsyncResult(job.task_id)
return TemplateResponse(request, 'import_status.html', {
'job': job,
'items': job.items.order_by('index').all(),
'task': task
})
def login_page(request):
''' authentication '''
@ -531,4 +548,3 @@ def get_user_shelf_preview(user, shelf_proportions=None):
'size': shelf.books.count(),
})
return shelves