bookwyrm/bookwyrm/models/bookwyrm_import_job.py
2024-01-28 15:56:44 +11:00

463 lines
15 KiB
Python

"""Import a user from another Bookwyrm instance"""
import json
import logging
from django.db.models import FileField, JSONField, CharField
from django.utils import timezone
from django.utils.html import strip_tags
from django.contrib.postgres.fields import ArrayField as DjangoArrayField
from bookwyrm import activitypub
from bookwyrm import models
from bookwyrm.tasks import app, IMPORTS
from bookwyrm.models.job import ParentJob, ParentTask, SubTask
from bookwyrm.utils.tar import BookwyrmTarFile
logger = logging.getLogger(__name__)
class BookwyrmImportJob(ParentJob):
"""entry for a specific request for importing a bookwyrm user backup"""
archive_file = FileField(null=True, blank=True)
import_data = JSONField(null=True)
required = DjangoArrayField(CharField(max_length=50, blank=True), blank=True)
def start_job(self):
"""Start the job"""
start_import_task.delay(job_id=self.id, no_children=True)
@app.task(queue=IMPORTS, base=ParentTask)
def start_import_task(**kwargs):
"""trigger the child import tasks for each user data"""
job = BookwyrmImportJob.objects.get(id=kwargs["job_id"])
archive_file = job.archive_file
# don't start the job if it was stopped from the UI
if job.complete:
return
try:
archive_file.open("rb")
with BookwyrmTarFile.open(mode="r:gz", fileobj=archive_file) as tar:
json_filename = next(
filter(lambda n: n.startswith("archive"), tar.getnames())
)
job.import_data = json.loads(tar.read(json_filename).decode("utf-8"))
if "include_user_profile" in job.required:
update_user_profile(job.user, tar, job.import_data)
if "include_user_settings" in job.required:
update_user_settings(job.user, job.import_data)
if "include_goals" in job.required:
update_goals(job.user, job.import_data.get("goals", []))
if "include_saved_lists" in job.required:
upsert_saved_lists(job.user, job.import_data.get("saved_lists", []))
if "include_follows" in job.required:
upsert_follows(job.user, job.import_data.get("follows", []))
if "include_blocks" in job.required:
upsert_user_blocks(job.user, job.import_data.get("blocks", []))
process_books(job, tar)
job.set_status("complete")
archive_file.close()
except Exception as err: # pylint: disable=broad-except
logger.exception("User Import Job %s Failed with error: %s", job.id, err)
job.set_status("failed")
def process_books(job, tar):
"""
Process user import data related to books
We always import the books even if not assigning
them to shelves, lists etc
"""
books = job.import_data.get("books")
for data in books:
book = get_or_create_edition(data, tar)
if "include_shelves" in job.required:
upsert_shelves(book, job.user, data)
if "include_readthroughs" in job.required:
upsert_readthroughs(data.get("readthroughs"), job.user, book.id)
if "include_comments" in job.required:
upsert_statuses(
job.user, models.Comment, data.get("comments"), book.remote_id
)
if "include_quotations" in job.required:
upsert_statuses(
job.user, models.Quotation, data.get("quotations"), book.remote_id
)
if "include_reviews" in job.required:
upsert_statuses(
job.user, models.Review, data.get("reviews"), book.remote_id
)
if "include_lists" in job.required:
upsert_lists(job.user, data.get("lists"), book.id)
def get_or_create_edition(book_data, tar):
"""Take a JSON string of work and edition data,
find or create the edition and work in the database and
return an edition instance"""
edition = book_data.get("edition")
existing = models.Edition.find_existing(edition)
if existing:
return existing
# make sure we have the authors in the local DB
# replace the old author ids in the edition JSON
edition["authors"] = []
for author in book_data.get("authors"):
parsed_author = activitypub.parse(author)
instance = parsed_author.to_model(
model=models.Author, save=True, overwrite=True
)
edition["authors"].append(instance.remote_id)
# we will add the cover later from the tar
# don't try to load it from the old server
cover = edition.get("cover", {})
cover_path = cover.get("url", None)
edition["cover"] = {}
# first we need the parent work to exist
work = book_data.get("work")
work["editions"] = []
parsed_work = activitypub.parse(work)
work_instance = parsed_work.to_model(model=models.Work, save=True, overwrite=True)
# now we have a work we can add it to the edition
# and create the edition model instance
edition["work"] = work_instance.remote_id
parsed_edition = activitypub.parse(edition)
book = parsed_edition.to_model(model=models.Edition, save=True, overwrite=True)
# set the cover image from the tar
if cover_path:
tar.write_image_to_file(cover_path, book.cover)
return book
def upsert_readthroughs(data, user, book_id):
"""Take a JSON string of readthroughs and
find or create the instances in the database"""
for read_through in data:
obj = {}
keys = [
"progress_mode",
"start_date",
"finish_date",
"stopped_date",
"is_active",
]
for key in keys:
obj[key] = read_through[key]
obj["user_id"] = user.id
obj["book_id"] = book_id
existing = models.ReadThrough.objects.filter(**obj).first()
if not existing:
models.ReadThrough.objects.create(**obj)
def upsert_statuses(user, cls, data, book_remote_id):
"""Take a JSON string of a status and
find or create the instances in the database"""
for status in data:
if is_alias(
user, status["attributedTo"]
): # don't let l33t hax0rs steal other people's posts
# update ids and remove replies
status["attributedTo"] = user.remote_id
status["to"] = update_followers_address(user, status["to"])
status["cc"] = update_followers_address(user, status["cc"])
status[
"replies"
] = (
{}
) # this parses incorrectly but we can't set it without knowing the new id
status["inReplyToBook"] = book_remote_id
parsed = activitypub.parse(status)
if not status_already_exists(
user, parsed
): # don't duplicate posts on multiple import
instance = parsed.to_model(model=cls, save=True, overwrite=True)
for val in [
"progress",
"progress_mode",
"position",
"endposition",
"position_mode",
]:
if status.get(val):
instance.val = status[val]
instance.remote_id = instance.get_remote_id() # update the remote_id
instance.save() # save and broadcast
else:
logger.warning("User does not have permission to import statuses")
def upsert_lists(user, lists, book_id):
"""Take a list of objects each containing
a list and list item as AP objects
Because we are creating new IDs we can't assume the id
will exist or be accurate, so we only use to_model for
adding new items after checking whether they exist .
"""
book = models.Edition.objects.get(id=book_id)
for blist in lists:
booklist = models.List.objects.filter(name=blist["name"], user=user).first()
if not booklist:
blist["owner"] = user.remote_id
parsed = activitypub.parse(blist)
booklist = parsed.to_model(model=models.List, save=True, overwrite=True)
booklist.privacy = blist["privacy"]
booklist.save()
item = models.ListItem.objects.filter(book=book, book_list=booklist).exists()
if not item:
count = booklist.books.count()
models.ListItem.objects.create(
book=book,
book_list=booklist,
user=user,
notes=blist["list_item"]["notes"],
approved=blist["list_item"]["approved"],
order=count + 1,
)
def upsert_shelves(book, user, book_data):
"""Take shelf JSON objects and create
DB entries if they don't already exist"""
shelves = book_data["shelves"]
for shelf in shelves:
book_shelf = models.Shelf.objects.filter(name=shelf["name"], user=user).first()
if not book_shelf:
book_shelf = models.Shelf.objects.create(name=shelf["name"], user=user)
# add the book as a ShelfBook if needed
if not models.ShelfBook.objects.filter(
book=book, shelf=book_shelf, user=user
).exists():
models.ShelfBook.objects.create(
book=book, shelf=book_shelf, user=user, shelved_date=timezone.now()
)
def update_user_profile(user, tar, data):
"""update the user's profile from import data"""
name = data.get("name", None)
username = data.get("preferredUsername")
user.name = name if name else username
user.summary = strip_tags(data.get("summary", None))
user.save(update_fields=["name", "summary"])
if data["icon"].get("url"):
avatar_filename = next(filter(lambda n: n.startswith("avatar"), tar.getnames()))
tar.write_image_to_file(avatar_filename, user.avatar)
def update_user_settings(user, data):
"""update the user's settings from import data"""
update_fields = ["manually_approves_followers", "hide_follows", "discoverable"]
ap_fields = [
("manuallyApprovesFollowers", "manually_approves_followers"),
("hideFollows", "hide_follows"),
("discoverable", "discoverable"),
]
for (ap_field, bw_field) in ap_fields:
setattr(user, bw_field, data[ap_field])
bw_fields = [
"show_goal",
"show_suggested_users",
"default_post_privacy",
"preferred_timezone",
]
for field in bw_fields:
update_fields.append(field)
setattr(user, field, data["settings"][field])
user.save(update_fields=update_fields)
@app.task(queue=IMPORTS, base=SubTask)
def update_user_settings_task(job_id):
"""wrapper task for user's settings import"""
parent_job = BookwyrmImportJob.objects.get(id=job_id)
return update_user_settings(parent_job.user, parent_job.import_data.get("user"))
def update_goals(user, data):
"""update the user's goals from import data"""
for goal in data:
# edit the existing goal if there is one
existing = models.AnnualGoal.objects.filter(
year=goal["year"], user=user
).first()
if existing:
for k in goal.keys():
setattr(existing, k, goal[k])
existing.save()
else:
goal["user"] = user
models.AnnualGoal.objects.create(**goal)
@app.task(queue=IMPORTS, base=SubTask)
def update_goals_task(job_id):
"""wrapper task for user's goals import"""
parent_job = BookwyrmImportJob.objects.get(id=job_id)
return update_goals(parent_job.user, parent_job.import_data.get("goals"))
def upsert_saved_lists(user, values):
"""Take a list of remote ids and add as saved lists"""
for remote_id in values:
book_list = activitypub.resolve_remote_id(remote_id, models.List)
if book_list:
user.saved_lists.add(book_list)
@app.task(queue=IMPORTS, base=SubTask)
def upsert_saved_lists_task(job_id):
"""wrapper task for user's saved lists import"""
parent_job = BookwyrmImportJob.objects.get(id=job_id)
return upsert_saved_lists(
parent_job.user, parent_job.import_data.get("saved_lists")
)
def upsert_follows(user, values):
"""Take a list of remote ids and add as follows"""
for remote_id in values:
followee = activitypub.resolve_remote_id(remote_id, models.User)
if followee:
(follow_request, created,) = models.UserFollowRequest.objects.get_or_create(
user_subject=user,
user_object=followee,
)
if not created:
# this request probably failed to connect with the remote
# and should save to trigger a re-broadcast
follow_request.save()
@app.task(queue=IMPORTS, base=SubTask)
def upsert_follows_task(job_id):
"""wrapper task for user's follows import"""
parent_job = BookwyrmImportJob.objects.get(id=job_id)
return upsert_follows(parent_job.user, parent_job.import_data.get("follows"))
def upsert_user_blocks(user, user_ids):
"""block users"""
for user_id in user_ids:
user_object = activitypub.resolve_remote_id(user_id, models.User)
if user_object:
exists = models.UserBlocks.objects.filter(
user_subject=user, user_object=user_object
).exists()
if not exists:
models.UserBlocks.objects.create(
user_subject=user, user_object=user_object
)
# remove the blocked users's lists from the groups
models.List.remove_from_group(user, user_object)
# remove the blocked user from all blocker's owned groups
models.GroupMember.remove(user, user_object)
@app.task(queue=IMPORTS, base=SubTask)
def upsert_user_blocks_task(job_id):
"""wrapper task for user's blocks import"""
parent_job = BookwyrmImportJob.objects.get(id=job_id)
return upsert_user_blocks(
parent_job.user, parent_job.import_data.get("blocked_users")
)
def update_followers_address(user, field):
"""statuses to or cc followers need to have the followers
address updated to the new local user"""
for i, audience in enumerate(field):
if audience.rsplit("/")[-1] == "followers":
field[i] = user.followers_url
return field
def is_alias(user, remote_id):
"""check that the user is listed as movedTo or also_known_as
in the remote user's profile"""
remote_user = activitypub.resolve_remote_id(
remote_id=remote_id, model=models.User, save=False
)
if remote_user:
if remote_user.moved_to:
return user.remote_id == remote_user.moved_to
if remote_user.also_known_as:
return user in remote_user.also_known_as.all()
return False
def status_already_exists(user, status):
"""check whether this status has already been published
by this user. We can't rely on to_model() because it
only matches on remote_id, which we have to change
*after* saving because it needs the primary key (id)"""
return models.Status.objects.filter(
user=user, content=status.content, published_date=status.published
).exists()