From 2bb9a855917c3c2ecca9cc32efedf30edc87ea12 Mon Sep 17 00:00:00 2001 From: Hugh Rundle Date: Sun, 28 Jan 2024 15:07:55 +1100 Subject: [PATCH] various fixes - use signed url for s3 downloads - re-arrange tar.gz file to match original - delete all working files after tarring - import from s3 export TODO - check local export and import - fix error when avatar missing - deal with multiple s3 storage options (e.g. Azure) --- .env.example | 1 + ...114_0055.py => 0193_auto_20240128_0249.py} | 4 +- bookwyrm/models/bookwyrm_export_job.py | 46 +++++++++++------ bookwyrm/models/bookwyrm_import_job.py | 13 +++-- .../templates/preferences/export-user.html | 50 ++++++++++++------- bookwyrm/templatetags/utilities.py | 22 +++----- bookwyrm/views/preferences/export.py | 34 ++++++++++++- 7 files changed, 114 insertions(+), 56 deletions(-) rename bookwyrm/migrations/{0192_auto_20240114_0055.py => 0193_auto_20240128_0249.py} (96%) diff --git a/.env.example b/.env.example index 20ce8240b..497d05779 100644 --- a/.env.example +++ b/.env.example @@ -81,6 +81,7 @@ AWS_SECRET_ACCESS_KEY= # AWS_S3_CUSTOM_DOMAIN=None # "example-bucket-name.s3.fr-par.scw.cloud" # AWS_S3_REGION_NAME=None # "fr-par" # AWS_S3_ENDPOINT_URL=None # "https://s3.fr-par.scw.cloud" +# S3_ENDPOINT_URL=None # same as AWS_S3_ENDPOINT_URL - needed for non-AWS for user exports # Commented are example values if you use Azure Blob Storage # USE_AZURE=true diff --git a/bookwyrm/migrations/0192_auto_20240114_0055.py b/bookwyrm/migrations/0193_auto_20240128_0249.py similarity index 96% rename from bookwyrm/migrations/0192_auto_20240114_0055.py rename to bookwyrm/migrations/0193_auto_20240128_0249.py index 824439728..c1c0527b9 100644 --- a/bookwyrm/migrations/0192_auto_20240114_0055.py +++ b/bookwyrm/migrations/0193_auto_20240128_0249.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.23 on 2024-01-14 00:55 +# Generated by Django 3.2.23 on 2024-01-28 02:49 import bookwyrm.storage_backends import django.core.serializers.json @@ -9,7 +9,7 @@ import django.db.models.deletion class Migration(migrations.Migration): dependencies = [ - ("bookwyrm", "0191_merge_20240102_0326"), + ("bookwyrm", "0192_sitesettings_user_exports_enabled"), ] operations = [ diff --git a/bookwyrm/models/bookwyrm_export_job.py b/bookwyrm/models/bookwyrm_export_job.py index 4b31b0ddf..384e71701 100644 --- a/bookwyrm/models/bookwyrm_export_job.py +++ b/bookwyrm/models/bookwyrm_export_job.py @@ -5,6 +5,7 @@ import logging from uuid import uuid4 from s3_tar import S3Tar +from storages.backends.s3boto3 import S3Boto3Storage from django.db.models import CASCADE, BooleanField, FileField, ForeignKey, JSONField from django.db.models import Q @@ -57,7 +58,6 @@ class BookwyrmExportJob(ParentJob): if not self.complete and self.has_completed: if not self.json_completed: - try: self.json_completed = True self.save(update_fields=["json_completed"]) @@ -193,8 +193,7 @@ class AddFileToTar(ChildJob): # NOTE we are doing this all in one big job, which has the potential to block a thread # This is because we need to refer to the same s3_job or BookwyrmTarFile whilst writing - # Alternatives using a series of jobs in a loop would be beter - # but Hugh couldn't make that work + # Using a series of jobs in a loop would be better if possible try: export_data = self.parent_export_job.export_data @@ -203,29 +202,41 @@ class AddFileToTar(ChildJob): user = self.parent_export_job.user editions = get_books_for_user(user) + # filenames for later + export_data_original = str(export_data) + filename = str(self.parent_export_job.task_id) + if settings.USE_S3: s3_job = S3Tar( settings.AWS_STORAGE_BUCKET_NAME, - f"exports/{str(self.parent_export_job.task_id)}.tar.gz", + f"exports/{filename}.tar.gz", ) - # TODO: will need to get it to the user - # from this secure part of the bucket - export_data.save("archive.json", ContentFile(json_data.encode("utf-8"))) - + # save json file + export_data.save( + f"archive_{filename}.json", ContentFile(json_data.encode("utf-8")) + ) s3_job.add_file(f"exports/{export_data.name}") - s3_job.add_file(f"images/{user.avatar.name}", folder="avatar") + + # save image file + file_type = user.avatar.name.rsplit(".", maxsplit=1)[-1] + export_data.save(f"avatar_{filename}.{file_type}", user.avatar) + s3_job.add_file(f"exports/{export_data.name}") + for book in editions: if getattr(book, "cover", False): cover_name = f"images/{book.cover.name}" s3_job.add_file(cover_name, folder="covers") s3_job.tar() - # delete export json as soon as it's tarred - # TODO: there is probably a better way to do this - # Currently this merely makes the file empty even though - # we're using save=False - export_data.delete(save=False) + + # delete child files - we don't need them any more + s3_storage = S3Boto3Storage(querystring_auth=True, custom_domain=None) + S3Boto3Storage.delete(s3_storage, f"exports/{export_data_original}") + S3Boto3Storage.delete(s3_storage, f"exports/archive_{filename}.json") + S3Boto3Storage.delete( + s3_storage, f"exports/avatar_{filename}.{file_type}" + ) else: export_data.open("wb") @@ -266,7 +277,14 @@ def start_export_task(**kwargs): # prepare the initial file and base json job.export_data = ContentFile(b"", str(uuid4())) + # BUG: this throws a MISSING class error if there is no avatar + # #3096 may fix it + if not job.user.avatar: + job.user.avatar = "" + job.user.save() + job.export_json = job.user.to_activity() + logger.info(job.export_json) job.save(update_fields=["export_data", "export_json"]) # let's go diff --git a/bookwyrm/models/bookwyrm_import_job.py b/bookwyrm/models/bookwyrm_import_job.py index 9a11fd932..02af25d12 100644 --- a/bookwyrm/models/bookwyrm_import_job.py +++ b/bookwyrm/models/bookwyrm_import_job.py @@ -42,20 +42,23 @@ def start_import_task(**kwargs): try: archive_file.open("rb") with BookwyrmTarFile.open(mode="r:gz", fileobj=archive_file) as tar: - job.import_data = json.loads(tar.read("archive.json").decode("utf-8")) + json_filename = next( + filter(lambda n: n.startswith("archive"), tar.getnames()) + ) + job.import_data = json.loads(tar.read(json_filename).decode("utf-8")) if "include_user_profile" in job.required: update_user_profile(job.user, tar, job.import_data) if "include_user_settings" in job.required: update_user_settings(job.user, job.import_data) if "include_goals" in job.required: - update_goals(job.user, job.import_data.get("goals")) + update_goals(job.user, job.import_data.get("goals", [])) if "include_saved_lists" in job.required: - upsert_saved_lists(job.user, job.import_data.get("saved_lists")) + upsert_saved_lists(job.user, job.import_data.get("saved_lists", [])) if "include_follows" in job.required: - upsert_follows(job.user, job.import_data.get("follows")) + upsert_follows(job.user, job.import_data.get("follows", [])) if "include_blocks" in job.required: - upsert_user_blocks(job.user, job.import_data.get("blocks")) + upsert_user_blocks(job.user, job.import_data.get("blocks", [])) process_books(job, tar) diff --git a/bookwyrm/templates/preferences/export-user.html b/bookwyrm/templates/preferences/export-user.html index cd3119e3e..764d51db9 100644 --- a/bookwyrm/templates/preferences/export-user.html +++ b/bookwyrm/templates/preferences/export-user.html @@ -92,25 +92,25 @@ {% endif %} - {% for job in jobs %} + {% for export in jobs %} - {{ job.updated_date }} + {{ export.job.updated_date }} - {% if job.status %} - {{ job.status }} - {{ job.status_display }} - {% elif job.complete %} + {% if export.job.status %} + {{ export.job.status }} + {{ export.job.status_display }} + {% elif export.job.complete %} {% trans "Complete" %} {% else %} {% trans "Active" %} @@ -118,18 +118,30 @@ - {{ job.export_data|get_file_size }} + {{ export.size|get_file_size }} - {% if job.complete and not job.status == "stopped" and not job.status == "failed" %} -

- - - - {% trans "Download your export" %} - - -

+ {% if export.job.complete and not export.job.status == "stopped" and not export.job.status == "failed" %} + {% if export.url%} +

+ + + + {% trans "Download your export" %} + + +

+ {% else %} +

+ + + + {% trans "Download your export" %} + + +

+ {% endif %} + {% endif %} diff --git a/bookwyrm/templatetags/utilities.py b/bookwyrm/templatetags/utilities.py index 225510085..e04c9f33a 100644 --- a/bookwyrm/templatetags/utilities.py +++ b/bookwyrm/templatetags/utilities.py @@ -130,23 +130,17 @@ def id_to_username(user_id): @register.filter(name="get_file_size") -def get_file_size(file): +def get_file_size(raw_size): """display the size of a file in human readable terms""" try: - # TODO: this obviously isn't a proper solution - # boto storages do not implement 'path' - if not USE_S3: - raw_size = os.stat(file.path).st_size - if raw_size < 1024: - return f"{raw_size} bytes" - if raw_size < 1024**2: - return f"{raw_size/1024:.2f} KB" - if raw_size < 1024**3: - return f"{raw_size/1024**2:.2f} MB" - return f"{raw_size/1024**3:.2f} GB" - - return "" + if raw_size < 1024: + return f"{raw_size} bytes" + if raw_size < 1024**2: + return f"{raw_size/1024:.2f} KB" + if raw_size < 1024**3: + return f"{raw_size/1024**2:.2f} MB" + return f"{raw_size/1024**3:.2f} GB" except Exception as error: # pylint: disable=broad-except print(error) diff --git a/bookwyrm/views/preferences/export.py b/bookwyrm/views/preferences/export.py index 33c90291d..bd32d45ad 100644 --- a/bookwyrm/views/preferences/export.py +++ b/bookwyrm/views/preferences/export.py @@ -13,9 +13,11 @@ from django.views import View from django.utils.decorators import method_decorator from django.shortcuts import redirect +from storages.backends.s3boto3 import S3Boto3Storage + from bookwyrm import models from bookwyrm.models.bookwyrm_export_job import BookwyrmExportJob -from bookwyrm.settings import PAGE_LENGTH +from bookwyrm import settings # pylint: disable=no-self-use,too-many-locals @@ -152,6 +154,34 @@ class ExportUser(View): jobs = BookwyrmExportJob.objects.filter(user=request.user).order_by( "-created_date" ) + + exports = [] + for job in jobs: + export = {"job": job} + + if settings.USE_S3: + # make custom_domain None so we can sign the url (https://github.com/jschneier/django-storages/issues/944) + storage = S3Boto3Storage(querystring_auth=True, custom_domain=None) + + # for s3 we download directly from s3, so we need a signed url + export["url"] = S3Boto3Storage.url( + storage, f"/exports/{job.task_id}.tar.gz", expire=900 + ) # temporarily downloadable file, expires after 5 minutes + + # for s3 we create a new tar file in s3, so we need to check the size of _that_ file + try: + export["size"] = S3Boto3Storage.size( + storage, f"exports/{job.task_id}.tar.gz" + ) + except Exception: + export["size"] = 0 + + else: + # for local storage export_data is the tar file + export["size"] = job.export_data.size if job.export_data else 0 + + exports.append(export) + site = models.SiteSettings.objects.get() hours = site.user_import_time_limit allowed = ( @@ -162,7 +192,7 @@ class ExportUser(View): next_available = ( jobs.first().created_date + timedelta(hours=hours) if not allowed else False ) - paginated = Paginator(jobs, PAGE_LENGTH) + paginated = Paginator(exports, settings.PAGE_LENGTH) page = paginated.get_page(request.GET.get("page")) data = { "jobs": page,