diff --git a/bookwyrm/migrations/0197_author_search_vector.py b/bookwyrm/migrations/0197_author_search_vector.py new file mode 100644 index 000000000..baa540cc0 --- /dev/null +++ b/bookwyrm/migrations/0197_author_search_vector.py @@ -0,0 +1,41 @@ +# Generated by Django 3.2.25 on 2024-03-20 15:15 + +import django.contrib.postgres.indexes +from django.db import migrations +import pgtrigger.compiler +import pgtrigger.migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("bookwyrm", "0196_merge_pr3134_into_main"), + ] + + operations = [ + migrations.AddIndex( + model_name="author", + index=django.contrib.postgres.indexes.GinIndex( + fields=["search_vector"], name="bookwyrm_au_search__b050a8_gin" + ), + ), + pgtrigger.migrations.AddTrigger( + model_name="author", + trigger=pgtrigger.compiler.Trigger( + name="update_search_vector_on_author_edit", + sql=pgtrigger.compiler.UpsertTriggerSql( + func="new.search_vector := setweight(to_tsvector('simple', new.name), 'A') || setweight(to_tsvector('simple', coalesce(array_to_string(new.aliases, ' '), '')), 'B');RETURN NEW;", + hash="b97919016236d74d0ade51a0769a173ea269da64", + operation='INSERT OR UPDATE OF "name", "aliases", "search_vector"', + pgid="pgtrigger_update_search_vector_on_author_edit_c61cb", + table="bookwyrm_author", + when="BEFORE", + ), + ), + ), + migrations.RunSQL( + # Calculate search vector for all Authors. + sql="UPDATE bookwyrm_author SET search_vector = NULL;", + reverse_sql="UPDATE bookwyrm_author SET search_vector = NULL;", + ), + ] diff --git a/bookwyrm/migrations/0198_book_search_vector_author_aliases.py b/bookwyrm/migrations/0198_book_search_vector_author_aliases.py new file mode 100644 index 000000000..491cb64bb --- /dev/null +++ b/bookwyrm/migrations/0198_book_search_vector_author_aliases.py @@ -0,0 +1,57 @@ +# Generated by Django 3.2.25 on 2024-03-20 15:52 + +from django.db import migrations +import pgtrigger.compiler +import pgtrigger.migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("bookwyrm", "0197_author_search_vector"), + ] + + operations = [ + pgtrigger.migrations.RemoveTrigger( + model_name="author", + name="reset_search_vector_on_author_edit", + ), + pgtrigger.migrations.RemoveTrigger( + model_name="book", + name="update_search_vector_on_book_edit", + ), + pgtrigger.migrations.AddTrigger( + model_name="author", + trigger=pgtrigger.compiler.Trigger( + name="reset_book_search_vector_on_author_edit", + sql=pgtrigger.compiler.UpsertTriggerSql( + func="WITH updated_books AS (SELECT book_id FROM bookwyrm_book_authors WHERE author_id = new.id ) UPDATE bookwyrm_book SET search_vector = '' FROM updated_books WHERE id = updated_books.book_id;RETURN NEW;", + hash="68422c0f29879c5802b82159dde45297eff53e73", + operation='UPDATE OF "name", "aliases"', + pgid="pgtrigger_reset_book_search_vector_on_author_edit_a50c7", + table="bookwyrm_author", + when="AFTER", + ), + ), + ), + pgtrigger.migrations.AddTrigger( + model_name="book", + trigger=pgtrigger.compiler.Trigger( + name="update_search_vector_on_book_edit", + sql=pgtrigger.compiler.UpsertTriggerSql( + func="WITH author_names AS (SELECT array_to_string(bookwyrm_author.name || bookwyrm_author.aliases, ' ') AS name_and_aliases FROM bookwyrm_author LEFT JOIN bookwyrm_book_authors ON bookwyrm_author.id = bookwyrm_book_authors.author_id WHERE bookwyrm_book_authors.book_id = new.id ) SELECT setweight(coalesce(nullif(to_tsvector('english', new.title), ''), to_tsvector('simple', new.title)), 'A') || setweight(to_tsvector('english', coalesce(new.subtitle, '')), 'B') || (SELECT setweight(to_tsvector('simple', coalesce(array_to_string(array_agg(name_and_aliases), ' '), '')), 'C') FROM author_names) || setweight(to_tsvector('english', coalesce(new.series, '')), 'D') INTO new.search_vector;RETURN NEW;", + hash="9324f5ca76a6f5e63931881d62d11da11f595b2c", + operation='INSERT OR UPDATE OF "title", "subtitle", "series", "search_vector"', + pgid="pgtrigger_update_search_vector_on_book_edit_bec58", + table="bookwyrm_book", + when="BEFORE", + ), + ), + ), + migrations.RunSQL( + # Recalculate search vector for all Books because it now includes + # Author aliases. + sql="UPDATE bookwyrm_book SET search_vector = NULL;", + reverse_sql="UPDATE bookwyrm_book SET search_vector = NULL;", + ), + ] diff --git a/bookwyrm/models/author.py b/bookwyrm/models/author.py index 154b00ccb..9dc3962ad 100644 --- a/bookwyrm/models/author.py +++ b/bookwyrm/models/author.py @@ -3,6 +3,7 @@ import re from typing import Tuple, Any from django.db import models +from django.contrib.postgres.indexes import GinIndex import pgtrigger from bookwyrm import activitypub @@ -71,11 +72,29 @@ class Author(BookDataModel): class Meta: """sets up indexes and triggers""" + # pylint: disable=line-too-long + + indexes = (GinIndex(fields=["search_vector"]),) triggers = [ pgtrigger.Trigger( - name="reset_search_vector_on_author_edit", + name="update_search_vector_on_author_edit", + when=pgtrigger.Before, + operation=pgtrigger.Insert + | pgtrigger.UpdateOf("name", "aliases", "search_vector"), + func=format_trigger( + """new.search_vector := + -- author name, with priority A + setweight(to_tsvector('simple', new.name), 'A') || + -- author aliases, with priority B + setweight(to_tsvector('simple', coalesce(array_to_string(new.aliases, ' '), '')), 'B'); + RETURN new; + """ + ), + ), + pgtrigger.Trigger( + name="reset_book_search_vector_on_author_edit", when=pgtrigger.After, - operation=pgtrigger.UpdateOf("name"), + operation=pgtrigger.UpdateOf("name", "aliases"), func=format_trigger( """WITH updated_books AS ( SELECT book_id @@ -89,7 +108,7 @@ class Author(BookDataModel): RETURN new; """ ), - ) + ), ] activity_serializer = activitypub.Author diff --git a/bookwyrm/models/book.py b/bookwyrm/models/book.py index e167e2138..5dba6532f 100644 --- a/bookwyrm/models/book.py +++ b/bookwyrm/models/book.py @@ -246,24 +246,34 @@ class Book(BookDataModel): operation=pgtrigger.Insert | pgtrigger.UpdateOf("title", "subtitle", "series", "search_vector"), func=format_trigger( - """new.search_vector := - -- title, with priority A (parse in English, default to simple if empty) - setweight(COALESCE(nullif( - to_tsvector('english', new.title), ''), - to_tsvector('simple', new.title)), 'A') || - -- subtitle, with priority B (always in English?) - setweight(to_tsvector('english', COALESCE(new.subtitle, '')), 'B') || - -- list of authors, with priority C (TODO: add aliases?, bookwyrm-social#3063) - (SELECT setweight(to_tsvector('simple', COALESCE(array_to_string(ARRAY_AGG(bookwyrm_author.name), ' '), '')), 'C') - FROM bookwyrm_author - LEFT JOIN bookwyrm_book_authors - ON bookwyrm_author.id = bookwyrm_book_authors.author_id - WHERE bookwyrm_book_authors.book_id = new.id - ) || - --- last: series name, with lowest priority - setweight(to_tsvector('english', COALESCE(new.series, '')), 'D'); - RETURN new; - """ + """ + WITH author_names AS ( + SELECT array_to_string(bookwyrm_author.name || bookwyrm_author.aliases, ' ') AS name_and_aliases + FROM bookwyrm_author + LEFT JOIN bookwyrm_book_authors + ON bookwyrm_author.id = bookwyrm_book_authors.author_id + WHERE bookwyrm_book_authors.book_id = new.id + ) + SELECT + -- title, with priority A (parse in English, default to simple if empty) + setweight(COALESCE(nullif( + to_tsvector('english', new.title), ''), + to_tsvector('simple', new.title)), 'A') || + + -- subtitle, with priority B (always in English?) + setweight(to_tsvector('english', COALESCE(new.subtitle, '')), 'B') || + + -- list of authors names and aliases (with priority C) + (SELECT setweight(to_tsvector('simple', COALESCE(array_to_string(ARRAY_AGG(name_and_aliases), ' '), '')), 'C') + FROM author_names + ) || + + --- last: series name, with lowest priority + setweight(to_tsvector('english', COALESCE(new.series, '')), 'D') + + INTO new.search_vector; + RETURN new; + """ ), ) ] diff --git a/bookwyrm/tests/test_author_search.py b/bookwyrm/tests/test_author_search.py new file mode 100644 index 000000000..e6b20a2c6 --- /dev/null +++ b/bookwyrm/tests/test_author_search.py @@ -0,0 +1,87 @@ +""" test searching for authors """ +from django.test import TestCase + +from django.contrib.postgres.search import SearchRank, SearchQuery +from django.db.models import F + +from bookwyrm import models + + +class AuthorSearch(TestCase): + """look for some authors""" + + @classmethod + def setUpTestData(cls): + """we need basic test data and mocks""" + cls.bob = models.Author.objects.create( + name="Bob", aliases=["Robertus", "Alice"] + ) + cls.alice = models.Author.objects.create(name="Alice") + + def test_search(self): + """search for an author in the db""" + results = self._search("Bob") + self.assertEqual(len(results), 1) + self.assertEqual(results[0], self.bob) + + def test_alias_priority(self): + """aliases should be included, with lower priority than name""" + results = self._search("Alice") + self.assertEqual(len(results), 2) + self.assertEqual(results[0], self.alice) + + def _search_first(self, query): + """wrapper around search_title_author""" + return self._search(query, return_first=True) + + @staticmethod + def _search(query, *, return_first=False): + """author search""" + search_query = SearchQuery(query, config="simple") + min_confidence = 0 + + results = ( + models.Author.objects.filter(search_vector=search_query) + .annotate(rank=SearchRank(F("search_vector"), search_query)) + .filter(rank__gt=min_confidence) + .order_by("-rank") + ) + if return_first: + return results.first() + return results + + +class SearchVectorTest(TestCase): + """check search_vector is computed correctly""" + + def test_search_vector_simple(self): + """simplest search vector""" + author = self._create_author("Mary") + self.assertEqual(author.search_vector, "'mary':1A") + + def test_search_vector_aliases(self): + """author aliases should be included with lower priority""" + author = self._create_author("Mary", aliases=["Maria", "Example"]) + self.assertEqual(author.search_vector, "'example':3B 'maria':2B 'mary':1A") + + def test_search_vector_parse_author(self): + """author name and alias is not stem'd or affected by stop words""" + author = self._create_author("Writes", aliases=["Reads"]) + self.assertEqual(author.search_vector, "'reads':2B 'writes':1A") + + def test_search_vector_on_update(self): + """make sure that search_vector is being set correctly on edit""" + author = self._create_author("Mary") + self.assertEqual(author.search_vector, "'mary':1A") + + author.name = "Example" + author.save(broadcast=False) + author.refresh_from_db() + self.assertEqual(author.search_vector, "'example':1A") + + @staticmethod + def _create_author(name, /, *, aliases=None): + """quickly create an author""" + author = models.Author.objects.create(name=name, aliases=aliases or []) + author.refresh_from_db() + return author diff --git a/bookwyrm/tests/test_book_search.py b/bookwyrm/tests/test_book_search.py index 0721cb142..b8c1ee1d3 100644 --- a/bookwyrm/tests/test_book_search.py +++ b/bookwyrm/tests/test_book_search.py @@ -14,6 +14,13 @@ class BookSearch(TestCase): @classmethod def setUpTestData(self): # pylint: disable=bad-classmethod-argument """we need basic test data and mocks""" + self.first_author = models.Author.objects.create( + name="Author One", aliases=["The First"] + ) + self.second_author = models.Author.objects.create( + name="Author Two", aliases=["The Second"] + ) + self.work = models.Work.objects.create(title="Example Work") self.first_edition = models.Edition.objects.create( @@ -23,6 +30,8 @@ class BookSearch(TestCase): physical_format="Paperback", published_date=datetime.datetime(2019, 4, 9, 0, 0, tzinfo=timezone.utc), ) + self.first_edition.authors.add(self.first_author) + self.second_edition = models.Edition.objects.create( title="Another Edition", parent_work=self.work, @@ -30,19 +39,34 @@ class BookSearch(TestCase): openlibrary_key="hello", pages=150, ) + self.second_edition.authors.add(self.first_author) + self.second_edition.authors.add(self.second_author) + self.third_edition = models.Edition.objects.create( title="Another Edition with annoying ISBN", parent_work=self.work, isbn_10="022222222X", ) + self.third_edition.authors.add(self.first_author) + self.third_edition.authors.add(self.second_author) def test_search(self): """search for a book in the db""" - # title/author + # title results = book_search.search("Example") self.assertEqual(len(results), 1) self.assertEqual(results[0], self.first_edition) + # author + results = book_search.search("One") + self.assertEqual(len(results), 1) + self.assertEqual(results[0], self.first_edition) + + # author alias + results = book_search.search("First") + self.assertEqual(len(results), 1) + self.assertEqual(results[0], self.first_edition) + # isbn results = book_search.search("0000000000") self.assertEqual(len(results), 1) @@ -155,8 +179,17 @@ class SearchVectorTest(TestCase): """search vector with subtitle and series""" # for a book like this we call `to_tsvector("Book Long Mary Bunch")`, hence the # indexes in the search vector. (priority "D" is the default, and never shown.) - book = self._create_book("Book", "Mary", subtitle="Long", series="Bunch") - self.assertEqual(book.search_vector, "'book':1A 'bunch':4 'long':2B 'mary':3C") + book = self._create_book( + "Book", + "Mary", + subtitle="Long", + series="Bunch", + author_alias=["Maria", "Mary Ann"], + ) + self.assertEqual( + book.search_vector, + "'ann':6C 'book':1A 'bunch':7 'long':2B 'maria':4C 'mary':3C,5C", + ) def test_search_vector_parse_book(self): """book parts are parsed in english""" @@ -170,8 +203,8 @@ class SearchVectorTest(TestCase): def test_search_vector_parse_author(self): """author name is not stem'd or affected by stop words""" - book = self._create_book("Writing", "Writes") - self.assertEqual(book.search_vector, "'write':1A 'writes':2C") + book = self._create_book("Writing", "Writes", author_alias=["Reads"]) + self.assertEqual(book.search_vector, "'reads':3C 'write':1A 'writes':2C") book = self._create_book("She Is Writing", "She Writes") self.assertEqual(book.search_vector, "'she':4C 'write':3A 'writes':5C") @@ -218,6 +251,13 @@ class SearchVectorTest(TestCase): book.refresh_from_db() self.assertEqual(book.search_vector, "'goodby':3A 'jeremy':4C 'long':2A") + author.aliases = ["Example"] + author.save(broadcast=False) + book.refresh_from_db() + self.assertEqual( + book.search_vector, "'example':5C 'goodby':3A 'jeremy':4C 'long':2A" + ) + def test_search_vector_on_author_delete(self): """update search when an author is deleted""" book = self._create_book("The Long Goodbye", "The Rays") @@ -274,7 +314,7 @@ class SearchVectorUpdates(TestCase): def setUp(self): """we need basic test data and mocks""" self.work = models.Work.objects.create(title="This Work") - self.author = models.Author.objects.create(name="Name") + self.author = models.Author.objects.create(name="Name", aliases=["Alias"]) self.edition = models.Edition.objects.create( title="First Edition of Work", subtitle="Some Extra Words Are Good", @@ -363,13 +403,18 @@ class SearchVectorUpdates(TestCase): def test_search_after_updated_author_name(self): """book found under new author name""" self.assertEqual(self.edition, self._search_first("Name")) + self.assertEqual(self.edition, self._search_first("Alias")) self.assertFalse(self._search("Identifier")) + self.assertFalse(self._search("Another")) self.author.name = "Identifier" + self.author.aliases = ["Another"] self.author.save(broadcast=False) self.assertFalse(self._search("Name")) + self.assertFalse(self._search("Aliases")) self.assertEqual(self.edition, self._search_first("Identifier")) + self.assertEqual(self.edition, self._search_first("Another")) self.assertEqual(self.edition, self._search_first("Work")) def _search_first(self, query): diff --git a/bookwyrm/views/search.py b/bookwyrm/views/search.py index e1598056f..13695a7d4 100644 --- a/bookwyrm/views/search.py +++ b/bookwyrm/views/search.py @@ -2,8 +2,9 @@ import re -from django.contrib.postgres.search import TrigramSimilarity +from django.contrib.postgres.search import TrigramSimilarity, SearchRank, SearchQuery from django.core.paginator import Paginator +from django.db.models import F from django.db.models.functions import Greatest from django.http import JsonResponse from django.template.response import TemplateResponse @@ -94,26 +95,28 @@ def book_search(request): def author_search(request): """search for an author""" - query = request.GET.get("q") - query = query.strip() - data = {"type": "author", "query": query} + query = request.GET.get("q").strip() + search_query = SearchQuery(query, config="simple") + min_confidence = 0 results = ( - models.Author.objects.annotate( - similarity=TrigramSimilarity("name", query), - ) - .filter( - similarity__gt=0.1, - ) - .order_by("-similarity") + models.Author.objects.filter(search_vector=search_query) + .annotate(rank=SearchRank(F("search_vector"), search_query)) + .filter(rank__gt=min_confidence) + .order_by("-rank") ) paginated = Paginator(results, PAGE_LENGTH) page = paginated.get_page(request.GET.get("page")) - data["results"] = page - data["page_range"] = paginated.get_elided_page_range( - page.number, on_each_side=2, on_ends=1 - ) + + data = { + "type": "author", + "query": query, + "results": page, + "page_range": paginated.get_elided_page_range( + page.number, on_each_side=2, on_ends=1 + ), + } return TemplateResponse(request, "search/author.html", data)