From fc3b609adafb57f37f7947bfa448f5c1ec3a635b Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 30 May 2022 08:42:48 -0700 Subject: [PATCH 01/17] Use general ratings rather than privacy filtered The original system customized how a rating is displayed to every user based on the privacy settings of the reviews and, relatedly, who the user follows. This is cool, but the query is too complicated to load in sessions, and the initial load, which isn't mitigated by caching, is too much and causes timeouts for many users. Also the cache clearing wasn't working correctly because I put in a wildcard, which does not work. --- bookwyrm/models/status.py | 2 +- bookwyrm/templatetags/rating_tags.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bookwyrm/models/status.py b/bookwyrm/models/status.py index 1e31de774..f37076b5c 100644 --- a/bookwyrm/models/status.py +++ b/bookwyrm/models/status.py @@ -374,7 +374,7 @@ class Review(BookStatus): def save(self, *args, **kwargs): """clear rating caches""" if self.book.parent_work: - cache.delete(f"book-rating-{self.book.parent_work.id}-*") + cache.delete(f"book-rating-{self.book.parent_work.id}") super().save(*args, **kwargs) diff --git a/bookwyrm/templatetags/rating_tags.py b/bookwyrm/templatetags/rating_tags.py index 670599e25..cc23d3308 100644 --- a/bookwyrm/templatetags/rating_tags.py +++ b/bookwyrm/templatetags/rating_tags.py @@ -13,10 +13,10 @@ register = template.Library() def get_rating(book, user): """get the overall rating of a book""" return cache.get_or_set( - f"book-rating-{book.parent_work.id}-{user.id}", - lambda u, b: models.Review.privacy_filter(u) - .filter(book__parent_work__editions=b, rating__gt=0) - .aggregate(Avg("rating"))["rating__avg"] + f"book-rating-{book.parent_work.id}", + lambda u, b: models.Review.objects.filter( + book__parent_work__editions=b, rating__gt=0 + ).aggregate(Avg("rating"))["rating__avg"] or 0, user, book, From fb3c7205aff00dc3f636336dadc2fec8a082b1dc Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 30 May 2022 09:17:51 -0700 Subject: [PATCH 02/17] Updates unit tests --- bookwyrm/tests/templatetags/test_rating_tags.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bookwyrm/tests/templatetags/test_rating_tags.py b/bookwyrm/tests/templatetags/test_rating_tags.py index c00f20726..a06ee9402 100644 --- a/bookwyrm/tests/templatetags/test_rating_tags.py +++ b/bookwyrm/tests/templatetags/test_rating_tags.py @@ -40,7 +40,8 @@ class RatingTags(TestCase): @patch("bookwyrm.models.activitypub_mixin.broadcast_task.apply_async") def test_get_rating(self, *_): - """privacy filtered rating""" + """privacy filtered rating. Commented versions are how it ought to work with + subjective ratings, which are currenly not used for performance reasons.""" # follows-only: not included models.ReviewRating.objects.create( user=self.remote_user, @@ -48,7 +49,8 @@ class RatingTags(TestCase): book=self.book, privacy="followers", ) - self.assertEqual(rating_tags.get_rating(self.book, self.local_user), 0) + # self.assertEqual(rating_tags.get_rating(self.book, self.local_user), 0) + self.assertEqual(rating_tags.get_rating(self.book, self.local_user), 5) # public: included models.ReviewRating.objects.create( From 9c03bf782e14a991cf317055d4f91eeb44718fc2 Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 30 May 2022 10:15:22 -0700 Subject: [PATCH 03/17] Make an async request to all search connectors This is the untest first pass at re-arranging remote search to work in parallel rather than sequence. It moves a couple functions around (raise_not_valid_url, for example, needs to be in connector_manager.py now to avoid circular imports). It adds a function to Connector objects that generates a search result (either to the isbn endpoint or the free text endpoint) based on the query, which was previously done as part of the search. I also lowered the timeout to 8 seconds by default. --- bookwyrm/connectors/abstract_connector.py | 34 ++++----- bookwyrm/connectors/connector_manager.py | 84 +++++++++++++---------- bookwyrm/settings.py | 2 +- requirements.txt | 1 + 4 files changed, 62 insertions(+), 59 deletions(-) diff --git a/bookwyrm/connectors/abstract_connector.py b/bookwyrm/connectors/abstract_connector.py index 56e273886..6685d5a09 100644 --- a/bookwyrm/connectors/abstract_connector.py +++ b/bookwyrm/connectors/abstract_connector.py @@ -1,9 +1,8 @@ """ functionality outline for a book data connector """ from abc import ABC, abstractmethod import imghdr -import ipaddress import logging -from urllib.parse import urlparse +import re from django.core.files.base import ContentFile from django.db import transaction @@ -11,7 +10,7 @@ import requests from requests.exceptions import RequestException from bookwyrm import activitypub, models, settings -from .connector_manager import load_more_data, ConnectorException +from .connector_manager import load_more_data, ConnectorException, raise_not_valid_url from .format_mappings import format_mappings @@ -39,6 +38,18 @@ class AbstractMinimalConnector(ABC): for field in self_fields: setattr(self, field, getattr(info, field)) + def get_search_url(self, query): + """ format the query url """ + # Check if the query resembles an ISBN + isbn = re.sub(r"[\W_]", "", query) # removes filler characters + maybe_isbn = len(isbn) in [10, 13] # ISBN10 or ISBN13 + if maybe_isbn and self.isbn_search_url and self.isbn_search_url != "": + return f"{self.isbn_search_url}{query}" + + # NOTE: previously, we tried searching isbn and if that produces no results, + # searched as free text. This, instead, only searches isbn if it's isbn-y + return f"{self.search_url}{query}" + def search(self, query, min_confidence=None, timeout=settings.QUERY_TIMEOUT): """free text search""" params = {} @@ -254,9 +265,6 @@ def get_data(url, params=None, timeout=10): # check if the url is blocked raise_not_valid_url(url) - if models.FederatedServer.is_blocked(url): - raise ConnectorException(f"Attempting to load data from blocked url: {url}") - try: resp = requests.get( url, @@ -311,20 +319,6 @@ def get_image(url, timeout=10): return image_content, extension -def raise_not_valid_url(url): - """do some basic reality checks on the url""" - parsed = urlparse(url) - if not parsed.scheme in ["http", "https"]: - raise ConnectorException("Invalid scheme: ", url) - - try: - ipaddress.ip_address(parsed.netloc) - raise ConnectorException("Provided url is an IP address: ", url) - except ValueError: - # it's not an IP address, which is good - pass - - class Mapping: """associate a local database field with a field in an external dataset""" diff --git a/bookwyrm/connectors/connector_manager.py b/bookwyrm/connectors/connector_manager.py index 14bb702cb..2b5ab1c9d 100644 --- a/bookwyrm/connectors/connector_manager.py +++ b/bookwyrm/connectors/connector_manager.py @@ -1,10 +1,11 @@ """ interface with whatever connectors the app has """ -from datetime import datetime +import asyncio import importlib +import ipaddress import logging -import re from urllib.parse import urlparse +import aiohttp from django.dispatch import receiver from django.db.models import signals @@ -21,52 +22,42 @@ class ConnectorException(HTTPError): """when the connector can't do what was asked""" +async def async_connector_search(query, connectors, params): + """Try a number of requests simultaneously""" + timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT) + async with aiohttp.ClientSession(timeout=timeout) as session: + for connector in connectors: + url = connector.get_search_url(query) + raise_not_valid_url(url) + + async with session.get(url, params=params) as response: + print("Status:", response.status) + print(response.ok) + print("Content-type:", response.headers['content-type']) + + raw_response = await response.json() + yield { + "connector": connector, + "results": connector.parse_search_data(raw_response) + } + + def search(query, min_confidence=0.1, return_first=False): """find books based on arbitary keywords""" if not query: return [] results = [] - # Have we got a ISBN ? - isbn = re.sub(r"[\W_]", "", query) - maybe_isbn = len(isbn) in [10, 13] # ISBN10 or ISBN13 - start_time = datetime.now() - for connector in get_connectors(): - result_set = None - if maybe_isbn and connector.isbn_search_url and connector.isbn_search_url != "": - # Search on ISBN - try: - result_set = connector.isbn_search(isbn) - except Exception as err: # pylint: disable=broad-except - logger.info(err) - # if this fails, we can still try regular search + connectors = list(get_connectors()) - # if no isbn search results, we fallback to generic search - if not result_set: - try: - result_set = connector.search(query, min_confidence=min_confidence) - except Exception as err: # pylint: disable=broad-except - # we don't want *any* error to crash the whole search page - logger.info(err) - continue - - if return_first and result_set: - # if we found anything, return it - return result_set[0] - - if result_set: - results.append( - { - "connector": connector, - "results": result_set, - } - ) - if (datetime.now() - start_time).seconds >= SEARCH_TIMEOUT: - break + # load as many results as we can + params = {"min_confidence": min_confidence} + results = asyncio.run(async_connector_search(query, connectors, params)) if return_first: - return None + # find the best result from all the responses and return that + raise Exception("Not implemented yet") return results @@ -133,3 +124,20 @@ def create_connector(sender, instance, created, *args, **kwargs): """create a connector to an external bookwyrm server""" if instance.application_type == "bookwyrm": get_or_create_connector(f"https://{instance.server_name}") + + +def raise_not_valid_url(url): + """do some basic reality checks on the url""" + parsed = urlparse(url) + if not parsed.scheme in ["http", "https"]: + raise ConnectorException("Invalid scheme: ", url) + + try: + ipaddress.ip_address(parsed.netloc) + raise ConnectorException("Provided url is an IP address: ", url) + except ValueError: + # it's not an IP address, which is good + pass + + if models.FederatedServer.is_blocked(url): + raise ConnectorException(f"Attempting to load data from blocked url: {url}") diff --git a/bookwyrm/settings.py b/bookwyrm/settings.py index e16c576e1..dc0d71f30 100644 --- a/bookwyrm/settings.py +++ b/bookwyrm/settings.py @@ -212,7 +212,7 @@ STREAMS = [ # Search configuration # total time in seconds that the instance will spend searching connectors -SEARCH_TIMEOUT = int(env("SEARCH_TIMEOUT", 15)) +SEARCH_TIMEOUT = int(env("SEARCH_TIMEOUT", 8)) # timeout for a query to an individual connector QUERY_TIMEOUT = int(env("QUERY_TIMEOUT", 5)) diff --git a/requirements.txt b/requirements.txt index 7614dc421..c3bdaf757 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +aiohttp==3.8.1 celery==5.2.2 colorthief==0.2.1 Django==3.2.13 From 0adda36da7a2de92ca36805b1b848a8eb100c98e Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 30 May 2022 10:34:03 -0700 Subject: [PATCH 04/17] Remove search endpoints from Connector Instead of having individual search functions that make individual requests, the connectors will always be searched asynchronously together. The process_seach_response combines the parse and format functions, which could probably be merged into one over-rideable function. The current to-do on this is to remove Inventaire search results that are below the confidence threshhold after search, which used to happen in the `search` function. --- bookwyrm/connectors/abstract_connector.py | 56 +++++++---------------- bookwyrm/connectors/connector_manager.py | 5 +- bookwyrm/connectors/inventaire.py | 8 ---- 3 files changed, 19 insertions(+), 50 deletions(-) diff --git a/bookwyrm/connectors/abstract_connector.py b/bookwyrm/connectors/abstract_connector.py index 6685d5a09..fa3624f82 100644 --- a/bookwyrm/connectors/abstract_connector.py +++ b/bookwyrm/connectors/abstract_connector.py @@ -39,52 +39,24 @@ class AbstractMinimalConnector(ABC): setattr(self, field, getattr(info, field)) def get_search_url(self, query): - """ format the query url """ + """format the query url""" # Check if the query resembles an ISBN - isbn = re.sub(r"[\W_]", "", query) # removes filler characters - maybe_isbn = len(isbn) in [10, 13] # ISBN10 or ISBN13 - if maybe_isbn and self.isbn_search_url and self.isbn_search_url != "": + if maybe_isbn(query) and self.isbn_search_url and self.isbn_search_url != "": return f"{self.isbn_search_url}{query}" # NOTE: previously, we tried searching isbn and if that produces no results, # searched as free text. This, instead, only searches isbn if it's isbn-y return f"{self.search_url}{query}" - def search(self, query, min_confidence=None, timeout=settings.QUERY_TIMEOUT): - """free text search""" - params = {} - if min_confidence: - params["min_confidence"] = min_confidence - - data = self.get_search_data( - f"{self.search_url}{query}", - params=params, - timeout=timeout, - ) - results = [] - - for doc in self.parse_search_data(data)[:10]: - results.append(self.format_search_result(doc)) - return results - - def isbn_search(self, query, timeout=settings.QUERY_TIMEOUT): - """isbn search""" - params = {} - data = self.get_search_data( - f"{self.isbn_search_url}{query}", - params=params, - timeout=timeout, - ) - results = [] - - # this shouldn't be returning mutliple results, but just in case - for doc in self.parse_isbn_search_data(data)[:10]: - results.append(self.format_isbn_search_result(doc)) - return results - - def get_search_data(self, remote_id, **kwargs): # pylint: disable=no-self-use - """this allows connectors to override the default behavior""" - return get_data(remote_id, **kwargs) + def process_search_response(self, query, data): + """Format the search results based on the formt of the query""" + # TODO: inventaire min confidence + parser = self.parse_search_data + formatter = self.format_search_result + if maybe_isbn(query): + parser = self.parse_isbn_search_data + formatter = self.format_isbn_search_result + return [formatter(doc) for doc in parser(data)[:10]] @abstractmethod def get_or_create_book(self, remote_id): @@ -360,3 +332,9 @@ def unique_physical_format(format_text): # try a direct match, so saving this would be redundant return None return format_text + + +def maybe_isbn(query): + """check if a query looks like an isbn""" + isbn = re.sub(r"[\W_]", "", query) # removes filler characters + return len(isbn) in [10, 13] # ISBN10 or ISBN13 diff --git a/bookwyrm/connectors/connector_manager.py b/bookwyrm/connectors/connector_manager.py index 2b5ab1c9d..3c90ba5fe 100644 --- a/bookwyrm/connectors/connector_manager.py +++ b/bookwyrm/connectors/connector_manager.py @@ -33,12 +33,12 @@ async def async_connector_search(query, connectors, params): async with session.get(url, params=params) as response: print("Status:", response.status) print(response.ok) - print("Content-type:", response.headers['content-type']) + print("Content-type:", response.headers["content-type"]) raw_response = await response.json() yield { "connector": connector, - "results": connector.parse_search_data(raw_response) + "results": connector.process_search_response(query, raw_response), } @@ -48,7 +48,6 @@ def search(query, min_confidence=0.1, return_first=False): return [] results = [] - connectors = list(get_connectors()) # load as many results as we can diff --git a/bookwyrm/connectors/inventaire.py b/bookwyrm/connectors/inventaire.py index a9aeb94f9..f25796125 100644 --- a/bookwyrm/connectors/inventaire.py +++ b/bookwyrm/connectors/inventaire.py @@ -77,14 +77,6 @@ class Connector(AbstractConnector): **{k: data.get(k) for k in ["uri", "image", "labels", "sitelinks", "type"]}, } - def search(self, query, min_confidence=None): # pylint: disable=arguments-differ - """overrides default search function with confidence ranking""" - results = super().search(query) - if min_confidence: - # filter the search results after the fact - return [r for r in results if r.confidence >= min_confidence] - return results - def parse_search_data(self, data): return data.get("results") From 9a9cef77665c9c32bf9b7a525564757e678fd549 Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 30 May 2022 11:16:05 -0700 Subject: [PATCH 05/17] Verify url before async search The database lookup doesn't work during the asyn process, so this change loops through the connectors and grabs the formatted urls before sending it to the async handler. --- bookwyrm/connectors/connector_manager.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/bookwyrm/connectors/connector_manager.py b/bookwyrm/connectors/connector_manager.py index 3c90ba5fe..300f1e933 100644 --- a/bookwyrm/connectors/connector_manager.py +++ b/bookwyrm/connectors/connector_manager.py @@ -22,13 +22,13 @@ class ConnectorException(HTTPError): """when the connector can't do what was asked""" -async def async_connector_search(query, connectors, params): +async def async_connector_search(query, items, params): """Try a number of requests simultaneously""" timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT) async with aiohttp.ClientSession(timeout=timeout) as session: - for connector in connectors: + for url, connector in items: url = connector.get_search_url(query) - raise_not_valid_url(url) + # raise_not_valid_url(url) async with session.get(url, params=params) as response: print("Status:", response.status) @@ -36,7 +36,7 @@ async def async_connector_search(query, connectors, params): print("Content-type:", response.headers["content-type"]) raw_response = await response.json() - yield { + return { "connector": connector, "results": connector.process_search_response(query, raw_response), } @@ -48,11 +48,18 @@ def search(query, min_confidence=0.1, return_first=False): return [] results = [] - connectors = list(get_connectors()) + items = [] + for connector in get_connectors(): + # get the search url from the connector before sending + url = connector.get_search_url(query) + # check the URL is valid + raise_not_valid_url(url) + items.append((url, connector)) # load as many results as we can params = {"min_confidence": min_confidence} - results = asyncio.run(async_connector_search(query, connectors, params)) + results = asyncio.run(async_connector_search(query, items, params)) + raise Exception("Hi") if return_first: # find the best result from all the responses and return that From 5e81ec75fbf04a6b08d80b83ae90889e96f4e7a9 Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 30 May 2022 11:19:16 -0700 Subject: [PATCH 06/17] Set request headers in async search get request Gotta ask for json --- bookwyrm/connectors/connector_manager.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/bookwyrm/connectors/connector_manager.py b/bookwyrm/connectors/connector_manager.py index 300f1e933..a94cdbc59 100644 --- a/bookwyrm/connectors/connector_manager.py +++ b/bookwyrm/connectors/connector_manager.py @@ -12,7 +12,7 @@ from django.db.models import signals from requests import HTTPError from bookwyrm import book_search, models -from bookwyrm.settings import SEARCH_TIMEOUT +from bookwyrm.settings import SEARCH_TIMEOUT, USER_AGENT from bookwyrm.tasks import app logger = logging.getLogger(__name__) @@ -25,12 +25,19 @@ class ConnectorException(HTTPError): async def async_connector_search(query, items, params): """Try a number of requests simultaneously""" timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT) + # pylint: disable=line-too-long + headers = { + "Accept": ( + 'application/json, application/activity+json, application/ld+json; profile="https://www.w3.org/ns/activitystreams"; charset=utf-8' + ), + "User-Agent": USER_AGENT, + } async with aiohttp.ClientSession(timeout=timeout) as session: for url, connector in items: url = connector.get_search_url(query) # raise_not_valid_url(url) - async with session.get(url, params=params) as response: + async with session.get(url, headers=headers, params=params) as response: print("Status:", response.status) print(response.ok) print("Content-type:", response.headers["content-type"]) From 45f2199c7120e20d4c098676b0d0b926eadd7e40 Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 30 May 2022 11:58:39 -0700 Subject: [PATCH 07/17] Gather and wait on async requests This sends out the request tasks all at once and then aggregates the results, instead of just running them one after another asynchronously. --- bookwyrm/connectors/connector_manager.py | 50 +++++++++++++++--------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/bookwyrm/connectors/connector_manager.py b/bookwyrm/connectors/connector_manager.py index a94cdbc59..23cde632b 100644 --- a/bookwyrm/connectors/connector_manager.py +++ b/bookwyrm/connectors/connector_manager.py @@ -12,7 +12,7 @@ from django.db.models import signals from requests import HTTPError from bookwyrm import book_search, models -from bookwyrm.settings import SEARCH_TIMEOUT, USER_AGENT +from bookwyrm.settings import QUERY_TIMEOUT, USER_AGENT from bookwyrm.tasks import app logger = logging.getLogger(__name__) @@ -22,9 +22,8 @@ class ConnectorException(HTTPError): """when the connector can't do what was asked""" -async def async_connector_search(query, items, params): - """Try a number of requests simultaneously""" - timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT) +async def get_results(session, url, params, query, connector): + """try this specific connector""" # pylint: disable=line-too-long headers = { "Accept": ( @@ -32,21 +31,36 @@ async def async_connector_search(query, items, params): ), "User-Agent": USER_AGENT, } + try: + async with session.get(url, headers=headers, params=params) as response: + try: + raw_data = await response.json() + except aiohttp.client_exceptions.ContentTypeError as err: + logger.exception(err) + return None + + return { + "connector": connector, + "results": connector.process_search_response(query, raw_data), + } + except asyncio.TimeoutError: + logger.exception("Connection timout for url: %s", url) + + +async def async_connector_search(query, items, params): + """Try a number of requests simultaneously""" + timeout = aiohttp.ClientTimeout(total=QUERY_TIMEOUT) async with aiohttp.ClientSession(timeout=timeout) as session: + tasks = [] for url, connector in items: - url = connector.get_search_url(query) - # raise_not_valid_url(url) + tasks.append( + asyncio.ensure_future( + get_results(session, url, params, query, connector) + ) + ) - async with session.get(url, headers=headers, params=params) as response: - print("Status:", response.status) - print(response.ok) - print("Content-type:", response.headers["content-type"]) - - raw_response = await response.json() - return { - "connector": connector, - "results": connector.process_search_response(query, raw_response), - } + results = await asyncio.gather(*tasks) + return results def search(query, min_confidence=0.1, return_first=False): @@ -66,13 +80,13 @@ def search(query, min_confidence=0.1, return_first=False): # load as many results as we can params = {"min_confidence": min_confidence} results = asyncio.run(async_connector_search(query, items, params)) - raise Exception("Hi") if return_first: # find the best result from all the responses and return that raise Exception("Not implemented yet") - return results + # failed requests will return None, so filter those out + return [r for r in results if r] def first_search_result(query, min_confidence=0.1): From 525e2a591d1329c916355ac0ad549691425d7716 Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 30 May 2022 12:35:17 -0700 Subject: [PATCH 08/17] More error handing Adds logging and error handling for some of the numerous ways a request could fail (the remote site is down, the url is blocked, etc). I also have the results boxes open by default, which makes it more legible imo. --- bookwyrm/connectors/connector_manager.py | 23 ++++++++++++++++------- bookwyrm/templates/search/book.html | 2 +- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/bookwyrm/connectors/connector_manager.py b/bookwyrm/connectors/connector_manager.py index 23cde632b..69d31a7ea 100644 --- a/bookwyrm/connectors/connector_manager.py +++ b/bookwyrm/connectors/connector_manager.py @@ -12,7 +12,7 @@ from django.db.models import signals from requests import HTTPError from bookwyrm import book_search, models -from bookwyrm.settings import QUERY_TIMEOUT, USER_AGENT +from bookwyrm.settings import SEARCH_TIMEOUT, USER_AGENT from bookwyrm.tasks import app logger = logging.getLogger(__name__) @@ -33,23 +33,29 @@ async def get_results(session, url, params, query, connector): } try: async with session.get(url, headers=headers, params=params) as response: + if not response.ok: + logger.info("Unable to connect to %s: %s", url, response.reason) + return + try: raw_data = await response.json() except aiohttp.client_exceptions.ContentTypeError as err: logger.exception(err) - return None + return return { "connector": connector, "results": connector.process_search_response(query, raw_data), } except asyncio.TimeoutError: - logger.exception("Connection timout for url: %s", url) + logger.info("Connection timed out for url: %s", url) + except aiohttp.ClientError as err: + logger.exception(err) async def async_connector_search(query, items, params): """Try a number of requests simultaneously""" - timeout = aiohttp.ClientTimeout(total=QUERY_TIMEOUT) + timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT) async with aiohttp.ClientSession(timeout=timeout) as session: tasks = [] for url, connector in items: @@ -73,8 +79,11 @@ def search(query, min_confidence=0.1, return_first=False): for connector in get_connectors(): # get the search url from the connector before sending url = connector.get_search_url(query) - # check the URL is valid - raise_not_valid_url(url) + try: + raise_not_valid_url(url) + except ConnectorException: + # if this URL is invalid we should skip it and move on + continue items.append((url, connector)) # load as many results as we can @@ -83,7 +92,7 @@ def search(query, min_confidence=0.1, return_first=False): if return_first: # find the best result from all the responses and return that - raise Exception("Not implemented yet") + raise Exception("Not implemented yet") # TODO # failed requests will return None, so filter those out return [r for r in results if r] diff --git a/bookwyrm/templates/search/book.html b/bookwyrm/templates/search/book.html index 7096a55de..d2828eabf 100644 --- a/bookwyrm/templates/search/book.html +++ b/bookwyrm/templates/search/book.html @@ -36,7 +36,7 @@ {% if result_set.results %}
{% if not result_set.connector.local %} -
+
{% endif %} {% if not result_set.connector.local %} From 87fe984462e6a412a09fddaa1508ec7ab1b2c9fa Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 30 May 2022 12:52:31 -0700 Subject: [PATCH 09/17] Combines search formatter and parser function The parser was extracting the list of search results from the json object returned by the search endpoint, and the formatter was converting an individual json entry into a SearchResult object. This just merged them into one function, because they are never used separately. --- bookwyrm/connectors/abstract_connector.py | 12 +---- bookwyrm/connectors/bookwyrm_connector.py | 14 ++--- bookwyrm/connectors/inventaire.py | 63 ++++++++++------------- bookwyrm/connectors/openlibrary.py | 56 ++++++++++---------- 4 files changed, 60 insertions(+), 85 deletions(-) diff --git a/bookwyrm/connectors/abstract_connector.py b/bookwyrm/connectors/abstract_connector.py index fa3624f82..6184a225a 100644 --- a/bookwyrm/connectors/abstract_connector.py +++ b/bookwyrm/connectors/abstract_connector.py @@ -52,11 +52,9 @@ class AbstractMinimalConnector(ABC): """Format the search results based on the formt of the query""" # TODO: inventaire min confidence parser = self.parse_search_data - formatter = self.format_search_result if maybe_isbn(query): parser = self.parse_isbn_search_data - formatter = self.format_isbn_search_result - return [formatter(doc) for doc in parser(data)[:10]] + return list(parser(data))[:10] @abstractmethod def get_or_create_book(self, remote_id): @@ -66,18 +64,10 @@ class AbstractMinimalConnector(ABC): def parse_search_data(self, data): """turn the result json from a search into a list""" - @abstractmethod - def format_search_result(self, search_result): - """create a SearchResult obj from json""" - @abstractmethod def parse_isbn_search_data(self, data): """turn the result json from a search into a list""" - @abstractmethod - def format_isbn_search_result(self, search_result): - """create a SearchResult obj from json""" - class AbstractConnector(AbstractMinimalConnector): """generic book data connector""" diff --git a/bookwyrm/connectors/bookwyrm_connector.py b/bookwyrm/connectors/bookwyrm_connector.py index 6dcba7c31..ec45c29b6 100644 --- a/bookwyrm/connectors/bookwyrm_connector.py +++ b/bookwyrm/connectors/bookwyrm_connector.py @@ -11,14 +11,10 @@ class Connector(AbstractMinimalConnector): return activitypub.resolve_remote_id(remote_id, model=models.Edition) def parse_search_data(self, data): - return data - - def format_search_result(self, search_result): - search_result["connector"] = self - return SearchResult(**search_result) + for search_result in data: + search_result["connector"] = self + yield SearchResult(**search_result) def parse_isbn_search_data(self, data): - return data - - def format_isbn_search_result(self, search_result): - return self.format_search_result(search_result) + for search_result in data: + yield self.format_search_result(search_result) diff --git a/bookwyrm/connectors/inventaire.py b/bookwyrm/connectors/inventaire.py index f25796125..7f6191271 100644 --- a/bookwyrm/connectors/inventaire.py +++ b/bookwyrm/connectors/inventaire.py @@ -78,44 +78,37 @@ class Connector(AbstractConnector): } def parse_search_data(self, data): - return data.get("results") - - def format_search_result(self, search_result): - images = search_result.get("image") - cover = f"{self.covers_url}/img/entities/{images[0]}" if images else None - # a deeply messy translation of inventaire's scores - confidence = float(search_result.get("_score", 0.1)) - confidence = 0.1 if confidence < 150 else 0.999 - return SearchResult( - title=search_result.get("label"), - key=self.get_remote_id(search_result.get("uri")), - author=search_result.get("description"), - view_link=f"{self.base_url}/entity/{search_result.get('uri')}", - cover=cover, - confidence=confidence, - connector=self, - ) + for search_result in data.get("results"): + images = search_result.get("image") + cover = f"{self.covers_url}/img/entities/{images[0]}" if images else None + # a deeply messy translation of inventaire's scores + confidence = float(search_result.get("_score", 0.1)) + confidence = 0.1 if confidence < 150 else 0.999 + yield SearchResult( + title=search_result.get("label"), + key=self.get_remote_id(search_result.get("uri")), + author=search_result.get("description"), + view_link=f"{self.base_url}/entity/{search_result.get('uri')}", + cover=cover, + confidence=confidence, + connector=self, + ) def parse_isbn_search_data(self, data): """got some daaaata""" - results = data.get("entities") - if not results: - return [] - return list(results.values()) - - def format_isbn_search_result(self, search_result): - """totally different format than a regular search result""" - title = search_result.get("claims", {}).get("wdt:P1476", []) - if not title: - return None - return SearchResult( - title=title[0], - key=self.get_remote_id(search_result.get("uri")), - author=search_result.get("description"), - view_link=f"{self.base_url}/entity/{search_result.get('uri')}", - cover=self.get_cover_url(search_result.get("image")), - connector=self, - ) + results = data.get("entities", []) + for search_result in list(results.values()): + title = search_result.get("claims", {}).get("wdt:P1476", []) + if not title: + continue + yield SearchResult( + title=title[0], + key=self.get_remote_id(search_result.get("uri")), + author=search_result.get("description"), + view_link=f"{self.base_url}/entity/{search_result.get('uri')}", + cover=self.get_cover_url(search_result.get("image")), + connector=self, + ) def is_work_data(self, data): return data.get("type") == "work" diff --git a/bookwyrm/connectors/openlibrary.py b/bookwyrm/connectors/openlibrary.py index 118222a16..fa9aeeb30 100644 --- a/bookwyrm/connectors/openlibrary.py +++ b/bookwyrm/connectors/openlibrary.py @@ -153,38 +153,34 @@ class Connector(AbstractConnector): return f"{self.covers_url}/b/id/{image_name}" def parse_search_data(self, data): - return data.get("docs") - - def format_search_result(self, search_result): - # build the remote id from the openlibrary key - key = self.books_url + search_result["key"] - author = search_result.get("author_name") or ["Unknown"] - cover_blob = search_result.get("cover_i") - cover = self.get_cover_url([cover_blob], size="M") if cover_blob else None - return SearchResult( - title=search_result.get("title"), - key=key, - author=", ".join(author), - connector=self, - year=search_result.get("first_publish_year"), - cover=cover, - ) + for search_result in data.get("docs"): + # build the remote id from the openlibrary key + key = self.books_url + search_result["key"] + author = search_result.get("author_name") or ["Unknown"] + cover_blob = search_result.get("cover_i") + cover = self.get_cover_url([cover_blob], size="M") if cover_blob else None + yield SearchResult( + title=search_result.get("title"), + key=key, + author=", ".join(author), + connector=self, + year=search_result.get("first_publish_year"), + cover=cover, + ) def parse_isbn_search_data(self, data): - return list(data.values()) - - def format_isbn_search_result(self, search_result): - # build the remote id from the openlibrary key - key = self.books_url + search_result["key"] - authors = search_result.get("authors") or [{"name": "Unknown"}] - author_names = [author.get("name") for author in authors] - return SearchResult( - title=search_result.get("title"), - key=key, - author=", ".join(author_names), - connector=self, - year=search_result.get("publish_date"), - ) + for search_result in list(data.values()): + # build the remote id from the openlibrary key + key = self.books_url + search_result["key"] + authors = search_result.get("authors") or [{"name": "Unknown"}] + author_names = [author.get("name") for author in authors] + yield SearchResult( + title=search_result.get("title"), + key=key, + author=", ".join(author_names), + connector=self, + year=search_result.get("publish_date"), + ) def load_edition_data(self, olkey): """query openlibrary for editions of a work""" From af19d728d2264592463b6211411d0850c5ad7b40 Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 30 May 2022 16:16:10 -0700 Subject: [PATCH 10/17] Removes outdated unit tests --- bookwyrm/connectors/bookwyrm_connector.py | 3 +- bookwyrm/connectors/inventaire.py | 6 ++- .../connectors/test_abstract_connector.py | 6 --- .../test_abstract_minimal_connector.py | 46 ----------------- .../connectors/test_bookwyrm_connector.py | 12 ++--- .../connectors/test_connector_manager.py | 28 ----------- .../connectors/test_inventaire_connector.py | 50 +++---------------- bookwyrm/tests/test_book_search.py | 6 --- 8 files changed, 16 insertions(+), 141 deletions(-) diff --git a/bookwyrm/connectors/bookwyrm_connector.py b/bookwyrm/connectors/bookwyrm_connector.py index ec45c29b6..05612d9eb 100644 --- a/bookwyrm/connectors/bookwyrm_connector.py +++ b/bookwyrm/connectors/bookwyrm_connector.py @@ -17,4 +17,5 @@ class Connector(AbstractMinimalConnector): def parse_isbn_search_data(self, data): for search_result in data: - yield self.format_search_result(search_result) + search_result["connector"] = self + yield SearchResult(**search_result) diff --git a/bookwyrm/connectors/inventaire.py b/bookwyrm/connectors/inventaire.py index 7f6191271..0495d8c2f 100644 --- a/bookwyrm/connectors/inventaire.py +++ b/bookwyrm/connectors/inventaire.py @@ -78,7 +78,7 @@ class Connector(AbstractConnector): } def parse_search_data(self, data): - for search_result in data.get("results"): + for search_result in data.get("results", []): images = search_result.get("image") cover = f"{self.covers_url}/img/entities/{images[0]}" if images else None # a deeply messy translation of inventaire's scores @@ -96,7 +96,9 @@ class Connector(AbstractConnector): def parse_isbn_search_data(self, data): """got some daaaata""" - results = data.get("entities", []) + results = data.get("entities") + if not results: + return for search_result in list(results.values()): title = search_result.get("claims", {}).get("wdt:P1476", []) if not title: diff --git a/bookwyrm/tests/connectors/test_abstract_connector.py b/bookwyrm/tests/connectors/test_abstract_connector.py index 26742c05e..eb3a7b01a 100644 --- a/bookwyrm/tests/connectors/test_abstract_connector.py +++ b/bookwyrm/tests/connectors/test_abstract_connector.py @@ -42,15 +42,9 @@ class AbstractConnector(TestCase): generated_remote_link_field = "openlibrary_link" - def format_search_result(self, search_result): - return search_result - def parse_search_data(self, data): return data - def format_isbn_search_result(self, search_result): - return search_result - def parse_isbn_search_data(self, data): return data diff --git a/bookwyrm/tests/connectors/test_abstract_minimal_connector.py b/bookwyrm/tests/connectors/test_abstract_minimal_connector.py index a90ce0c7e..06da8741c 100644 --- a/bookwyrm/tests/connectors/test_abstract_minimal_connector.py +++ b/bookwyrm/tests/connectors/test_abstract_minimal_connector.py @@ -1,6 +1,5 @@ """ testing book data connectors """ from django.test import TestCase -import responses from bookwyrm import models from bookwyrm.connectors import abstract_connector @@ -25,18 +24,12 @@ class AbstractConnector(TestCase): class TestConnector(abstract_connector.AbstractMinimalConnector): """nothing added here""" - def format_search_result(self, search_result): - return search_result - def get_or_create_book(self, remote_id): pass def parse_search_data(self, data): return data - def format_isbn_search_result(self, search_result): - return search_result - def parse_isbn_search_data(self, data): return data @@ -54,45 +47,6 @@ class AbstractConnector(TestCase): self.assertIsNone(connector.name) self.assertEqual(connector.identifier, "example.com") - @responses.activate - def test_search(self): - """makes an http request to the outside service""" - responses.add( - responses.GET, - "https://example.com/search?q=a%20book%20title", - json=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"], - status=200, - ) - results = self.test_connector.search("a book title") - self.assertEqual(len(results), 10) - self.assertEqual(results[0], "a") - self.assertEqual(results[1], "b") - self.assertEqual(results[2], "c") - - @responses.activate - def test_search_min_confidence(self): - """makes an http request to the outside service""" - responses.add( - responses.GET, - "https://example.com/search?q=a%20book%20title&min_confidence=1", - json=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"], - status=200, - ) - results = self.test_connector.search("a book title", min_confidence=1) - self.assertEqual(len(results), 10) - - @responses.activate - def test_isbn_search(self): - """makes an http request to the outside service""" - responses.add( - responses.GET, - "https://example.com/isbn?q=123456", - json=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"], - status=200, - ) - results = self.test_connector.isbn_search("123456") - self.assertEqual(len(results), 10) - def test_create_mapping(self): """maps remote fields for book data to bookwyrm activitypub fields""" mapping = Mapping("isbn") diff --git a/bookwyrm/tests/connectors/test_bookwyrm_connector.py b/bookwyrm/tests/connectors/test_bookwyrm_connector.py index 585080e66..b9642a50e 100644 --- a/bookwyrm/tests/connectors/test_bookwyrm_connector.py +++ b/bookwyrm/tests/connectors/test_bookwyrm_connector.py @@ -30,14 +30,11 @@ class BookWyrmConnector(TestCase): result = self.connector.get_or_create_book(book.remote_id) self.assertEqual(book, result) - def test_format_search_result(self): + def test_parse_search_data(self): """create a SearchResult object from search response json""" datafile = pathlib.Path(__file__).parent.joinpath("../data/bw_search.json") search_data = json.loads(datafile.read_bytes()) - results = self.connector.parse_search_data(search_data) - self.assertIsInstance(results, list) - - result = self.connector.format_search_result(results[0]) + result = list(self.connector.parse_search_data(search_data))[0] self.assertIsInstance(result, SearchResult) self.assertEqual(result.title, "Jonathan Strange and Mr Norrell") self.assertEqual(result.key, "https://example.com/book/122") @@ -45,10 +42,9 @@ class BookWyrmConnector(TestCase): self.assertEqual(result.year, 2017) self.assertEqual(result.connector, self.connector) - def test_format_isbn_search_result(self): + def test_parse_isbn_search_data(self): """just gotta attach the connector""" datafile = pathlib.Path(__file__).parent.joinpath("../data/bw_search.json") search_data = json.loads(datafile.read_bytes()) - results = self.connector.parse_isbn_search_data(search_data) - result = self.connector.format_isbn_search_result(results[0]) + result = list(self.connector.parse_isbn_search_data(search_data))[0] self.assertEqual(result.connector, self.connector) diff --git a/bookwyrm/tests/connectors/test_connector_manager.py b/bookwyrm/tests/connectors/test_connector_manager.py index c88a8036a..c0c09147e 100644 --- a/bookwyrm/tests/connectors/test_connector_manager.py +++ b/bookwyrm/tests/connectors/test_connector_manager.py @@ -49,39 +49,11 @@ class ConnectorManager(TestCase): self.assertEqual(len(connectors), 1) self.assertIsInstance(connectors[0], BookWyrmConnector) - @responses.activate - def test_search_plaintext(self): - """search all connectors""" - responses.add( - responses.GET, - "http://fake.ciom/search/Example?min_confidence=0.1", - json=[{"title": "Hello", "key": "https://www.example.com/search/1"}], - ) - results = connector_manager.search("Example") - self.assertEqual(len(results), 1) - self.assertEqual(len(results[0]["results"]), 1) - self.assertEqual(results[0]["connector"].identifier, "test_connector_remote") - self.assertEqual(results[0]["results"][0].title, "Hello") - def test_search_empty_query(self): """don't panic on empty queries""" results = connector_manager.search("") self.assertEqual(results, []) - @responses.activate - def test_search_isbn(self): - """special handling if a query resembles an isbn""" - responses.add( - responses.GET, - "http://fake.ciom/isbn/0000000000", - json=[{"title": "Hello", "key": "https://www.example.com/search/1"}], - ) - results = connector_manager.search("0000000000") - self.assertEqual(len(results), 1) - self.assertEqual(len(results[0]["results"]), 1) - self.assertEqual(results[0]["connector"].identifier, "test_connector_remote") - self.assertEqual(results[0]["results"][0].title, "Hello") - def test_first_search_result(self): """only get one search result""" result = connector_manager.first_search_result("Example") diff --git a/bookwyrm/tests/connectors/test_inventaire_connector.py b/bookwyrm/tests/connectors/test_inventaire_connector.py index 55727a600..7b526f224 100644 --- a/bookwyrm/tests/connectors/test_inventaire_connector.py +++ b/bookwyrm/tests/connectors/test_inventaire_connector.py @@ -66,38 +66,14 @@ class Inventaire(TestCase): with self.assertRaises(ConnectorException): self.connector.get_book_data("https://test.url/ok") - @responses.activate - def test_search(self): - """min confidence filtering""" - responses.add( - responses.GET, - "https://inventaire.io/search?q=hi", - json={ - "results": [ - { - "_score": 200, - "label": "hello", - }, - { - "_score": 100, - "label": "hi", - }, - ], - }, - ) - results = self.connector.search("hi", min_confidence=0.5) - self.assertEqual(len(results), 1) - self.assertEqual(results[0].title, "hello") - - def test_format_search_result(self): + def test_parse_search_data(self): """json to search result objs""" search_file = pathlib.Path(__file__).parent.joinpath( "../data/inventaire_search.json" ) search_results = json.loads(search_file.read_bytes()) - results = self.connector.parse_search_data(search_results) - formatted = self.connector.format_search_result(results[0]) + formatted = list(self.connector.parse_search_data(search_results))[0] self.assertEqual(formatted.title, "The Stories of Vladimir Nabokov") self.assertEqual( @@ -178,15 +154,14 @@ class Inventaire(TestCase): result = self.connector.resolve_keys(keys) self.assertEqual(result, ["epistolary novel", "crime novel"]) - def test_isbn_search(self): + def test_pase_isbn_search_data(self): """another search type""" search_file = pathlib.Path(__file__).parent.joinpath( "../data/inventaire_isbn_search.json" ) search_results = json.loads(search_file.read_bytes()) - results = self.connector.parse_isbn_search_data(search_results) - formatted = self.connector.format_isbn_search_result(results[0]) + formatted = list(self.connector.parse_isbn_search_data(search_results))[0] self.assertEqual(formatted.title, "L'homme aux cercles bleus") self.assertEqual( @@ -198,25 +173,12 @@ class Inventaire(TestCase): "https://covers.inventaire.io/img/entities/12345", ) - def test_isbn_search_empty(self): + def test_parse_isbn_search_data_empty(self): """another search type""" search_results = {} - results = self.connector.parse_isbn_search_data(search_results) + results = list(self.connector.parse_isbn_search_data(search_results)) self.assertEqual(results, []) - def test_isbn_search_no_title(self): - """another search type""" - search_file = pathlib.Path(__file__).parent.joinpath( - "../data/inventaire_isbn_search.json" - ) - search_results = json.loads(search_file.read_bytes()) - search_results["entities"]["isbn:9782290349229"]["claims"]["wdt:P1476"] = None - - result = self.connector.format_isbn_search_result( - search_results.get("entities") - ) - self.assertIsNone(result) - def test_is_work_data(self): """is it a work""" work_file = pathlib.Path(__file__).parent.joinpath( diff --git a/bookwyrm/tests/test_book_search.py b/bookwyrm/tests/test_book_search.py index 16435ffff..141e3906b 100644 --- a/bookwyrm/tests/test_book_search.py +++ b/bookwyrm/tests/test_book_search.py @@ -102,18 +102,12 @@ class BookSearch(TestCase): class TestConnector(AbstractMinimalConnector): """nothing added here""" - def format_search_result(self, search_result): - return search_result - def get_or_create_book(self, remote_id): pass def parse_search_data(self, data): return data - def format_isbn_search_result(self, search_result): - return search_result - def parse_isbn_search_data(self, data): return data From 83ee5a756f48622243b5e0d0347092ca0564ebfa Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 30 May 2022 16:42:37 -0700 Subject: [PATCH 11/17] Filter intentaire results by confidence --- bookwyrm/connectors/abstract_connector.py | 10 ++++------ bookwyrm/connectors/bookwyrm_connector.py | 2 +- bookwyrm/connectors/connector_manager.py | 12 ++++++------ bookwyrm/connectors/inventaire.py | 4 +++- bookwyrm/connectors/openlibrary.py | 2 +- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/bookwyrm/connectors/abstract_connector.py b/bookwyrm/connectors/abstract_connector.py index 6184a225a..dc4be4b3d 100644 --- a/bookwyrm/connectors/abstract_connector.py +++ b/bookwyrm/connectors/abstract_connector.py @@ -48,20 +48,18 @@ class AbstractMinimalConnector(ABC): # searched as free text. This, instead, only searches isbn if it's isbn-y return f"{self.search_url}{query}" - def process_search_response(self, query, data): + def process_search_response(self, query, data, min_confidence): """Format the search results based on the formt of the query""" - # TODO: inventaire min confidence - parser = self.parse_search_data if maybe_isbn(query): - parser = self.parse_isbn_search_data - return list(parser(data))[:10] + return list(self.parse_isbn_search_data(data))[:10] + return list(self.parse_search_data(data, min_confidence))[:10] @abstractmethod def get_or_create_book(self, remote_id): """pull up a book record by whatever means possible""" @abstractmethod - def parse_search_data(self, data): + def parse_search_data(self, data, min_confidence): """turn the result json from a search into a list""" @abstractmethod diff --git a/bookwyrm/connectors/bookwyrm_connector.py b/bookwyrm/connectors/bookwyrm_connector.py index 05612d9eb..e07a0b281 100644 --- a/bookwyrm/connectors/bookwyrm_connector.py +++ b/bookwyrm/connectors/bookwyrm_connector.py @@ -10,7 +10,7 @@ class Connector(AbstractMinimalConnector): def get_or_create_book(self, remote_id): return activitypub.resolve_remote_id(remote_id, model=models.Edition) - def parse_search_data(self, data): + def parse_search_data(self, data, min_confidence): for search_result in data: search_result["connector"] = self yield SearchResult(**search_result) diff --git a/bookwyrm/connectors/connector_manager.py b/bookwyrm/connectors/connector_manager.py index 69d31a7ea..0488421e9 100644 --- a/bookwyrm/connectors/connector_manager.py +++ b/bookwyrm/connectors/connector_manager.py @@ -22,7 +22,7 @@ class ConnectorException(HTTPError): """when the connector can't do what was asked""" -async def get_results(session, url, params, query, connector): +async def get_results(session, url, min_confidence, query, connector): """try this specific connector""" # pylint: disable=line-too-long headers = { @@ -31,6 +31,7 @@ async def get_results(session, url, params, query, connector): ), "User-Agent": USER_AGENT, } + params = {"min_confidence": min_confidence} try: async with session.get(url, headers=headers, params=params) as response: if not response.ok: @@ -45,7 +46,7 @@ async def get_results(session, url, params, query, connector): return { "connector": connector, - "results": connector.process_search_response(query, raw_data), + "results": connector.process_search_response(query, raw_data, min_confidence), } except asyncio.TimeoutError: logger.info("Connection timed out for url: %s", url) @@ -53,7 +54,7 @@ async def get_results(session, url, params, query, connector): logger.exception(err) -async def async_connector_search(query, items, params): +async def async_connector_search(query, items, min_confidence): """Try a number of requests simultaneously""" timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT) async with aiohttp.ClientSession(timeout=timeout) as session: @@ -61,7 +62,7 @@ async def async_connector_search(query, items, params): for url, connector in items: tasks.append( asyncio.ensure_future( - get_results(session, url, params, query, connector) + get_results(session, url, min_confidence, query, connector) ) ) @@ -87,8 +88,7 @@ def search(query, min_confidence=0.1, return_first=False): items.append((url, connector)) # load as many results as we can - params = {"min_confidence": min_confidence} - results = asyncio.run(async_connector_search(query, items, params)) + results = asyncio.run(async_connector_search(query, items, min_confidence)) if return_first: # find the best result from all the responses and return that diff --git a/bookwyrm/connectors/inventaire.py b/bookwyrm/connectors/inventaire.py index 0495d8c2f..c13f4e3e6 100644 --- a/bookwyrm/connectors/inventaire.py +++ b/bookwyrm/connectors/inventaire.py @@ -77,13 +77,15 @@ class Connector(AbstractConnector): **{k: data.get(k) for k in ["uri", "image", "labels", "sitelinks", "type"]}, } - def parse_search_data(self, data): + def parse_search_data(self, data, min_confidence): for search_result in data.get("results", []): images = search_result.get("image") cover = f"{self.covers_url}/img/entities/{images[0]}" if images else None # a deeply messy translation of inventaire's scores confidence = float(search_result.get("_score", 0.1)) confidence = 0.1 if confidence < 150 else 0.999 + if confidence < min_confidence: + continue yield SearchResult( title=search_result.get("label"), key=self.get_remote_id(search_result.get("uri")), diff --git a/bookwyrm/connectors/openlibrary.py b/bookwyrm/connectors/openlibrary.py index fa9aeeb30..2b625dffc 100644 --- a/bookwyrm/connectors/openlibrary.py +++ b/bookwyrm/connectors/openlibrary.py @@ -152,7 +152,7 @@ class Connector(AbstractConnector): image_name = f"{cover_id}-{size}.jpg" return f"{self.covers_url}/b/id/{image_name}" - def parse_search_data(self, data): + def parse_search_data(self, data, min_confidence): for search_result in data.get("docs"): # build the remote id from the openlibrary key key = self.books_url + search_result["key"] From 98ed03b6b45701d526b48add7931121683723169 Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 30 May 2022 17:00:34 -0700 Subject: [PATCH 12/17] Python formatting and test update --- bookwyrm/connectors/connector_manager.py | 4 +++- .../connectors/test_bookwyrm_connector.py | 2 +- .../connectors/test_inventaire_connector.py | 2 +- .../connectors/test_openlibrary_connector.py | 24 +++---------------- 4 files changed, 8 insertions(+), 24 deletions(-) diff --git a/bookwyrm/connectors/connector_manager.py b/bookwyrm/connectors/connector_manager.py index 0488421e9..fc7849249 100644 --- a/bookwyrm/connectors/connector_manager.py +++ b/bookwyrm/connectors/connector_manager.py @@ -46,7 +46,9 @@ async def get_results(session, url, min_confidence, query, connector): return { "connector": connector, - "results": connector.process_search_response(query, raw_data, min_confidence), + "results": connector.process_search_response( + query, raw_data, min_confidence + ), } except asyncio.TimeoutError: logger.info("Connection timed out for url: %s", url) diff --git a/bookwyrm/tests/connectors/test_bookwyrm_connector.py b/bookwyrm/tests/connectors/test_bookwyrm_connector.py index b9642a50e..1e369ca01 100644 --- a/bookwyrm/tests/connectors/test_bookwyrm_connector.py +++ b/bookwyrm/tests/connectors/test_bookwyrm_connector.py @@ -34,7 +34,7 @@ class BookWyrmConnector(TestCase): """create a SearchResult object from search response json""" datafile = pathlib.Path(__file__).parent.joinpath("../data/bw_search.json") search_data = json.loads(datafile.read_bytes()) - result = list(self.connector.parse_search_data(search_data))[0] + result = list(self.connector.parse_search_data(search_data, 0))[0] self.assertIsInstance(result, SearchResult) self.assertEqual(result.title, "Jonathan Strange and Mr Norrell") self.assertEqual(result.key, "https://example.com/book/122") diff --git a/bookwyrm/tests/connectors/test_inventaire_connector.py b/bookwyrm/tests/connectors/test_inventaire_connector.py index 7b526f224..3bba9ece3 100644 --- a/bookwyrm/tests/connectors/test_inventaire_connector.py +++ b/bookwyrm/tests/connectors/test_inventaire_connector.py @@ -73,7 +73,7 @@ class Inventaire(TestCase): ) search_results = json.loads(search_file.read_bytes()) - formatted = list(self.connector.parse_search_data(search_results))[0] + formatted = list(self.connector.parse_search_data(search_results, 0))[0] self.assertEqual(formatted.title, "The Stories of Vladimir Nabokov") self.assertEqual( diff --git a/bookwyrm/tests/connectors/test_openlibrary_connector.py b/bookwyrm/tests/connectors/test_openlibrary_connector.py index c05eb1a94..69e4f847c 100644 --- a/bookwyrm/tests/connectors/test_openlibrary_connector.py +++ b/bookwyrm/tests/connectors/test_openlibrary_connector.py @@ -122,21 +122,11 @@ class Openlibrary(TestCase): self.assertEqual(result, "https://covers.openlibrary.org/b/id/image-L.jpg") def test_parse_search_result(self): - """extract the results from the search json response""" - datafile = pathlib.Path(__file__).parent.joinpath("../data/ol_search.json") - search_data = json.loads(datafile.read_bytes()) - result = self.connector.parse_search_data(search_data) - self.assertIsInstance(result, list) - self.assertEqual(len(result), 2) - - def test_format_search_result(self): """translate json from openlibrary into SearchResult""" datafile = pathlib.Path(__file__).parent.joinpath("../data/ol_search.json") search_data = json.loads(datafile.read_bytes()) - results = self.connector.parse_search_data(search_data) - self.assertIsInstance(results, list) + result = list(self.connector.parse_search_data(search_data, 0))[0] - result = self.connector.format_search_result(results[0]) self.assertIsInstance(result, SearchResult) self.assertEqual(result.title, "This Is How You Lose the Time War") self.assertEqual(result.key, "https://openlibrary.org/works/OL20639540W") @@ -148,18 +138,10 @@ class Openlibrary(TestCase): """extract the results from the search json response""" datafile = pathlib.Path(__file__).parent.joinpath("../data/ol_isbn_search.json") search_data = json.loads(datafile.read_bytes()) - result = self.connector.parse_isbn_search_data(search_data) - self.assertIsInstance(result, list) + result = list(self.connector.parse_isbn_search_data(search_data)) self.assertEqual(len(result), 1) - def test_format_isbn_search_result(self): - """translate json from openlibrary into SearchResult""" - datafile = pathlib.Path(__file__).parent.joinpath("../data/ol_isbn_search.json") - search_data = json.loads(datafile.read_bytes()) - results = self.connector.parse_isbn_search_data(search_data) - self.assertIsInstance(results, list) - - result = self.connector.format_isbn_search_result(results[0]) + result = result[0] self.assertIsInstance(result, SearchResult) self.assertEqual(result.title, "Les ombres errantes") self.assertEqual(result.key, "https://openlibrary.org/books/OL16262504M") From a053f209619429706f33baeeb2844a48756acce3 Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Tue, 31 May 2022 08:20:59 -0700 Subject: [PATCH 13/17] Re-implements return first option Since we get all the results quickly now, this aggregates all the results that came back and sorts them by confidence, and returns the highest confidence result. The confidences aren't great on free text search, but conceptually that's how it should work at least. It may make sense to aggregate the search results in all contexts, but I'll propose that in a separate PR. --- bookwyrm/book_search.py | 4 ++-- bookwyrm/connectors/connector_manager.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/bookwyrm/book_search.py b/bookwyrm/book_search.py index e42a6d8c3..4b0a6eab9 100644 --- a/bookwyrm/book_search.py +++ b/bookwyrm/book_search.py @@ -148,8 +148,8 @@ class SearchResult: def __repr__(self): # pylint: disable=consider-using-f-string - return "".format( - self.key, self.title, self.author + return "".format( + self.key, self.title, self.author, self.confidence ) def json(self): diff --git a/bookwyrm/connectors/connector_manager.py b/bookwyrm/connectors/connector_manager.py index fc7849249..5c8b43fdb 100644 --- a/bookwyrm/connectors/connector_manager.py +++ b/bookwyrm/connectors/connector_manager.py @@ -91,13 +91,16 @@ def search(query, min_confidence=0.1, return_first=False): # load as many results as we can results = asyncio.run(async_connector_search(query, items, min_confidence)) + results = [r for r in results if r] if return_first: # find the best result from all the responses and return that - raise Exception("Not implemented yet") # TODO + all_results = [r for con in results for r in con["results"]] + all_results = sorted(all_results, key=lambda r: r.confidence, reverse=True) + return all_results[0] # failed requests will return None, so filter those out - return [r for r in results if r] + return results def first_search_result(query, min_confidence=0.1): From 5e99002aad76fd8dcadf8b5b3f6ef9c4bcac262b Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Tue, 31 May 2022 08:25:02 -0700 Subject: [PATCH 14/17] Raise priority for external connectors in initdb By default, OpenLibrary and Inventaire were prioritzed below other BookWyrm nodes. In practice, people have gotten better search results from these connectors, hence the change. With the search refactor, this has much less impact, but it will show these search results higher in the list. If the results page shows all the connectors' results integrated, this field should be removed entirely. --- bookwyrm/management/commands/initdb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bookwyrm/management/commands/initdb.py b/bookwyrm/management/commands/initdb.py index 160502ca0..23020a0a6 100644 --- a/bookwyrm/management/commands/initdb.py +++ b/bookwyrm/management/commands/initdb.py @@ -89,7 +89,7 @@ def init_connectors(): covers_url="https://inventaire.io", search_url="https://inventaire.io/api/search?types=works&types=works&search=", isbn_search_url="https://inventaire.io/api/entities?action=by-uris&uris=isbn%3A", - priority=3, + priority=1, ) models.Connector.objects.create( @@ -101,7 +101,7 @@ def init_connectors(): covers_url="https://covers.openlibrary.org", search_url="https://openlibrary.org/search?q=", isbn_search_url="https://openlibrary.org/api/books?jscmd=data&format=json&bibkeys=ISBN:", - priority=3, + priority=1, ) From 05fd30cfcf50338b694e9c8c1e07d2c30ade3317 Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Tue, 31 May 2022 08:37:07 -0700 Subject: [PATCH 15/17] Pylint fixes in connector tests --- bookwyrm/tests/connectors/test_abstract_connector.py | 2 +- bookwyrm/tests/connectors/test_abstract_minimal_connector.py | 2 +- bookwyrm/tests/test_book_search.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bookwyrm/tests/connectors/test_abstract_connector.py b/bookwyrm/tests/connectors/test_abstract_connector.py index eb3a7b01a..02ac5c66a 100644 --- a/bookwyrm/tests/connectors/test_abstract_connector.py +++ b/bookwyrm/tests/connectors/test_abstract_connector.py @@ -42,7 +42,7 @@ class AbstractConnector(TestCase): generated_remote_link_field = "openlibrary_link" - def parse_search_data(self, data): + def parse_search_data(self, data, min_confidence): return data def parse_isbn_search_data(self, data): diff --git a/bookwyrm/tests/connectors/test_abstract_minimal_connector.py b/bookwyrm/tests/connectors/test_abstract_minimal_connector.py index 06da8741c..119ca3581 100644 --- a/bookwyrm/tests/connectors/test_abstract_minimal_connector.py +++ b/bookwyrm/tests/connectors/test_abstract_minimal_connector.py @@ -27,7 +27,7 @@ class AbstractConnector(TestCase): def get_or_create_book(self, remote_id): pass - def parse_search_data(self, data): + def parse_search_data(self, data, min_confidence): return data def parse_isbn_search_data(self, data): diff --git a/bookwyrm/tests/test_book_search.py b/bookwyrm/tests/test_book_search.py index 141e3906b..7274cb49c 100644 --- a/bookwyrm/tests/test_book_search.py +++ b/bookwyrm/tests/test_book_search.py @@ -105,7 +105,7 @@ class BookSearch(TestCase): def get_or_create_book(self, remote_id): pass - def parse_search_data(self, data): + def parse_search_data(self, data, min_confidence): return data def parse_isbn_search_data(self, data): From 969db13ff2c43421357509bbe3d5a562983ba06f Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Tue, 31 May 2022 08:49:23 -0700 Subject: [PATCH 16/17] Safely return None in remote search return_first --- bookwyrm/connectors/connector_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bookwyrm/connectors/connector_manager.py b/bookwyrm/connectors/connector_manager.py index 5c8b43fdb..af79b2483 100644 --- a/bookwyrm/connectors/connector_manager.py +++ b/bookwyrm/connectors/connector_manager.py @@ -97,7 +97,7 @@ def search(query, min_confidence=0.1, return_first=False): # find the best result from all the responses and return that all_results = [r for con in results for r in con["results"]] all_results = sorted(all_results, key=lambda r: r.confidence, reverse=True) - return all_results[0] + return all_results[0] if all_results else None # failed requests will return None, so filter those out return results From c3b35760a285faecc82a465835fc33e0fb006913 Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Tue, 31 May 2022 09:32:32 -0700 Subject: [PATCH 17/17] Updates test mocks for remote search --- bookwyrm/connectors/connector_manager.py | 1 + bookwyrm/tests/views/test_search.py | 30 ++++++++++++------------ 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/bookwyrm/connectors/connector_manager.py b/bookwyrm/connectors/connector_manager.py index af79b2483..86774af56 100644 --- a/bookwyrm/connectors/connector_manager.py +++ b/bookwyrm/connectors/connector_manager.py @@ -86,6 +86,7 @@ def search(query, min_confidence=0.1, return_first=False): raise_not_valid_url(url) except ConnectorException: # if this URL is invalid we should skip it and move on + logger.info("Request denied to blocked domain: %s", url) continue items.append((url, connector)) diff --git a/bookwyrm/tests/views/test_search.py b/bookwyrm/tests/views/test_search.py index 2df04f588..51166f2fa 100644 --- a/bookwyrm/tests/views/test_search.py +++ b/bookwyrm/tests/views/test_search.py @@ -1,6 +1,5 @@ """ test for app action functionality """ import json -import pathlib from unittest.mock import patch from django.contrib.auth.models import AnonymousUser @@ -8,9 +7,9 @@ from django.http import JsonResponse from django.template.response import TemplateResponse from django.test import TestCase from django.test.client import RequestFactory -import responses from bookwyrm import models, views +from bookwyrm.book_search import SearchResult from bookwyrm.settings import DOMAIN from bookwyrm.tests.validate_html import validate_html @@ -65,12 +64,11 @@ class Views(TestCase): self.assertIsInstance(response, TemplateResponse) validate_html(response.render()) - @responses.activate def test_search_books(self): """searches remote connectors""" view = views.Search.as_view() - models.Connector.objects.create( + connector = models.Connector.objects.create( identifier="example.com", connector_file="openlibrary", base_url="https://example.com", @@ -78,26 +76,24 @@ class Views(TestCase): covers_url="https://example.com/covers", search_url="https://example.com/search?q=", ) - datafile = pathlib.Path(__file__).parent.joinpath("../data/ol_search.json") - search_data = json.loads(datafile.read_bytes()) - responses.add( - responses.GET, "https://example.com/search?q=Test%20Book", json=search_data - ) + mock_result = SearchResult(title="Mock Book", connector=connector, key="hello") request = self.factory.get("", {"q": "Test Book", "remote": True}) request.user = self.local_user with patch("bookwyrm.views.search.is_api_request") as is_api: is_api.return_value = False - response = view(request) + with patch("bookwyrm.connectors.connector_manager.search") as remote_search: + remote_search.return_value = [ + {"results": [mock_result], "connector": connector} + ] + response = view(request) + self.assertIsInstance(response, TemplateResponse) validate_html(response.render()) connector_results = response.context_data["results"] self.assertEqual(len(connector_results), 2) self.assertEqual(connector_results[0]["results"][0].title, "Test Book") - self.assertEqual( - connector_results[1]["results"][0].title, - "This Is How You Lose the Time War", - ) + self.assertEqual(connector_results[1]["results"][0].title, "Mock Book") # don't search remote request = self.factory.get("", {"q": "Test Book", "remote": True}) @@ -106,7 +102,11 @@ class Views(TestCase): request.user = anonymous_user with patch("bookwyrm.views.search.is_api_request") as is_api: is_api.return_value = False - response = view(request) + with patch("bookwyrm.connectors.connector_manager.search") as remote_search: + remote_search.return_value = [ + {"results": [mock_result], "connector": connector} + ] + response = view(request) self.assertIsInstance(response, TemplateResponse) validate_html(response.render()) connector_results = response.context_data["results"]