bookwyrm/bookwyrm/connectors/connector_manager.py

195 lines
6.5 KiB
Python
Raw Normal View History

2021-03-08 16:49:10 +00:00
""" interface with whatever connectors the app has """
import asyncio
2021-01-02 16:14:28 +00:00
import importlib
import ipaddress
2021-04-07 15:09:47 +00:00
import logging
from urllib.parse import urlparse
import aiohttp
from django.dispatch import receiver
from django.db.models import signals
from requests import HTTPError
from bookwyrm import book_search, models
from bookwyrm.settings import SEARCH_TIMEOUT, USER_AGENT
from bookwyrm.tasks import app, LOW
2021-04-07 15:09:47 +00:00
logger = logging.getLogger(__name__)
2020-03-07 20:22:28 +00:00
2021-01-02 16:14:28 +00:00
class ConnectorException(HTTPError):
2021-04-26 16:15:42 +00:00
"""when the connector can't do what was asked"""
async def get_results(session, url, min_confidence, query, connector):
"""try this specific connector"""
# pylint: disable=line-too-long
headers = {
"Accept": (
'application/json, application/activity+json, application/ld+json; profile="https://www.w3.org/ns/activitystreams"; charset=utf-8'
),
"User-Agent": USER_AGENT,
}
params = {"min_confidence": min_confidence}
try:
async with session.get(url, headers=headers, params=params) as response:
if not response.ok:
logger.info("Unable to connect to %s: %s", url, response.reason)
return
try:
raw_data = await response.json()
except aiohttp.client_exceptions.ContentTypeError as err:
logger.exception(err)
return
return {
"connector": connector,
2022-05-31 00:00:34 +00:00
"results": connector.process_search_response(
query, raw_data, min_confidence
),
}
except asyncio.TimeoutError:
logger.info("Connection timed out for url: %s", url)
except aiohttp.ClientError as err:
logger.info(err)
async def async_connector_search(query, items, min_confidence):
"""Try a number of requests simultaneously"""
timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT)
async with aiohttp.ClientSession(timeout=timeout) as session:
tasks = []
for url, connector in items:
tasks.append(
asyncio.ensure_future(
get_results(session, url, min_confidence, query, connector)
)
)
results = await asyncio.gather(*tasks)
return results
def search(query, min_confidence=0.1, return_first=False):
2021-04-26 16:15:42 +00:00
"""find books based on arbitary keywords"""
2021-03-31 19:03:58 +00:00
if not query:
return []
results = []
2021-03-01 20:09:21 +00:00
items = []
for connector in get_connectors():
# get the search url from the connector before sending
url = connector.get_search_url(query)
try:
raise_not_valid_url(url)
except ConnectorException:
# if this URL is invalid we should skip it and move on
2022-05-31 16:32:32 +00:00
logger.info("Request denied to blocked domain: %s", url)
continue
items.append((url, connector))
# load as many results as we can
results = asyncio.run(async_connector_search(query, items, min_confidence))
results = [r for r in results if r]
2020-04-29 17:57:20 +00:00
if return_first:
# find the best result from all the responses and return that
all_results = [r for con in results for r in con["results"]]
all_results = sorted(all_results, key=lambda r: r.confidence, reverse=True)
return all_results[0] if all_results else None
# failed requests will return None, so filter those out
return results
2020-10-29 22:29:23 +00:00
def first_search_result(query, min_confidence=0.1):
2021-04-26 16:15:42 +00:00
"""search until you find a result that fits"""
# try local search first
result = book_search.search(query, min_confidence=min_confidence, return_first=True)
if result:
return result
# otherwise, try remote endpoints
return search(query, min_confidence=min_confidence, return_first=True) or None
2020-05-03 22:26:47 +00:00
def get_connectors():
2021-04-26 16:15:42 +00:00
"""load all connectors"""
2021-05-11 18:34:58 +00:00
for info in models.Connector.objects.filter(active=True).order_by("priority").all():
2020-05-12 17:01:36 +00:00
yield load_connector(info)
2021-01-02 16:14:28 +00:00
def get_or_create_connector(remote_id):
2021-04-26 16:15:42 +00:00
"""get the connector related to the object's server"""
2021-01-02 16:14:28 +00:00
url = urlparse(remote_id)
identifier = url.netloc
if not identifier:
2021-03-08 16:49:10 +00:00
raise ValueError("Invalid remote id")
2021-01-02 16:14:28 +00:00
try:
connector_info = models.Connector.objects.get(identifier=identifier)
except models.Connector.DoesNotExist:
connector_info = models.Connector.objects.create(
identifier=identifier,
2021-03-08 16:49:10 +00:00
connector_file="bookwyrm_connector",
2021-09-18 18:32:00 +00:00
base_url=f"https://{identifier}",
books_url=f"https://{identifier}/book",
covers_url=f"https://{identifier}/images/covers",
search_url=f"https://{identifier}/search?q=",
2021-03-08 16:49:10 +00:00
priority=2,
2021-01-02 16:14:28 +00:00
)
return load_connector(connector_info)
@app.task(queue=LOW)
2021-01-02 16:14:28 +00:00
def load_more_data(connector_id, book_id):
2021-04-26 16:15:42 +00:00
"""background the work of getting all 10,000 editions of LoTR"""
2021-01-02 16:14:28 +00:00
connector_info = models.Connector.objects.get(id=connector_id)
connector = load_connector(connector_info)
book = models.Book.objects.select_subclasses().get(id=book_id)
connector.expand_book_data(book)
@app.task(queue=LOW)
def create_edition_task(connector_id, work_id, data):
"""separate task for each of the 10,000 editions of LoTR"""
connector_info = models.Connector.objects.get(id=connector_id)
connector = load_connector(connector_info)
work = models.Work.objects.select_subclasses().get(id=work_id)
connector.create_edition_from_data(work, data)
2021-01-02 16:14:28 +00:00
def load_connector(connector_info):
2021-04-26 16:15:42 +00:00
"""instantiate the connector class"""
2021-01-02 16:14:28 +00:00
connector = importlib.import_module(
2021-09-18 18:32:00 +00:00
f"bookwyrm.connectors.{connector_info.connector_file}"
2021-01-02 16:14:28 +00:00
)
return connector.Connector(connector_info.identifier)
@receiver(signals.post_save, sender="bookwyrm.FederatedServer")
# pylint: disable=unused-argument
def create_connector(sender, instance, created, *args, **kwargs):
2021-04-26 16:15:42 +00:00
"""create a connector to an external bookwyrm server"""
if instance.application_type == "bookwyrm":
2021-09-18 18:32:00 +00:00
get_or_create_connector(f"https://{instance.server_name}")
def raise_not_valid_url(url):
"""do some basic reality checks on the url"""
parsed = urlparse(url)
if not parsed.scheme in ["http", "https"]:
raise ConnectorException("Invalid scheme: ", url)
try:
ipaddress.ip_address(parsed.netloc)
raise ConnectorException("Provided url is an IP address: ", url)
except ValueError:
# it's not an IP address, which is good
pass
if models.FederatedServer.is_blocked(url):
raise ConnectorException(f"Attempting to load data from blocked url: {url}")