From f182abd6f8f1eac20d19c3e4b4c9800115f2a705 Mon Sep 17 00:00:00 2001 From: Bnyro Date: Mon, 11 Sep 2023 08:22:32 +0200 Subject: [PATCH] [mod] library of congress: fix engine --- docs/dev/engines/online/loc.rst | 13 +++++ searx/engines/loc.py | 91 ++++++++++++++++++++++----------- 2 files changed, 73 insertions(+), 31 deletions(-) create mode 100644 docs/dev/engines/online/loc.rst diff --git a/docs/dev/engines/online/loc.rst b/docs/dev/engines/online/loc.rst new file mode 100644 index 000000000..2ed76cd81 --- /dev/null +++ b/docs/dev/engines/online/loc.rst @@ -0,0 +1,13 @@ +.. _loc engine: + +=================== +Library of Congress +=================== + +.. contents:: Contents + :depth: 2 + :local: + :backlinks: entry + +.. automodule:: searx.engines.loc + :members: diff --git a/searx/engines/loc.py b/searx/engines/loc.py index 0b2f3a689..5f58eb3dc 100644 --- a/searx/engines/loc.py +++ b/searx/engines/loc.py @@ -1,67 +1,96 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -""" +"""Library of Congress: query Photo, Print and Drawing from API endpoint_ +``photos``. - Library of Congress : images from Prints and Photographs Online Catalog +.. _endpoint: https://www.loc.gov/apis/json-and-yaml/requests/endpoints/ + +.. note:: + + Beside the ``photos`` endpoint_ there are more endpoints available / we are + looking forward for contributions implementing more endpoints. """ -from json import loads from urllib.parse import urlencode - +from searx.network import raise_for_httperror about = { "website": 'https://www.loc.gov/pictures/', "wikidata_id": 'Q131454', - "official_api_documentation": 'https://www.loc.gov/pictures/api', + "official_api_documentation": 'https://www.loc.gov/api', "use_official_api": True, "require_api_key": False, "results": 'JSON', } categories = ['images'] - paging = True -base_url = 'https://loc.gov/pictures/search/?' -search_string = "&sp={page}&{query}&fo=json" - -IMG_SRC_FIXES = { - 'https://tile.loc.gov/storage-services/': 'https://tile.loc.gov/storage-services/', - 'https://loc.gov/pictures/static/images/': 'https://tile.loc.gov/storage-services/', - 'https://www.loc.gov/pictures/cdn/': 'https://tile.loc.gov/storage-services/', -} +endpoint = 'photos' +base_url = 'https://loc.gov' +search_string = "/{endpoint}/?sp={page}&{query}&fo=json" def request(query, params): - search_path = search_string.format(query=urlencode({'q': query}), page=params['pageno']) - + search_path = search_string.format( + endpoint=endpoint, + query=urlencode({'q': query}), + page=params['pageno'], + ) params['url'] = base_url + search_path - + params['raise_for_httperror'] = False return params def response(resp): + results = [] + json_data = resp.json() - json_data = loads(resp.text) + json_results = json_data.get('results') + if not json_results: + # when a search term has none results, loc sends a JSON in a HTTP 404 + # response and the HTTP status code is set in the 'status' element. + if json_data.get('status') == 404: + return results + + raise_for_httperror(resp) + + for result in json_results: + + url = result["item"].get("link") + if not url: + continue + + img_src = result['item'].get('service_medium') + if not img_src or img_src == 'https://memory.loc.gov/pp/grp.gif': + continue + + title = result['title'] + if title.startswith('['): + title = title.strip('[]') + + content_items = [ + result['item'].get('created_published_date'), + result['item'].get('summary', [None])[0], + result['item'].get('notes', [None])[0], + result['item'].get('part_of', [None])[0], + ] + + author = None + if result['item'].get('creators'): + author = result['item']['creators'][0]['title'] - for result in json_data['results']: - img_src = result['image']['full'] - for url_prefix, url_replace in IMG_SRC_FIXES.items(): - if img_src.startswith(url_prefix): - img_src = img_src.replace(url_prefix, url_replace) - break - else: - img_src = result['image']['thumb'] results.append( { - 'url': result['links']['item'], - 'title': result['title'], - 'img_src': img_src, - 'thumbnail_src': result['image']['thumb'], - 'author': result['creator'], 'template': 'images.html', + 'url': url, + 'title': title, + 'content': ' / '.join([i for i in content_items if i]), + 'img_src': img_src, + 'thumbnail_src': result['item'].get('thumb_gallery'), + 'author': author, } )