From e560d7e373a2d083590bb75014f6b1e801775410 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 13 Jan 2024 14:06:26 +0100 Subject: [PATCH] [mod] presearch: add language & region support In Presearch there are languages for the UI and regions for narrowing down the search. With this change the SearXNG engine supports a search by region. The details can be found in the documentation of the source code. To test, you can search terms like:: !presearch bmw :zh-TW !presearch bmw :en-CA 1. You should get results corresponding to the region (Taiwan, Canada) 2. and in the language (Chinese, Englisch). 3. The context in info box content is in the same language. Exceptions: 1. Region or language is not supported by Presearch or 2. SearXNG user did not selected a region tag, example:: !presearch bmw :en Signed-off-by: Markus Heiser --- docs/dev/engines/online/presearch.rst | 13 +++ searx/engines/presearch.py | 117 ++++++++++++++++++++++---- 2 files changed, 113 insertions(+), 17 deletions(-) create mode 100644 docs/dev/engines/online/presearch.rst diff --git a/docs/dev/engines/online/presearch.rst b/docs/dev/engines/online/presearch.rst new file mode 100644 index 000000000..59332c354 --- /dev/null +++ b/docs/dev/engines/online/presearch.rst @@ -0,0 +1,13 @@ +.. _engine presearch: + +================ +Presearch Engine +================ + +.. contents:: + :depth: 2 + :local: + :backlinks: entry + +.. automodule:: searx.engines.presearch + :members: diff --git a/searx/engines/presearch.py b/searx/engines/presearch.py index 747393829..baf692d60 100644 --- a/searx/engines/presearch.py +++ b/searx/engines/presearch.py @@ -1,23 +1,72 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Presearch (general, images, videos, news) +"""Presearch supports the search types listed in :py:obj:`search_type` (general, +images, videos, news). + +Configured ``presarch`` engines: + +.. code:: yaml + + - name: presearch + engine: presearch + search_type: search + categories: [general, web] + + - name: presearch images + ... + search_type: images + categories: [images, web] + + - name: presearch videos + ... + search_type: videos + categories: [general, web] + + - name: presearch news + ... + search_type: news + categories: [news, web] .. hint:: - The results in the video category are most often links to pages that contain - a video, for instance many links from preasearch's video category link - content from facebook (aka Meta) or Twitter (aka X). Since these are not - real links to video streams SearXNG can't use the video template for this and - if SearXNG can't use this template, then the user doesn't want to see these - hits in the videos category. - - TL;DR; by default presearch's video category is placed into categories:: + By default Presearch's video category is intentionally placed into:: categories: [general, web] + +Search type ``video`` +===================== + +The results in the video category are most often links to pages that contain a +video, for instance many links from Preasearch's video category link content +from facebook (aka Meta) or Twitter (aka X). Since these are not real links to +video streams SearXNG can't use the video template for this and if SearXNG can't +use this template, then the user doesn't want to see these hits in the videos +category. + + +Languages & Regions +=================== + +In Presearch there are languages for the UI and regions for narrowing down the +search. If we set "auto" for the region in the WEB-UI of Presearch and cookie +``use_local_search_results=false``, then the defaults are set for both (the +language and the region) from the ``Accept-Language`` header. + +Since the region is already "auto" by default, we only need to set the +``use_local_search_results`` cookie and send the ``Accept-Language`` header. We +have to set these values in both requests we send to Presearch; in the first +request to get the request-ID from Presearch and in the final request to get the +result list (see ``send_accept_language_header``). + + +Implementations +=============== + """ from urllib.parse import urlencode +from searx import locales from searx.network import get from searx.utils import gen_useragent, html_to_text @@ -32,6 +81,7 @@ about = { paging = True safesearch = True time_range_support = True +send_accept_language_header = True categories = ["general", "web"] # general, images, videos, news search_type = "search" @@ -46,19 +96,43 @@ def init(_): raise ValueError(f'presearch search_type: {search_type}') -def _get_request_id(query, page, time_range, safesearch_param): +def _get_request_id(query, params): + args = { "q": query, - "page": page, + "page": params["pageno"], } - if time_range: - args["time"] = time_range + + if params["time_range"]: + args["time"] = params["time_range"] url = f"{base_url}/{search_type}?{urlencode(args)}" + headers = { 'User-Agent': gen_useragent(), - 'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch_param]}", + 'Cookie': ( + f"b=1;" + f" presearch_session=;" + f" use_local_search_results=false;" + f" use_safe_search={safesearch_map[params['safesearch']]}" + ), } + if params['searxng_locale'] != 'all': + l = locales.get_locale(params['searxng_locale']) + + # Presearch narrows down the search by region. In SearXNG when the user + # does not set a region (e.g. 'en-CA' / canada) we cannot hand over a + # region. + + # We could possibly use searx.locales.get_official_locales to determine + # in which regions this language is an official one, but then we still + # wouldn't know which region should be given more weight / Presearch + # performs an IP-based geolocation of the user, we don't want that in + # SearXNG ;-) + + if l.territory: + headers['Accept-Language'] = f"{l.language}-{l.territory},{l.language};" "q=0.9,*;" "q=0.5" + resp_text = get(url, headers=headers).text # type: ignore for line in resp_text.split("\n"): @@ -69,8 +143,7 @@ def _get_request_id(query, page, time_range, safesearch_param): def request(query, params): - request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"]) - + request_id = _get_request_id(query, params) params["headers"]["Accept"] = "application/json" params["url"] = f"{base_url}/results?id={request_id}" @@ -109,7 +182,17 @@ def parse_search_query(json_results): if info: attributes = [] for item in info.get('about', []): - label, value = html_to_text(item).split(':', 1) + + text = html_to_text(item) + if ':' in text: + # split text into key / value + label, value = text.split(':', 1) + else: + # In other languages (tested with zh-TW) a colon is represented + # by a different symbol --> then we split at the first space. + label, value = text.split(' ', 1) + label = label[:-1] + value = _strip_leading_strings(value) attributes.append({'label': label, 'value': value}) content = []