From e8706fb738da9feb21e596f403dddb40e69c8a7b Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 25 Jun 2023 12:37:31 +0200 Subject: [PATCH] [fix] engine & network issues / documentation and type annotations This patch fixes some quirks and issues related to the engines and the network. Each engine has its own network and this network was broken for the following engines[1]: - archlinux - bing - dailymotion - duckduckgo - google - peertube - startpage - wikipedia Since the files have been touched anyway, the type annotaions of the engine modules has also been completed so that error messages from the type checker are no longer reported. Related and (partial) fixed issue: - [1] https://github.com/searxng/searxng/issues/762#issuecomment-1605323861 - [2] https://github.com/searxng/searxng/issues/2513 - [3] https://github.com/searxng/searxng/issues/2515 Signed-off-by: Markus Heiser --- docs/admin/engines/settings.rst | 74 ++++++++++++++++++++++++++------- searx/enginelib/__init__.py | 14 ++++++- searx/enginelib/traits.py | 6 +-- searx/engines/__init__.py | 69 ++++++++++++++++++------------ searx/engines/archlinux.py | 25 +++++------ searx/engines/bing.py | 18 +++++--- searx/engines/dailymotion.py | 20 ++++----- searx/engines/duckduckgo.py | 24 ++++++----- searx/engines/google.py | 18 ++++---- searx/engines/peertube.py | 22 +++++----- searx/engines/startpage.py | 28 +++++++------ searx/engines/wikipedia.py | 6 +-- searx/settings_defaults.py | 2 - 13 files changed, 204 insertions(+), 122 deletions(-) diff --git a/docs/admin/engines/settings.rst b/docs/admin/engines/settings.rst index 118e01efd..250a27461 100644 --- a/docs/admin/engines/settings.rst +++ b/docs/admin/engines/settings.rst @@ -397,14 +397,26 @@ Communication with search engines. Global timeout of the requests made to others engines in seconds. A bigger timeout will allow to wait for answers from slow engines, but in consequence will slow SearXNG reactivity (the result page may take the time specified in the - timeout to load). Can be override by :ref:`settings engine` + timeout to load). Can be override by ``timeout`` in the :ref:`settings engine`. ``useragent_suffix`` : Suffix to the user-agent SearXNG uses to send requests to others engines. If an engine wish to block you, a contact info here may be useful to avoid that. +.. _Pool limit configuration: https://www.python-httpx.org/advanced/#pool-limit-configuration + +``pool_maxsize``: + Number of allowable keep-alive connections, or ``null`` to always allow. The + default is 10. See ``max_keepalive_connections`` `Pool limit configuration`_. + +``pool_connections`` : + Maximum number of allowable connections, or ``null`` # for no limits. The + default is 100. See ``max_connections`` `Pool limit configuration`_. + ``keepalive_expiry`` : - Number of seconds to keep a connection in the pool. By default 5.0 seconds. + Number of seconds to keep a connection in the pool. By default 5.0 seconds. + See ``keepalive_expiry`` `Pool limit configuration`_. + .. _httpx proxies: https://www.python-httpx.org/advanced/#http-proxying @@ -429,15 +441,6 @@ Communication with search engines. Number of retry in case of an HTTP error. On each retry, SearXNG uses an different proxy and source ip. -``retry_on_http_error`` : - Retry request on some HTTP status code. - - Example: - - * ``true`` : on HTTP status code between 400 and 599. - * ``403`` : on HTTP status code 403. - * ``[403, 429]``: on HTTP status code 403 and 429. - ``enable_http2`` : Enable by default. Set to ``false`` to disable HTTP/2. @@ -455,6 +458,11 @@ Communication with search engines. ``max_redirects`` : 30 by default. Maximum redirect before it is an error. +``using_tor_proxy`` : + Using tor proxy (``true``) or not (``false``) for all engines. The default is + ``false`` and can be overwritten in the :ref:`settings engine` + + .. _settings categories_as_tabs: @@ -522,13 +530,14 @@ engine is shown. Most of the options have a default value or even are optional. use_official_api: true require_api_key: true results: HTML - enable_http: false + + # overwrite values from section 'outgoing:' enable_http2: false retries: 1 - retry_on_http_error: true # or 403 or [404, 429] max_connections: 100 max_keepalive_connections: 10 keepalive_expiry: 5.0 + using_tor_proxy: false proxies: http: - http://proxy1:8080 @@ -539,6 +548,11 @@ engine is shown. Most of the options have a default value or even are optional. - socks5://user:password@proxy3:1080 - socks5h://user:password@proxy4:1080 + # other network settings + enable_http: false + retry_on_http_error: true # or 403 or [404, 429] + + ``name`` : Name that will be used across SearXNG to define this engine. In settings, on the result page... @@ -579,7 +593,8 @@ engine is shown. Most of the options have a default value or even are optional. query all search engines in that category (group). ``timeout`` : optional - Timeout of the search with the current search engine. **Be careful, it will + Timeout of the search with the current search engine. Overwrites + ``request_timeout`` from :ref:`settings outgoing`. **Be careful, it will modify the global timeout of SearXNG.** ``api_key`` : optional @@ -615,6 +630,37 @@ engine is shown. Most of the options have a default value or even are optional. - ``ipv4`` set ``local_addresses`` to ``0.0.0.0`` (use only IPv4 local addresses) - ``ipv6`` set ``local_addresses`` to ``::`` (use only IPv6 local addresses) +``enable_http`` : optional + Enable HTTP for this engine (by default only HTTPS is enabled). + +``retry_on_http_error`` : optional + Retry request on some HTTP status code. + + Example: + + * ``true`` : on HTTP status code between 400 and 599. + * ``403`` : on HTTP status code 403. + * ``[403, 429]``: on HTTP status code 403 and 429. + +``proxies`` : + Overwrites proxy settings from :ref:`settings outgoing`. + +``using_tor_proxy`` : + Using tor proxy (``true``) or not (``false``) for this engine. The default is + taken from ``using_tor_proxy`` of the :ref:`settings outgoing`. + +``max_keepalive_connection#s`` : + `Pool limit configuration`_, overwrites value ``pool_maxsize`` from + :ref:`settings outgoing` for this engine. + +``max_connections`` : + `Pool limit configuration`_, overwrites value ``pool_connections`` from + :ref:`settings outgoing` for this engine. + +``keepalive_expiry`` : + `Pool limit configuration`_, overwrites value ``keepalive_expiry`` from + :ref:`settings outgoing` for this engine. + .. note:: A few more options are possible, but they are pretty specific to some diff --git a/searx/enginelib/__init__.py b/searx/enginelib/__init__.py index 00962e215..fd3019e6c 100644 --- a/searx/enginelib/__init__.py +++ b/searx/enginelib/__init__.py @@ -17,7 +17,7 @@ from __future__ import annotations -from typing import Union, Dict, List, Callable, TYPE_CHECKING +from typing import List, Callable, TYPE_CHECKING if TYPE_CHECKING: from searx.enginelib import traits @@ -134,3 +134,15 @@ class Engine: # pylint: disable=too-few-public-methods require_api_key: true results: HTML """ + + using_tor_proxy: bool + """Using tor proxy (``true``) or not (``false``) for this engine.""" + + send_accept_language_header: bool + """When this option is activated, the language (locale) that is selected by + the user is used to build and send a ``Accept-Language`` header in the + request to the origin search engine.""" + + tokens: List[str] + """A list of secret tokens to make this engine *private*, more details see + :ref:`private engines`.""" diff --git a/searx/enginelib/traits.py b/searx/enginelib/traits.py index ae27d46f1..8a7356ce2 100644 --- a/searx/enginelib/traits.py +++ b/searx/enginelib/traits.py @@ -13,6 +13,7 @@ used. from __future__ import annotations import json import dataclasses +import types from typing import Dict, Iterable, Union, Callable, Optional, TYPE_CHECKING from typing_extensions import Literal, Self @@ -82,8 +83,7 @@ class EngineTraits: """ custom: Dict[str, Union[Dict[str, Dict], Iterable[str]]] = dataclasses.field(default_factory=dict) - """A place to store engine's custom traits, not related to the SearXNG core - + """A place to store engine's custom traits, not related to the SearXNG core. """ def get_language(self, searxng_locale: str, default=None): @@ -228,7 +228,7 @@ class EngineTraitsMap(Dict[str, EngineTraits]): return obj - def set_traits(self, engine: Engine): + def set_traits(self, engine: Engine | types.ModuleType): """Set traits in a :py:obj:`Engine` namespace. :param engine: engine instance build by :py:func:`searx.engines.load_engine` diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index a2db26816..e9e9f87c9 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -17,7 +17,9 @@ import sys import copy from os.path import realpath, dirname -from typing import TYPE_CHECKING, Dict, Optional +from typing import TYPE_CHECKING, Dict +import types +import inspect from searx import logger, settings from searx.utils import load_module @@ -28,21 +30,23 @@ if TYPE_CHECKING: logger = logger.getChild('engines') ENGINE_DIR = dirname(realpath(__file__)) ENGINE_DEFAULT_ARGS = { + # Common options in the engine module "engine_type": "online", - "inactive": False, - "disabled": False, - "timeout": settings["outgoing"]["request_timeout"], - "shortcut": "-", - "categories": ["general"], "paging": False, - "safesearch": False, "time_range_support": False, + "safesearch": False, + # settings.yml + "categories": ["general"], "enable_http": False, - "using_tor_proxy": False, + "shortcut": "-", + "timeout": settings["outgoing"]["request_timeout"], "display_error_messages": True, + "disabled": False, + "inactive": False, + "about": {}, + "using_tor_proxy": False, "send_accept_language_header": False, "tokens": [], - "about": {}, } # set automatically when an engine does not have any tab category DEFAULT_CATEGORY = 'other' @@ -51,7 +55,7 @@ DEFAULT_CATEGORY = 'other' # Defaults for the namespace of an engine module, see :py:func:`load_engine` categories = {'general': []} -engines: Dict[str, Engine] = {} +engines: Dict[str, Engine | types.ModuleType] = {} engine_shortcuts = {} """Simple map of registered *shortcuts* to name of the engine (or ``None``). @@ -63,7 +67,19 @@ engine_shortcuts = {} """ -def load_engine(engine_data: dict) -> Optional[Engine]: +def check_engine_module(module: types.ModuleType): + # probe unintentional name collisions / for example name collisions caused + # by import statements in the engine module .. + + # network: https://github.com/searxng/searxng/issues/762#issuecomment-1605323861 + obj = getattr(module, 'network', None) + if obj and inspect.ismodule(obj): + msg = f'type of {module.__name__}.network is a module ({obj.__name__}), expected a string' + # logger.error(msg) + raise TypeError(msg) + + +def load_engine(engine_data: dict) -> Engine | types.ModuleType | None: """Load engine from ``engine_data``. :param dict engine_data: Attributes from YAML ``settings:engines/`` @@ -100,19 +116,20 @@ def load_engine(engine_data: dict) -> Optional[Engine]: engine_data['name'] = engine_name # load_module - engine_module = engine_data.get('engine') - if engine_module is None: + module_name = engine_data.get('engine') + if module_name is None: logger.error('The "engine" field is missing for the engine named "{}"'.format(engine_name)) return None try: - engine = load_module(engine_module + '.py', ENGINE_DIR) + engine = load_module(module_name + '.py', ENGINE_DIR) except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError): - logger.exception('Fatal exception in engine "{}"'.format(engine_module)) + logger.exception('Fatal exception in engine "{}"'.format(module_name)) sys.exit(1) except BaseException: - logger.exception('Cannot load engine "{}"'.format(engine_module)) + logger.exception('Cannot load engine "{}"'.format(module_name)) return None + check_engine_module(engine) update_engine_attributes(engine, engine_data) update_attributes_for_tor(engine) @@ -153,18 +170,18 @@ def set_loggers(engine, engine_name): and not hasattr(module, "logger") ): module_engine_name = module_name.split(".")[-1] - module.logger = logger.getChild(module_engine_name) + module.logger = logger.getChild(module_engine_name) # type: ignore -def update_engine_attributes(engine: Engine, engine_data): +def update_engine_attributes(engine: Engine | types.ModuleType, engine_data): # set engine attributes from engine_data for param_name, param_value in engine_data.items(): if param_name == 'categories': if isinstance(param_value, str): param_value = list(map(str.strip, param_value.split(','))) - engine.categories = param_value + engine.categories = param_value # type: ignore elif hasattr(engine, 'about') and param_name == 'about': - engine.about = {**engine.about, **engine_data['about']} + engine.about = {**engine.about, **engine_data['about']} # type: ignore else: setattr(engine, param_name, param_value) @@ -174,10 +191,10 @@ def update_engine_attributes(engine: Engine, engine_data): setattr(engine, arg_name, copy.deepcopy(arg_value)) -def update_attributes_for_tor(engine: Engine) -> bool: +def update_attributes_for_tor(engine: Engine | types.ModuleType): if using_tor_proxy(engine) and hasattr(engine, 'onion_url'): - engine.search_url = engine.onion_url + getattr(engine, 'search_path', '') - engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0) + engine.search_url = engine.onion_url + getattr(engine, 'search_path', '') # type: ignore + engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0) # type: ignore def is_missing_required_attributes(engine): @@ -193,12 +210,12 @@ def is_missing_required_attributes(engine): return missing -def using_tor_proxy(engine: Engine): +def using_tor_proxy(engine: Engine | types.ModuleType): """Return True if the engine configuration declares to use Tor.""" return settings['outgoing'].get('using_tor_proxy') or getattr(engine, 'using_tor_proxy', False) -def is_engine_active(engine: Engine): +def is_engine_active(engine: Engine | types.ModuleType): # check if engine is inactive if engine.inactive is True: return False @@ -210,7 +227,7 @@ def is_engine_active(engine: Engine): return True -def register_engine(engine: Engine): +def register_engine(engine: Engine | types.ModuleType): if engine.name in engines: logger.error('Engine config error: ambiguous name: {0}'.format(engine.name)) sys.exit(1) diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py index 56c3b447f..17bb1b6c5 100644 --- a/searx/engines/archlinux.py +++ b/searx/engines/archlinux.py @@ -14,7 +14,6 @@ from urllib.parse import urlencode, urljoin, urlparse import lxml import babel -from searx import network from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex from searx.enginelib.traits import EngineTraits from searx.locales import language_tag @@ -45,13 +44,13 @@ main_wiki = 'wiki.archlinux.org' def request(query, params): sxng_lang = params['searxng_locale'].split('-')[0] - netloc = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) - title = traits.custom['title'].get(sxng_lang, 'Special:Search') + netloc: str = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) # type: ignore + title: str = traits.custom['title'].get(sxng_lang, 'Special:Search') # type: ignore base_url = 'https://' + netloc + '/index.php?' offset = (params['pageno'] - 1) * 20 if netloc == main_wiki: - eng_lang: str = traits.get_language(sxng_lang, 'English') + eng_lang: str = traits.get_language(sxng_lang, 'English') # type: ignore query += ' (' + eng_lang + ')' elif netloc == 'wiki.archlinuxcn.org': base_url = 'https://' + netloc + '/wzh/index.php?' @@ -71,11 +70,11 @@ def request(query, params): def response(resp): results = [] - dom = lxml.html.fromstring(resp.text) + dom = lxml.html.fromstring(resp.text) # type: ignore # get the base URL for the language in which request was made sxng_lang = resp.search_params['searxng_locale'].split('-')[0] - netloc = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) + netloc: str = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki) # type: ignore base_url = 'https://' + netloc + '/index.php?' for result in eval_xpath_list(dom, '//ul[@class="mw-search-results"]/li'): @@ -83,7 +82,7 @@ def response(resp): content = extract_text(result.xpath('.//div[@class="searchresult"]')) results.append( { - 'url': urljoin(base_url, link.get('href')), + 'url': urljoin(base_url, link.get('href')), # type: ignore 'title': extract_text(link), 'content': content, } @@ -114,6 +113,8 @@ def fetch_traits(engine_traits: EngineTraits): }, """ + # pylint: disable=import-outside-toplevel + from searx.network import get # see https://github.com/searxng/searxng/issues/762 engine_traits.custom['wiki_netloc'] = {} engine_traits.custom['title'] = {} @@ -125,11 +126,11 @@ def fetch_traits(engine_traits: EngineTraits): 'zh': 'Special:搜索', } - resp = network.get('https://wiki.archlinux.org/') - if not resp.ok: + resp = get('https://wiki.archlinux.org/') + if not resp.ok: # type: ignore print("ERROR: response from wiki.archlinix.org is not OK.") - dom = lxml.html.fromstring(resp.text) + dom = lxml.html.fromstring(resp.text) # type: ignore for a in eval_xpath_list(dom, "//a[@class='interlanguage-link-target']"): sxng_tag = language_tag(babel.Locale.parse(a.get('lang'), sep='-')) @@ -143,9 +144,9 @@ def fetch_traits(engine_traits: EngineTraits): print("ERROR: title tag from %s (%s) is unknown" % (netloc, sxng_tag)) continue engine_traits.custom['wiki_netloc'][sxng_tag] = netloc - engine_traits.custom['title'][sxng_tag] = title + engine_traits.custom['title'][sxng_tag] = title # type: ignore eng_tag = extract_text(eval_xpath_list(a, ".//span")) - engine_traits.languages[sxng_tag] = eng_tag + engine_traits.languages[sxng_tag] = eng_tag # type: ignore engine_traits.languages['en'] = 'English' diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 81a0cf6a5..3cd707870 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -38,7 +38,6 @@ import babel import babel.languages from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex -from searx import network from searx.locales import language_tag, region_tag from searx.enginelib.traits import EngineTraits @@ -180,6 +179,10 @@ def request(query, params): def response(resp): + # pylint: disable=too-many-locals,import-outside-toplevel + + from searx.network import Request, multi_requests # see https://github.com/searxng/searxng/issues/762 + results = [] result_len = 0 @@ -231,9 +234,9 @@ def response(resp): # resolve all Bing redirections in parallel request_list = [ - network.Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve + Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve ] - response_list = network.multi_requests(request_list) + response_list = multi_requests(request_list) for i, redirect_response in enumerate(response_list): if not isinstance(redirect_response, Exception): results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location'] @@ -272,16 +275,19 @@ def fetch_traits(engine_traits: EngineTraits): def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str): + # pylint: disable=too-many-locals,import-outside-toplevel + + from searx.network import get # see https://github.com/searxng/searxng/issues/762 # insert alias to map from a language (zh) to a language + script (zh_Hans) engine_traits.languages['zh'] = 'zh-hans' - resp = network.get(url) + resp = get(url) - if not resp.ok: + if not resp.ok: # type: ignore print("ERROR: response from peertube is not OK.") - dom = html.fromstring(resp.text) + dom = html.fromstring(resp.text) # type: ignore map_lang = {'jp': 'ja'} for td in eval_xpath(dom, xpath_language_codes): diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py index d734ec3c8..99da9616c 100644 --- a/searx/engines/dailymotion.py +++ b/searx/engines/dailymotion.py @@ -18,9 +18,9 @@ from urllib.parse import urlencode import time import babel -from searx.exceptions import SearxEngineAPIException -from searx import network +from searx.network import get, raise_for_httperror # see https://github.com/searxng/searxng/issues/762 from searx.utils import html_to_text +from searx.exceptions import SearxEngineAPIException from searx.locales import region_tag, language_tag from searx.enginelib.traits import EngineTraits @@ -106,7 +106,7 @@ def request(query, params): if not query: return False - eng_region = traits.get_region(params['searxng_locale'], 'en_US') + eng_region: str = traits.get_region(params['searxng_locale'], 'en_US') # type: ignore eng_lang = traits.get_language(params['searxng_locale'], 'en') args = { @@ -156,7 +156,7 @@ def response(resp): if 'error' in search_res: raise SearxEngineAPIException(search_res['error'].get('message')) - network.raise_for_httperror(resp) + raise_for_httperror(resp) # parse results for res in search_res.get('list', []): @@ -218,11 +218,11 @@ def fetch_traits(engine_traits: EngineTraits): """ - resp = network.get('https://api.dailymotion.com/locales') - if not resp.ok: + resp = get('https://api.dailymotion.com/locales') + if not resp.ok: # type: ignore print("ERROR: response from dailymotion/locales is not OK.") - for item in resp.json()['list']: + for item in resp.json()['list']: # type: ignore eng_tag = item['locale'] if eng_tag in ('en_EN', 'ar_AA'): continue @@ -241,11 +241,11 @@ def fetch_traits(engine_traits: EngineTraits): locale_lang_list = [x.split('_')[0] for x in engine_traits.regions.values()] - resp = network.get('https://api.dailymotion.com/languages') - if not resp.ok: + resp = get('https://api.dailymotion.com/languages') + if not resp.ok: # type: ignore print("ERROR: response from dailymotion/languages is not OK.") - for item in resp.json()['list']: + for item in resp.json()['list']: # type: ignore eng_tag = item['code'] if eng_tag in locale_lang_list: sxng_tag = language_tag(babel.Locale.parse(eng_tag)) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index d37e28c2d..8349ad8e3 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -13,17 +13,17 @@ import babel import lxml.html from searx import ( - network, locales, redislib, external_bang, ) -from searx import redisdb from searx.utils import ( eval_xpath, eval_xpath_getindex, extract_text, ) +from searx.network import get # see https://github.com/searxng/searxng/issues/762 +from searx import redisdb from searx.enginelib.traits import EngineTraits from searx.exceptions import SearxEngineAPIException @@ -95,8 +95,8 @@ def get_vqd(query, headers): return value query_url = 'https://duckduckgo.com/?q={query}&atb=v290-5'.format(query=urlencode({'q': query})) - res = network.get(query_url, headers=headers) - content = res.text + res = get(query_url, headers=headers) + content = res.text # type: ignore if content.find('vqd=\"') == -1: raise SearxEngineAPIException('Request failed') value = content[content.find('vqd=\"') + 5 :] @@ -139,7 +139,9 @@ def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'): params['cookies']['kl'] = eng_region # 'ar-es' """ - return eng_traits.custom['lang_region'].get(sxng_locale, eng_traits.get_language(sxng_locale, default)) + return eng_traits.custom['lang_region'].get( # type: ignore + sxng_locale, eng_traits.get_language(sxng_locale, default) + ) ddg_reg_map = { @@ -358,13 +360,13 @@ def fetch_traits(engine_traits: EngineTraits): engine_traits.all_locale = 'wt-wt' # updated from u588 to u661 / should be updated automatically? - resp = network.get('https://duckduckgo.com/util/u661.js') + resp = get('https://duckduckgo.com/util/u661.js') - if not resp.ok: + if not resp.ok: # type: ignore print("ERROR: response from DuckDuckGo is not OK.") - pos = resp.text.find('regions:{') + 8 - js_code = resp.text[pos:] + pos = resp.text.find('regions:{') + 8 # type: ignore + js_code = resp.text[pos:] # type: ignore pos = js_code.find('}') + 1 regions = json.loads(js_code[:pos]) @@ -399,8 +401,8 @@ def fetch_traits(engine_traits: EngineTraits): engine_traits.custom['lang_region'] = {} - pos = resp.text.find('languages:{') + 10 - js_code = resp.text[pos:] + pos = resp.text.find('languages:{') + 10 # type: ignore + js_code = resp.text[pos:] # type: ignore pos = js_code.find('}') + 1 js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"') languages = json.loads(js_code) diff --git a/searx/engines/google.py b/searx/engines/google.py index 708068f3a..6aaac2f22 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -23,7 +23,7 @@ import babel.languages from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex from searx.locales import language_tag, region_tag, get_offical_locales -from searx import network +from searx.network import get # see https://github.com/searxng/searxng/issues/762 from searx.exceptions import SearxEngineCaptchaException from searx.enginelib.traits import EngineTraits @@ -419,11 +419,11 @@ def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True): engine_traits.custom['supported_domains'] = {} - resp = network.get('https://www.google.com/preferences') - if not resp.ok: + resp = get('https://www.google.com/preferences') + if not resp.ok: # type: ignore raise RuntimeError("Response from Google's preferences is not OK.") - dom = html.fromstring(resp.text) + dom = html.fromstring(resp.text) # type: ignore # supported language codes @@ -474,18 +474,18 @@ def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True): # supported domains if add_domains: - resp = network.get('https://www.google.com/supported_domains') - if not resp.ok: + resp = get('https://www.google.com/supported_domains') + if not resp.ok: # type: ignore raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.") - for domain in resp.text.split(): + for domain in resp.text.split(): # type: ignore domain = domain.strip() if not domain or domain in [ '.google.com', ]: continue region = domain.split('.')[-1].upper() - engine_traits.custom['supported_domains'][region] = 'www' + domain + engine_traits.custom['supported_domains'][region] = 'www' + domain # type: ignore if region == 'HK': # There is no google.cn, we use .com.hk for zh-CN - engine_traits.custom['supported_domains']['CN'] = 'www' + domain + engine_traits.custom['supported_domains']['CN'] = 'www' + domain # type: ignore diff --git a/searx/engines/peertube.py b/searx/engines/peertube.py index 87b386d7a..d0eba6b88 100644 --- a/searx/engines/peertube.py +++ b/searx/engines/peertube.py @@ -13,7 +13,7 @@ from dateutil.relativedelta import relativedelta import babel -from searx import network +from searx.network import get # see https://github.com/searxng/searxng/issues/762 from searx.locales import language_tag from searx.utils import html_to_text from searx.enginelib.traits import EngineTraits @@ -147,32 +147,30 @@ def fetch_traits(engine_traits: EngineTraits): https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729#3d8747f9a60695c367c70bb64efba8f403721fad_0_291 """ - resp = network.get( + resp = get( 'https://framagit.org/framasoft/peertube/search-index/-/raw/master/client/src/components/Filters.vue', # the response from search-index repository is very slow timeout=60, ) - if not resp.ok: + if not resp.ok: # type: ignore print("ERROR: response from peertube is not OK.") return - js_lang = re.search(r"videoLanguages \(\)[^\n]+(.*?)\]", resp.text, re.DOTALL) + js_lang = re.search(r"videoLanguages \(\)[^\n]+(.*?)\]", resp.text, re.DOTALL) # type: ignore if not js_lang: print("ERROR: can't determine languages from peertube") return for lang in re.finditer(r"\{ id: '([a-z]+)', label:", js_lang.group(1)): + eng_tag = lang.group(1) + if eng_tag == 'oc': + # Occitanis not known by babel, its closest relative is Catalan + # but 'ca' is already in the list of engine_traits.languages --> + # 'oc' will be ignored. + continue try: - eng_tag = lang.group(1) - if eng_tag == 'oc': - # Occitanis not known by babel, its closest relative is Catalan - # but 'ca' is already in the list of engine_traits.languages --> - # 'oc' will be ignored. - continue - sxng_tag = language_tag(babel.Locale.parse(eng_tag)) - except babel.UnknownLocaleError: print("ERROR: %s is unknown by babel" % eng_tag) continue diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 2813d0bf3..92d69867a 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -91,8 +91,8 @@ import dateutil.parser import lxml.html import babel -from searx import network from searx.utils import extract_text, eval_xpath, gen_useragent +from searx.network import get # see https://github.com/searxng/searxng/issues/762 from searx.exceptions import SearxEngineCaptchaException from searx.locales import region_tag from searx.enginelib.traits import EngineTraits @@ -211,25 +211,25 @@ def get_sc_code(searxng_locale, params): get_sc_url = base_url + '/?sc=%s' % (sc_code) logger.debug("query new sc time-stamp ... %s", get_sc_url) logger.debug("headers: %s", headers) - resp = network.get(get_sc_url, headers=headers) + resp = get(get_sc_url, headers=headers) # ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers) # ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg # ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21 - if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): + if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): # type: ignore raise SearxEngineCaptchaException( message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha", ) - dom = lxml.html.fromstring(resp.text) + dom = lxml.html.fromstring(resp.text) # type: ignore try: sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0] except IndexError as exc: logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695") raise SearxEngineCaptchaException( - message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url, + message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url, # type: ignore ) from exc sc_code_ts = time() @@ -350,7 +350,7 @@ def _response_cat_web(dom): title = extract_text(link) if eval_xpath(result, content_xpath): - content = extract_text(eval_xpath(result, content_xpath)) + content: str = extract_text(eval_xpath(result, content_xpath)) # type: ignore else: content = '' @@ -374,7 +374,7 @@ def _response_cat_web(dom): date_string = content[0 : date_pos - 5] # calculate datetime - published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) + published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group())) # type: ignore # fix content string content = content[date_pos:] @@ -399,12 +399,12 @@ def fetch_traits(engine_traits: EngineTraits): 'User-Agent': gen_useragent(), 'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language } - resp = network.get('https://www.startpage.com/do/settings', headers=headers) + resp = get('https://www.startpage.com/do/settings', headers=headers) - if not resp.ok: + if not resp.ok: # type: ignore print("ERROR: response from Startpage is not OK.") - dom = lxml.html.fromstring(resp.text) + dom = lxml.html.fromstring(resp.text) # type: ignore # regions @@ -443,8 +443,10 @@ def fetch_traits(engine_traits: EngineTraits): # get the native name of every language known by babel - for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers()): - native_name = babel.Locale(lang_code).get_language_name().lower() + for lang_code in filter( + lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers() # type: ignore + ): + native_name = babel.Locale(lang_code).get_language_name().lower() # type: ignore # add native name exactly as it is catalog_engine2code[native_name] = lang_code @@ -478,7 +480,7 @@ def fetch_traits(engine_traits: EngineTraits): eng_tag = option.get('value') if eng_tag in skip_eng_tags: continue - name = extract_text(option).lower() + name = extract_text(option).lower() # type: ignore sxng_tag = catalog_engine2code.get(eng_tag) if sxng_tag is None: diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 98b3d6f9e..b4b70208d 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -61,7 +61,7 @@ import babel from lxml import html from searx import utils -from searx import network +from searx import network as _network from searx import locales from searx.enginelib.traits import EngineTraits @@ -180,7 +180,7 @@ def response(resp): ): return [] - network.raise_for_httperror(resp) + _network.raise_for_httperror(resp) api_result = resp.json() title = utils.html_to_text(api_result.get('titles', {}).get('display') or api_result.get('title')) @@ -267,7 +267,7 @@ def fetch_wikimedia_traits(engine_traits: EngineTraits): for sxng_tag in sxng_tag_list: engine_traits.regions[sxng_tag] = eng_tag - resp = network.get(list_of_wikipedias) + resp = _network.get(list_of_wikipedias) if not resp.ok: print("ERROR: response from Wikipedia is not OK.") diff --git a/searx/settings_defaults.py b/searx/settings_defaults.py index 7f657aa54..5d978d0e0 100644 --- a/searx/settings_defaults.py +++ b/searx/settings_defaults.py @@ -209,9 +209,7 @@ SCHEMA = { 'enable_http2': SettingsValue(bool, True), 'verify': SettingsValue((bool, str), True), 'max_request_timeout': SettingsValue((None, numbers.Real), None), - # Magic number kept from previous code 'pool_connections': SettingsValue(int, 100), - # Picked from constructor 'pool_maxsize': SettingsValue(int, 10), 'keepalive_expiry': SettingsValue(numbers.Real, 5.0), # default maximum redirect