From 82fd0dac60993c5c8971655228fe3d67689119d7 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 29 Apr 2024 18:36:26 +0200 Subject: [PATCH] [mod] lower memory footprint by lazy loading JSON data This patch implements lazy loading of the JSON data. Motivation: in most requests not all JSON data is needed, but loaded. By example these four JSON files: - currencies.json ~550KB - engine_descriptions.json ~1,3MB - external_bangs.json ~1,3MB - osm_keys_tags.json ~ 2,2MB most often not used and consume a lot of memory and BTW they also extend the time required to instantiate a walker. Signed-off-by: Markus Heiser --- searx/data/__init__.py | 48 ++++++++++++++++------ searx/enginelib/traits.py | 6 +-- searx/engines/annas_archive.py | 4 +- searx/engines/duckduckgo.py | 3 +- searx/engines/duckduckgo_definitions.py | 4 +- searx/engines/openstreetmap.py | 8 ++-- searx/engines/wikidata.py | 4 +- searx/engines/zlibrary.py | 4 +- searx/external_bang.py | 6 +-- searx/external_urls.py | 4 +- searx/plugins/ahmia_filter.py | 4 +- searx/plugins/unit_converter.py | 4 +- searx/search/processors/online_currency.py | 6 +-- searx/utils.py | 8 ++-- searx/webapp.py | 8 ++-- 15 files changed, 73 insertions(+), 48 deletions(-) diff --git a/searx/data/__init__.py b/searx/data/__init__.py index 28a3974fa..fd11e1506 100644 --- a/searx/data/__init__.py +++ b/searx/data/__init__.py @@ -20,13 +20,20 @@ __all__ = [ import json from pathlib import Path +from searx import logger data_dir = Path(__file__).parent +logger = logger.getChild('data') - -def _load(filename): - with open(data_dir / filename, encoding='utf-8') as f: - return json.load(f) +CURRENCIES: dict +USER_AGENTS: dict +EXTERNAL_URLS: dict +WIKIDATA_UNITS: dict +EXTERNAL_BANGS: dict +OSM_KEYS_TAGS: dict +ENGINE_DESCRIPTIONS: dict +ENGINE_TRAITS: dict +LOCALES: dict def ahmia_blacklist_loader(): @@ -42,12 +49,27 @@ def ahmia_blacklist_loader(): return f.read().split() -CURRENCIES = _load('currencies.json') -USER_AGENTS = _load('useragents.json') -EXTERNAL_URLS = _load('external_urls.json') -WIKIDATA_UNITS = _load('wikidata_units.json') -EXTERNAL_BANGS = _load('external_bangs.json') -OSM_KEYS_TAGS = _load('osm_keys_tags.json') -ENGINE_DESCRIPTIONS = _load('engine_descriptions.json') -ENGINE_TRAITS = _load('engine_traits.json') -LOCALES = _load('locales.json') +NAME_TO_JSON_FILE = { + 'CURRENCIES': 'currencies.json', + 'USER_AGENTS': 'useragents.json', + 'EXTERNAL_URLS': 'external_urls.json', + 'WIKIDATA_UNITS': 'wikidata_units.json', + 'EXTERNAL_BANGS': 'external_bangs.json', + 'OSM_KEYS_TAGS': 'osm_keys_tags.json', + 'ENGINE_DESCRIPTIONS': 'engine_descriptions.json', + 'ENGINE_TRAITS': 'engine_traits.json', + 'LOCALES': 'locales.json', +} + + +def __getattr__(name: str): + # lazy load of JSON files .. + filename = NAME_TO_JSON_FILE.get(name) + if filename: + filename = data_dir / filename + logger.debug("init global %s from JSON file %s", name, filename) + with open(filename, encoding='utf-8') as f: + globals()[name] = json.load(f) + return globals()[name] + else: + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/searx/enginelib/traits.py b/searx/enginelib/traits.py index cab6557dd..3870105ee 100644 --- a/searx/enginelib/traits.py +++ b/searx/enginelib/traits.py @@ -16,7 +16,7 @@ import types from typing import Dict, Literal, Iterable, Union, Callable, Optional, TYPE_CHECKING from searx import locales -from searx.data import data_dir, ENGINE_TRAITS +from searx import data if TYPE_CHECKING: from . import Engine @@ -193,7 +193,7 @@ class EngineTraits: class EngineTraitsMap(Dict[str, EngineTraits]): """A python dictionary to map :class:`EngineTraits` by engine name.""" - ENGINE_TRAITS_FILE = (data_dir / 'engine_traits.json').resolve() + ENGINE_TRAITS_FILE = (data.data_dir / 'engine_traits.json').resolve() """File with persistence of the :py:obj:`EngineTraitsMap`.""" def save_data(self): @@ -205,7 +205,7 @@ class EngineTraitsMap(Dict[str, EngineTraits]): def from_data(cls) -> 'EngineTraitsMap': """Instantiate :class:`EngineTraitsMap` object from :py:obj:`ENGINE_TRAITS`""" obj = cls() - for k, v in ENGINE_TRAITS.items(): + for k, v in data.ENGINE_TRAITS.items(): obj[k] = EngineTraits(**v) return obj diff --git a/searx/engines/annas_archive.py b/searx/engines/annas_archive.py index d758e4a96..b943638a0 100644 --- a/searx/engines/annas_archive.py +++ b/searx/engines/annas_archive.py @@ -37,9 +37,9 @@ from typing import List, Dict, Any, Optional from urllib.parse import quote from lxml import html +from searx import data from searx.utils import extract_text, eval_xpath, eval_xpath_list from searx.enginelib.traits import EngineTraits -from searx.data import ENGINE_TRAITS # about about: Dict[str, Any] = { @@ -86,7 +86,7 @@ aa_ext: str = '' def init(engine_settings=None): # pylint: disable=unused-argument """Check of engine's settings.""" - traits = EngineTraits(**ENGINE_TRAITS['annas archive']) + traits = EngineTraits(**data.ENGINE_TRAITS['annas archive']) if aa_content and aa_content not in traits.custom['content']: raise ValueError(f'invalid setting content: {aa_content}') diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 8a1dafbcf..69dbb79c9 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -12,6 +12,7 @@ import babel import lxml.html from searx import ( + data, locales, redislib, external_bang, @@ -230,7 +231,7 @@ def quote_ddg_bangs(query): for val in re.split(r'(\s+)', query): if not val.strip(): continue - if val.startswith('!') and external_bang.get_node(external_bang.EXTERNAL_BANGS, val[1:]): + if val.startswith('!') and external_bang.get_node(data.EXTERNAL_BANGS, val[1:]): val = f"'{val}'" query_parts.append(val) return ' '.join(query_parts) diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 59caed8ce..86c83a726 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -18,7 +18,7 @@ from typing import TYPE_CHECKING from urllib.parse import urlencode, urlparse, urljoin from lxml import html -from searx.data import WIKIDATA_UNITS +from searx import data from searx.utils import extract_text, html_to_text, get_string_replaces_function from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom @@ -238,7 +238,7 @@ def unit_to_str(unit): for prefix in WIKIDATA_PREFIX: if unit.startswith(prefix): wikidata_entity = unit[len(prefix) :] - real_unit = WIKIDATA_UNITS.get(wikidata_entity) + real_unit = data.WIKIDATA_UNITS.get(wikidata_entity) if real_unit is None: return unit return real_unit['symbol'] diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py index 7e3d8f43b..a74f0ac66 100644 --- a/searx/engines/openstreetmap.py +++ b/searx/engines/openstreetmap.py @@ -10,7 +10,7 @@ from functools import partial from flask_babel import gettext -from searx.data import OSM_KEYS_TAGS, CURRENCIES +from searx import data as searx_data from searx.utils import searx_useragent from searx.external_urls import get_external_url from searx.engines.wikidata import send_wikidata_query, sparql_string_escape, get_thumbnail @@ -435,7 +435,7 @@ def get_label(labels, lang): def get_tag_label(tag_category, tag_name, lang): """Get tag label from OSM_KEYS_TAGS""" tag_name = '' if tag_name is None else tag_name - tag_labels = OSM_KEYS_TAGS['tags'].get(tag_category, {}).get(tag_name, {}) + tag_labels = searx_data.OSM_KEYS_TAGS['tags'].get(tag_category, {}).get(tag_name, {}) return get_label(tag_labels, lang) @@ -449,12 +449,12 @@ def get_key_label(key_name, lang): # https://taginfo.openstreetmap.org/keys/currency#values currency = key_name.split(':') if len(currency) > 1: - o = CURRENCIES['iso4217'].get(currency[1]) + o = searx_data.CURRENCIES['iso4217'].get(currency[1]) if o: return get_label(o, lang).lower() return currency[1] - labels = OSM_KEYS_TAGS['keys'] + labels = searx_data.OSM_KEYS_TAGS['keys'] for k in key_name.split(':') + ['*']: labels = labels.get(k) if labels is None: diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 268da6fa9..30f8d2d68 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -13,7 +13,7 @@ from json import loads from dateutil.parser import isoparse from babel.dates import format_datetime, format_date, format_time, get_datetime_format -from searx.data import WIKIDATA_UNITS +from searx import data from searx.network import post, get from searx.utils import searx_useragent, get_string_replaces_function from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom @@ -762,7 +762,7 @@ def debug_explain_wikidata_query(query, method='GET'): def init(engine_settings=None): # pylint: disable=unused-argument # WIKIDATA_PROPERTIES : add unit symbols - WIKIDATA_PROPERTIES.update(WIKIDATA_UNITS) + WIKIDATA_PROPERTIES.update(data.WIKIDATA_UNITS) # WIKIDATA_PROPERTIES : add property labels wikidata_property_names = [] diff --git a/searx/engines/zlibrary.py b/searx/engines/zlibrary.py index ba1f474fa..1d9454c2d 100644 --- a/searx/engines/zlibrary.py +++ b/searx/engines/zlibrary.py @@ -40,9 +40,9 @@ from urllib.parse import quote from lxml import html from flask_babel import gettext +from searx import data from searx.utils import extract_text, eval_xpath, eval_xpath_list from searx.enginelib.traits import EngineTraits -from searx.data import ENGINE_TRAITS if TYPE_CHECKING: import httpx @@ -80,7 +80,7 @@ zlib_ext: str = "" def init(engine_settings=None) -> None: # pylint: disable=unused-argument """Check of engine's settings.""" - traits: EngineTraits = EngineTraits(**ENGINE_TRAITS["z-library"]) + traits: EngineTraits = EngineTraits(**data.ENGINE_TRAITS["z-library"]) if zlib_ext and zlib_ext not in traits.custom["ext"]: raise ValueError(f"invalid setting ext: {zlib_ext}") diff --git a/searx/external_bang.py b/searx/external_bang.py index fde2f8eb5..b5b69f6e2 100644 --- a/searx/external_bang.py +++ b/searx/external_bang.py @@ -2,7 +2,7 @@ # pylint: disable=missing-module-docstring from urllib.parse import quote_plus, urlparse -from searx.data import EXTERNAL_BANGS +from searx import data LEAF_KEY = chr(16) @@ -56,7 +56,7 @@ def resolve_bang_definition(bang_definition, query): def get_bang_definition_and_autocomplete(bang, external_bangs_db=None): # pylint: disable=invalid-name if external_bangs_db is None: - external_bangs_db = EXTERNAL_BANGS + external_bangs_db = data.EXTERNAL_BANGS bang_definition, bang_ac_list = get_bang_definition_and_ac(external_bangs_db, bang) @@ -90,7 +90,7 @@ def get_bang_url(search_query, external_bangs_db=None): ret_val = None if external_bangs_db is None: - external_bangs_db = EXTERNAL_BANGS + external_bangs_db = data.EXTERNAL_BANGS if search_query.external_bang: bang_definition, _ = get_bang_definition_and_ac(external_bangs_db, search_query.external_bang) diff --git a/searx/external_urls.py b/searx/external_urls.py index 8e243ec47..c8efe4c92 100644 --- a/searx/external_urls.py +++ b/searx/external_urls.py @@ -3,7 +3,7 @@ import math -from searx.data import EXTERNAL_URLS +from searx import data IMDB_PREFIX_TO_URL_ID = { @@ -43,7 +43,7 @@ def get_external_url(url_id, item_id, alternative="default"): elif url_id == 'wikimedia_image': item_id = get_wikimedia_image_id(item_id) - url_description = EXTERNAL_URLS.get(url_id) + url_description = data.EXTERNAL_URLS.get(url_id) if url_description: url_template = url_description["urls"].get(alternative) if url_template is not None: diff --git a/searx/plugins/ahmia_filter.py b/searx/plugins/ahmia_filter.py index bbf137103..73f211bc3 100644 --- a/searx/plugins/ahmia_filter.py +++ b/searx/plugins/ahmia_filter.py @@ -2,7 +2,7 @@ # pylint: disable=missing-module-docstring from hashlib import md5 -from searx.data import ahmia_blacklist_loader +from searx import data name = "Ahmia blacklist" description = "Filter out onion results that appear in Ahmia's blacklist. (See https://ahmia.fi/blacklist)" @@ -24,5 +24,5 @@ def init(_app, settings): if not settings['outgoing']['using_tor_proxy']: # disable the plugin return False - ahmia_blacklist = ahmia_blacklist_loader() + ahmia_blacklist = data.ahmia_blacklist_loader() return True diff --git a/searx/plugins/unit_converter.py b/searx/plugins/unit_converter.py index dd515aa72..8fb2e5472 100644 --- a/searx/plugins/unit_converter.py +++ b/searx/plugins/unit_converter.py @@ -4,7 +4,7 @@ from flask_babel import gettext -from searx.data import WIKIDATA_UNITS +from searx import data name = "Unit converter plugin" description = gettext("Convert between units") @@ -38,7 +38,7 @@ def _parse_text_and_convert(search, splitted_query): from_unit = None to_unit = None - for unit in WIKIDATA_UNITS.values(): + for unit in data.WIKIDATA_UNITS.values(): if unit['symbol'] == from_unit_key: from_unit = unit diff --git a/searx/search/processors/online_currency.py b/searx/search/processors/online_currency.py index 7d6811e6b..43134c1a3 100644 --- a/searx/search/processors/online_currency.py +++ b/searx/search/processors/online_currency.py @@ -6,7 +6,7 @@ import unicodedata import re -from searx.data import CURRENCIES +from searx import data from .online import OnlineProcessor parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) @@ -20,14 +20,14 @@ def normalize_name(name): def name_to_iso4217(name): name = normalize_name(name) - currency = CURRENCIES['names'].get(name, [name]) + currency = data.CURRENCIES['names'].get(name, [name]) if isinstance(currency, str): return currency return currency[0] def iso4217_to_name(iso4217, language): - return CURRENCIES['iso4217'].get(iso4217, {}).get(language, iso4217) + return data.CURRENCIES['iso4217'].get(iso4217, {}).get(language, iso4217) class OnlineCurrencyProcessor(OnlineProcessor): diff --git a/searx/utils.py b/searx/utils.py index f50618ea2..b92f476f7 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -21,7 +21,7 @@ from lxml import html from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError from searx import settings -from searx.data import USER_AGENTS, data_dir +from searx import data as searx_data from searx.version import VERSION_TAG from searx.sxng_locales import sxng_locales from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException @@ -81,7 +81,9 @@ def gen_useragent(os_string: Optional[str] = None) -> str: See searx/data/useragents.json """ - return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions'])) + return searx_data.USER_AGENTS['ua'].format( + os=os_string or choice(searx_data.USER_AGENTS['os']), version=choice(searx_data.USER_AGENTS['versions']) + ) class _HTMLTextExtractorException(Exception): @@ -600,7 +602,7 @@ def _get_fasttext_model() -> "fasttext.FastText._FastText": # type: ignore # Monkey patch: prevent fasttext from showing a (useless) warning when loading a model. fasttext.FastText.eprint = lambda x: None - _FASTTEXT_MODEL = fasttext.load_model(str(data_dir / 'lid.176.ftz')) + _FASTTEXT_MODEL = fasttext.load_model(str(searx_data.data_dir / 'lid.176.ftz')) return _FASTTEXT_MODEL diff --git a/searx/webapp.py b/searx/webapp.py index 0901af8ea..71da0b5d7 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -58,7 +58,7 @@ from searx import infopage from searx import limiter from searx.botdetection import link_token -from searx.data import ENGINE_DESCRIPTIONS +from searx import data from searx.results import Timing from searx.settings_defaults import OUTPUT_FORMATS from searx.settings_loader import get_default_settings_path @@ -1102,14 +1102,14 @@ def image_proxy(): @app.route('/engine_descriptions.json', methods=['GET']) def engine_descriptions(): locale = get_locale().split('_')[0] - result = ENGINE_DESCRIPTIONS['en'].copy() + result = data.ENGINE_DESCRIPTIONS['en'].copy() if locale != 'en': - for engine, description in ENGINE_DESCRIPTIONS.get(locale, {}).items(): + for engine, description in data.ENGINE_DESCRIPTIONS.get(locale, {}).items(): result[engine] = description for engine, description in result.items(): if len(description) == 2 and description[1] == 'ref': ref_engine, ref_lang = description[0].split(':') - description = ENGINE_DESCRIPTIONS[ref_lang][ref_engine] + description = data.ENGINE_DESCRIPTIONS[ref_lang][ref_engine] if isinstance(description, str): description = [description, 'wikipedia'] result[engine] = description