data: engine descriptions: use SQLite instead of JSON

To reduce memory usage, use a SQLite database to store the engine descriptions.
A dump of the database is stored in Git to facilitate maintenance,
especially the pull requests made automatically every month.

Related to
* https://github.com/searxng/searxng/discussions/2633
* https://github.com/searxng/searxng/pull/3443
This commit is contained in:
Alexandre Flament 2024-05-04 08:04:02 +00:00
parent d577817646
commit 259f82e87d
7 changed files with 4123 additions and 7987 deletions

View file

@ -13,15 +13,19 @@ __all__ = [
'WIKIDATA_UNITS',
'EXTERNAL_BANGS',
'OSM_KEYS_TAGS',
'ENGINE_DESCRIPTIONS',
'LOCALES',
'ahmia_blacklist_loader',
'fetch_engine_descriptions',
]
import json
import sqlite3
from typing import Dict, List
from threading import local
from pathlib import Path
data_dir = Path(__file__).parent
data_connection_local = local()
def _load(filename):
@ -29,6 +33,40 @@ def _load(filename):
return json.load(f)
def _get_connection(filename: str) -> sqlite3.Connection:
"""Return a read only SQLite connection to filename.
The filename is relative to searx/data
Multiple calls to this function in the same thread,
already return the same connection.
"""
connection = data_connection_local.__dict__.get(filename)
if connection is not None:
return connection
data_filename = str(data_dir / 'engine_descriptions.db')
# open database in read only mode
data_connection = sqlite3.connect(f'file:{data_filename}?mode=ro', uri=True)
# https://phiresky.github.io/blog/2020/sqlite-performance-tuning/
data_connection.executescript(
"""
pragma temp_store = memory;
pragma mmap_size = 30000000000;
"""
)
data_connection_local.__dict__[filename] = data_connection
return data_connection
def fetch_engine_descriptions(language) -> Dict[str, List[str]]:
"""Return engine description and source for each engine name."""
res = _get_connection("engine_descriptions.db").execute(
"SELECT engine, description, source FROM engine_descriptions WHERE language=?", (language,)
)
return {result[0]: [result[1], result[2]] for result in res.fetchall()}
def ahmia_blacklist_loader():
"""Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion
names. The MD5 values are fetched by::
@ -48,6 +86,5 @@ EXTERNAL_URLS = _load('external_urls.json')
WIKIDATA_UNITS = _load('wikidata_units.json')
EXTERNAL_BANGS = _load('external_bangs.json')
OSM_KEYS_TAGS = _load('osm_keys_tags.json')
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
ENGINE_TRAITS = _load('engine_traits.json')
LOCALES = _load('locales.json')

View file

@ -0,0 +1,3 @@
Dumps of the SQLite files in ``searx.data``.
These files are not used by SearXNG, they are here for reference.

File diff suppressed because one or more lines are too long

Binary file not shown.

File diff suppressed because one or more lines are too long

View file

@ -58,7 +58,7 @@ from searx import infopage
from searx import limiter
from searx.botdetection import link_token
from searx.data import ENGINE_DESCRIPTIONS
from searx.data import fetch_engine_descriptions
from searx.results import Timing
from searx.settings_defaults import OUTPUT_FORMATS
from searx.settings_loader import get_default_settings_path
@ -1102,17 +1102,10 @@ def image_proxy():
@app.route('/engine_descriptions.json', methods=['GET'])
def engine_descriptions():
locale = get_locale().split('_')[0]
result = ENGINE_DESCRIPTIONS['en'].copy()
result = fetch_engine_descriptions('en')
if locale != 'en':
for engine, description in ENGINE_DESCRIPTIONS.get(locale, {}).items():
for engine, description in fetch_engine_descriptions(locale).items():
result[engine] = description
for engine, description in result.items():
if len(description) == 2 and description[1] == 'ref':
ref_engine, ref_lang = description[0].split(':')
description = ENGINE_DESCRIPTIONS[ref_lang][ref_engine]
if isinstance(description, str):
description = [description, 'wikipedia']
result[engine] = description
# overwrite by about:description (from settings)
for engine_name, engine_mod in engines.items():

View file

@ -9,22 +9,24 @@ Output file: :origin:`searx/data/engine_descriptions.json`.
# pylint: disable=invalid-name, global-statement
import csv
import json
import sqlite3
from urllib.parse import urlparse
from os.path import join
from pathlib import Path
from lxml.html import fromstring
from searx.engines import wikidata, set_loggers
from searx.utils import extract_text, searx_useragent
from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
from searx import searx_dir
from searx.utils import gen_useragent, detect_language
import searx.search
import searx.network
from searx.data import data_dir
DATA_FILE = data_dir / 'engine_descriptions.json'
DATABASE_FILE = data_dir / 'engine_descriptions.db'
CSV_FILE = data_dir / 'dumps' / 'engine_descriptions.csv'
set_loggers(wikidata, 'wikidata')
locales_initialize()
@ -323,37 +325,32 @@ def fetch_website_descriptions():
fetch_website_description(engine_name, website)
def get_engine_descriptions_filename():
return join(join(searx_dir, "data"), "engine_descriptions.json")
def get_output():
def write_db():
"""
From descriptions[engine][language] = [description, source]
To
Erase and write the SQLite database searx/data/engine_descriptions.db :
* create one table engine_descriptions
* dump write all the values
* output[language][engine] = description_and_source
* description_and_source can be:
* [description, source]
* description (if source = "wikipedia")
* [f"engine:lang", "ref"] (reference to another existing description)
Make a JSON dump of the values into engine_descriptions.json
"""
output = {locale: {} for locale in LOCALE_NAMES}
seen_descriptions = {}
for engine_name, lang_descriptions in descriptions.items():
for language, description in lang_descriptions.items():
if description[0] in seen_descriptions:
ref = seen_descriptions[description[0]]
description = [f'{ref[0]}:{ref[1]}', 'ref']
else:
seen_descriptions[description[0]] = (engine_name, language)
if description[1] == 'wikipedia':
description = description[0]
output.setdefault(language, {}).setdefault(engine_name, description)
return output
data = [
(language, engine_name, description[0], description[1])
for engine_name, lang_descriptions in descriptions.items()
for language, description in lang_descriptions.items()
]
data.sort(key=lambda item: (item[0], item[1]))
Path(DATABASE_FILE).unlink(missing_ok=True)
with sqlite3.connect(DATABASE_FILE) as con:
cur = con.cursor()
cur.execute("CREATE TABLE engine_descriptions(language, engine, description, source)")
cur.executemany("INSERT INTO engine_descriptions VALUES(?, ?, ?, ?)", data)
cur.execute("CREATE INDEX index_engine_descriptions ON engine_descriptions('language')")
con.commit()
with CSV_FILE.open('w', encoding="utf8") as f:
w = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
w.writerow(["language", "engine", "description", "source"])
for row in data:
w.writerow(row)
def main():
@ -361,10 +358,7 @@ def main():
fetch_wikidata_descriptions()
fetch_wikipedia_descriptions()
fetch_website_descriptions()
output = get_output()
with DATA_FILE.open('w', encoding='utf8') as f:
f.write(json.dumps(output, indent=1, separators=(',', ':'), sort_keys=True, ensure_ascii=False))
write_db()
if __name__ == "__main__":