data: currencies: use SQLite instead of JSON

This commit is contained in:
Alexandre Flament 2024-05-04 10:41:36 +00:00
parent 259f82e87d
commit cd98409b76
7 changed files with 10158 additions and 15019 deletions

View file

@ -7,7 +7,6 @@
__all__ = [ __all__ = [
'ENGINE_TRAITS', 'ENGINE_TRAITS',
'CURRENCIES',
'USER_AGENTS', 'USER_AGENTS',
'EXTERNAL_URLS', 'EXTERNAL_URLS',
'WIKIDATA_UNITS', 'WIKIDATA_UNITS',
@ -16,11 +15,16 @@ __all__ = [
'LOCALES', 'LOCALES',
'ahmia_blacklist_loader', 'ahmia_blacklist_loader',
'fetch_engine_descriptions', 'fetch_engine_descriptions',
'fetch_iso4217_from_user',
'fetch_name_from_iso4217',
] ]
import re
import unicodedata
import json import json
import sqlite3 import sqlite3
from typing import Dict, List from typing import Dict, List, Optional
from functools import lru_cache
from threading import local from threading import local
from pathlib import Path from pathlib import Path
@ -44,7 +48,7 @@ def _get_connection(filename: str) -> sqlite3.Connection:
if connection is not None: if connection is not None:
return connection return connection
data_filename = str(data_dir / 'engine_descriptions.db') data_filename = str(data_dir / filename)
# open database in read only mode # open database in read only mode
data_connection = sqlite3.connect(f'file:{data_filename}?mode=ro', uri=True) data_connection = sqlite3.connect(f'file:{data_filename}?mode=ro', uri=True)
# https://phiresky.github.io/blog/2020/sqlite-performance-tuning/ # https://phiresky.github.io/blog/2020/sqlite-performance-tuning/
@ -67,6 +71,44 @@ def fetch_engine_descriptions(language) -> Dict[str, List[str]]:
return {result[0]: [result[1], result[2]] for result in res.fetchall()} return {result[0]: [result[1], result[2]] for result in res.fetchall()}
def _normalize_name(name):
name = name.lower().replace('-', ' ').rstrip('s')
name = re.sub(' +', ' ', name)
return unicodedata.normalize('NFKD', name).lower()
@lru_cache(10)
def fetch_iso4217_from_user(name: str) -> Optional[str]:
connection = _get_connection("currencies.db")
# try the iso4217
res = connection.execute("SELECT iso4217 FROM currencies WHERE lower(iso4217)=? LIMIT 1", (name.lower(),))
result = res.fetchone()
if result:
return result[0]
# try the currency names
name = _normalize_name(name)
res = connection.execute("SELECT iso4217 FROM currencies WHERE name=?", (name,))
result = list(set(result[0] for result in res.fetchall()))
if len(result) == 1:
return result[0]
# ambiguity --> return nothing
return None
@lru_cache(10)
def fetch_name_from_iso4217(iso4217: str, language: str) -> Optional[str]:
res = _get_connection("currencies.db").execute(
"SELECT name FROM currencies WHERE iso4217=? AND language=?", (iso4217, language)
)
result = [result[0] for result in res.fetchall()]
if len(result) == 1:
return result[0]
return None
def ahmia_blacklist_loader(): def ahmia_blacklist_loader():
"""Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion """Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion
names. The MD5 values are fetched by:: names. The MD5 values are fetched by::
@ -80,7 +122,6 @@ def ahmia_blacklist_loader():
return f.read().split() return f.read().split()
CURRENCIES = _load('currencies.json')
USER_AGENTS = _load('useragents.json') USER_AGENTS = _load('useragents.json')
EXTERNAL_URLS = _load('external_urls.json') EXTERNAL_URLS = _load('external_urls.json')
WIKIDATA_UNITS = _load('wikidata_units.json') WIKIDATA_UNITS = _load('wikidata_units.json')

BIN
searx/data/currencies.db Normal file

Binary file not shown.

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -10,7 +10,7 @@ from functools import partial
from flask_babel import gettext from flask_babel import gettext
from searx.data import OSM_KEYS_TAGS, CURRENCIES from searx.data import OSM_KEYS_TAGS, fetch_name_from_iso4217
from searx.utils import searx_useragent from searx.utils import searx_useragent
from searx.external_urls import get_external_url from searx.external_urls import get_external_url
from searx.engines.wikidata import send_wikidata_query, sparql_string_escape, get_thumbnail from searx.engines.wikidata import send_wikidata_query, sparql_string_escape, get_thumbnail
@ -449,9 +449,9 @@ def get_key_label(key_name, lang):
# https://taginfo.openstreetmap.org/keys/currency#values # https://taginfo.openstreetmap.org/keys/currency#values
currency = key_name.split(':') currency = key_name.split(':')
if len(currency) > 1: if len(currency) > 1:
o = CURRENCIES['iso4217'].get(currency[1]) label = fetch_name_from_iso4217(currency[1], lang)
if o: if label:
return get_label(o, lang).lower() return label
return currency[1] return currency[1]
labels = OSM_KEYS_TAGS['keys'] labels = OSM_KEYS_TAGS['keys']

View file

@ -3,33 +3,14 @@
""" """
import unicodedata
import re import re
from searx.data import CURRENCIES from searx.data import fetch_iso4217_from_user, fetch_name_from_iso4217
from .online import OnlineProcessor from .online import OnlineProcessor
parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I)
def normalize_name(name):
name = name.lower().replace('-', ' ').rstrip('s')
name = re.sub(' +', ' ', name)
return unicodedata.normalize('NFKD', name).lower()
def name_to_iso4217(name):
name = normalize_name(name)
currency = CURRENCIES['names'].get(name, [name])
if isinstance(currency, str):
return currency
return currency[0]
def iso4217_to_name(iso4217, language):
return CURRENCIES['iso4217'].get(iso4217, {}).get(language, iso4217)
class OnlineCurrencyProcessor(OnlineProcessor): class OnlineCurrencyProcessor(OnlineProcessor):
"""Processor class used by ``online_currency`` engines.""" """Processor class used by ``online_currency`` engines."""
@ -52,14 +33,17 @@ class OnlineCurrencyProcessor(OnlineProcessor):
amount = float(amount_str) amount = float(amount_str)
except ValueError: except ValueError:
return None return None
from_currency = name_to_iso4217(from_currency.strip()) from_currency = fetch_iso4217_from_user(from_currency.strip())
to_currency = name_to_iso4217(to_currency.strip()) to_currency = fetch_iso4217_from_user(to_currency.strip())
if from_currency is None or to_currency is None:
return None
params['amount'] = amount params['amount'] = amount
params['from'] = from_currency params['from'] = from_currency
params['to'] = to_currency params['to'] = to_currency
params['from_name'] = iso4217_to_name(from_currency, 'en') params['from_name'] = fetch_name_from_iso4217(from_currency, 'en')
params['to_name'] = iso4217_to_name(to_currency, 'en') params['to_name'] = fetch_name_from_iso4217(to_currency, 'en')
return params return params
def get_default_tests(self): def get_default_tests(self):

View file

@ -9,15 +9,20 @@ Output file: :origin:`searx/data/currencies.json` (:origin:`CI Update data ...
# pylint: disable=invalid-name # pylint: disable=invalid-name
import csv
import re import re
import unicodedata import unicodedata
import json import sqlite3
from pathlib import Path
from searx.network import set_timeout_for_thread
from searx.locales import LOCALE_NAMES, locales_initialize from searx.locales import LOCALE_NAMES, locales_initialize
from searx.engines import wikidata, set_loggers from searx.engines import wikidata, set_loggers
from searx.data import data_dir from searx.data import data_dir
DATA_FILE = data_dir / 'currencies.json' DATABASE_FILE = data_dir / 'currencies.db'
CSV_FILE = data_dir / 'dumps' / 'currencies.csv'
set_loggers(wikidata, 'wikidata') set_loggers(wikidata, 'wikidata')
locales_initialize() locales_initialize()
@ -75,57 +80,45 @@ def _normalize_name(name):
return name return name
def add_currency_name(db, name, iso4217, normalize_name=True): def add_entry(db, language, iso4217, name, normalize_name=True):
db_names = db['names']
if normalize_name: if normalize_name:
name = _normalize_name(name) name = _normalize_name(name)
iso4217_set = db_names.setdefault(name, []) entry = (language, iso4217, name)
if iso4217 not in iso4217_set: db.add(entry)
iso4217_set.insert(0, iso4217)
def add_currency_label(db, label, iso4217, language):
labels = db['iso4217'].setdefault(iso4217, {})
labels[language] = label
def wikidata_request_result_iterator(request): def wikidata_request_result_iterator(request):
set_timeout_for_thread(60)
result = wikidata.send_wikidata_query(request.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL)) result = wikidata.send_wikidata_query(request.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL))
if result is not None: if result is not None:
yield from result['results']['bindings'] yield from result['results']['bindings']
def fetch_db(): def fetch_db():
db = { db = set()
'names': {},
'iso4217': {},
}
for r in wikidata_request_result_iterator(SPARQL_WIKIPEDIA_NAMES_REQUEST): for r in wikidata_request_result_iterator(SPARQL_WIKIPEDIA_NAMES_REQUEST):
iso4217 = r['iso4217']['value'] iso4217 = r['iso4217']['value']
article_name = r['article_name']['value'] article_name = r['article_name']['value']
article_lang = r['article_name']['xml:lang'] article_lang = r['article_name']['xml:lang']
add_currency_name(db, article_name, iso4217) add_entry(db, article_lang, iso4217, article_name)
add_currency_label(db, article_name, iso4217, article_lang)
for r in wikidata_request_result_iterator(SARQL_REQUEST): for r in wikidata_request_result_iterator(SARQL_REQUEST):
iso4217 = r['iso4217']['value'] iso4217 = r['iso4217']['value']
if 'label' in r: if 'label' in r:
label = r['label']['value'] label = r['label']['value']
label_lang = r['label']['xml:lang'] label_lang = r['label']['xml:lang']
add_currency_name(db, label, iso4217) add_entry(db, label_lang, iso4217, label)
add_currency_label(db, label, iso4217, label_lang)
if 'alias' in r: if 'alias' in r:
add_currency_name(db, r['alias']['value'], iso4217) add_entry(db, "", iso4217, r['alias']['value'])
if 'unicode' in r: if 'unicode' in r:
add_currency_name(db, r['unicode']['value'], iso4217, normalize_name=False) add_entry(db, "", iso4217, r['unicode']['value'], normalize_name=False)
if 'unit' in r: if 'unit' in r:
add_currency_name(db, r['unit']['value'], iso4217, normalize_name=False) add_entry(db, "", iso4217, r['unit']['value'], normalize_name=False)
return db return db
@ -135,22 +128,33 @@ def main():
db = fetch_db() db = fetch_db()
# static # static
add_currency_name(db, "euro", 'EUR') add_entry(db, "", 'EUR', "euro")
add_currency_name(db, "euros", 'EUR') add_entry(db, "", 'EUR', "euros")
add_currency_name(db, "dollar", 'USD') add_entry(db, "", 'USD', "dollar")
add_currency_name(db, "dollars", 'USD') add_entry(db, "", 'USD', "dollars")
add_currency_name(db, "peso", 'MXN') add_entry(
add_currency_name(db, "pesos", 'MXN') db,
"",
'MXN',
"peso",
)
add_entry(db, "", 'MXN', "pesos")
# reduce memory usage: db = list(db)
# replace lists with one item by the item. see db.sort(key=lambda entry: (entry[0], entry[1], entry[2]))
# searx.search.processors.online_currency.name_to_iso4217 Path(DATABASE_FILE).unlink(missing_ok=True)
for name in db['names']: with sqlite3.connect(DATABASE_FILE) as con:
if len(db['names'][name]) == 1: cur = con.cursor()
db['names'][name] = db['names'][name][0] cur.execute("CREATE TABLE currencies(language, iso4217, name)")
cur.executemany("INSERT INTO currencies VALUES(?, ?, ?)", db)
with DATA_FILE.open('w', encoding='utf8') as f: cur.execute("CREATE INDEX index_currencies_iso4217 ON currencies('iso4217')")
json.dump(db, f, indent=4, sort_keys=True, ensure_ascii=False) cur.execute("CREATE INDEX index_currencies_name ON currencies('name')")
con.commit()
with CSV_FILE.open('w', encoding='utf8') as f:
w = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
w.writerow(["language", "iso4217", "name"])
for row in db:
w.writerow(row)
if __name__ == '__main__': if __name__ == '__main__':