data: currencies: use SQLite instead of JSON

This commit is contained in:
Alexandre Flament 2024-05-04 10:41:36 +00:00
parent 259f82e87d
commit cd98409b76
7 changed files with 10158 additions and 15019 deletions

View file

@ -7,7 +7,6 @@
__all__ = [
'ENGINE_TRAITS',
'CURRENCIES',
'USER_AGENTS',
'EXTERNAL_URLS',
'WIKIDATA_UNITS',
@ -16,11 +15,16 @@ __all__ = [
'LOCALES',
'ahmia_blacklist_loader',
'fetch_engine_descriptions',
'fetch_iso4217_from_user',
'fetch_name_from_iso4217',
]
import re
import unicodedata
import json
import sqlite3
from typing import Dict, List
from typing import Dict, List, Optional
from functools import lru_cache
from threading import local
from pathlib import Path
@ -44,7 +48,7 @@ def _get_connection(filename: str) -> sqlite3.Connection:
if connection is not None:
return connection
data_filename = str(data_dir / 'engine_descriptions.db')
data_filename = str(data_dir / filename)
# open database in read only mode
data_connection = sqlite3.connect(f'file:{data_filename}?mode=ro', uri=True)
# https://phiresky.github.io/blog/2020/sqlite-performance-tuning/
@ -67,6 +71,44 @@ def fetch_engine_descriptions(language) -> Dict[str, List[str]]:
return {result[0]: [result[1], result[2]] for result in res.fetchall()}
def _normalize_name(name):
name = name.lower().replace('-', ' ').rstrip('s')
name = re.sub(' +', ' ', name)
return unicodedata.normalize('NFKD', name).lower()
@lru_cache(10)
def fetch_iso4217_from_user(name: str) -> Optional[str]:
connection = _get_connection("currencies.db")
# try the iso4217
res = connection.execute("SELECT iso4217 FROM currencies WHERE lower(iso4217)=? LIMIT 1", (name.lower(),))
result = res.fetchone()
if result:
return result[0]
# try the currency names
name = _normalize_name(name)
res = connection.execute("SELECT iso4217 FROM currencies WHERE name=?", (name,))
result = list(set(result[0] for result in res.fetchall()))
if len(result) == 1:
return result[0]
# ambiguity --> return nothing
return None
@lru_cache(10)
def fetch_name_from_iso4217(iso4217: str, language: str) -> Optional[str]:
res = _get_connection("currencies.db").execute(
"SELECT name FROM currencies WHERE iso4217=? AND language=?", (iso4217, language)
)
result = [result[0] for result in res.fetchall()]
if len(result) == 1:
return result[0]
return None
def ahmia_blacklist_loader():
"""Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion
names. The MD5 values are fetched by::
@ -80,7 +122,6 @@ def ahmia_blacklist_loader():
return f.read().split()
CURRENCIES = _load('currencies.json')
USER_AGENTS = _load('useragents.json')
EXTERNAL_URLS = _load('external_urls.json')
WIKIDATA_UNITS = _load('wikidata_units.json')

BIN
searx/data/currencies.db Normal file

Binary file not shown.

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -10,7 +10,7 @@ from functools import partial
from flask_babel import gettext
from searx.data import OSM_KEYS_TAGS, CURRENCIES
from searx.data import OSM_KEYS_TAGS, fetch_name_from_iso4217
from searx.utils import searx_useragent
from searx.external_urls import get_external_url
from searx.engines.wikidata import send_wikidata_query, sparql_string_escape, get_thumbnail
@ -449,9 +449,9 @@ def get_key_label(key_name, lang):
# https://taginfo.openstreetmap.org/keys/currency#values
currency = key_name.split(':')
if len(currency) > 1:
o = CURRENCIES['iso4217'].get(currency[1])
if o:
return get_label(o, lang).lower()
label = fetch_name_from_iso4217(currency[1], lang)
if label:
return label
return currency[1]
labels = OSM_KEYS_TAGS['keys']

View file

@ -3,33 +3,14 @@
"""
import unicodedata
import re
from searx.data import CURRENCIES
from searx.data import fetch_iso4217_from_user, fetch_name_from_iso4217
from .online import OnlineProcessor
parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I)
def normalize_name(name):
name = name.lower().replace('-', ' ').rstrip('s')
name = re.sub(' +', ' ', name)
return unicodedata.normalize('NFKD', name).lower()
def name_to_iso4217(name):
name = normalize_name(name)
currency = CURRENCIES['names'].get(name, [name])
if isinstance(currency, str):
return currency
return currency[0]
def iso4217_to_name(iso4217, language):
return CURRENCIES['iso4217'].get(iso4217, {}).get(language, iso4217)
class OnlineCurrencyProcessor(OnlineProcessor):
"""Processor class used by ``online_currency`` engines."""
@ -52,14 +33,17 @@ class OnlineCurrencyProcessor(OnlineProcessor):
amount = float(amount_str)
except ValueError:
return None
from_currency = name_to_iso4217(from_currency.strip())
to_currency = name_to_iso4217(to_currency.strip())
from_currency = fetch_iso4217_from_user(from_currency.strip())
to_currency = fetch_iso4217_from_user(to_currency.strip())
if from_currency is None or to_currency is None:
return None
params['amount'] = amount
params['from'] = from_currency
params['to'] = to_currency
params['from_name'] = iso4217_to_name(from_currency, 'en')
params['to_name'] = iso4217_to_name(to_currency, 'en')
params['from_name'] = fetch_name_from_iso4217(from_currency, 'en')
params['to_name'] = fetch_name_from_iso4217(to_currency, 'en')
return params
def get_default_tests(self):

View file

@ -9,15 +9,20 @@ Output file: :origin:`searx/data/currencies.json` (:origin:`CI Update data ...
# pylint: disable=invalid-name
import csv
import re
import unicodedata
import json
import sqlite3
from pathlib import Path
from searx.network import set_timeout_for_thread
from searx.locales import LOCALE_NAMES, locales_initialize
from searx.engines import wikidata, set_loggers
from searx.data import data_dir
DATA_FILE = data_dir / 'currencies.json'
DATABASE_FILE = data_dir / 'currencies.db'
CSV_FILE = data_dir / 'dumps' / 'currencies.csv'
set_loggers(wikidata, 'wikidata')
locales_initialize()
@ -75,57 +80,45 @@ def _normalize_name(name):
return name
def add_currency_name(db, name, iso4217, normalize_name=True):
db_names = db['names']
def add_entry(db, language, iso4217, name, normalize_name=True):
if normalize_name:
name = _normalize_name(name)
iso4217_set = db_names.setdefault(name, [])
if iso4217 not in iso4217_set:
iso4217_set.insert(0, iso4217)
def add_currency_label(db, label, iso4217, language):
labels = db['iso4217'].setdefault(iso4217, {})
labels[language] = label
entry = (language, iso4217, name)
db.add(entry)
def wikidata_request_result_iterator(request):
set_timeout_for_thread(60)
result = wikidata.send_wikidata_query(request.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL))
if result is not None:
yield from result['results']['bindings']
def fetch_db():
db = {
'names': {},
'iso4217': {},
}
db = set()
for r in wikidata_request_result_iterator(SPARQL_WIKIPEDIA_NAMES_REQUEST):
iso4217 = r['iso4217']['value']
article_name = r['article_name']['value']
article_lang = r['article_name']['xml:lang']
add_currency_name(db, article_name, iso4217)
add_currency_label(db, article_name, iso4217, article_lang)
add_entry(db, article_lang, iso4217, article_name)
for r in wikidata_request_result_iterator(SARQL_REQUEST):
iso4217 = r['iso4217']['value']
if 'label' in r:
label = r['label']['value']
label_lang = r['label']['xml:lang']
add_currency_name(db, label, iso4217)
add_currency_label(db, label, iso4217, label_lang)
add_entry(db, label_lang, iso4217, label)
if 'alias' in r:
add_currency_name(db, r['alias']['value'], iso4217)
add_entry(db, "", iso4217, r['alias']['value'])
if 'unicode' in r:
add_currency_name(db, r['unicode']['value'], iso4217, normalize_name=False)
add_entry(db, "", iso4217, r['unicode']['value'], normalize_name=False)
if 'unit' in r:
add_currency_name(db, r['unit']['value'], iso4217, normalize_name=False)
add_entry(db, "", iso4217, r['unit']['value'], normalize_name=False)
return db
@ -135,22 +128,33 @@ def main():
db = fetch_db()
# static
add_currency_name(db, "euro", 'EUR')
add_currency_name(db, "euros", 'EUR')
add_currency_name(db, "dollar", 'USD')
add_currency_name(db, "dollars", 'USD')
add_currency_name(db, "peso", 'MXN')
add_currency_name(db, "pesos", 'MXN')
add_entry(db, "", 'EUR', "euro")
add_entry(db, "", 'EUR', "euros")
add_entry(db, "", 'USD', "dollar")
add_entry(db, "", 'USD', "dollars")
add_entry(
db,
"",
'MXN',
"peso",
)
add_entry(db, "", 'MXN', "pesos")
# reduce memory usage:
# replace lists with one item by the item. see
# searx.search.processors.online_currency.name_to_iso4217
for name in db['names']:
if len(db['names'][name]) == 1:
db['names'][name] = db['names'][name][0]
with DATA_FILE.open('w', encoding='utf8') as f:
json.dump(db, f, indent=4, sort_keys=True, ensure_ascii=False)
db = list(db)
db.sort(key=lambda entry: (entry[0], entry[1], entry[2]))
Path(DATABASE_FILE).unlink(missing_ok=True)
with sqlite3.connect(DATABASE_FILE) as con:
cur = con.cursor()
cur.execute("CREATE TABLE currencies(language, iso4217, name)")
cur.executemany("INSERT INTO currencies VALUES(?, ?, ?)", db)
cur.execute("CREATE INDEX index_currencies_iso4217 ON currencies('iso4217')")
cur.execute("CREATE INDEX index_currencies_name ON currencies('name')")
con.commit()
with CSV_FILE.open('w', encoding='utf8') as f:
w = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
w.writerow(["language", "iso4217", "name"])
for row in db:
w.writerow(row)
if __name__ == '__main__':