This commit is contained in:
Alexandre Flament 2024-05-07 13:59:11 +02:00 committed by GitHub
commit 7fc6ac283e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
19 changed files with 86697 additions and 79765 deletions

View file

@ -7,21 +7,29 @@
__all__ = [
'ENGINE_TRAITS',
'CURRENCIES',
'USER_AGENTS',
'EXTERNAL_URLS',
'WIKIDATA_UNITS',
'EXTERNAL_BANGS',
'OSM_KEYS_TAGS',
'ENGINE_DESCRIPTIONS',
'LOCALES',
'ahmia_blacklist_loader',
'fetch_engine_descriptions',
'fetch_iso4217_from_user',
'fetch_name_from_iso4217',
'fetch_osm_key_label',
]
import re
import unicodedata
import json
import sqlite3
from typing import Dict, List, Optional
from functools import lru_cache
from threading import local
from pathlib import Path
data_dir = Path(__file__).parent
data_connection_local = local()
def _load(filename):
@ -29,6 +37,115 @@ def _load(filename):
return json.load(f)
def _get_connection(filename: str) -> sqlite3.Connection:
"""Return a read only SQLite connection to filename.
The filename is relative to searx/data
Multiple calls to this function in the same thread,
already return the same connection.
"""
connection = data_connection_local.__dict__.get(filename)
if connection is not None:
return connection
data_filename = str(data_dir / filename)
# open database in read only mode
data_connection = sqlite3.connect(f'file:{data_filename}?mode=ro', uri=True)
# https://phiresky.github.io/blog/2020/sqlite-performance-tuning/
data_connection.executescript(
"""
pragma temp_store = memory;
pragma mmap_size = 30000000000;
"""
)
data_connection_local.__dict__[filename] = data_connection
return data_connection
def fetch_engine_descriptions(language) -> Dict[str, List[str]]:
"""Return engine description and source for each engine name."""
res = _get_connection("engine_descriptions.db").execute(
"SELECT engine, description, source FROM engine_descriptions WHERE language=?", (language,)
)
return {result[0]: [result[1], result[2]] for result in res.fetchall()}
def _normalize_name(name):
name = name.lower().replace('-', ' ').rstrip('s')
name = re.sub(' +', ' ', name)
return unicodedata.normalize('NFKD', name).lower()
@lru_cache(10)
def fetch_iso4217_from_user(name: str) -> Optional[str]:
connection = _get_connection("currencies.db")
# try the iso4217
res = connection.execute("SELECT iso4217 FROM currencies WHERE lower(iso4217)=? LIMIT 1", (name.lower(),))
result = res.fetchone()
if result:
return result[0]
# try the currency names
name = _normalize_name(name)
res = connection.execute("SELECT iso4217 FROM currencies WHERE name=?", (name,))
result = list(set(result[0] for result in res.fetchall()))
if len(result) == 1:
return result[0]
# ambiguity --> return nothing
return None
@lru_cache(10)
def fetch_name_from_iso4217(iso4217: str, language: str) -> Optional[str]:
res = _get_connection("currencies.db").execute(
"SELECT name FROM currencies WHERE iso4217=? AND language=?", (iso4217, language)
)
result = [result[0] for result in res.fetchall()]
if len(result) == 1:
return result[0]
return None
@lru_cache(100)
def fetch_osm_key_label(key_name: str, language: str) -> Optional[str]:
if key_name.startswith('currency:'):
# currency:EUR --> get the name from the CURRENCIES variable
# see https://wiki.openstreetmap.org/wiki/Key%3Acurrency
# and for example https://taginfo.openstreetmap.org/keys/currency:EUR#values
# but there is also currency=EUR (currently not handled)
# https://taginfo.openstreetmap.org/keys/currency#values
currency = key_name.split(':')
if len(currency) > 1:
label = fetch_name_from_iso4217(currency[1], language)
if label:
return label
return currency[1]
language = language.lower()
language_short = language.split('-')[0]
res = _get_connection("osm_keys_tags.db").execute(
"SELECT language, label FROM osm_keys WHERE name=? AND language in (?, ?, 'en')",
(key_name, language, language_short),
)
result = {result[0]: result[1] for result in res.fetchall()}
return result.get(language) or result.get(language_short) or result.get('en')
@lru_cache(100)
def fetch_osm_tag_label(tag_key: str, tag_value: str, language: str) -> Optional[str]:
language = language.lower()
language_short = language.split('-')[0]
res = _get_connection("osm_keys_tags.db").execute(
"SELECT language, label FROM osm_tags WHERE tag_key=? AND tag_value=? AND language in (?, ?, 'en')",
(tag_key, tag_value, language, language_short),
)
result = {result[0]: result[1] for result in res.fetchall()}
return result.get(language) or result.get(language_short) or result.get('en')
def ahmia_blacklist_loader():
"""Load data from `ahmia_blacklist.txt` and return a list of MD5 values of onion
names. The MD5 values are fetched by::
@ -42,12 +159,9 @@ def ahmia_blacklist_loader():
return f.read().split()
CURRENCIES = _load('currencies.json')
USER_AGENTS = _load('useragents.json')
EXTERNAL_URLS = _load('external_urls.json')
WIKIDATA_UNITS = _load('wikidata_units.json')
EXTERNAL_BANGS = _load('external_bangs.json')
OSM_KEYS_TAGS = _load('osm_keys_tags.json')
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
ENGINE_TRAITS = _load('engine_traits.json')
LOCALES = _load('locales.json')

BIN
searx/data/currencies.db Normal file

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,3 @@
Dumps of the SQLite files in ``searx.data``.
These files are not used by SearXNG, they are here for reference.

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

70384
searx/data/dumps/osm_tags.csv Normal file

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because one or more lines are too long

BIN
searx/data/osm_keys_tags.db Normal file

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -6,7 +6,7 @@ from time import time
from urllib.parse import urlencode
from searx.network import get as http_get
from searx.engines.openstreetmap import get_key_label
from searx.data import fetch_osm_key_label
about = {
"website": 'https://www.apple.com/maps/',
@ -72,7 +72,7 @@ def response(resp):
telephone = result['telephone']
links.append(
{
'label': get_key_label('phone', user_language),
'label': fetch_osm_key_label('phone', user_language),
'url': 'tel:' + telephone,
'url_label': telephone,
}
@ -81,7 +81,7 @@ def response(resp):
url = result['urls'][0]
links.append(
{
'label': get_key_label('website', user_language),
'label': fetch_osm_key_label('website', user_language),
'url': url,
'url_label': url,
}

View file

@ -10,7 +10,7 @@ from functools import partial
from flask_babel import gettext
from searx.data import OSM_KEYS_TAGS, CURRENCIES
from searx.data import fetch_osm_tag_label, fetch_osm_key_label
from searx.utils import searx_useragent
from searx.external_urls import get_external_url
from searx.engines.wikidata import send_wikidata_query, sparql_string_escape, get_thumbnail
@ -187,14 +187,14 @@ def response(resp):
'template': 'map.html',
'title': title,
'address': address,
'address_label': get_key_label('addr', user_language),
'address_label': fetch_osm_key_label('addr', user_language),
'url': url,
'osm': osm,
'geojson': geojson,
'img_src': img_src,
'links': links,
'data': data,
'type': get_tag_label(result.get('category'), result.get('type', ''), user_language),
'type': fetch_osm_tag_label(result.get('category'), result.get('type', ''), user_language),
'type_icon': result.get('icon'),
'content': '',
'longitude': result['lon'],
@ -367,7 +367,7 @@ def get_links(result, user_language):
url_label = result.get('wikidata', {}).get('itemLabel') or url_label
links.append(
{
'label': get_key_label(k, user_language),
'label': fetch_osm_key_label(k, user_language),
'url': url,
'url_label': url_label,
}
@ -389,7 +389,7 @@ def get_data(result, user_language, ignore_keys):
continue
if get_key_rank(k) is None:
continue
k_label = get_key_label(k, user_language)
k_label = fetch_osm_key_label(k, user_language)
if k_label:
data.append(
{
@ -412,51 +412,3 @@ def get_key_rank(k):
# "payment:*" in KEY_ORDER matches "payment:cash", "payment:debit card", etc...
key_rank = KEY_RANKS.get(k.split(':')[0] + ':*')
return key_rank
def get_label(labels, lang):
"""Get label from labels in OSM_KEYS_TAGS
in OSM_KEYS_TAGS, labels have key == '*'
"""
tag_label = labels.get(lang.lower())
if tag_label is None:
# example: if 'zh-hk' is not found, check 'zh'
tag_label = labels.get(lang.split('-')[0])
if tag_label is None and lang != 'en':
# example: if 'zh' is not found, check 'en'
tag_label = labels.get('en')
if tag_label is None and len(labels.values()) > 0:
# example: if still not found, use the first entry
tag_label = labels.values()[0]
return tag_label
def get_tag_label(tag_category, tag_name, lang):
"""Get tag label from OSM_KEYS_TAGS"""
tag_name = '' if tag_name is None else tag_name
tag_labels = OSM_KEYS_TAGS['tags'].get(tag_category, {}).get(tag_name, {})
return get_label(tag_labels, lang)
def get_key_label(key_name, lang):
"""Get key label from OSM_KEYS_TAGS"""
if key_name.startswith('currency:'):
# currency:EUR --> get the name from the CURRENCIES variable
# see https://wiki.openstreetmap.org/wiki/Key%3Acurrency
# and for example https://taginfo.openstreetmap.org/keys/currency:EUR#values
# but there is also currency=EUR (currently not handled)
# https://taginfo.openstreetmap.org/keys/currency#values
currency = key_name.split(':')
if len(currency) > 1:
o = CURRENCIES['iso4217'].get(currency[1])
if o:
return get_label(o, lang).lower()
return currency[1]
labels = OSM_KEYS_TAGS['keys']
for k in key_name.split(':') + ['*']:
labels = labels.get(k)
if labels is None:
return None
return get_label(labels, lang)

View file

@ -3,33 +3,14 @@
"""
import unicodedata
import re
from searx.data import CURRENCIES
from searx.data import fetch_iso4217_from_user, fetch_name_from_iso4217
from .online import OnlineProcessor
parser_re = re.compile('.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I)
def normalize_name(name):
name = name.lower().replace('-', ' ').rstrip('s')
name = re.sub(' +', ' ', name)
return unicodedata.normalize('NFKD', name).lower()
def name_to_iso4217(name):
name = normalize_name(name)
currency = CURRENCIES['names'].get(name, [name])
if isinstance(currency, str):
return currency
return currency[0]
def iso4217_to_name(iso4217, language):
return CURRENCIES['iso4217'].get(iso4217, {}).get(language, iso4217)
class OnlineCurrencyProcessor(OnlineProcessor):
"""Processor class used by ``online_currency`` engines."""
@ -52,14 +33,17 @@ class OnlineCurrencyProcessor(OnlineProcessor):
amount = float(amount_str)
except ValueError:
return None
from_currency = name_to_iso4217(from_currency.strip())
to_currency = name_to_iso4217(to_currency.strip())
from_currency = fetch_iso4217_from_user(from_currency.strip())
to_currency = fetch_iso4217_from_user(to_currency.strip())
if from_currency is None or to_currency is None:
return None
params['amount'] = amount
params['from'] = from_currency
params['to'] = to_currency
params['from_name'] = iso4217_to_name(from_currency, 'en')
params['to_name'] = iso4217_to_name(to_currency, 'en')
params['from_name'] = fetch_name_from_iso4217(from_currency, 'en')
params['to_name'] = fetch_name_from_iso4217(to_currency, 'en')
return params
def get_default_tests(self):

View file

@ -58,7 +58,7 @@ from searx import infopage
from searx import limiter
from searx.botdetection import link_token
from searx.data import ENGINE_DESCRIPTIONS
from searx.data import fetch_engine_descriptions
from searx.results import Timing
from searx.settings_defaults import OUTPUT_FORMATS
from searx.settings_loader import get_default_settings_path
@ -1102,17 +1102,10 @@ def image_proxy():
@app.route('/engine_descriptions.json', methods=['GET'])
def engine_descriptions():
locale = get_locale().split('_')[0]
result = ENGINE_DESCRIPTIONS['en'].copy()
result = fetch_engine_descriptions('en')
if locale != 'en':
for engine, description in ENGINE_DESCRIPTIONS.get(locale, {}).items():
for engine, description in fetch_engine_descriptions(locale).items():
result[engine] = description
for engine, description in result.items():
if len(description) == 2 and description[1] == 'ref':
ref_engine, ref_lang = description[0].split(':')
description = ENGINE_DESCRIPTIONS[ref_lang][ref_engine]
if isinstance(description, str):
description = [description, 'wikipedia']
result[engine] = description
# overwrite by about:description (from settings)
for engine_name, engine_mod in engines.items():

View file

@ -9,15 +9,20 @@ Output file: :origin:`searx/data/currencies.json` (:origin:`CI Update data ...
# pylint: disable=invalid-name
import csv
import re
import unicodedata
import json
import sqlite3
from pathlib import Path
from searx.network import set_timeout_for_thread
from searx.locales import LOCALE_NAMES, locales_initialize
from searx.engines import wikidata, set_loggers
from searx.data import data_dir
DATA_FILE = data_dir / 'currencies.json'
DATABASE_FILE = data_dir / 'currencies.db'
CSV_FILE = data_dir / 'dumps' / 'currencies.csv'
set_loggers(wikidata, 'wikidata')
locales_initialize()
@ -75,57 +80,45 @@ def _normalize_name(name):
return name
def add_currency_name(db, name, iso4217, normalize_name=True):
db_names = db['names']
def add_entry(db, language, iso4217, name, normalize_name=True):
if normalize_name:
name = _normalize_name(name)
iso4217_set = db_names.setdefault(name, [])
if iso4217 not in iso4217_set:
iso4217_set.insert(0, iso4217)
def add_currency_label(db, label, iso4217, language):
labels = db['iso4217'].setdefault(iso4217, {})
labels[language] = label
entry = (language, iso4217, name)
db.add(entry)
def wikidata_request_result_iterator(request):
set_timeout_for_thread(60)
result = wikidata.send_wikidata_query(request.replace('%LANGUAGES_SPARQL%', LANGUAGES_SPARQL))
if result is not None:
yield from result['results']['bindings']
def fetch_db():
db = {
'names': {},
'iso4217': {},
}
db = set()
for r in wikidata_request_result_iterator(SPARQL_WIKIPEDIA_NAMES_REQUEST):
iso4217 = r['iso4217']['value']
article_name = r['article_name']['value']
article_lang = r['article_name']['xml:lang']
add_currency_name(db, article_name, iso4217)
add_currency_label(db, article_name, iso4217, article_lang)
add_entry(db, article_lang, iso4217, article_name)
for r in wikidata_request_result_iterator(SARQL_REQUEST):
iso4217 = r['iso4217']['value']
if 'label' in r:
label = r['label']['value']
label_lang = r['label']['xml:lang']
add_currency_name(db, label, iso4217)
add_currency_label(db, label, iso4217, label_lang)
add_entry(db, label_lang, iso4217, label)
if 'alias' in r:
add_currency_name(db, r['alias']['value'], iso4217)
add_entry(db, "", iso4217, r['alias']['value'])
if 'unicode' in r:
add_currency_name(db, r['unicode']['value'], iso4217, normalize_name=False)
add_entry(db, "", iso4217, r['unicode']['value'], normalize_name=False)
if 'unit' in r:
add_currency_name(db, r['unit']['value'], iso4217, normalize_name=False)
add_entry(db, "", iso4217, r['unit']['value'], normalize_name=False)
return db
@ -135,22 +128,33 @@ def main():
db = fetch_db()
# static
add_currency_name(db, "euro", 'EUR')
add_currency_name(db, "euros", 'EUR')
add_currency_name(db, "dollar", 'USD')
add_currency_name(db, "dollars", 'USD')
add_currency_name(db, "peso", 'MXN')
add_currency_name(db, "pesos", 'MXN')
add_entry(db, "", 'EUR', "euro")
add_entry(db, "", 'EUR', "euros")
add_entry(db, "", 'USD', "dollar")
add_entry(db, "", 'USD', "dollars")
add_entry(
db,
"",
'MXN',
"peso",
)
add_entry(db, "", 'MXN', "pesos")
# reduce memory usage:
# replace lists with one item by the item. see
# searx.search.processors.online_currency.name_to_iso4217
for name in db['names']:
if len(db['names'][name]) == 1:
db['names'][name] = db['names'][name][0]
with DATA_FILE.open('w', encoding='utf8') as f:
json.dump(db, f, indent=4, sort_keys=True, ensure_ascii=False)
db = list(db)
db.sort(key=lambda entry: (entry[0], entry[1], entry[2]))
Path(DATABASE_FILE).unlink(missing_ok=True)
with sqlite3.connect(DATABASE_FILE) as con:
cur = con.cursor()
cur.execute("CREATE TABLE currencies(language, iso4217, name)")
cur.executemany("INSERT INTO currencies VALUES(?, ?, ?)", db)
cur.execute("CREATE INDEX index_currencies_iso4217 ON currencies('iso4217')")
cur.execute("CREATE INDEX index_currencies_name ON currencies('name')")
con.commit()
with CSV_FILE.open('w', encoding='utf8') as f:
w = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
w.writerow(["language", "iso4217", "name"])
for row in db:
w.writerow(row)
if __name__ == '__main__':

View file

@ -9,22 +9,24 @@ Output file: :origin:`searx/data/engine_descriptions.json`.
# pylint: disable=invalid-name, global-statement
import csv
import json
import sqlite3
from urllib.parse import urlparse
from os.path import join
from pathlib import Path
from lxml.html import fromstring
from searx.engines import wikidata, set_loggers
from searx.utils import extract_text, searx_useragent
from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
from searx import searx_dir
from searx.utils import gen_useragent, detect_language
import searx.search
import searx.network
from searx.data import data_dir
DATA_FILE = data_dir / 'engine_descriptions.json'
DATABASE_FILE = data_dir / 'engine_descriptions.db'
CSV_FILE = data_dir / 'dumps' / 'engine_descriptions.csv'
set_loggers(wikidata, 'wikidata')
locales_initialize()
@ -323,37 +325,32 @@ def fetch_website_descriptions():
fetch_website_description(engine_name, website)
def get_engine_descriptions_filename():
return join(join(searx_dir, "data"), "engine_descriptions.json")
def get_output():
def write_db():
"""
From descriptions[engine][language] = [description, source]
To
Erase and write the SQLite database searx/data/engine_descriptions.db :
* create one table engine_descriptions
* dump write all the values
* output[language][engine] = description_and_source
* description_and_source can be:
* [description, source]
* description (if source = "wikipedia")
* [f"engine:lang", "ref"] (reference to another existing description)
Make a JSON dump of the values into engine_descriptions.json
"""
output = {locale: {} for locale in LOCALE_NAMES}
seen_descriptions = {}
for engine_name, lang_descriptions in descriptions.items():
for language, description in lang_descriptions.items():
if description[0] in seen_descriptions:
ref = seen_descriptions[description[0]]
description = [f'{ref[0]}:{ref[1]}', 'ref']
else:
seen_descriptions[description[0]] = (engine_name, language)
if description[1] == 'wikipedia':
description = description[0]
output.setdefault(language, {}).setdefault(engine_name, description)
return output
data = [
(language, engine_name, description[0], description[1])
for engine_name, lang_descriptions in descriptions.items()
for language, description in lang_descriptions.items()
]
data.sort(key=lambda item: (item[0], item[1]))
Path(DATABASE_FILE).unlink(missing_ok=True)
with sqlite3.connect(DATABASE_FILE) as con:
cur = con.cursor()
cur.execute("CREATE TABLE engine_descriptions(language, engine, description, source)")
cur.executemany("INSERT INTO engine_descriptions VALUES(?, ?, ?, ?)", data)
cur.execute("CREATE INDEX index_engine_descriptions ON engine_descriptions('language')")
con.commit()
with CSV_FILE.open('w', encoding="utf8") as f:
w = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
w.writerow(["language", "engine", "description", "source"])
for row in data:
w.writerow(row)
def main():
@ -361,10 +358,7 @@ def main():
fetch_wikidata_descriptions()
fetch_wikipedia_descriptions()
fetch_website_descriptions()
output = get_output()
with DATA_FILE.open('w', encoding='utf8') as f:
f.write(json.dumps(output, indent=1, separators=(',', ':'), sort_keys=True, ensure_ascii=False))
write_db()
if __name__ == "__main__":

View file

@ -42,8 +42,9 @@ Output file: :origin:`searx/data/osm_keys_tags` (:origin:`CI Update data ...
"""
import json
import collections
import csv
import sqlite3
from pathlib import Path
from searx.network import set_timeout_for_thread
from searx.engines import wikidata, set_loggers
@ -51,7 +52,9 @@ from searx.sxng_locales import sxng_locales
from searx.engines.openstreetmap import get_key_rank, VALUE_TO_LINK
from searx.data import data_dir
DATA_FILE = data_dir / 'osm_keys_tags.json'
DATABASE_FILE = data_dir / 'osm_keys_tags.db'
CSV_KEYS_FILE = data_dir / 'dumps' / 'osm_keys.csv'
CSV_TAGS_FILE = data_dir / 'dumps' / 'osm_tags.csv'
set_loggers(wikidata, 'wikidata')
@ -78,42 +81,39 @@ ORDER BY ?key ?item ?itemLabel
LANGUAGES = [l[0].lower() for l in sxng_locales]
PRESET_KEYS = {
('wikidata',): {'en': 'Wikidata'},
('wikipedia',): {'en': 'Wikipedia'},
('email',): {'en': 'Email'},
('facebook',): {'en': 'Facebook'},
('fax',): {'en': 'Fax'},
('internet_access', 'ssid'): {'en': 'Wi-Fi'},
}
PRESET_KEYS = [
["wikidata", "en", "Wikidata"],
["wikipedia", "en", "Wikipedia"],
["email", "en", "email"],
["facebook", "en", "facebook"],
["fax", "en", "Fax"],
["internet_access:ssid", "en", "Wi-Fi"],
]
INCLUDED_KEYS = {('addr',)}
def get_preset_keys():
results = collections.OrderedDict()
for keys, value in PRESET_KEYS.items():
r = results
for k in keys:
r = r.setdefault(k, {})
r.setdefault('*', value)
return results
def get_keys():
results = get_preset_keys()
result_keys = set()
results = PRESET_KEYS.copy()
response = wikidata.send_wikidata_query(SPARQL_KEYS_REQUEST)
for key in response['results']['bindings']:
keys = key['key']['value'].split(':')[1:]
label = key['itemLabel']['value'].lower()
lang = key['itemLabel']['xml:lang']
if lang not in LANGUAGES:
continue
if keys[0] == 'currency' and len(keys) > 1:
# special case in openstreetmap.py
continue
if keys[0] == 'contact' and len(keys) > 1:
# label for the key "contact.email" is "Email"
# whatever the language
r = results.setdefault('contact', {})
r[keys[1]] = {'*': {'en': keys[1]}}
if lang == "en":
# label for the key "contact.email" is "Email"
# whatever the language
results.append((":".join(keys), "en", keys[1]))
continue
if tuple(keys) in PRESET_KEYS:
# skip presets (already set above)
@ -125,40 +125,46 @@ def get_keys():
):
# keep only keys that will be displayed by openstreetmap.py
continue
label = key['itemLabel']['value'].lower()
lang = key['itemLabel']['xml:lang']
r = results
for k in keys:
r = r.setdefault(k, {})
r = r.setdefault('*', {})
if lang in LANGUAGES:
r.setdefault(lang, label)
entry = (":".join(keys), lang, label)
entry_key = (entry[0], entry[1])
if entry_key not in result_keys:
results.append(entry)
result_keys.add(entry_key)
# special cases
results['delivery']['covid19']['*'].clear()
for k, v in results['delivery']['*'].items():
results['delivery']['covid19']['*'][k] = v + ' (COVID19)'
results = [entry for entry in results if entry[0] != 'delivery:covid19']
results.extend(
[['delivery:covid19', entry[1], entry[2] + ' (COVID19)'] for entry in results if entry[0] == 'delivery']
)
results['opening_hours']['covid19']['*'].clear()
for k, v in results['opening_hours']['*'].items():
results['opening_hours']['covid19']['*'][k] = v + ' (COVID19)'
results = [entry for entry in results if entry[0] != 'opening_hours:covid19']
results.extend(
[
['opening_hours:covid19', entry[1], entry[2] + ' (COVID19)']
for entry in results
if entry[0] == 'opening_hours'
]
)
return results
def get_tags():
results = collections.OrderedDict()
results = []
response = wikidata.send_wikidata_query(SPARQL_TAGS_REQUEST)
for tag in response['results']['bindings']:
tag_names = tag['tag']['value'].split(':')[1].split('=')
if len(tag_names) == 2:
tag_category, tag_type = tag_names
else:
tag_category, tag_type = tag_names[0], ''
try:
tag_key, tag_value = tag['tag']['value'].split('=')
if tag_key.startswith("Tag:"):
tag_key = tag_key[4:]
except ValueError:
print("ignore tag", tag['tag']['value'])
continue
label = tag['itemLabel']['value'].lower()
lang = tag['itemLabel']['xml:lang']
if lang in LANGUAGES:
results.setdefault(tag_category, {}).setdefault(tag_type, {}).setdefault(lang, label)
results.append((tag_key, tag_value, lang, label))
return results
@ -206,9 +212,30 @@ def optimize_keys(data):
if __name__ == '__main__':
set_timeout_for_thread(60)
result = {
'keys': optimize_keys(get_keys()),
'tags': optimize_tags(get_tags()),
}
with DATA_FILE.open('w', encoding="utf8") as f:
json.dump(result, f, indent=4, sort_keys=True, ensure_ascii=False)
osm_keys = get_keys()
osm_tags = get_tags()
osm_keys.sort(key=lambda item: (item[0], item[1]))
osm_tags.sort(key=lambda item: (item[0], item[1]))
Path(DATABASE_FILE).unlink(missing_ok=True)
with sqlite3.connect(DATABASE_FILE) as con:
cur = con.cursor()
cur.execute("CREATE TABLE osm_keys(name, language, label)")
cur.executemany("INSERT INTO osm_keys VALUES(?, ?, ?)", osm_keys)
cur.execute("CREATE INDEX index_osm_keys ON osm_keys('name', 'language')")
cur.execute("CREATE TABLE osm_tags(tag_key, tag_value, language, label)")
cur.executemany("INSERT INTO osm_tags VALUES(?, ?, ?, ?)", osm_tags)
cur.execute("CREATE INDEX index_osm_tags ON osm_tags('tag_key', 'tag_value', 'language')")
con.commit()
with CSV_KEYS_FILE.open('w', encoding="utf8") as f:
w = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
w.writerow(["name", "language", "label"])
for row in osm_keys:
w.writerow(row)
with CSV_TAGS_FILE.open('w', encoding="utf8") as f:
w = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
w.writerow(["tag_key", "tag_value", "language", "label"])
for row in osm_tags:
w.writerow(row)