Merge pull request #683 from return42/fix-doc

Document & Pylint scripts in searxng_extra/update
This commit is contained in:
Martin Fischer 2022-01-05 19:46:00 +01:00 committed by GitHub
commit 160f3e022e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 206 additions and 62 deletions

View file

@ -1,14 +1,15 @@
.. _searxng_extra:
======================================================
Tooling box ``searxng_extra`` for developers and users
======================================================
=============================
Tooling box ``searxng_extra``
=============================
In the folder :origin:`searxng_extra/` we maintain some tools useful for
In the folder :origin:`searxng_extra/` we maintain some tools useful for CI and
developers.
.. toctree::
:maxdepth: 2
:caption: Contents
update
standalone_searx.py

View file

@ -0,0 +1,88 @@
=========================
``searxng_extra/update/``
=========================
:origin:`[source] <searxng_extra/update/__init__.py>`
Scripts to update static data in :origin:`searx/data/`
.. _update_ahmia_blacklist.py:
``update_ahmia_blacklist.py``
=============================
:origin:`[source] <searxng_extra/update/update_ahmia_blacklist.py>`
.. automodule:: searxng_extra.update.update_ahmia_blacklist
:members:
``update_currencies.py``
========================
:origin:`[source] <searxng_extra/update/update_currencies.py>`
.. automodule:: searxng_extra.update.update_currencies
:members:
``update_engine_descriptions.py``
=================================
:origin:`[source] <searxng_extra/update/update_engine_descriptions.py>`
.. automodule:: searxng_extra.update.update_engine_descriptions
:members:
``update_external_bangs.py``
============================
:origin:`[source] <searxng_extra/update/update_external_bangs.py>`
.. automodule:: searxng_extra.update.update_external_bangs
:members:
``update_firefox_version.py``
=============================
:origin:`[source] <searxng_extra/update/update_firefox_version.py>`
.. automodule:: searxng_extra.update.update_firefox_version
:members:
``update_languages.py``
=======================
:origin:`[source] <searxng_extra/update/update_languages.py>`
.. automodule:: searxng_extra.update.update_languages
:members:
``update_osm_keys_tags.py``
===========================
:origin:`[source] <searxng_extra/update/update_osm_keys_tags.py>`
.. automodule:: searxng_extra.update.update_osm_keys_tags
:members:
``update_pygments.py``
======================
:origin:`[source] <searxng_extra/update/update_pygments.py>`
.. automodule:: searxng_extra.update.update_pygments
:members:
``update_wikidata_units.py``
============================
:origin:`[source] <searxng_extra/update/update_wikidata_units.py>`
.. automodule:: searxng_extra.update.update_wikidata_units
:members:

View file

@ -1,10 +1,15 @@
#!/usr/bin/env python
# lint: pylint
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This script saves `Ahmia's blacklist`_ for onion sites.
# This script saves Ahmia's blacklist for onion sites.
# More info in https://ahmia.fi/blacklist/
Output file: :origin:`searx/data/ahmia_blacklist.txt` (:origin:`CI Update data
... <.github/workflows/data-update.yml>`).
.. _Ahmia's blacklist: https://ahmia.fi/blacklist/
"""
# set path
from os.path import join
import requests
@ -17,15 +22,14 @@ def fetch_ahmia_blacklist():
resp = requests.get(URL, timeout=3.0)
if resp.status_code != 200:
raise Exception("Error fetching Ahmia blacklist, HTTP code " + resp.status_code)
else:
blacklist = resp.text.split()
return blacklist
return resp.text.split()
def get_ahmia_blacklist_filename():
return join(join(searx_dir, "data"), "ahmia_blacklist.txt")
blacklist = fetch_ahmia_blacklist()
with open(get_ahmia_blacklist_filename(), "w") as f:
f.write('\n'.join(blacklist))
if __name__ == '__main__':
blacklist = fetch_ahmia_blacklist()
with open(get_ahmia_blacklist_filename(), "w", encoding='utf-8') as f:
f.write('\n'.join(blacklist))

View file

@ -1,13 +1,22 @@
#!/usr/bin/env python
# lint: pylint
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Fetch currencies from :origin:`searx/engines/wikidata.py` engine.
Output file: :origin:`searx/data/currencies.json` (:origin:`CI Update data ...
<.github/workflows/data-update.yml>`).
"""
# pylint: disable=invalid-name
import re
import unicodedata
import json
# set path
from sys import path
from os.path import realpath, dirname, join
from os.path import join
from searx import searx_dir
from searx.locales import LOCALE_NAMES

View file

@ -1,6 +1,16 @@
#!/usr/bin/env python
# lint: pylint
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Fetch website description from websites and from
:origin:`searx/engines/wikidata.py` engine.
Output file: :origin:`searx/data/engine_descriptions.json`.
"""
# pylint: disable=invalid-name, global-statement
import json
from urllib.parse import urlparse
from os.path import join
@ -102,7 +112,7 @@ def get_wikipedia_summary(lang, pageid):
response.raise_for_status()
api_result = json.loads(response.text)
return api_result.get('extract')
except:
except Exception: # pylint: disable=broad-except
return None
@ -134,7 +144,7 @@ def get_website_description(url, lang1, lang2=None):
try:
response = searx.network.get(url, headers=headers, timeout=10)
response.raise_for_status()
except Exception:
except Exception: # pylint: disable=broad-except
return (None, None)
try:

View file

@ -1,17 +1,20 @@
#!/usr/bin/env python
# lint: pylint
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Update searx/data/external_bangs.json using the duckduckgo bangs.
"""Update :origin:`searx/data/external_bangs.json` using the duckduckgo bangs
(:origin:`CI Update data ... <.github/workflows/data-update.yml>`).
https://duckduckgo.com/newbang loads:
https://duckduckgo.com/newbang loads
* a javascript which provides the bang version ( https://duckduckgo.com/bv1.js )
* a JSON file which contains the bangs ( https://duckduckgo.com/bang.v260.js for example )
This script loads the javascript, then the bangs.
The javascript URL may change in the future ( for example https://duckduckgo.com/bv2.js ),
but most probably it will requires to update RE_BANG_VERSION
The javascript URL may change in the future ( for example
https://duckduckgo.com/bv2.js ), but most probably it will requires to update
RE_BANG_VERSION
"""
# pylint: disable=C0116

View file

@ -1,21 +1,30 @@
#!/usr/bin/env python
# lint: pylint
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Fetch firefox useragent signatures
Output file: :origin:`searx/data/useragents.json` (:origin:`CI Update data ...
<.github/workflows/data-update.yml>`).
"""
import json
import requests
import re
from os.path import dirname, join
from os.path import join
from urllib.parse import urlparse, urljoin
from distutils.version import LooseVersion, StrictVersion
from distutils.version import LooseVersion
import requests
from lxml import html
from searx import searx_dir
URL = 'https://ftp.mozilla.org/pub/firefox/releases/'
RELEASE_PATH = '/pub/firefox/releases/'
NORMAL_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?$')
# BETA_REGEX = re.compile('.*[0-9]b([0-9\-a-z]+)$')
# ESR_REGEX = re.compile('^[0-9]+\.[0-9](\.[0-9])?esr$')
NORMAL_REGEX = re.compile(r'^[0-9]+\.[0-9](\.[0-9])?$')
# BETA_REGEX = re.compile(r'.*[0-9]b([0-9\-a-z]+)$')
# ESR_REGEX = re.compile(r'^[0-9]+\.[0-9](\.[0-9])?esr$')
#
useragents = {
@ -32,20 +41,19 @@ def fetch_firefox_versions():
resp = requests.get(URL, timeout=2.0)
if resp.status_code != 200:
raise Exception("Error fetching firefox versions, HTTP code " + resp.status_code)
else:
dom = html.fromstring(resp.text)
versions = []
dom = html.fromstring(resp.text)
versions = []
for link in dom.xpath('//a/@href'):
url = urlparse(urljoin(URL, link))
path = url.path
if path.startswith(RELEASE_PATH):
version = path[len(RELEASE_PATH) : -1]
if NORMAL_REGEX.match(version):
versions.append(LooseVersion(version))
for link in dom.xpath('//a/@href'):
url = urlparse(urljoin(URL, link))
path = url.path
if path.startswith(RELEASE_PATH):
version = path[len(RELEASE_PATH) : -1]
if NORMAL_REGEX.match(version):
versions.append(LooseVersion(version))
list.sort(versions, reverse=True)
return versions
list.sort(versions, reverse=True)
return versions
def fetch_firefox_last_versions():
@ -66,6 +74,7 @@ def get_useragents_filename():
return join(join(searx_dir, "data"), "useragents.json")
useragents["versions"] = fetch_firefox_last_versions()
with open(get_useragents_filename(), "w") as f:
json.dump(useragents, f, indent=4, ensure_ascii=False)
if __name__ == '__main__':
useragents["versions"] = fetch_firefox_last_versions()
with open(get_useragents_filename(), "w", encoding='utf-8') as f:
json.dump(useragents, f, indent=4, ensure_ascii=False)

View file

@ -1,9 +1,17 @@
#!/usr/bin/env python
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
# This script generates languages.py from intersecting each engine's supported languages.
#
# Output files: searx/data/engines_languages.json and searx/languages.py
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This script generates languages.py from intersecting each engine's supported
languages.
Output files: :origin:`searx/data/engines_languages.json` and
:origin:`searx/languages.py` (:origin:`CI Update data ...
<.github/workflows/data-update.yml>`).
"""
# pylint: disable=invalid-name
import json
from pathlib import Path
@ -24,7 +32,7 @@ languages_file = Path(searx_dir) / 'languages.py'
def fetch_supported_languages():
set_timeout_for_thread(10.0)
engines_languages = dict()
engines_languages = {}
names = list(engines)
names.sort()
@ -32,7 +40,7 @@ def fetch_supported_languages():
if hasattr(engines[engine_name], 'fetch_supported_languages'):
engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
print("fetched %s languages from engine %s" % (len(engines_languages[engine_name]), engine_name))
if type(engines_languages[engine_name]) == list:
if type(engines_languages[engine_name]) == list: # pylint: disable=unidiomatic-typecheck
engines_languages[engine_name] = sorted(engines_languages[engine_name])
print("fetched languages from %s engines" % len(engines_languages))
@ -55,7 +63,7 @@ def get_locale(lang_code):
# Join all language lists.
def join_language_lists(engines_languages):
language_list = dict()
language_list = {}
for engine_name in engines_languages:
for lang_code in engines_languages[engine_name]:
@ -91,7 +99,7 @@ def join_language_lists(engines_languages):
'name': language_name,
'english_name': english_name,
'counter': set(),
'countries': dict(),
'countries': {},
}
# add language with country if not in list
@ -119,6 +127,7 @@ def join_language_lists(engines_languages):
def filter_language_list(all_languages):
min_engines_per_lang = 13
min_engines_per_country = 7
# pylint: disable=consider-using-dict-items, consider-iterating-dictionary
main_engines = [
engine_name
for engine_name in engines.keys()
@ -138,7 +147,7 @@ def filter_language_list(all_languages):
}
def _copy_lang_data(lang, country_name=None):
new_dict = dict()
new_dict = {}
new_dict['name'] = all_languages[lang]['name']
new_dict['english_name'] = all_languages[lang]['english_name']
if country_name:
@ -146,10 +155,10 @@ def filter_language_list(all_languages):
return new_dict
# for each language get country codes supported by most engines or at least one country code
filtered_languages_with_countries = dict()
filtered_languages_with_countries = {}
for lang, lang_data in filtered_languages.items():
countries = lang_data['countries']
filtered_countries = dict()
filtered_countries = {}
# get language's country codes with enough supported engines
for lang_country, country_data in countries.items():
@ -211,7 +220,7 @@ def write_languages_file(languages):
language_codes = tuple(language_codes)
with open(languages_file, 'w') as new_file:
with open(languages_file, 'w', encoding='utf-8') as new_file:
file_content = "{file_headers} {language_codes},\n)\n".format(
# fmt: off
file_headers = '\n'.join(file_headers),
@ -224,7 +233,7 @@ def write_languages_file(languages):
if __name__ == "__main__":
load_engines(settings['engines'])
engines_languages = fetch_supported_languages()
all_languages = join_language_lists(engines_languages)
filtered_languages = filter_language_list(all_languages)
write_languages_file(filtered_languages)
_engines_languages = fetch_supported_languages()
_all_languages = join_language_lists(_engines_languages)
_filtered_languages = filter_language_list(_all_languages)
write_languages_file(_filtered_languages)

View file

@ -5,7 +5,10 @@
To get the i18n names, the scripts uses `Wikidata Query Service`_ instead of for
example `OSM tags API`_ (sidenote: the actual change log from
map.atownsend.org.uk_ might be useful to normalize OSM tags)
map.atownsend.org.uk_ might be useful to normalize OSM tags).
Output file: :origin:`searx/data/osm_keys_tags` (:origin:`CI Update data ...
<.github/workflows/data-update.yml>`).
.. _Wikidata Query Service: https://query.wikidata.org/
.. _OSM tags API: https://taginfo.openstreetmap.org/taginfo/apidoc

View file

@ -3,6 +3,13 @@
# lint: pylint
# pylint: disable=missing-module-docstring
"""Fetch units from :origin:`searx/engines/wikidata.py` engine.
Output file: :origin:`searx/data/wikidata_units.json` (:origin:`CI Update data
... <.github/workflows/data-update.yml>`).
"""
import json
import collections
@ -54,5 +61,6 @@ def get_wikidata_units_filename():
return join(join(searx_dir, "data"), "wikidata_units.json")
with open(get_wikidata_units_filename(), 'w', encoding="utf8") as f:
json.dump(get_data(), f, indent=4, ensure_ascii=False)
if __name__ == '__main__':
with open(get_wikidata_units_filename(), 'w', encoding="utf8") as f:
json.dump(get_data(), f, indent=4, ensure_ascii=False)