[mod] Google: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Google engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2022-10-08 11:32:08 +02:00
parent dba8977b09
commit f78f908383
6 changed files with 1366 additions and 17 deletions

File diff suppressed because it is too large Load diff

View file

@ -29,6 +29,9 @@ from urllib.parse import urlencode
from lxml import html
from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.exceptions import SearxEngineCaptchaException
from searx.enginelib.traits import EngineTraits
traits: EngineTraits
# about
about = {
@ -373,3 +376,87 @@ def _fetch_supported_languages(resp):
ret_val[code] = {"name": name}
return ret_val
skip_countries = [
# official language of google-country not in google-languages
'AL', # Albanien (sq)
'AZ', # Aserbaidschan (az)
'BD', # Bangladesch (bn)
'BN', # Brunei Darussalam (ms)
'BT', # Bhutan (dz)
'ET', # Äthiopien (am)
'GE', # Georgien (ka, os)
'GL', # Grönland (kl)
'KH', # Kambodscha (km)
'LA', # Laos (lo)
'LK', # Sri Lanka (si, ta)
'ME', # Montenegro (sr)
'MK', # Nordmazedonien (mk, sq)
'MM', # Myanmar (my)
'MN', # Mongolei (mn)
'MV', # Malediven (dv) // dv_MV is unknown by babel
'MY', # Malaysia (ms)
'NP', # Nepal (ne)
'TJ', # Tadschikistan (tg)
'TM', # Turkmenistan (tk)
'UZ', # Usbekistan (uz)
]
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages from Google."""
# pylint: disable=import-outside-toplevel
engine_traits.data_type = 'supported_languages' # deprecated
import babel
import babel.languages
from searx import network
from searx.locales import language_tag, region_tag, get_offical_locales
resp = network.get('https://www.google.com/preferences')
if not resp.ok:
print("ERROR: response from Google is not OK.")
dom = html.fromstring(resp.text)
lang_map = {'no': 'nb'}
for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'):
eng_lang = x.get("value").split('_')[-1]
try:
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
except babel.UnknownLocaleError:
print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
continue
sxng_lang = language_tag(locale)
conflict = engine_traits.languages.get(sxng_lang)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
continue
engine_traits.languages[sxng_lang] = 'lang_' + eng_lang
# alias languages
engine_traits.languages['zh'] = 'lang_zh-CN'
for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'):
eng_country = x.get("value")
if eng_country in skip_countries:
continue
if eng_country == 'ZZ':
engine_traits.all_locale = 'ZZ'
continue
sxng_locales = get_offical_locales(eng_country, engine_traits.languages.keys(), regional=True)
if not sxng_locales:
print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country))
continue
for sxng_locale in sxng_locales:
engine_traits.regions[region_tag(sxng_locale)] = 'country' + eng_country

View file

@ -23,7 +23,7 @@ from searx.engines.google import (
)
# pylint: disable=unused-import
from searx.engines.google import supported_languages_url, _fetch_supported_languages
from searx.engines.google import supported_languages_url, _fetch_supported_languages, fetch_traits
# pylint: enable=unused-import

View file

@ -28,6 +28,7 @@ from searx.utils import (
# pylint: disable=unused-import
from searx.engines.google import (
fetch_traits,
supported_languages_url,
_fetch_supported_languages,
)

View file

@ -31,6 +31,7 @@ from searx.engines.google import (
# pylint: disable=unused-import
from searx.engines.google import (
fetch_traits,
supported_languages_url,
_fetch_supported_languages,
)

View file

@ -38,7 +38,7 @@ from searx.engines.google import (
)
# pylint: disable=unused-import
from searx.engines.google import supported_languages_url, _fetch_supported_languages
from searx.engines.google import supported_languages_url, _fetch_supported_languages, fetch_traits
# pylint: enable=unused-import