From f096d68ec6c8ae3efc6656181570791115746d5d Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 21 Jun 2021 16:09:16 +0200 Subject: [PATCH 1/3] [mod] google engine: reduce mobile UI parameters to what is needed Reverse engineering shows that not all of the parameters used by google's mobile UI (aka "more results" button) are needed [1]. [1] https://github.com/searxng/searxng/pull/160#issuecomment-865013625 Signed-off-by: Markus Heiser --- searx/engines/google.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/searx/engines/google.py b/searx/engines/google.py index 6ba164814..284209523 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -270,8 +270,7 @@ def request(query, params): additional_parameters = {} if use_mobile_ui: additional_parameters = { - 'asearch': "arc", - 'async': 'arc_id:srp_510,ffilt:all,ve_name:MoreResultsContainer,next_id:srp_5,use_ac:true,_id:arc-srp_510,_pms:qs,_fmt:pc' # pylint: disable=line-too-long + 'async': 'use_ac:true,_fmt:pc', } # https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium From 05e90f2e57043d8ca425e43ed288d45027bdf0ec Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 21 Jun 2021 16:46:08 +0200 Subject: [PATCH 2/3] [fix] google answers: normalize space of the answers. Signed-off-by: Markus Heiser --- searx/engines/google.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/searx/engines/google.py b/searx/engines/google.py index 284209523..25adfe4b5 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -311,9 +311,10 @@ def response(resp): dom = html.fromstring(resp.text) # results --> answer - answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()') - if answer: - results.append({'answer': ' '.join(answer)}) + answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]') + if answer_list: + answer_list = [_.xpath("normalize-space()") for _ in answer_list] + results.append({'answer': ' '.join(answer_list)}) else: logger.debug("did not find 'answer'") From 0ef6aa51265c41a77d504f858c3c0c13eff448c2 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Mon, 21 Jun 2021 18:15:40 +0200 Subject: [PATCH 3/3] [docs] add documentation from the sources of the google engines Signed-off-by: Markus Heiser --- docs/src/searx.engines.google.rst | 55 +++++++++++++++++++++++++++++++ searx/engines/google.py | 35 +++++++++++++++----- searx/engines/google_images.py | 13 +++----- searx/engines/google_news.py | 9 ++--- searx/engines/google_videos.py | 11 ++----- searx/settings.yml | 1 + 6 files changed, 91 insertions(+), 33 deletions(-) create mode 100644 docs/src/searx.engines.google.rst diff --git a/docs/src/searx.engines.google.rst b/docs/src/searx.engines.google.rst new file mode 100644 index 000000000..2d10b5eea --- /dev/null +++ b/docs/src/searx.engines.google.rst @@ -0,0 +1,55 @@ +.. _google engines: + +============== +Google Engines +============== + +.. contents:: Contents + :depth: 2 + :local: + :backlinks: entry + + +.. _google API: + +google API +========== + +.. _Query Parameter Definitions: + https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions + +For detailed description of the *REST-full* API see: `Query Parameter +Definitions`_. Not all parameters can be appied and some engines are *special* +(e.g. :ref:`google news engine`). + +.. _google web engine: + +Google WEB +========== + +.. automodule:: searx.engines.google + :members: + +.. _google images engine: + +Google Images +============= + +.. automodule:: searx.engines.google_images + :members: + +.. _google videos engine: + +Google Videos +============= + +.. automodule:: searx.engines.google_videos + :members: + +.. _google news engine: + +Google News +=========== + +.. automodule:: searx.engines.google_news + :members: diff --git a/searx/engines/google.py b/searx/engines/google.py index 25adfe4b5..2ad9b4ca8 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -1,12 +1,28 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Google (Web) +"""This is the implementation of the google WEB engine. Some of this +implementations are shared by other engines: -For detailed description of the *REST-full* API see: `Query Parameter -Definitions`_. +- :ref:`google images engine` +- :ref:`google news engine` +- :ref:`google videos engine` + +The google WEB engine itself has a special setup option: + +.. code:: yaml + + - name: google + ... + use_mobile_ui: true + +``use_mobile_ui``: (default: ``true``) + Enables to use *mobile endpoint* to bypass the google blocking (see + :issue:`159`). On the mobile UI of Google Search, the button :guilabel:`More + results` is not affected by Google rate limiting and we can still do requests + while actively blocked by the original Google search. By activate + ``use_mobile_ui`` this behavior is simulated by adding the parameter + ``async=use_ac:true,_fmt:pc`` to the :py:func:`request`. -.. _Query Parameter Definitions: - https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions """ # pylint: disable=invalid-name, missing-function-docstring @@ -137,8 +153,9 @@ spelling_suggestion_xpath = '//div[@class="med"]/p/a' def get_lang_info(params, lang_list, custom_aliases, supported_any_language): """Composing various language properties for the google engines. - This function is called by the various google engines (google itself, - google-images, -news, -scholar, -videos). + This function is called by the various google engines (:ref:`google web + engine`, :ref:`google images engine`, :ref:`google news engine` and + :ref:`google videos engine`). :param dict param: request parameters of the engine @@ -146,7 +163,7 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): :py:obj:`ENGINES_LANGUAGES[engine-name] ` :param dict lang_list: custom aliases for non standard language codes - (used when calling :py:func:`searx.utils.match_language) + (used when calling :py:func:`searx.utils.match_language`) :param bool supported_any_language: When a language is not specified, the language interpretation is left up to Google to decide how the search @@ -159,7 +176,7 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): Py-Dictionary with the key/value pairs: language: - Return value from :py:func:`searx.utils.match_language + Return value from :py:func:`searx.utils.match_language` country: The country code (e.g. US, AT, CA, FR, DE ..) diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index e7382a6fe..c17227a64 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -1,19 +1,14 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Google (Images) +"""This is the implementation of the google images engine. -For detailed description of the *REST-full* API see: `Query Parameter -Definitions`_. - -.. _admonition:: Content-Security-Policy (CSP) +.. admonition:: Content-Security-Policy (CSP) This engine needs to allow images from the `data URLs`_ (prefixed with the - ``data:` scheme).:: + ``data:`` scheme):: - Header set Content-Security-Policy "img-src 'self' data: ;" + Header set Content-Security-Policy "img-src 'self' data: ;" -.. _Query Parameter Definitions: - https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions .. _data URLs: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs """ diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index c1c97b700..e6d50855e 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -1,16 +1,11 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Google (News) - -For detailed description of the *REST-full* API see: `Query Parameter -Definitions`_. Not all parameters can be appied: +"""This is the implementation of the google news engine. The google news API +ignores some parameters from the common :ref:`google API`: - num_ : the number of search results is ignored - save_ : is ignored / Google-News results are always *SafeSearch* -.. _Query Parameter Definitions: - https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions - .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp .. _save: https://developers.google.com/custom-search/docs/xml_results#safesp diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index c57db4e63..3747e85a5 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -1,19 +1,14 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Google (Video) +"""This is the implementation of the google videos engine. -For detailed description of the *REST-full* API see: `Query Parameter -Definitions`_. Not all parameters can be appied. - -.. _admonition:: Content-Security-Policy (CSP) +.. admonition:: Content-Security-Policy (CSP) This engine needs to allow images from the `data URLs`_ (prefixed with the - ``data:` scheme).:: + ``data:`` scheme):: Header set Content-Security-Policy "img-src 'self' data: ;" -.. _Query Parameter Definitions: - https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions .. _data URLs: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs diff --git a/searx/settings.yml b/searx/settings.yml index 0b5802cae..32450b0b8 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -583,6 +583,7 @@ engines: - name: google engine: google shortcut: go + # see https://searxng.github.io/searxng/src/searx.engines.google.html#module-searx.engines.google use_mobile_ui: true # additional_tests: # android: *test_android