From 460bbe5b8114cb5782a6f3e8cb644bbd29829b64 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 6 Aug 2023 19:35:56 +0200 Subject: [PATCH] [mod] implement brave (WEB) engine to replace XPath configuration Signed-off-by: Markus Heiser --- docs/dev/engines/online/brave.rst | 13 ++ searx/engines/brave.py | 270 ++++++++++++++++++++++++++---- searx/settings.yml | 48 ++---- 3 files changed, 263 insertions(+), 68 deletions(-) create mode 100644 docs/dev/engines/online/brave.rst diff --git a/docs/dev/engines/online/brave.rst b/docs/dev/engines/online/brave.rst new file mode 100644 index 000000000..a1c589b9d --- /dev/null +++ b/docs/dev/engines/online/brave.rst @@ -0,0 +1,13 @@ +.. _brave engine: + +============= +Brave Engines +============= + +.. contents:: Contents + :depth: 2 + :local: + :backlinks: entry + +.. automodule:: searx.engines.brave + :members: diff --git a/searx/engines/brave.py b/searx/engines/brave.py index deb109b8e..c32d0f39e 100644 --- a/searx/engines/brave.py +++ b/searx/engines/brave.py @@ -1,10 +1,56 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -""" - Brave (General, news, videos, images) -""" +# lint: pylint +"""Brave supports the categories listed in :py:obj:`brave_category` (General, +news, videos, images). The support of :py:obj:`paging` and :py:obj:`time range +` is limited (see remarks). + +Configured ``brave`` engines: + +.. code:: yaml + + - name: brave + engine: brave + ... + brave_category: search + time_range_support: true + paging: true + + - name: brave.images + engine: brave + ... + brave_category: images + + - name: brave.videos + engine: brave + ... + brave_category: videos + + - name: brave.news + engine: brave + ... + brave_category: news + + +Implementations +=============== + +""" +# pylint: disable=fixme + +from urllib.parse import ( + urlencode, + urlparse, + parse_qs, +) -from urllib.parse import urlencode import chompjs +from lxml import html + +from searx.utils import ( + extract_text, + eval_xpath_list, + eval_xpath_getindex, +) about = { "website": 'https://search.brave.com/', @@ -14,41 +60,87 @@ about = { "require_api_key": False, "results": 'HTML', } + base_url = "https://search.brave.com/" +categories = [] +brave_category = 'search' +"""Brave supports common web-search, video search, image and video search. + +- ``search``: Common WEB search +- ``videos``: search for videos +- ``images``: search for images +- ``news``: search for news +""" + +brave_spellcheck = False +"""Brave supports some kind of spell checking. When activated, Brave tries to +fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``. In +the UI of Brave the user gets warned about this, since we can not warn the user +in SearXNG, the spellchecking is disabled by default. +""" + +send_accept_language_header = True paging = False -categories = ['images', 'videos', 'news'] # images, videos, news +"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI +category All).""" + +safesearch = True +safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'} # cookie: safesearch=off + +time_range_support = False +"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI +category All).""" + +time_range_map = { + 'day': 'pd', + 'week': 'pw', + 'month': 'pm', + 'year': 'py', +} def request(query, params): + + # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787 + params['headers']['Accept-Encoding'] = 'gzip, deflate' + args = { 'q': query, - 'spellcheck': 1, } - params["url"] = f"{base_url}{categories[0]}?{urlencode(args)}" + if brave_spellcheck: + args['spellcheck'] = '1' + if brave_category == 'search': + if params.get('pageno', 1) - 1: + args['offset'] = params.get('pageno', 1) - 1 + if time_range_map.get(params['time_range']): + args['tf'] = time_range_map.get(params['time_range']) -def get_video_results(json_data): - results = [] + params["url"] = f"{base_url}{brave_category}?{urlencode(args)}" - for result in json_data: - results.append( - { - 'template': 'videos.html', - 'url': result['url'], - 'thumbnail_src': result['thumbnail']['src'], - 'img_src': result['properties']['url'], - 'content': result['description'], - 'title': result['title'], - 'source': result['source'], - 'duration': result['video']['duration'], - } - ) + # set preferences in cookie + params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off') - return results + # ToDo: we need a fetch_traits(..) implementation / the ui_lang of Brave are + # limited and the country handling has it quirks + + eng_locale = params.get('searxng_locale') + params['cookies']['useLocation'] = '0' # the useLocation is IP based, we use 'country' + params['cookies']['summarizer'] = '0' + + if not eng_locale or eng_locale == 'all': + params['cookies']['country'] = 'all' # country=all + else: + params['cookies']['country'] = eng_locale.split('-')[-1].lower() + params['cookies']['ui_lang'] = eng_locale.split('-')[0].lower() + + # logger.debug("cookies %s", params['cookies']) def response(resp): - results = [] + + if brave_category == 'search': + return _parse_search(resp) datastr = "" for line in resp.text.split("\n"): @@ -57,10 +149,81 @@ def response(resp): break json_data = chompjs.parse_js_object(datastr) - json_resp = json_data[1]['data']['body']['response'] - if categories[0] == 'news': + + if brave_category == 'news': json_resp = json_resp['news'] + return _parse_news(json_resp) + + if brave_category == 'images': + return _parse_images(json_resp) + if brave_category == 'videos': + return _parse_videos(json_resp) + + return [] + + +def _parse_search(resp): + + result_list = [] + dom = html.fromstring(resp.text) + + answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None) + if answer_tag: + result_list.append({'answer': extract_text(answer_tag)}) + + # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]' + xpath_results = '//div[contains(@class, "snippet")]' + + for result in eval_xpath_list(dom, xpath_results): + + url = eval_xpath_getindex(result, './/a[@class="result-header"]/@href', 0, default=None) + title_tag = eval_xpath_getindex(result, './/span[@class="snippet-title"]', 0, default=None) + if not (url and title_tag): + continue + + content_tag = eval_xpath_getindex(result, './/p[@class="snippet-description"]', 0, default='') + img_src = eval_xpath_getindex(result, './/img[@class="thumb"]/@src', 0, default='') + + item = { + 'url': url, + 'title': extract_text(title_tag), + 'content': extract_text(content_tag), + 'img_src': img_src, + } + + video_tag = eval_xpath_getindex( + result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None + ) + if video_tag: + + # In my tests a video tag in the WEB search was mostoften not a + # video, except the ones from youtube .. + + iframe_src = _get_iframe_src(url) + if iframe_src: + item['iframe_src'] = iframe_src + item['template'] = 'videos.html' + item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='') + else: + item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='') + + result_list.append(item) + + return result_list + + +def _get_iframe_src(url): + parsed_url = urlparse(url) + if parsed_url.path == '/watch' and parsed_url.query: + video_id = parse_qs(parsed_url.query).get('v', []) # type: ignore + if video_id: + return 'https://www.youtube-nocookie.com/embed/' + video_id[0] # type: ignore + return None + + +def _parse_news(json_resp): + result_list = [] for result in json_resp["results"]: item = { @@ -68,18 +231,53 @@ def response(resp): 'title': result['title'], 'content': result['description'], } + if result['thumbnail'] != "null": + item['img_src'] = result['thumbnail']['src'] + result_list.append(item) + + return result_list + + +def _parse_images(json_resp): + result_list = [] + + for result in json_resp["results"]: + item = { + 'url': result['url'], + 'title': result['title'], + 'content': result['description'], + 'template': 'images.html', + 'img_format': result['properties']['format'], + 'source': result['source'], + 'img_src': result['properties']['url'], + } + result_list.append(item) + + return result_list + + +def _parse_videos(json_resp): + result_list = [] + + for result in json_resp["results"]: + + url = result['url'] + item = { + 'url': url, + 'title': result['title'], + 'content': result['description'], + 'template': 'videos.html', + 'length': result['video']['duration'], + 'duration': result['video']['duration'], + } + if result['thumbnail'] != "null": item['thumbnail'] = result['thumbnail']['src'] - if categories[0] == 'images': - item['template'] = 'images.html' - item['img_format'] = result['properties']['format'] - item['source'] = result['source'] - item['img_src'] = result['properties']['url'] - elif categories[0] == 'videos': - item['template'] = 'videos.html' - item['length'] = result['video']['duration'] + iframe_src = _get_iframe_src(url) + if iframe_src: + item['iframe_src'] = iframe_src - results.append(item) + result_list.append(item) - return results + return result_list diff --git a/searx/settings.yml b/searx/settings.yml index 87bf381eb..04ec3e466 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1816,50 +1816,34 @@ engines: timeout: 9.0 - name: brave - shortcut: brave - engine: xpath - paging: true + engine: brave + shortcut: br time_range_support: true - first_page_num: 0 - time_range_url: "&tf={time_range_val}" - search_url: https://search.brave.com/search?q={query}&offset={pageno}&spellcheck=1{time_range} - url_xpath: //a[@class="result-header"]/@href - title_xpath: //span[@class="snippet-title"] - content_xpath: //p[1][@class="snippet-description"] - suggestion_xpath: //div[@class="text-gray h6"]/a - time_range_map: - day: 'pd' - week: 'pw' - month: 'pm' - year: 'py' + paging: true categories: [general, web] - disabled: true - headers: - Accept-Encoding: gzip, deflate - about: - website: https://brave.com/search/ - wikidata_id: Q107355971 - use_official_api: false - require_api_key: false - results: HTML + brave_category: search + # brave_spellcheck: true - name: brave.images - shortcut: braveimg engine: brave - categories: images - disabled: true + network: brave + shortcut: brimg + categories: [images, web] + brave_category: images - name: brave.videos - shortcut: bravevid engine: brave - categories: videos - disabled: true + network: brave + shortcut: brvid + categories: [videos, web] + brave_category: videos - name: brave.news - shortcut: bravenews engine: brave + network: brave + shortcut: brnews categories: news - disabled: true + brave_category: news - name: petalsearch shortcut: pts