# SPDX-License-Identifier: AGPL-3.0-or-later """Google (News) For detailed description of the *REST-full* API see: `Query Parameter Definitions`_. Not all parameters can be appied, e.g. num_ (the number of search results to return) is ignored. .. _Query Parameter Definitions: https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions .. _num: https://developers.google.com/custom-search/docs/xml_results#numsp """ # pylint: disable=invalid-name, missing-function-docstring import binascii import re from urllib.parse import urlencode from base64 import b64decode from lxml import html from searx import logger from searx.utils import ( eval_xpath, eval_xpath_list, eval_xpath_getindex, extract_text, ) # pylint: disable=unused-import from searx.engines.google import ( supported_languages_url, _fetch_supported_languages, detect_google_sorry, ) # pylint: enable=unused-import from searx.engines.google import ( get_lang_country, filter_mapping, ) # about about = { "website": 'https://news.google.com', "wikidata_id": 'Q12020', "official_api_documentation": None, "use_official_api": False, "require_api_key": False, "results": 'HTML', } logger = logger.getChild('google news') # compared to other google engines google-news has a different time range # support. The time range is included in the search term. time_range_dict = { 'day': 'when:1d', 'week': 'when:7d', 'month': 'when:1m', 'year': 'when:1y', } # engine dependent config categories = ['news'] paging = False language_support = True use_locale_domain = True time_range_support = True safesearch = True # not really, but it is not generated by google def request(query, params): """Google-News search request""" language, country, lang_country = get_lang_country( # pylint: disable=undefined-variable params, supported_languages, language_aliases ) subdomain = 'news.google.com' if params['time_range']: # in time_range_dict: query += ' ' + time_range_dict[params['time_range']] query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({ 'q': query, 'hl': language, 'lr': "lang_" + language, 'ie': "utf8", 'oe': "utf8", 'ceid' : "%s:%s" % (country, language), 'gl' : country, }) if params['safesearch']: query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) params['url'] = query_url logger.debug("query_url --> %s", query_url) # en-US,en;q=0.8,en;q=0.5 params['headers']['Accept-Language'] = ( lang_country + ',' + language + ';q=0.8,' + language + ';q=0.5' ) logger.debug("HTTP header Accept-Language --> %s", params['headers']['Accept-Language']) params['headers']['Accept'] = ( 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ) # hl=en redirect to hl=en-US / en-CA ... params['soft_max_redirects'] = 1 #params['google_subdomain'] = subdomain return params def response(resp): """Get response from google's search request""" results = [] detect_google_sorry(resp) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) for result in eval_xpath_list(dom, '//div[@class="xrnccd"]'): # The first tag in the
contains the link to the # article The href attribute of the is a google internal link, # we can't use. The real link is hidden in the jslog attribute: # # jslog = eval_xpath_getindex(result, './article/a/@jslog', 0) url = re.findall('http[^;]*', jslog) if url: url = url[0] else: # The real URL is base64 encoded in the json attribute: # jslog="95014; 5:W251bGwsbnVsbCxudW...giXQ==; track:click" jslog = jslog.split(";")[1].split(':')[1].strip() try: padding = (4 -(len(jslog) % 4)) * "=" jslog = b64decode(jslog + padding) except binascii.Error: # URL cant be read, skip this result continue # now we have : b'[null, ... null,"https://www.cnn.com/.../index.html"]' url = re.findall('http[^;"]*', str(jslog))[0] # the first

tag in the
contains the title of the link title = extract_text(eval_xpath(result, './article/h3[1]')) # the first
tag in the
contains the content of the link content = extract_text(eval_xpath(result, './article/div[1]')) # the second
tag contains origin publisher and the publishing date pub_date = extract_text(eval_xpath(result, './article/div[2]//time')) pub_origin = extract_text(eval_xpath(result, './article/div[2]//a')) pub_info = [] if pub_origin: pub_info.append(pub_origin) if pub_date: # The pub_date is mostly a string like 'yesertday', not a real # timezone date or time. Therefore we can't use publishedDate. pub_info.append(pub_date) pub_info = ', '.join(pub_info) if pub_info: content = pub_info + ': ' + content # The image URL is located in a preceding sibling tag, e.g.: # "https://lh3.googleusercontent.com/DjhQh7DMszk.....z=-p-h100-w100" # These URL are long but not personalized (double checked via tor). img_src = extract_text(result.xpath('preceding-sibling::a/figure/img/@src')) results.append({ 'url': url, 'title': title, 'content': content, 'img_src': img_src, }) # return results return results