From a95b607273cd6d3ee7ec0c04190e8660003e732d Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 4 May 2024 08:45:42 +0200 Subject: [PATCH] [fix] startpage engine: XPath expressions adapted for new HTML layout Startpage has changed its HTML layout, classes like ``w-gl__result__main`` do no longer exists and the result items have been slightly changed in their structure. Signed-off-by: Markus Heiser --- searx/engines/startpage.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index aa594f0dc..d538a22e4 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -142,9 +142,6 @@ search_url = base_url + '/sp/search' # specific xpath variables # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"] # not ads: div[@class="result"] are the direct childs of div[@id="results"] -results_xpath = '//div[@class="w-gl__result__main"]' -link_xpath = './/a[@class="w-gl__result-title result-link"]' -content_xpath = './/p[@class="w-gl__description"]' search_form_xpath = '//form[@id="search"]' """XPath of Startpage's origin search form @@ -334,8 +331,8 @@ def _response_cat_web(dom): results = [] # parse results - for result in eval_xpath(dom, results_xpath): - links = eval_xpath(result, link_xpath) + for result in eval_xpath(dom, '//div[@class="w-gl"]/div[contains(@class, "result")]'): + links = eval_xpath(result, './/a[contains(@class, "result-title result-link")]') if not links: continue link = links[0] @@ -349,12 +346,9 @@ def _response_cat_web(dom): if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): continue - title = extract_text(link) - - if eval_xpath(result, content_xpath): - content: str = extract_text(eval_xpath(result, content_xpath)) # type: ignore - else: - content = '' + title = extract_text(eval_xpath(link, 'h2')) + content = eval_xpath(result, './/p[contains(@class, "description")]') + content = extract_text(content, allow_none=True) or '' published_date = None