From a95b607273cd6d3ee7ec0c04190e8660003e732d Mon Sep 17 00:00:00 2001
From: Markus Heiser <markus.heiser@darmarit.de>
Date: Sat, 4 May 2024 08:45:42 +0200
Subject: [PATCH] [fix] startpage engine: XPath expressions adapted for new
 HTML layout

Startpage has changed its HTML layout, classes like ``w-gl__result__main`` do no
longer exists and the result items have been slightly changed in their
structure.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
---
 searx/engines/startpage.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index aa594f0dc..d538a22e4 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -142,9 +142,6 @@ search_url = base_url + '/sp/search'
 # specific xpath variables
 # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
 # not ads: div[@class="result"] are the direct childs of div[@id="results"]
-results_xpath = '//div[@class="w-gl__result__main"]'
-link_xpath = './/a[@class="w-gl__result-title result-link"]'
-content_xpath = './/p[@class="w-gl__description"]'
 search_form_xpath = '//form[@id="search"]'
 """XPath of Startpage's origin search form
 
@@ -334,8 +331,8 @@ def _response_cat_web(dom):
     results = []
 
     # parse results
-    for result in eval_xpath(dom, results_xpath):
-        links = eval_xpath(result, link_xpath)
+    for result in eval_xpath(dom, '//div[@class="w-gl"]/div[contains(@class, "result")]'):
+        links = eval_xpath(result, './/a[contains(@class, "result-title result-link")]')
         if not links:
             continue
         link = links[0]
@@ -349,12 +346,9 @@ def _response_cat_web(dom):
         if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
             continue
 
-        title = extract_text(link)
-
-        if eval_xpath(result, content_xpath):
-            content: str = extract_text(eval_xpath(result, content_xpath))  # type: ignore
-        else:
-            content = ''
+        title = extract_text(eval_xpath(link, 'h2'))
+        content = eval_xpath(result, './/p[contains(@class, "description")]')
+        content = extract_text(content, allow_none=True) or ''
 
         published_date = None