From 1540891561d9b24c960c239981f35eaf380879c4 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 5 Jul 2022 22:02:29 +0200 Subject: [PATCH] [fix] engine tineye: handle 422 response of not supported img format Closes: https://github.com/searxng/searxng/issues/1449 Signed-off-by: Markus Heiser --- searx/engines/tineye.py | 178 +++++++++++++++++++++++++++++++++------- 1 file changed, 150 insertions(+), 28 deletions(-) diff --git a/searx/engines/tineye.py b/searx/engines/tineye.py index fe5b60393..6c5ff134c 100644 --- a/searx/engines/tineye.py +++ b/searx/engines/tineye.py @@ -17,6 +17,7 @@ billion images `[tineye.com] `_. from urllib.parse import urlencode from datetime import datetime +from flask_babel import gettext about = { "website": 'https://tineye.com', @@ -28,20 +29,41 @@ about = { } engine_type = 'online_url_search' +""":py:obj:`searx.search.processors.online_url_search`""" + categories = ['general'] paging = True safesearch = False base_url = 'https://tineye.com' search_string = '/result_json/?page={page}&{query}' +FORMAT_NOT_SUPPORTED = gettext( + "Could not read that image url. This may be due to an unsupported file" + " format. TinEye only supports images that are JPEG, PNG, GIF, BMP, TIFF or WebP." +) +"""TinEye error message""" + +NO_SIGNATURE_ERROR = gettext( + "The image is too simple to find matches. TinEye requires a basic level of" + " visual detail to successfully identify matches." +) +"""TinEye error message""" + +DOWNLOAD_ERROR = gettext("The image could not be downloaded.") +"""TinEye error message""" + def request(query, params): + """Build TinEye HTTP request using ``search_urls`` of a :py:obj:`engine_type`.""" + + params['raise_for_httperror'] = False if params['search_urls']['data:image']: query = params['search_urls']['data:image'] elif params['search_urls']['http']: query = params['search_urls']['http'] + logger.debug("query URL: %s", query) query = urlencode({'url': query}) # see https://github.com/TinEye/pytineye/blob/main/pytineye/api.py @@ -59,45 +81,145 @@ def request(query, params): return params +def parse_tineye_match(match_json): + """Takes parsed JSON from the API server and turns it into a :py:obj:`dict` + object. + + Attributes `(class Match) `__ + + - `image_url`, link to the result image. + - `domain`, domain this result was found on. + - `score`, a number (0 to 100) that indicates how closely the images match. + - `width`, image width in pixels. + - `height`, image height in pixels. + - `size`, image area in pixels. + - `format`, image format. + - `filesize`, image size in bytes. + - `overlay`, overlay URL. + - `tags`, whether this match belongs to a collection or stock domain. + + - `backlinks`, a list of Backlink objects pointing to the original websites + and image URLs. List items are instances of :py:obj:`dict`, (`Backlink + `__): + + - `url`, the image URL to the image. + - `backlink`, the original website URL. + - `crawl_date`, the date the image was crawled. + + """ + + # HINT: there exists an alternative backlink dict in the domains list / e.g.:: + # + # match_json['domains'][0]['backlinks'] + + backlinks = [] + if "backlinks" in match_json: + + for backlink_json in match_json["backlinks"]: + if not isinstance(backlink_json, dict): + continue + + crawl_date = backlink_json.get("crawl_date") + if crawl_date: + crawl_date = datetime.fromisoformat(crawl_date[:-3]) + else: + crawl_date = datetime.min + + backlinks.append( + { + 'url': backlink_json.get("url"), + 'backlink': backlink_json.get("backlink"), + 'crawl_date': crawl_date, + 'image_name': backlink_json.get("image_name"), + } + ) + + return { + 'image_url': match_json.get("image_url"), + 'domain': match_json.get("domain"), + 'score': match_json.get("score"), + 'width': match_json.get("width"), + 'height': match_json.get("height"), + 'size': match_json.get("size"), + 'image_format': match_json.get("format"), + 'filesize': match_json.get("filesize"), + 'overlay': match_json.get("overlay"), + 'tags': match_json.get("tags"), + 'backlinks': backlinks, + } + + def response(resp): + """Parse HTTP response from TinEye.""" results = [] - # Define wanted results - json_data = resp.json() - number_of_results = json_data['num_matches'] + try: + json_data = resp.json() + except Exception as exc: # pylint: disable=broad-except + msg = "can't parse JSON response // %s" % exc + logger.error(msg) + json_data = {'error': msg} - for i in json_data['matches']: - image_format = i['format'] - width = i['width'] - height = i['height'] - thumbnail_src = i['image_url'] - backlink = i['domains'][0]['backlinks'][0] - url = backlink['backlink'] - source = backlink['url'] - title = backlink['image_name'] - img_src = backlink['url'] + # handle error codes from Tineye - # Get and convert published date - api_date = backlink['crawl_date'][:-3] - publishedDate = datetime.fromisoformat(api_date) + if resp.is_error: + if resp.status_code in (400, 422): - # Append results + message = 'HTTP status: %s' % resp.status_code + error = json_data.get('error') + s_key = json_data.get('suggestions', {}).get('key', '') + + if error and s_key: + message = "%s (%s)" % (error, s_key) + elif error: + message = error + + if s_key == "Invalid image URL": + # test https://docs.searxng.org/_static/searxng-wordmark.svg + message = FORMAT_NOT_SUPPORTED + elif s_key == 'NO_SIGNATURE_ERROR': + # test https://pngimg.com/uploads/dot/dot_PNG4.png + message = NO_SIGNATURE_ERROR + elif s_key == 'Download Error': + # test https://notexists + message = DOWNLOAD_ERROR + + # see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023 + # results.append({'answer': message}) + logger.error(message) + + return results + + resp.raise_for_status() + + # append results from matches + + for match_json in json_data['matches']: + + tineye_match = parse_tineye_match(match_json) + if not tineye_match['backlinks']: + continue + + backlink = tineye_match['backlinks'][0] results.append( { 'template': 'images.html', - 'url': url, - 'thumbnail_src': thumbnail_src, - 'source': source, - 'title': title, - 'img_src': img_src, - 'format': image_format, - 'widht': width, - 'height': height, - 'publishedDate': publishedDate, + 'url': backlink['backlink'], + 'thumbnail_src': tineye_match['image_url'], + 'source': backlink['url'], + 'title': backlink['image_name'], + 'img_src': backlink['url'], + 'format': tineye_match['image_format'], + 'widht': tineye_match['width'], + 'height': tineye_match['height'], + 'publishedDate': backlink['crawl_date'], } ) - # Append number of results - results.append({'number_of_results': number_of_results}) + # append number of results + + number_of_results = json_data.get('num_matches') + if number_of_results: + results.append({'number_of_results': number_of_results}) return results