From 16443d4f4a4a3b94c8646db48ac3f1ae6f0623c4 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Sat, 24 Sep 2022 14:26:07 +0200 Subject: [PATCH] [mod] core.ac.uk: try multiple ways to get url If the url is not found, using: * the DOI * the downloadUrl * the ARK id --- searx/engines/core.py | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/searx/engines/core.py b/searx/engines/core.py index c95fa1d28..a997343f2 100644 --- a/searx/engines/core.py +++ b/searx/engines/core.py @@ -41,7 +41,6 @@ def request(query, params): ) params['url'] = base_url + search_path - logger.debug("query_url --> %s", params['url']) return params @@ -51,17 +50,39 @@ def response(resp): for result in json_data['data']: source = result['_source'] - if not source['urls']: + url = None + if source.get('urls'): + url = source['urls'][0].replace('http://', 'https://', 1) + + if url is None and source.get('doi'): + # use the DOI reference + url = 'https://doi.org/' + source['doi'] + + if url is None and source.get('downloadUrl'): + # use the downloadUrl + url = source['downloadUrl'] + + if url is None and source.get('identifiers'): + # try to find an ark id, see + # https://www.wikidata.org/wiki/Property:P8091 + # and https://en.wikipedia.org/wiki/Archival_Resource_Key + arkids = [ + identifier[5:] # 5 is the length of "ark:/" + for identifier in source.get('identifiers') + if isinstance(identifier, str) and identifier.startswith('ark:/') + ] + if len(arkids) > 0: + url = 'https://n2t.net/' + arkids[0] + + if url is None: continue time = source['publishedDate'] or source['depositedDate'] if time: publishedDate = datetime.fromtimestamp(time / 1000) - journals = [] - if source['journals']: - for j in source['journals']: - journals.append(j['title']) + # sometimes the 'title' is None / filter None values + journals = [j['title'] for j in (source.get('journals') or []) if j['title']] publisher = source['publisher'] if publisher: @@ -71,8 +92,8 @@ def response(resp): { 'template': 'paper.html', 'title': source['title'], - 'url': source['urls'][0].replace('http://', 'https://', 1), - 'content': source['description'], + 'url': url, + 'content': source['description'] or '', # 'comments': '', 'tags': source['topics'], 'publishedDate': publishedDate,