diff --git a/.gitignore b/.gitignore index e56a575ab..b1286ea66 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ setup.cfg *.pyc */*.pyc *~ +*.swp /node_modules diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 9cdca47b7..00be89412 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -142,6 +142,17 @@ def load_engine(engine_data): engine.stats['page_load_time'] = 0 engine.stats['page_load_count'] = 0 + # tor related settings + if settings['outgoing'].get('using_tor_proxy'): + # use onion url if using tor. + if hasattr(engine, 'onion_url'): + engine.search_url = engine.onion_url + getattr(engine, 'search_path', '') + elif 'onions' in engine.categories: + # exclude onion engines if not using tor. + return None + + engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0) + for category_name in engine.categories: categories.setdefault(category_name, []).append(engine) @@ -252,8 +263,9 @@ def get_engines_stats(preferences): def load_engines(engine_list): - global engines + global engines, engine_shortcuts engines.clear() + engine_shortcuts.clear() for engine_data in engine_list: engine = load_engine(engine_data) if engine is not None: diff --git a/searx/engines/ahmia.py b/searx/engines/ahmia.py new file mode 100644 index 000000000..d9fcc6ca7 --- /dev/null +++ b/searx/engines/ahmia.py @@ -0,0 +1,82 @@ +""" + Ahmia (Onions) + + @website http://msydqstlz2kzerdg.onion + @provides-api no + + @using-api no + @results HTML + @stable no + @parse url, title, content +""" + +from urllib.parse import urlencode, urlparse, parse_qs +from lxml.html import fromstring +from searx.engines.xpath import extract_url, extract_text + +# engine config +categories = ['onions'] +paging = True +page_size = 10 + +# search url +search_url = 'http://msydqstlz2kzerdg.onion/search/?{query}' +time_range_support = True +time_range_dict = {'day': 1, + 'week': 7, + 'month': 30} + +# xpaths +results_xpath = '//li[@class="result"]' +url_xpath = './h4/a/@href' +title_xpath = './h4/a[1]' +content_xpath = './/p[1]' +correction_xpath = '//*[@id="didYouMean"]//a' +number_of_results_xpath = '//*[@id="totalResults"]' + + +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query})) + + if params['time_range'] in time_range_dict: + params['url'] += '&' + urlencode({'d': time_range_dict[params['time_range']]}) + + return params + + +def response(resp): + results = [] + dom = fromstring(resp.text) + + # trim results so there's not way too many at once + first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1) + all_results = dom.xpath(results_xpath) + trimmed_results = all_results[first_result_index:first_result_index + page_size] + + # get results + for result in trimmed_results: + # remove ahmia url and extract the actual url for the result + raw_url = extract_url(result.xpath(url_xpath), search_url) + cleaned_url = parse_qs(urlparse(raw_url).query).get('redirect_url', [''])[0] + + title = extract_text(result.xpath(title_xpath)) + content = extract_text(result.xpath(content_xpath)) + + results.append({'url': cleaned_url, + 'title': title, + 'content': content, + 'is_onion': True}) + + # get spelling corrections + for correction in dom.xpath(correction_xpath): + results.append({'correction': extract_text(correction)}) + + # get number of results + number_of_results = dom.xpath(number_of_results_xpath) + if number_of_results: + try: + results.append({'number_of_results': int(extract_text(number_of_results))}) + except: + pass + + return results diff --git a/searx/engines/not_evil.py b/searx/engines/not_evil.py new file mode 100644 index 000000000..e84f153bd --- /dev/null +++ b/searx/engines/not_evil.py @@ -0,0 +1,64 @@ +""" + not Evil (Onions) + + @website http://hss3uro2hsxfogfq.onion + @provide-api yes (http://hss3uro2hsxfogfq.onion/api.htm) + + @using-api no + @results HTML + @stable no + @parse url, title, content +""" + +from urllib.parse import urlencode +from lxml import html +from searx.engines.xpath import extract_text + +# engine dependent config +categories = ['onions'] +paging = True +page_size = 20 + +# search-url +base_url = 'http://hss3uro2hsxfogfq.onion/' +search_url = 'index.php?{query}&hostLimit=20&start={pageno}&numRows={page_size}' + +# specific xpath variables +results_xpath = '//*[@id="content"]/div/p' +url_xpath = './span[1]' +title_xpath = './a[1]' +content_xpath = './text()' + + +# do search-request +def request(query, params): + offset = (params['pageno'] - 1) * page_size + + params['url'] = base_url + search_url.format(pageno=offset, + query=urlencode({'q': query}), + page_size=page_size) + + return params + + +# get response from search-request +def response(resp): + results = [] + + # needed because otherwise requests guesses wrong encoding + resp.encoding = 'utf8' + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(results_xpath): + url = extract_text(result.xpath(url_xpath)[0]) + title = extract_text(result.xpath(title_xpath)[0]) + content = extract_text(result.xpath(content_xpath)) + + # append result + results.append({'url': url, + 'title': title, + 'content': content, + 'is_onion': True}) + + return results diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index a269253d7..81c2747fb 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -10,6 +10,8 @@ thumbnail_xpath = False paging = False suggestion_xpath = '' results_xpath = '' +cached_xpath = '' +cached_url = '' # parameters for engines with paging support # @@ -36,6 +38,8 @@ def request(query, params): def response(resp): results = [] dom = html.fromstring(resp.text) + is_onion = True if 'onions' in categories else False + if results_xpath: for result in eval_xpath(dom, results_xpath): url = extract_url(eval_xpath(result, url_xpath), search_url) @@ -49,15 +53,33 @@ def response(resp): if len(thumbnail_xpath_result) > 0: tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url) + # add alternative cached url if available + if cached_xpath: + tmp_result['cached_url'] = cached_url + extract_text(result.xpath(cached_xpath)) + + if is_onion: + tmp_result['is_onion'] = True + results.append(tmp_result) else: - for url, title, content in zip( - (extract_url(x, search_url) for - x in eval_xpath(dom, url_xpath)), - map(extract_text, eval_xpath(dom, title_xpath)), - map(extract_text, eval_xpath(dom, content_xpath)) - ): - results.append({'url': url, 'title': title, 'content': content}) + if cached_xpath: + for url, title, content, cached in zip( + (extract_url(x, search_url) for + x in dom.xpath(url_xpath)), + map(extract_text, dom.xpath(title_xpath)), + map(extract_text, dom.xpath(content_xpath)), + map(extract_text, dom.xpath(cached_xpath)) + ): + results.append({'url': url, 'title': title, 'content': content, + 'cached_url': cached_url + cached, 'is_onion': is_onion}) + else: + for url, title, content in zip( + (extract_url(x, search_url) for + x in dom.xpath(url_xpath)), + map(extract_text, dom.xpath(title_xpath)), + map(extract_text, dom.xpath(content_xpath)) + ): + results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion}) if not suggestion_xpath: return results diff --git a/searx/settings.yml b/searx/settings.yml index b23f48b45..54352bbfc 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -60,8 +60,10 @@ outgoing: # communication with search engines # see http://docs.python-requests.org/en/latest/user/advanced/#proxies # SOCKS proxies are also supported: see http://requests.readthedocs.io/en/master/user/advanced/#socks # proxies : -# http : http://127.0.0.1:8080 -# https: http://127.0.0.1:8080 +# http : socks5h://127.0.0.1:9050 +# https: socks5h://127.0.0.1:9050 +# using_tor_proxy : True +# extra_proxy_timeout : 10.0 # Extra seconds to add in order to account for the time taken by the proxy # uncomment below section only if you have more than one network interface # which can be the source of outgoing search requests # source_ips: @@ -89,6 +91,12 @@ engines: shortcut: apkm disabled: True +# Requires Tor + - name : ahmia + engine : ahmia + categories : onions + shortcut : ah + - name : arch linux wiki engine : archlinux shortcut : al @@ -185,7 +193,7 @@ engines: - name : deviantart engine : deviantart shortcut : da - timeout: 3.0 + timeout : 3.0 - name : ddg definitions engine : duckduckgo_definitions @@ -514,6 +522,11 @@ engines: timeout: 5.0 shortcut : npm +# Requires Tor + - name : not evil + engine : not_evil + shortcut : ne + - name : nyaa engine : nyaa shortcut : nt @@ -698,6 +711,18 @@ engines: url: https://torrentz2.eu/ timeout : 3.0 +# Requires Tor + - name : torch + engine : xpath + paging : True + search_url : http://xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd.onion/cgi-bin/omega/omega?P={query}&DEFAULTOP=and + results_xpath : //table//tr + url_xpath : ./td[2]/a + title_xpath : ./td[2]/b + content_xpath : ./td[2]/small + categories : onions + shortcut : tch + - name : twitter engine : twitter shortcut : tw diff --git a/searx/templates/legacy/result_templates/default.html b/searx/templates/legacy/result_templates/default.html index 13e2d2913..78bf031df 100644 --- a/searx/templates/legacy/result_templates/default.html +++ b/searx/templates/legacy/result_templates/default.html @@ -1,6 +1,11 @@

{% if "icon_"~result.engine~".ico" in favicons %}{{result.engine}}{% endif %}{{ result.title|safe }}

-

{{ result.pretty_url }}‎ {{ _('cached') }} +

{{ result.pretty_url }}‎ + {% if result.cached_url %} + {{ _('cached') }} + {% elif not result.is_onion %} + {{ _('cached') }} + {% endif %} {% if result.publishedDate %}{{ result.publishedDate }}{% endif %}

{% if result.img_src %}{% endif %}{% if result.content %}{{ result.content|safe }}
{% endif %}

diff --git a/searx/templates/oscar/macros.html b/searx/templates/oscar/macros.html index f52d9713c..57a90aaa2 100644 --- a/searx/templates/oscar/macros.html +++ b/searx/templates/oscar/macros.html @@ -32,7 +32,11 @@ {{ engine }} {%- endfor -%} {%- if result.url -%} - {{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }} + {% if result.cached_url %} + {{ result_link(result.cached_url, icon('link') + _('cached'), "text-info", id) }} + {% elif not result.is_onion %} + {{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }} + {% endif %} {%- endif -%} {%- if proxify -%} {{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info", id) }} @@ -50,7 +54,11 @@ {{ engine }} {%- endfor %} {%- if result.url -%} - {{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }} + {% if result.cached_url %} + {{ result_link(result.cached_url, icon('link') + _('cached'), "text-info", id) }} + {% elif not result.is_onion %} + {{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }} + {% endif %} {%- endif -%} {% if proxify -%} {{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info", id) }} diff --git a/searx/webapp.py b/searx/webapp.py index cf9a09778..609669b85 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -146,6 +146,7 @@ _category_names = (gettext('files'), gettext('it'), gettext('news'), gettext('map'), + gettext('onions'), gettext('science')) outgoing_proxies = settings['outgoing'].get('proxies') or None diff --git a/tests/unit/engines/test_xpath.py b/tests/unit/engines/test_xpath.py new file mode 100644 index 000000000..963a44a25 --- /dev/null +++ b/tests/unit/engines/test_xpath.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +from collections import defaultdict +import mock +from searx.engines import xpath +from searx.testing import SearxTestCase + + +class TestXpathEngine(SearxTestCase): + + def test_request(self): + xpath.search_url = 'https://url.com/{query}' + xpath.categories = [] + xpath.paging = False + query = 'test_query' + dicto = defaultdict(dict) + params = xpath.request(query, dicto) + self.assertIn('url', params) + self.assertEquals('https://url.com/test_query', params['url']) + + xpath.search_url = 'https://url.com/q={query}&p={pageno}' + xpath.paging = True + query = 'test_query' + dicto = defaultdict(dict) + dicto['pageno'] = 1 + params = xpath.request(query, dicto) + self.assertIn('url', params) + self.assertEquals('https://url.com/q=test_query&p=1', params['url']) + + def test_response(self): + # without results_xpath + xpath.url_xpath = '//div[@class="search_result"]//a[@class="result"]/@href' + xpath.title_xpath = '//div[@class="search_result"]//a[@class="result"]' + xpath.content_xpath = '//div[@class="search_result"]//p[@class="content"]' + + self.assertRaises(AttributeError, xpath.response, None) + self.assertRaises(AttributeError, xpath.response, []) + self.assertRaises(AttributeError, xpath.response, '') + self.assertRaises(AttributeError, xpath.response, '[]') + + response = mock.Mock(text='') + self.assertEqual(xpath.response(response), []) + + html = u""" +
+
+ Result 1 +

Content 1

+ Cache +
+
+ Result 2 +

Content 2

+ Cache +
+
+ """ + response = mock.Mock(text=html) + results = xpath.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + self.assertEqual(results[0]['title'], 'Result 1') + self.assertEqual(results[0]['url'], 'https://result1.com/') + self.assertEqual(results[0]['content'], 'Content 1') + self.assertEqual(results[1]['title'], 'Result 2') + self.assertEqual(results[1]['url'], 'https://result2.com/') + self.assertEqual(results[1]['content'], 'Content 2') + + # with cached urls, without results_xpath + xpath.cached_xpath = '//div[@class="search_result"]//a[@class="cached"]/@href' + results = xpath.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + self.assertEqual(results[0]['cached_url'], 'https://cachedresult1.com') + self.assertEqual(results[1]['cached_url'], 'https://cachedresult2.com') + self.assertFalse(results[0].get('is_onion', False)) + + # results are onion urls (no results_xpath) + xpath.categories = ['onions'] + results = xpath.response(response) + self.assertTrue(results[0]['is_onion']) + + # with results_xpath + xpath.results_xpath = '//div[@class="search_result"]' + xpath.url_xpath = './/a[@class="result"]/@href' + xpath.title_xpath = './/a[@class="result"]' + xpath.content_xpath = './/p[@class="content"]' + xpath.cached_xpath = None + xpath.categories = [] + + self.assertRaises(AttributeError, xpath.response, None) + self.assertRaises(AttributeError, xpath.response, []) + self.assertRaises(AttributeError, xpath.response, '') + self.assertRaises(AttributeError, xpath.response, '[]') + + response = mock.Mock(text='') + self.assertEqual(xpath.response(response), []) + + response = mock.Mock(text=html) + results = xpath.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + self.assertEqual(results[0]['title'], 'Result 1') + self.assertEqual(results[0]['url'], 'https://result1.com/') + self.assertEqual(results[0]['content'], 'Content 1') + self.assertEqual(results[1]['title'], 'Result 2') + self.assertEqual(results[1]['url'], 'https://result2.com/') + self.assertEqual(results[1]['content'], 'Content 2') + + # with cached urls, with results_xpath + xpath.cached_xpath = './/a[@class="cached"]/@href' + results = xpath.response(response) + self.assertEqual(type(results), list) + self.assertEqual(len(results), 2) + self.assertEqual(results[0]['cached_url'], 'https://cachedresult1.com') + self.assertEqual(results[1]['cached_url'], 'https://cachedresult2.com') + self.assertFalse(results[0].get('is_onion', False)) + + # results are onion urls (with results_xpath) + xpath.categories = ['onions'] + results = xpath.response(response) + self.assertTrue(results[0]['is_onion']) diff --git a/tests/unit/test_engines_init.py b/tests/unit/test_engines_init.py new file mode 100644 index 000000000..cf4d50309 --- /dev/null +++ b/tests/unit/test_engines_init.py @@ -0,0 +1,44 @@ +from searx.testing import SearxTestCase +from searx import settings, engines + + +class TestEnginesInit(SearxTestCase): + + @classmethod + def tearDownClass(cls): + settings['outgoing']['using_tor_proxy'] = False + settings['outgoing']['extra_proxy_timeout'] = 0 + + def test_initialize_engines_default(self): + engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1'}, + {'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2'}] + + engines.initialize_engines(engine_list) + self.assertEqual(len(engines.engines), 2) + self.assertIn('engine1', engines.engines) + self.assertIn('engine2', engines.engines) + + def test_initialize_engines_exclude_onions(self): + settings['outgoing']['using_tor_proxy'] = False + engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1', 'categories': 'general'}, + {'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2', 'categories': 'onions'}] + + engines.initialize_engines(engine_list) + self.assertEqual(len(engines.engines), 1) + self.assertIn('engine1', engines.engines) + self.assertNotIn('onions', engines.categories) + + def test_initialize_engines_include_onions(self): + settings['outgoing']['using_tor_proxy'] = True + settings['outgoing']['extra_proxy_timeout'] = 100.0 + engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1', 'categories': 'general', + 'timeout': 20.0, 'onion_url': 'http://engine1.onion'}, + {'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2', 'categories': 'onions'}] + + engines.initialize_engines(engine_list) + self.assertEqual(len(engines.engines), 2) + self.assertIn('engine1', engines.engines) + self.assertIn('engine2', engines.engines) + self.assertIn('onions', engines.categories) + self.assertIn('http://engine1.onion', engines.engines['engine1'].search_url) + self.assertEqual(engines.engines['engine1'].timeout, 120.0)