From a4bc089091c65a502974519846823cfb2a726f51 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 9 Jan 2022 16:05:25 +0100 Subject: [PATCH] [fix] startpage engine: fetch CAPTCHA & issues related to PR-695 In case of CAPTCHA raise a SearxEngineCaptchaException and suspend for 7 days. When get_sc_code() fails raise a SearxEngineResponseException and suspend for 7 days. [1] https://github.com/searxng/searxng/pull/695 Signed-off-by: Markus Heiser --- searx/engines/startpage.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index e64bf5c5..7d4f8dc5 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -18,6 +18,11 @@ from babel.localedata import locale_identifiers from searx import network from searx.utils import extract_text, eval_xpath, match_language +from searx.exceptions import ( + SearxEngineResponseException, + SearxEngineCaptchaException, +) + # about about = { @@ -54,6 +59,13 @@ sc_code_ts = 0 sc_code = '' +def raise_captcha(resp): + + if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): + # suspend CAPTCHA for 7 days + raise SearxEngineCaptchaException(suspended_time=7 * 24 * 3600) + + def get_sc_code(headers): """Get an actual `sc` argument from startpage's home page. @@ -73,10 +85,17 @@ def get_sc_code(headers): logger.debug("query new sc time-stamp ...") resp = network.get(base_url, headers=headers) + raise_captcha(resp) dom = html.fromstring(resp.text) - # href --> '/?sc=adrKJMgF8xwp20' - href = eval_xpath(dom, '//a[@class="footer-home__logo"]')[0].get('href') + try: + # href --> '/?sc=adrKJMgF8xwp20' + href = eval_xpath(dom, '//a[@class="footer-home__logo"]')[0].get('href') + except IndexError as exc: + # suspend startpage API --> https://github.com/searxng/searxng/pull/695 + raise SearxEngineResponseException( + suspended_time=7 * 24 * 3600, message="PR-695: query new sc time-stamp failed!" + ) sc_code = href[5:] sc_code_ts = time()