Merge pull request #2350 from dalf/mod-engines-report-captcha

[mod] stackoverflow & yandex: detect CAPTCHA response
This commit is contained in:
Alexandre Flament 2020-12-03 13:50:12 +01:00 committed by GitHub
commit cec73b5dcf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 12 additions and 2 deletions

View File

@ -10,9 +10,10 @@
@parse url, title, content @parse url, title, content
""" """
from urllib.parse import urlencode, urljoin from urllib.parse import urlencode, urljoin, urlparse
from lxml import html from lxml import html
from searx.utils import extract_text from searx.utils import extract_text
from searx.exceptions import SearxEngineCaptchaException
# engine dependent config # engine dependent config
categories = ['it'] categories = ['it']
@ -37,6 +38,10 @@ def request(query, params):
# get response from search-request # get response from search-request
def response(resp): def response(resp):
resp_url = urlparse(resp.url)
if resp_url.path.startswith('/nocaptcha'):
raise SearxEngineCaptchaException()
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)

View File

@ -9,9 +9,10 @@
@parse url, title, content @parse url, title, content
""" """
from urllib.parse import urlencode from urllib.parse import urlencode, urlparse
from lxml import html from lxml import html
from searx import logger from searx import logger
from searx.exceptions import SearxEngineCaptchaException
logger = logger.getChild('yandex engine') logger = logger.getChild('yandex engine')
@ -47,6 +48,10 @@ def request(query, params):
# get response from search-request # get response from search-request
def response(resp): def response(resp):
resp_url = urlparse(resp.url)
if resp_url.path.startswith('/showcaptcha'):
raise SearxEngineCaptchaException()
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
results = [] results = []