mirror of
https://github.com/searx/searx
synced 2024-12-11 16:35:20 +01:00
[fix] Get an actual sc
argument from startpage's home page.
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
a6184ac32c
commit
1076d7e52e
@ -5,6 +5,7 @@
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
from time import time
|
||||||
|
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
from unicodedata import normalize, combining
|
from unicodedata import normalize, combining
|
||||||
@ -15,6 +16,7 @@ from lxml import html
|
|||||||
from babel import Locale
|
from babel import Locale
|
||||||
from babel.localedata import locale_identifiers
|
from babel.localedata import locale_identifiers
|
||||||
|
|
||||||
|
from searx import network
|
||||||
from searx.utils import extract_text, eval_xpath, match_language
|
from searx.utils import extract_text, eval_xpath, match_language
|
||||||
|
|
||||||
# about
|
# about
|
||||||
@ -47,6 +49,41 @@ results_xpath = '//div[@class="w-gl__result__main"]'
|
|||||||
link_xpath = './/a[@class="w-gl__result-title result-link"]'
|
link_xpath = './/a[@class="w-gl__result-title result-link"]'
|
||||||
content_xpath = './/p[@class="w-gl__description"]'
|
content_xpath = './/p[@class="w-gl__description"]'
|
||||||
|
|
||||||
|
# timestamp of the last fetch of 'sc' code
|
||||||
|
sc_code_ts = 0
|
||||||
|
sc_code = ''
|
||||||
|
|
||||||
|
|
||||||
|
def get_sc_code(headers):
|
||||||
|
"""Get an actual `sc` argument from startpage's home page.
|
||||||
|
|
||||||
|
Startpage puts a `sc` argument on every link. Without this argument
|
||||||
|
startpage considers the request is from a bot. We do not know what is
|
||||||
|
encoded in the value of the `sc` argument, but it seems to be a kind of a
|
||||||
|
*time-stamp*. This *time-stamp* is valid for a few hours.
|
||||||
|
|
||||||
|
This function scrap a new *time-stamp* from startpage's home page every hour
|
||||||
|
(3000 sec).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
global sc_code_ts, sc_code # pylint: disable=global-statement
|
||||||
|
|
||||||
|
if time() > (sc_code_ts + 3000):
|
||||||
|
logger.debug("query new sc time-stamp ...")
|
||||||
|
|
||||||
|
resp = network.get(base_url, headers=headers)
|
||||||
|
dom = html.fromstring(resp.text)
|
||||||
|
|
||||||
|
# href --> '/?sc=adrKJMgF8xwp20'
|
||||||
|
href = eval_xpath(dom, '//a[@class="footer-home__logo"]')[0].get('href')
|
||||||
|
|
||||||
|
sc_code = href[5:]
|
||||||
|
sc_code_ts = time()
|
||||||
|
logger.debug("new value is: %s", sc_code)
|
||||||
|
|
||||||
|
return sc_code
|
||||||
|
|
||||||
|
|
||||||
# do search-request
|
# do search-request
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
@ -56,7 +93,7 @@ def request(query, params):
|
|||||||
'page': params['pageno'],
|
'page': params['pageno'],
|
||||||
'cat': 'web',
|
'cat': 'web',
|
||||||
# 'abp': "-1",
|
# 'abp': "-1",
|
||||||
'sc': 'Mj4jZy61QETj20',
|
'sc': get_sc_code(params['headers']),
|
||||||
}
|
}
|
||||||
|
|
||||||
# set language if specified
|
# set language if specified
|
||||||
|
Loading…
Reference in New Issue
Block a user