mirror of
https://github.com/searx/searx
synced 2024-12-11 16:35:20 +01:00
use Wikipedia's REST v1 API
This commit is contained in:
parent
d0f9778c2a
commit
ab20ca182c
@ -1,7 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
Wikipedia (Web)
|
Wikipedia (Web)
|
||||||
|
|
||||||
@website https://{language}.wikipedia.org
|
@website https://en.wikipedia.org/api/rest_v1/
|
||||||
@provide-api yes
|
@provide-api yes
|
||||||
|
|
||||||
@using-api yes
|
@using-api yes
|
||||||
@ -12,21 +12,11 @@
|
|||||||
|
|
||||||
from json import loads
|
from json import loads
|
||||||
from lxml.html import fromstring
|
from lxml.html import fromstring
|
||||||
from searx.url_utils import quote, urlencode
|
from searx.url_utils import quote
|
||||||
from searx.utils import match_language
|
from searx.utils import match_language, searx_useragent
|
||||||
|
|
||||||
# search-url
|
# search-url
|
||||||
base_url = u'https://{language}.wikipedia.org/'
|
search_url = u'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
|
||||||
search_url = base_url + u'w/api.php?'\
|
|
||||||
'action=query'\
|
|
||||||
'&format=json'\
|
|
||||||
'&{query}'\
|
|
||||||
'&prop=extracts|pageimages|pageprops'\
|
|
||||||
'&ppprop=disambiguation'\
|
|
||||||
'&exintro'\
|
|
||||||
'&explaintext'\
|
|
||||||
'&pithumbsize=300'\
|
|
||||||
'&redirects'
|
|
||||||
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
|
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
|
||||||
|
|
||||||
|
|
||||||
@ -41,51 +31,37 @@ def url_lang(lang):
|
|||||||
# do search-request
|
# do search-request
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
if query.islower():
|
if query.islower():
|
||||||
query = u'{0}|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8')
|
query = query.title()
|
||||||
|
|
||||||
params['url'] = search_url.format(query=urlencode({'titles': query}),
|
params['url'] = search_url.format(title=quote(query),
|
||||||
language=url_lang(params['language']))
|
language=url_lang(params['language']))
|
||||||
|
|
||||||
|
params['headers']['User-Agent'] = searx_useragent()
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
# get response from search-request
|
# get response from search-request
|
||||||
def response(resp):
|
def response(resp):
|
||||||
results = []
|
if not resp.ok:
|
||||||
|
|
||||||
search_result = loads(resp.text)
|
|
||||||
|
|
||||||
# wikipedia article's unique id
|
|
||||||
# first valid id is assumed to be the requested article
|
|
||||||
if 'pages' not in search_result['query']:
|
|
||||||
return results
|
|
||||||
|
|
||||||
for article_id in search_result['query']['pages']:
|
|
||||||
page = search_result['query']['pages'][article_id]
|
|
||||||
if int(article_id) > 0:
|
|
||||||
break
|
|
||||||
|
|
||||||
if int(article_id) < 0 or 'disambiguation' in page.get('pageprops', {}):
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
title = page.get('title')
|
results = []
|
||||||
|
api_result = loads(resp.text)
|
||||||
|
|
||||||
image = page.get('thumbnail')
|
# skip disambiguation pages
|
||||||
if image:
|
if api_result['type'] != 'standard':
|
||||||
image = image.get('source')
|
return []
|
||||||
|
|
||||||
summary = page.get('extract', '').split('\n')[0].replace('()', '')
|
title = api_result['title']
|
||||||
|
wikipedia_link = api_result['content_urls']['desktop']['page']
|
||||||
# link to wikipedia article
|
|
||||||
wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \
|
|
||||||
+ 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
|
|
||||||
|
|
||||||
results.append({'url': wikipedia_link, 'title': title})
|
results.append({'url': wikipedia_link, 'title': title})
|
||||||
|
|
||||||
results.append({'infobox': title,
|
results.append({'infobox': title,
|
||||||
'id': wikipedia_link,
|
'id': wikipedia_link,
|
||||||
'content': summary,
|
'content': api_result.get('extract', ''),
|
||||||
'img_src': image,
|
'img_src': api_result.get('thumbnail', {}).get('source'),
|
||||||
'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
|
'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
Loading…
Reference in New Issue
Block a user