[enh] infobox for wolframalpha

TODO:
    - infobox styles
    - unit tests

ISSUES:
    - no_api version needs to re-call server for additional pods, such
      as plots. therefore, it's even slower than before. comment out the
part that calls get_async_pod if requests reach timeout or increase
timeout in settings.yml.
This commit is contained in:
a01200356 2016-02-27 19:06:44 -06:00
parent 45b5073f3e
commit 78d3f3d6b1
3 changed files with 150 additions and 46 deletions

View File

@ -1,40 +1,56 @@
# Wolfram Alpha (Maths) # Wolfram Alpha (Science)
# #
# @website http://www.wolframalpha.com # @website https://www.wolframalpha.com
# @provide-api yes (http://api.wolframalpha.com/v2/) # @provide-api yes (https://api.wolframalpha.com/v2/)
# #
# @using-api yes # @using-api yes
# @results XML # @results XML
# @stable yes # @stable yes
# @parse result # @parse url, infobox
from urllib import urlencode from urllib import urlencode
from lxml import etree from lxml import etree
from re import search
# search-url # search-url
base_url = 'http://api.wolframalpha.com/v2/query' search_url = 'https://api.wolframalpha.com/v2/query?appid={api_key}&{query}'
search_url = base_url + '?appid={api_key}&{query}&format=plaintext' site_url = 'https://www.wolframalpha.com/input/?{query}'
site_url = 'http://www.wolframalpha.com/input/?{query}'
api_key = '' # defined in settings.yml api_key = '' # defined in settings.yml
# xpath variables # xpath variables
failure_xpath = '/queryresult[attribute::success="false"]' failure_xpath = '/queryresult[attribute::success="false"]'
answer_xpath = '//pod[attribute::primary="true"]/subpod/plaintext' answer_xpath = '//pod[attribute::primary="true"]/subpod/plaintext'
input_xpath = '//pod[starts-with(attribute::title, "Input")]/subpod/plaintext' input_xpath = '//pod[starts-with(attribute::title, "Input")]/subpod/plaintext'
pods_xpath = '//pod'
subpods_xpath = './subpod'
pod_title_xpath = './@title'
plaintext_xpath = './plaintext'
image_xpath = './img'
img_src_xpath = './@src'
img_alt_xpath = './@alt'
# pods to display as image in infobox
# this pods do return a plaintext, but they look better and are more useful as images
image_pods = {'Visual representation',
'Manipulatives illustration'}
# do search-request # do search-request
def request(query, params): def request(query, params):
params['url'] = search_url.format(query=urlencode({'input': query}), params['url'] = search_url.format(query=urlencode({'input': query}),
api_key=api_key) api_key=api_key)
params['headers']['Referer'] = site_url.format(query=urlencode({'i': query}))
return params return params
# replace private user area characters to make text legible # replace private user area characters to make text legible
def replace_pua_chars(text): def replace_pua_chars(text):
pua_chars = {u'\uf74c': 'd', pua_chars = {u'\uf522': u'\u2192',
u'\uf7b1': u'\u2115',
u'\uf7b4': u'\u211a',
u'\uf7b5': u'\u211d',
u'\uf7bd': u'\u2124',
u'\uf74c': 'd',
u'\uf74d': u'\u212f', u'\uf74d': u'\u212f',
u'\uf74e': 'i', u'\uf74e': 'i',
u'\uf7d9': '='} u'\uf7d9': '='}
@ -55,23 +71,45 @@ def response(resp):
if search_results.xpath(failure_xpath): if search_results.xpath(failure_xpath):
return [] return []
# parse answers infobox_title = search_results.xpath(input_xpath)
answers = search_results.xpath(answer_xpath) if infobox_title:
if answers: infobox_title = replace_pua_chars(infobox_title[0].text)
for answer in answers:
answer = replace_pua_chars(answer.text)
results.append({'answer': answer}) pods = search_results.xpath(pods_xpath)
result_chunks = []
for pod in pods:
pod_title = replace_pua_chars(pod.xpath(pod_title_xpath)[0])
# if there's no input section in search_results, check if answer has the input embedded (before their "=" sign) subpods = pod.xpath(subpods_xpath)
try: if not subpods:
query_input = search_results.xpath(input_xpath)[0].text continue
except IndexError:
query_input = search(u'([^\uf7d9]+)', answers[0].text).group(1) for subpod in subpods:
content = subpod.xpath(plaintext_xpath)[0].text
image = subpod.xpath(image_xpath)
if content and pod_title not in image_pods:
content = replace_pua_chars(content)
result_chunks.append({'label': pod_title, 'value': content})
# if there's no input pod, infobox_title is content of first pod
if not infobox_title:
infobox_title = content
elif image:
result_chunks.append({'label': pod_title,
'image': {'src': image[0].xpath(img_src_xpath)[0],
'alt': image[0].xpath(img_alt_xpath)[0]}})
if not result_chunks:
return []
results.append({'infobox': infobox_title,
'attributes': result_chunks,
'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}]})
# append link to site # append link to site
result_url = site_url.format(query=urlencode({'i': query_input.encode('utf-8')})) results.append({'url': resp.request.headers['Referer'],
results.append({'url': result_url, 'title': 'Wolfram|Alpha',
'title': query_input + " - Wolfram|Alpha"}) 'content': infobox_title})
return results return results

View File

@ -1,23 +1,23 @@
# WolframAlpha (Maths) # Wolfram|Alpha (Science)
# #
# @website http://www.wolframalpha.com/ # @website https://www.wolframalpha.com/
# @provide-api yes (http://api.wolframalpha.com/v2/) # @provide-api yes (https://api.wolframalpha.com/v2/)
# #
# @using-api no # @using-api no
# @results HTML # @results JSON
# @stable no # @stable no
# @parse answer # @parse url, infobox
from cgi import escape from cgi import escape
from json import loads from json import loads
from time import time from time import time
from urllib import urlencode from urllib import urlencode
from lxml.etree import XML
from searx.poolrequests import get as http_get from searx.poolrequests import get as http_get
# search-url # search-url
url = 'https://www.wolframalpha.com/' url = 'https://www.wolframalpha.com/'
search_url = url + 'input/?{query}'
search_url = url + 'input/json.jsp'\ search_url = url + 'input/json.jsp'\
'?async=true'\ '?async=true'\
@ -33,13 +33,25 @@ search_url = url + 'input/json.jsp'\
'&sponsorcategories=true'\ '&sponsorcategories=true'\
'&statemethod=deploybutton' '&statemethod=deploybutton'
# xpath variables referer_url = url + 'input/?{query}'
scripts_xpath = '//script'
title_xpath = '//title'
failure_xpath = '//p[attribute::class="pfail"]'
token = {'value': '', token = {'value': '',
'last_updated': None} 'last_updated': None}
# xpath variables
success_xpath = '/pod[attribute::error="false"]'
plaintext_xpath = './plaintext'
title_xpath = './@title'
image_xpath = './img'
img_src_xpath = './img/@src'
img_alt_xpath = './img/@alt'
# pods to display as image in infobox
# this pods do return a plaintext, but they look better and are more useful as images
image_pods = {'Visual representation',
'Manipulatives illustration',
'Symbol'}
# seems, wolframalpha resets its token in every hour # seems, wolframalpha resets its token in every hour
def obtain_token(): def obtain_token():
@ -62,13 +74,42 @@ def request(query, params):
if time() - token['last_updated'] > 3600: if time() - token['last_updated'] > 3600:
obtain_token() obtain_token()
params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value']) params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value'])
params['headers']['Referer'] = 'https://www.wolframalpha.com/input/?i=' + query params['headers']['Referer'] = referer_url.format(query=urlencode({'i': query}))
return params return params
# get additional pod
# NOTE: this makes an additional requests to server, so the response will take longer and might reach timeout
def get_async_pod(url):
pod = {'subpods': []}
try:
resp = http_get(url, timeout=2.0)
resp_pod = XML(resp.content)
if resp_pod.xpath(success_xpath):
for subpod in resp_pod:
plaintext = subpod.xpath(plaintext_xpath)[0].text
if plaintext:
pod['subpods'].append({'title': subpod.xpath(title_xpath)[0],
'plaintext': plaintext})
elif subpod.xpath(image_xpath):
pod['subpods'].append({'title': subpod.xpath(title_xpath)[0],
'plaintext': '',
'img': {'src': subpod.xpath(img_src_xpath)[0],
'alt': subpod.xpath(img_alt_xpath)[0]}})
except:
pass
return pod
# get response from search-request # get response from search-request
def response(resp): def response(resp):
results = []
resp_json = loads(resp.text) resp_json = loads(resp.text)
if not resp_json['queryresult']['success']: if not resp_json['queryresult']['success']:
@ -76,20 +117,45 @@ def response(resp):
# TODO handle resp_json['queryresult']['assumptions'] # TODO handle resp_json['queryresult']['assumptions']
result_chunks = [] result_chunks = []
infobox_title = None
for pod in resp_json['queryresult']['pods']: for pod in resp_json['queryresult']['pods']:
pod_title = pod.get('title', '') pod_title = pod.get('title', '')
if 'subpods' not in pod: if 'subpods' not in pod:
continue # comment this section if your requests always reach timeout
if pod['async']:
result = get_async_pod(pod['async'])
if result:
pod = result
else:
continue
# infobox title is input or text content on first pod
if pod_title.startswith('Input') or not infobox_title:
try:
infobox_title = pod['subpods'][0]['plaintext']
except:
infobox_title = ''
pass
for subpod in pod['subpods']: for subpod in pod['subpods']:
if 'img' in subpod: if subpod['plaintext'] != '' and pod_title not in image_pods:
result_chunks.append(u'<p>{0}<br /><img src="{1}" alt="{2}" /></p>' # append unless it's not an actual answer
.format(escape(pod_title or subpod['img']['alt']), if subpod['plaintext'] != '(requires interactivity)':
escape(subpod['img']['src']), result_chunks.append({'label': pod_title, 'value': subpod['plaintext']})
escape(subpod['img']['alt'])))
elif 'img' in subpod:
result_chunks.append({'label': pod_title, 'image': subpod['img']})
if not result_chunks: if not result_chunks:
return [] return []
return [{'url': resp.request.headers['Referer'].decode('utf-8'), results.append({'infobox': infobox_title,
'title': 'Wolframalpha', 'attributes': result_chunks,
'content': ''.join(result_chunks)}] 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}]})
results.append({'url': resp.request.headers['Referer'],
'title': 'Wolfram|Alpha',
'content': infobox_title})
return results

View File

@ -310,10 +310,10 @@ engines:
shortcut : wa shortcut : wa
# You can use the engine using the official stable API, but you need an API key # You can use the engine using the official stable API, but you need an API key
# See : http://products.wolframalpha.com/api/ # See : http://products.wolframalpha.com/api/
# engine : wolframalpha_api # engine : wolframalpha_api
# api_key: 'apikey' # required! # api_key: '5952JX-X52L3VKWT8' # required!
engine : wolframalpha_noapi engine : wolframalpha_noapi
timeout: 6.0 timeout: 10.0
categories : science categories : science
#The blekko technology and team have joined IBM Watson! -> https://blekko.com/ #The blekko technology and team have joined IBM Watson! -> https://blekko.com/