fix yahoo engines and add comments

This commit is contained in:
Thomas Pointhuber 2014-09-01 16:17:29 +02:00
parent 90dcfc1ddd
commit 03db970e6a
2 changed files with 73 additions and 18 deletions

View File

@ -1,64 +1,99 @@
#!/usr/bin/env python ## Yahoo (Web)
#
# @website https://search.yahoo.com/web
# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries
#
# @using-api no (because pricing)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content, suggestion
from urllib import urlencode from urllib import urlencode
from urlparse import unquote from urlparse import unquote
from lxml import html from lxml import html
from searx.engines.xpath import extract_text, extract_url from searx.engines.xpath import extract_text, extract_url
# engine dependent config
categories = ['general'] categories = ['general']
search_url = 'http://search.yahoo.com/search?{query}&b={offset}' paging = True
language_support = True
# search-url
search_url = 'https://search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
# specific xpath variables
results_xpath = '//div[@class="res"]' results_xpath = '//div[@class="res"]'
url_xpath = './/h3/a/@href' url_xpath = './/h3/a/@href'
title_xpath = './/h3/a' title_xpath = './/h3/a'
content_xpath = './/div[@class="abstr"]' content_xpath = './/div[@class="abstr"]'
suggestion_xpath = '//div[@id="satat"]//a' suggestion_xpath = '//div[@id="satat"]//a'
paging = True
# remove yahoo-specific tracking-url
def parse_url(url_string): def parse_url(url_string):
endings = ['/RS', '/RK'] endings = ['/RS', '/RK']
endpositions = [] endpositions = []
start = url_string.find('http', url_string.find('/RU=')+1) start = url_string.find('http', url_string.find('/RU=')+1)
for ending in endings: for ending in endings:
endpos = url_string.rfind(ending) endpos = url_string.rfind(ending)
if endpos > -1: if endpos > -1:
endpositions.append(endpos) endpositions.append(endpos)
end = min(endpositions) end = min(endpositions)
return unquote(url_string[start:end]) return unquote(url_string[start:end])
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1 offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all': if params['language'] == 'all':
language = 'en' language = 'en'
else: else:
language = params['language'].split('_')[0] language = params['language'].split('_')[0]
params['url'] = search_url.format(offset=offset, params['url'] = search_url.format(offset=offset,
query=urlencode({'p': query})) query=urlencode({'p': query}),
lang=language)
# TODO required?
params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
.format(lang=language) .format(lang=language)
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath): for result in dom.xpath(results_xpath):
try: try:
url = parse_url(extract_url(result.xpath(url_xpath), search_url)) url = parse_url(extract_url(result.xpath(url_xpath), search_url))
title = extract_text(result.xpath(title_xpath)[0]) title = extract_text(result.xpath(title_xpath)[0])
except: except:
continue continue
content = extract_text(result.xpath(content_xpath)[0])
results.append({'url': url, 'title': title, 'content': content})
content = extract_text(result.xpath(content_xpath)[0])
# append result
results.append({'url': url,
'title': title,
'content': content})
# if no suggestion found, return results
if not suggestion_xpath: if not suggestion_xpath:
return results return results
# parse suggestion
for suggestion in dom.xpath(suggestion_xpath): for suggestion in dom.xpath(suggestion_xpath):
# append suggestion
results.append({'suggestion': extract_text(suggestion)}) results.append({'suggestion': extract_text(suggestion)})
# return results
return results return results

View File

@ -1,4 +1,12 @@
#!/usr/bin/env python ## Yahoo (News)
#
# @website https://news.yahoo.com
# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries
#
# @using-api no (because pricing)
# @results HTML (using search portal)
# @stable no (HTML can change)
# @parse url, title, content, publishedDate
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
@ -8,8 +16,15 @@ from datetime import datetime, timedelta
import re import re
from dateutil import parser from dateutil import parser
# engine dependent config
categories = ['news'] categories = ['news']
search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}' paging = True
language_support = True
# search-url
search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
# specific xpath variables
results_xpath = '//div[@class="res"]' results_xpath = '//div[@class="res"]'
url_xpath = './/h3/a/@href' url_xpath = './/h3/a/@href'
title_xpath = './/h3/a' title_xpath = './/h3/a'
@ -17,30 +32,39 @@ content_xpath = './/div[@class="abstr"]'
publishedDate_xpath = './/span[@class="timestamp"]' publishedDate_xpath = './/span[@class="timestamp"]'
suggestion_xpath = '//div[@id="satat"]//a' suggestion_xpath = '//div[@id="satat"]//a'
paging = True
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1 offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all': if params['language'] == 'all':
language = 'en' language = 'en'
else: else:
language = params['language'].split('_')[0] language = params['language'].split('_')[0]
params['url'] = search_url.format(offset=offset, params['url'] = search_url.format(offset=offset,
query=urlencode({'p': query})) query=urlencode({'p': query}),
lang=language)
# TODO required?
params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
.format(lang=language) .format(lang=language)
return params return params
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath(results_xpath): for result in dom.xpath(results_xpath):
url = parse_url(extract_url(result.xpath(url_xpath), search_url)) url = parse_url(extract_url(result.xpath(url_xpath), search_url))
title = extract_text(result.xpath(title_xpath)[0]) title = extract_text(result.xpath(title_xpath)[0])
content = extract_text(result.xpath(content_xpath)[0]) content = extract_text(result.xpath(content_xpath)[0])
# parse publishedDate
publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
if re.match("^[0-9]+ minute(s|) ago$", publishedDate): if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
@ -58,15 +82,11 @@ def response(resp):
if publishedDate.year == 1900: if publishedDate.year == 1900:
publishedDate = publishedDate.replace(year=datetime.now().year) publishedDate = publishedDate.replace(year=datetime.now().year)
# append result
results.append({'url': url, results.append({'url': url,
'title': title, 'title': title,
'content': content, 'content': content,
'publishedDate': publishedDate}) 'publishedDate': publishedDate})
if not suggestion_xpath: # return results
return results
for suggestion in dom.xpath(suggestion_xpath):
results.append({'suggestion': extract_text(suggestion)})
return results return results