better in the news
This commit is contained in:
parent
5100b5195a
commit
2c1bc5a937
@ -3,7 +3,7 @@
|
|||||||
import urllib.request
|
import urllib.request
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
from html.parser import HTMLParser
|
from newspaper import Article
|
||||||
|
|
||||||
url = "https://en.wikipedia.org/wiki/Main_Page"
|
url = "https://en.wikipedia.org/wiki/Main_Page"
|
||||||
response = urllib.request.urlopen(url)
|
response = urllib.request.urlopen(url)
|
||||||
@ -15,21 +15,20 @@ articles_li = re.findall("<li>([\S\n\t\v ]+?)<\/li>", text_li)
|
|||||||
# Iterate all articles and generate JSON feed entries.
|
# Iterate all articles and generate JSON feed entries.
|
||||||
wiki_base_url = "https://en.wikipedia.org"
|
wiki_base_url = "https://en.wikipedia.org"
|
||||||
|
|
||||||
class HTMLFilter(HTMLParser):
|
|
||||||
text = ""
|
|
||||||
def handle_data(self, data):
|
|
||||||
self.text += data
|
|
||||||
|
|
||||||
json_feed = "{{\"title\": \"Wikipedia - In the news\", \"items\": [{items}]}}"
|
json_feed = "{{\"title\": \"Wikipedia - In the news\", \"items\": [{items}]}}"
|
||||||
items = list()
|
items = list()
|
||||||
|
|
||||||
for article in articles_li:
|
for article in articles_li:
|
||||||
article_url = json.dumps(wiki_base_url + re.search("^.+?href=\"(.+?)\"", article).group(1))
|
article_url = wiki_base_url + re.search("^.+?href=\"(.+?)\"", article).group(1)
|
||||||
f = HTMLFilter()
|
|
||||||
f.feed(article)
|
f = Article(article_url, keep_article_html = True)
|
||||||
f.text
|
f.download()
|
||||||
article_title = json.dumps(f.text)
|
f.parse()
|
||||||
article_html = json.dumps("<div>{}</div>".format(article))
|
|
||||||
|
article_url = json.dumps(article_url)
|
||||||
|
article_title = json.dumps(f.title)
|
||||||
|
article_html = json.dumps(f.article_html)
|
||||||
items.append("{{\"title\": {title}, \"content_html\": {html}, \"url\": {url}}}".format(title=article_title,
|
items.append("{{\"title\": {title}, \"content_html\": {html}, \"url\": {url}}}".format(title=article_title,
|
||||||
html=article_html,
|
html=article_html,
|
||||||
url=article_url))
|
url=article_url))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user