Make scripts work for JSON guessing too.
This commit is contained in:
parent
da6cb6ea3a
commit
b78ebd1d2a
@ -30,7 +30,7 @@
|
|||||||
<url type="donation">https://martinrotter.github.io/donate/</url>
|
<url type="donation">https://martinrotter.github.io/donate/</url>
|
||||||
<content_rating type="oars-1.1" />
|
<content_rating type="oars-1.1" />
|
||||||
<releases>
|
<releases>
|
||||||
<release version="3.8.4" date="2021-02-04"/>
|
<release version="3.8.4" date="2021-02-05"/>
|
||||||
</releases>
|
</releases>
|
||||||
<content_rating type="oars-1.0">
|
<content_rating type="oars-1.0">
|
||||||
<content_attribute id="violence-cartoon">none</content_attribute>
|
<content_attribute id="violence-cartoon">none</content_attribute>
|
||||||
|
@ -78,6 +78,8 @@ RSS Guard offers placeholder `%data%` which is automatically replaced with full
|
|||||||
|
|
||||||
Also, working directory of process executing the script is set to RSS Guard's user data folder.
|
Also, working directory of process executing the script is set to RSS Guard's user data folder.
|
||||||
|
|
||||||
|
There are some examples of website scrapers [here](https://github.com/martinrotter/rssguard/tree/master/resources/scripts/scrapers), most of the are written in Python 3, thus their execution line is `python.exe#script.py`.
|
||||||
|
|
||||||
After your source feed data are downloaded either via URL or custom script, you can optionally post-process the data with one more custom script, which will take **raw source data as input** and must produce processed valid feed data to **standard output** while printing all error messages to **error output**.
|
After your source feed data are downloaded either via URL or custom script, you can optionally post-process the data with one more custom script, which will take **raw source data as input** and must produce processed valid feed data to **standard output** while printing all error messages to **error output**.
|
||||||
|
|
||||||
Format of post-process script execution line is the same as above.
|
Format of post-process script execution line is the same as above.
|
||||||
|
39
resources/scripts/scrapers/wiki-inthenews.py
Executable file
39
resources/scripts/scrapers/wiki-inthenews.py
Executable file
@ -0,0 +1,39 @@
|
|||||||
|
# Obtains Wikipedia's "In the news" today's articles.
|
||||||
|
|
||||||
|
import urllib.request
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
|
url = "https://en.wikipedia.org/wiki/Main_Page"
|
||||||
|
response = urllib.request.urlopen(url)
|
||||||
|
text = response.read().decode("utf-8")
|
||||||
|
|
||||||
|
text_li = re.search("In the news[\S\n\t\v ]+?<ul>([\S\n\t\v ]+?)<\/ul>", text).group(1)
|
||||||
|
articles_li = re.findall("<li>([\S\n\t\v ]+?)<\/li>", text_li)
|
||||||
|
|
||||||
|
# Iterate all articles and generate JSON feed entries.
|
||||||
|
wiki_base_url = "https://en.wikipedia.org"
|
||||||
|
|
||||||
|
class HTMLFilter(HTMLParser):
|
||||||
|
text = ""
|
||||||
|
def handle_data(self, data):
|
||||||
|
self.text += data
|
||||||
|
|
||||||
|
json_feed = "{{\"title\": \"Wikipedia - In the news\", \"items\": [{items}]}}"
|
||||||
|
items = list()
|
||||||
|
|
||||||
|
for article in articles_li:
|
||||||
|
article_url = json.dumps(wiki_base_url + re.search("^.+?href=\"(.+?)\"", article).group(1))
|
||||||
|
f = HTMLFilter()
|
||||||
|
f.feed(article)
|
||||||
|
f.text
|
||||||
|
article_title = json.dumps(f.text)
|
||||||
|
article_html = json.dumps("<div>{}</div>".format(article))
|
||||||
|
items.append("{{\"title\": {title}, \"content_html\": {html}, \"url\": {url}}}".format(title=article_title,
|
||||||
|
html=article_html,
|
||||||
|
url=article_url))
|
||||||
|
|
||||||
|
json_feed = json_feed.format(items=", ".join(items))
|
||||||
|
|
||||||
|
print(json_feed)
|
@ -261,7 +261,8 @@ StandardFeed* StandardFeed::guessFeed(StandardFeed::SourceType source_type,
|
|||||||
|
|
||||||
StandardFeed* feed = nullptr;
|
StandardFeed* feed = nullptr;
|
||||||
|
|
||||||
if (content_type.contains(QSL("json"), Qt::CaseSensitivity::CaseInsensitive)) {
|
if (content_type.contains(QSL("json"), Qt::CaseSensitivity::CaseInsensitive) ||
|
||||||
|
feed_contents.startsWith('{')) {
|
||||||
feed = new StandardFeed();
|
feed = new StandardFeed();
|
||||||
|
|
||||||
// We have JSON feed.
|
// We have JSON feed.
|
||||||
@ -693,8 +694,15 @@ QString StandardFeed::runScriptProcess(const QStringList& cmd_args, const QStrin
|
|||||||
process.closeWriteChannel();
|
process.closeWriteChannel();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (process.waitForFinished(run_timeout)) {
|
if (process.waitForFinished(run_timeout) && process.exitStatus() == QProcess::ExitStatus::NormalExit) {
|
||||||
auto raw_output = process.readAllStandardOutput();
|
auto raw_output = process.readAllStandardOutput();
|
||||||
|
auto raw_error = process.readAllStandardError();
|
||||||
|
|
||||||
|
if (!raw_error.simplified().isEmpty()) {
|
||||||
|
qWarningNN << LOGSEC_CORE
|
||||||
|
<< "Received error output from custom script even if it reported that it exited normally:"
|
||||||
|
<< QUOTE_W_SPACE_DOT(raw_error);
|
||||||
|
}
|
||||||
|
|
||||||
return raw_output;
|
return raw_output;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user