From b78ebd1d2a74eba73a1478325b398df36becacb4 Mon Sep 17 00:00:00 2001 From: Martin Rotter Date: Fri, 5 Feb 2021 08:57:38 +0100 Subject: [PATCH] Make scripts work for JSON guessing too. --- .../desktop/com.github.rssguard.appdata.xml | 2 +- resources/docs/Feed-formats.md | 2 + resources/scripts/scrapers/wiki-inthenews.py | 39 +++++++++++++++++++ .../services/standard/standardfeed.cpp | 12 +++++- 4 files changed, 52 insertions(+), 3 deletions(-) create mode 100755 resources/scripts/scrapers/wiki-inthenews.py diff --git a/resources/desktop/com.github.rssguard.appdata.xml b/resources/desktop/com.github.rssguard.appdata.xml index 0743bf1c4..937f5e74d 100644 --- a/resources/desktop/com.github.rssguard.appdata.xml +++ b/resources/desktop/com.github.rssguard.appdata.xml @@ -30,7 +30,7 @@ https://martinrotter.github.io/donate/ - + none diff --git a/resources/docs/Feed-formats.md b/resources/docs/Feed-formats.md index 52502c689..0092976f4 100755 --- a/resources/docs/Feed-formats.md +++ b/resources/docs/Feed-formats.md @@ -78,6 +78,8 @@ RSS Guard offers placeholder `%data%` which is automatically replaced with full Also, working directory of process executing the script is set to RSS Guard's user data folder. +There are some examples of website scrapers [here](https://github.com/martinrotter/rssguard/tree/master/resources/scripts/scrapers), most of the are written in Python 3, thus their execution line is `python.exe#script.py`. + After your source feed data are downloaded either via URL or custom script, you can optionally post-process the data with one more custom script, which will take **raw source data as input** and must produce processed valid feed data to **standard output** while printing all error messages to **error output**. Format of post-process script execution line is the same as above. diff --git a/resources/scripts/scrapers/wiki-inthenews.py b/resources/scripts/scrapers/wiki-inthenews.py new file mode 100755 index 000000000..d95847f54 --- /dev/null +++ b/resources/scripts/scrapers/wiki-inthenews.py @@ -0,0 +1,39 @@ +# Obtains Wikipedia's "In the news" today's articles. + +import urllib.request +import re +import json +from html.parser import HTMLParser + +url = "https://en.wikipedia.org/wiki/Main_Page" +response = urllib.request.urlopen(url) +text = response.read().decode("utf-8") + +text_li = re.search("In the news[\S\n\t\v ]+?
    ([\S\n\t\v ]+?)<\/ul>", text).group(1) +articles_li = re.findall("
  • ([\S\n\t\v ]+?)<\/li>", text_li) + +# Iterate all articles and generate JSON feed entries. +wiki_base_url = "https://en.wikipedia.org" + +class HTMLFilter(HTMLParser): + text = "" + def handle_data(self, data): + self.text += data + +json_feed = "{{\"title\": \"Wikipedia - In the news\", \"items\": [{items}]}}" +items = list() + +for article in articles_li: + article_url = json.dumps(wiki_base_url + re.search("^.+?href=\"(.+?)\"", article).group(1)) + f = HTMLFilter() + f.feed(article) + f.text + article_title = json.dumps(f.text) + article_html = json.dumps("
    {}
    ".format(article)) + items.append("{{\"title\": {title}, \"content_html\": {html}, \"url\": {url}}}".format(title=article_title, + html=article_html, + url=article_url)) + +json_feed = json_feed.format(items=", ".join(items)) + +print(json_feed) \ No newline at end of file diff --git a/src/librssguard/services/standard/standardfeed.cpp b/src/librssguard/services/standard/standardfeed.cpp index 9af4aa02f..299d1f636 100644 --- a/src/librssguard/services/standard/standardfeed.cpp +++ b/src/librssguard/services/standard/standardfeed.cpp @@ -261,7 +261,8 @@ StandardFeed* StandardFeed::guessFeed(StandardFeed::SourceType source_type, StandardFeed* feed = nullptr; - if (content_type.contains(QSL("json"), Qt::CaseSensitivity::CaseInsensitive)) { + if (content_type.contains(QSL("json"), Qt::CaseSensitivity::CaseInsensitive) || + feed_contents.startsWith('{')) { feed = new StandardFeed(); // We have JSON feed. @@ -693,8 +694,15 @@ QString StandardFeed::runScriptProcess(const QStringList& cmd_args, const QStrin process.closeWriteChannel(); } - if (process.waitForFinished(run_timeout)) { + if (process.waitForFinished(run_timeout) && process.exitStatus() == QProcess::ExitStatus::NormalExit) { auto raw_output = process.readAllStandardOutput(); + auto raw_error = process.readAllStandardError(); + + if (!raw_error.simplified().isEmpty()) { + qWarningNN << LOGSEC_CORE + << "Received error output from custom script even if it reported that it exited normally:" + << QUOTE_W_SPACE_DOT(raw_error); + } return raw_output; }