diff --git a/resources/desktop/com.github.rssguard.appdata.xml b/resources/desktop/com.github.rssguard.appdata.xml index 0743bf1c4..937f5e74d 100644 --- a/resources/desktop/com.github.rssguard.appdata.xml +++ b/resources/desktop/com.github.rssguard.appdata.xml @@ -30,7 +30,7 @@ https://martinrotter.github.io/donate/ - + none diff --git a/resources/docs/Feed-formats.md b/resources/docs/Feed-formats.md index 52502c689..0092976f4 100755 --- a/resources/docs/Feed-formats.md +++ b/resources/docs/Feed-formats.md @@ -78,6 +78,8 @@ RSS Guard offers placeholder `%data%` which is automatically replaced with full Also, working directory of process executing the script is set to RSS Guard's user data folder. +There are some examples of website scrapers [here](https://github.com/martinrotter/rssguard/tree/master/resources/scripts/scrapers), most of the are written in Python 3, thus their execution line is `python.exe#script.py`. + After your source feed data are downloaded either via URL or custom script, you can optionally post-process the data with one more custom script, which will take **raw source data as input** and must produce processed valid feed data to **standard output** while printing all error messages to **error output**. Format of post-process script execution line is the same as above. diff --git a/resources/scripts/scrapers/wiki-inthenews.py b/resources/scripts/scrapers/wiki-inthenews.py new file mode 100755 index 000000000..d95847f54 --- /dev/null +++ b/resources/scripts/scrapers/wiki-inthenews.py @@ -0,0 +1,39 @@ +# Obtains Wikipedia's "In the news" today's articles. + +import urllib.request +import re +import json +from html.parser import HTMLParser + +url = "https://en.wikipedia.org/wiki/Main_Page" +response = urllib.request.urlopen(url) +text = response.read().decode("utf-8") + +text_li = re.search("In the news[\S\n\t\v ]+?