metacritic filtering script

2022-01-04 12:18:43 +01:00 · 2022-01-04 12:18:43 +01:00 · 5990ff2b27
parent 4de3db8000
commit 5990ff2b27
2 changed files with 67 additions and 1 deletions
--- a/resources/desktop/com.github.rssguard.appdata.xml
+++ b/resources/desktop/com.github.rssguard.appdata.xml
@ -26,7 +26,7 @@
  <url type="donation">https://github.com/sponsors/martinrotter</url>
  <content_rating type="oars-1.1" />
  <releases>
-    <release version="4.0.4" date="2022-01-03"/>
+    <release version="4.0.4" date="2022-01-04"/>
  </releases>
  <content_rating type="oars-1.0">
    <content_attribute id="violence-cartoon">none</content_attribute>
--- a/resources/scripts/scrapers/metacritic.py
+++ b/resources/scripts/scrapers/metacritic.py
@ -0,0 +1,66 @@
+# Provides filtering of entries provided via Metacritic RSS feeds.
+#
+# Example input feed is: https://www.metacritic.com/rss/tv
+#
+# This script expects raw RSS 2.0 feed data as input and can be called like
+# this on cli:
+#   curl 'https://www.metacritic.com/rss/tv' | python ./metacritic.py "<MINIMUM-SCORE>"
+#
+# Replace <MINIMUM-SCORE> with minimal numerical score your articles must have or pass nothing
+# to keep all articles.
+
+import json
+import sys
+import urllib.request
+import xml.etree.ElementTree as ET
+import re
+import ssl
+
+def get_score_of_url(article_url):
+  # Download HTML of article.
+  req = urllib.request.Request(article_url)
+  req.add_header("Accept", "*/*")
+  req.add_header("User-Agent", "curl/7.55.1")
+  response = urllib.request.urlopen(req, context=ssl.SSLContext())
+  text = response.read().decode("utf-8")
+  score = int(re.search(r'metascore_w larger tvshow positive">(\d{1,2})', text).group(1))
+  return score
+
+def main():
+  minimal_score = int(sys.argv[1]) if len(sys.argv) >= 2 else -1
+
+  # Read RSS 2.0 feed data from input.
+  sys.stdin.reconfigure(encoding="utf-8")
+
+  #req = urllib.request.Request("https://www.metacritic.com/rss/tv")
+  #req.add_header("Accept", "*/*")
+  #req.add_header("User-Agent", "curl/7.55.1")
+  #feed_data = urllib.request.urlopen(req, context=ssl.SSLContext()).read()
+
+  feed_data = sys.stdin.read()
+  feed_document = ET.fromstring(feed_data)
+
+  # Process articles one by one.
+  feed_channel = feed_document.find(".//channel")
+  feed_articles = feed_channel.findall("item")
+
+  for article in feed_articles:
+    try:
+      article_score = get_score_of_url(article.find("link").text)
+    except:
+      article_score = minimal_score - 1
+      pass
+
+    if article_score < minimal_score:
+      feed_channel.remove(article)
+    else:
+      article_title = article.find("title")
+      article_title.text += " - {}".format(article_score)
+
+  out_xml = ET.tostring(feed_document)
+  out_decoded_xml = out_xml.decode()
+  
+  print(out_decoded_xml)
+
+if __name__ == '__main__':
+  main()