From 2eb016c895d46a3a42f4143203a7b0c4578fb010 Mon Sep 17 00:00:00 2001 From: Martin Rotter Date: Tue, 26 Apr 2022 08:01:42 +0200 Subject: [PATCH] =?UTF-8?q?update=20translation=20scraper=20=C2=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../desktop/com.github.rssguard.appdata.xml | 2 +- resources/scripts/scrapers/translate-feed.py | 18 ++++++------------ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/resources/desktop/com.github.rssguard.appdata.xml b/resources/desktop/com.github.rssguard.appdata.xml index 06d0d23f1..c69521035 100644 --- a/resources/desktop/com.github.rssguard.appdata.xml +++ b/resources/desktop/com.github.rssguard.appdata.xml @@ -26,7 +26,7 @@ https://github.com/sponsors/martinrotter - + none diff --git a/resources/scripts/scrapers/translate-feed.py b/resources/scripts/scrapers/translate-feed.py index fc5d26fa0..7c6845a6b 100644 --- a/resources/scripts/scrapers/translate-feed.py +++ b/resources/scripts/scrapers/translate-feed.py @@ -1,9 +1,9 @@ -# Translates entries of RSS 2.0 feed into different locale. +# Translates entries of RSS 2.0 (or ATOM) feed into different locale. +# +# Requires Python 3.10+. # # Make sure to have all dependencies installed: -# pip3 install googletrans -# pip3 install asyncio (if using parallel version of the script) -# pip3 install hyper (for HTTP/2 support, much faster than default) +# pip3 install googletrans-py lxml bs4 httpx httpcore asyncio --upgrade # # You must provide raw RSS 2.0 (or ATOM) UTF-8 feed XML data as input, for example with curl: # curl 'https://phys.org/rss-feed/' | python ./translate-feed.py "en" "pt_BR" "true" @@ -11,14 +11,10 @@ # You must provide three command line arguments: # translate-feed.py [FROM-LANGUAGE] [TO-LANGUAGE] [RUN-PARALLEL] [FEED-ENCODING (optional)] -import json -import re import io import sys import time -import html -import requests -import distutils.util +import setuptools._distutils.util import xml.etree.ElementTree as ET import itertools as IT from googletrans import Translator @@ -26,7 +22,7 @@ from bs4 import BeautifulSoup lang_from = sys.argv[1] lang_to = sys.argv[2] -parallel = bool(distutils.util.strtobool(sys.argv[3])) +parallel = bool(setuptools._distutils.util.strtobool(sys.argv[3])) if (len(sys.argv) >= 5): src_enc = sys.argv[4] @@ -40,8 +36,6 @@ if parallel: sys.stdin.reconfigure(encoding = src_enc) rss_data = sys.stdin.read() -#print(rss_data) - try: rss_document = ET.fromstring(rss_data) except ET.ParseError as err: