rssguard/resources/scripts/scrapers/translate-feed.py

123 lines
3.3 KiB
Python
Raw Normal View History

2022-04-26 08:01:42 +02:00
# Translates entries of RSS 2.0 (or ATOM) feed into different locale.
#
# Requires Python 3.10+.
2021-03-31 09:59:33 +02:00
#
# Make sure to have all dependencies installed:
2022-04-26 08:01:42 +02:00
# pip3 install googletrans-py lxml bs4 httpx httpcore asyncio --upgrade
2021-03-31 09:59:33 +02:00
#
2022-04-06 06:23:33 +02:00
# You must provide raw RSS 2.0 (or ATOM) UTF-8 feed XML data as input, for example with curl:
# curl 'https://phys.org/rss-feed/' | python ./translate-feed.py "en" "pt_BR" "true"
2021-03-31 09:59:33 +02:00
#
2021-03-31 10:20:18 +02:00
# You must provide three command line arguments:
2021-09-23 11:26:43 +02:00
# translate-feed.py [FROM-LANGUAGE] [TO-LANGUAGE] [RUN-PARALLEL] [FEED-ENCODING (optional)]
2021-03-31 09:59:33 +02:00
2021-09-23 11:25:19 +02:00
import io
2021-03-31 09:59:33 +02:00
import sys
import time
2022-04-26 08:01:42 +02:00
import setuptools._distutils.util
2021-03-31 09:59:33 +02:00
import xml.etree.ElementTree as ET
2021-09-23 11:25:19 +02:00
import itertools as IT
2021-03-31 09:59:33 +02:00
from googletrans import Translator
from bs4 import BeautifulSoup
2021-03-31 09:59:33 +02:00
lang_from = sys.argv[1]
lang_to = sys.argv[2]
2022-04-26 08:01:42 +02:00
parallel = bool(setuptools._distutils.util.strtobool(sys.argv[3]))
2021-03-31 10:20:18 +02:00
2021-06-03 13:21:07 +02:00
if (len(sys.argv) >= 5):
src_enc = sys.argv[4]
else:
src_enc = "utf-8"
2021-03-31 10:20:18 +02:00
if parallel:
import asyncio
from concurrent.futures import ThreadPoolExecutor
2021-06-03 13:21:07 +02:00
sys.stdin.reconfigure(encoding = src_enc)
2021-03-31 09:59:33 +02:00
rss_data = sys.stdin.read()
2021-09-23 11:25:19 +02:00
try:
rss_document = ET.fromstring(rss_data)
except ET.ParseError as err:
lineno, column = err.position
line = next(IT.islice(io.StringIO(rss_data), lineno))
caret = '{:=>{}}'.format('^', column)
err.msg = '{}\n{}\n{}'.format(err, line, caret)
raise
2021-03-31 09:59:33 +02:00
translator = Translator()
atom_ns = {"ns": "http://www.w3.org/2005/Atom"}
2021-03-31 09:59:33 +02:00
def translate_string(to_translate):
2021-03-31 10:20:18 +02:00
try:
if to_translate is None:
return to_translate
2021-03-31 10:20:18 +02:00
translated_text = translator.translate(to_translate, src = lang_from, dest = lang_to)
if not parallel:
time.sleep(0.2)
return translated_text.text
except:
return to_translate
2021-03-31 09:59:33 +02:00
def process_article(article):
title = article.find("title")
if title is None:
title = article.find("ns:title", atom_ns)
if title is not None:
title.text = translate_string(title.text)
# RSS.
2021-03-31 09:59:33 +02:00
contents = article.find("description")
if contents is None:
# ATOM.
contents = article.find("ns:content", atom_ns)
if contents is not None:
htmll = "<div>{}</div>".format(contents.text)
soup = BeautifulSoup(htmll, features = "lxml")
contents.text = translate_string(soup.get_text())
contents.text = contents.text.replace("\n", "<br/>")
2021-03-31 09:59:33 +02:00
2021-03-31 10:40:44 +02:00
# Translate title.
# RSS.
channel = rss_document.find("channel")
if channel is not None:
title = channel.find("title")
if (channel is None) or (title is None):
# ATOM.
title = rss_document.find("ns:title", atom_ns)
if title is not None:
title.text = translate_string(title.text)
2021-03-31 10:40:44 +02:00
# Translate articles.
2021-03-31 10:20:18 +02:00
if parallel:
2021-06-03 13:21:07 +02:00
with ThreadPoolExecutor(max_workers = 2) as executor:
2021-03-31 10:20:18 +02:00
futures = []
for article in rss_document.findall(".//item"):
futures.append(executor.submit(process_article, article))
for article in rss_document.findall(".//ns:entry", atom_ns):
futures.append(executor.submit(process_article, article))
2021-03-31 10:20:18 +02:00
for future in futures:
future.result()
else:
for article in rss_document.findall(".//item"):
process_article(article)
for article in rss_document.findall(".//ns:entry", atom_ns):
process_article(article)
2021-03-31 09:59:33 +02:00
2021-09-23 11:25:19 +02:00
out_xml = ET.tostring(rss_document)
out_decoded_xml = out_xml.decode()
2021-09-23 11:26:43 +02:00
print(out_decoded_xml)