rssguard/resources/scripts/scrapers/translate-feed.py
2021-09-23 11:26:43 +02:00

129 lines
3.4 KiB
Python

# Translates entries of RSS 2.0 feed into different locale.
#
# Make sure to have all dependencies installed:
# pip3 install googletrans
# pip3 install asyncio (if using parallel version of the script)
# pip3 install hyper (for HTTP/2 support, much faster than default)
#
# You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl:
# curl 'https://phys.org/rss-feed/' | python ./translate-feed.py "en" "pt_BR" "true"
#
# You must provide three command line arguments:
# translate-feed.py [FROM-LANGUAGE] [TO-LANGUAGE] [RUN-PARALLEL] [FEED-ENCODING (optional)]
import json
import re
import io
import sys
import time
import html
import requests
import distutils.util
import xml.etree.ElementTree as ET
import itertools as IT
from googletrans import Translator
from bs4 import BeautifulSoup
lang_from = sys.argv[1]
lang_to = sys.argv[2]
parallel = bool(distutils.util.strtobool(sys.argv[3]))
if (len(sys.argv) >= 5):
src_enc = sys.argv[4]
else:
src_enc = "utf-8"
if parallel:
import asyncio
from concurrent.futures import ThreadPoolExecutor
sys.stdin.reconfigure(encoding = src_enc)
rss_data = sys.stdin.read()
#print(rss_data)
try:
rss_document = ET.fromstring(rss_data)
except ET.ParseError as err:
lineno, column = err.position
line = next(IT.islice(io.StringIO(rss_data), lineno))
caret = '{:=>{}}'.format('^', column)
err.msg = '{}\n{}\n{}'.format(err, line, caret)
raise
translator = Translator()
atom_ns = {"ns": "http://www.w3.org/2005/Atom"}
def translate_string(to_translate):
try:
if to_translate is None:
return to_translate
translated_text = translator.translate(to_translate, src = lang_from, dest = lang_to)
if not parallel:
time.sleep(0.2)
return translated_text.text
except:
return to_translate
def process_article(article):
title = article.find("title")
if title is None:
title = article.find("ns:title", atom_ns)
if title is not None:
title.text = translate_string(title.text)
# RSS.
contents = article.find("description")
if contents is None:
# ATOM.
contents = article.find("ns:content", atom_ns)
if contents is not None:
htmll = "<div>{}</div>".format(contents.text)
soup = BeautifulSoup(htmll, features = "lxml")
contents.text = translate_string(soup.get_text())
contents.text = contents.text.replace("\n", "<br/>")
# Translate title.
# RSS.
channel = rss_document.find("channel")
if channel is not None:
title = channel.find("title")
if (channel is None) or (title is None):
# ATOM.
title = rss_document.find("ns:title", atom_ns)
if title is not None:
title.text = translate_string(title.text)
# Translate articles.
if parallel:
with ThreadPoolExecutor(max_workers = 2) as executor:
futures = []
for article in rss_document.findall(".//item"):
futures.append(executor.submit(process_article, article))
for article in rss_document.findall(".//ns:entry", atom_ns):
futures.append(executor.submit(process_article, article))
for future in futures:
future.result()
else:
for article in rss_document.findall(".//item"):
process_article(article)
for article in rss_document.findall(".//ns:entry", atom_ns):
process_article(article)
out_xml = ET.tostring(rss_document)
out_decoded_xml = out_xml.decode()
print(out_decoded_xml)