# Translates entries of RSS 2.0 feed into different locale. # # Make sure to have all dependencies installed: # pip3 install googletrans # pip3 install asyncio (if using parallel version of the script) # pip3 install hyper (for HTTP/2 support, much faster than default) # # You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl: # curl 'https://phys.org/rss-feed/' | python ./translate-feed.py "en" "pt_BR" "true" # # You must provide three command line arguments: # translate-feed.py [FROM-LANGUAGE] [TO-LANGUAGE] [RUN-PARALLEL] [FEED-ENCODING (optional)] import json import re import io import sys import time import html import requests import distutils.util import xml.etree.ElementTree as ET import itertools as IT from googletrans import Translator from bs4 import BeautifulSoup lang_from = sys.argv[1] lang_to = sys.argv[2] parallel = bool(distutils.util.strtobool(sys.argv[3])) if (len(sys.argv) >= 5): src_enc = sys.argv[4] else: src_enc = "utf-8" if parallel: import asyncio from concurrent.futures import ThreadPoolExecutor sys.stdin.reconfigure(encoding = src_enc) rss_data = sys.stdin.read() #print(rss_data) try: rss_document = ET.fromstring(rss_data) except ET.ParseError as err: lineno, column = err.position line = next(IT.islice(io.StringIO(rss_data), lineno)) caret = '{:=>{}}'.format('^', column) err.msg = '{}\n{}\n{}'.format(err, line, caret) raise translator = Translator() atom_ns = {"ns": "http://www.w3.org/2005/Atom"} def translate_string(to_translate): try: if to_translate is None: return to_translate translated_text = translator.translate(to_translate, src = lang_from, dest = lang_to) if not parallel: time.sleep(0.2) return translated_text.text except: return to_translate def process_article(article): title = article.find("title") if title is None: title = article.find("ns:title", atom_ns) if title is not None: title.text = translate_string(title.text) # RSS. contents = article.find("description") if contents is None: # ATOM. contents = article.find("ns:content", atom_ns) if contents is not None: htmll = "
{}
".format(contents.text) soup = BeautifulSoup(htmll, features = "lxml") contents.text = translate_string(soup.get_text()) contents.text = contents.text.replace("\n", "
") # Translate title. # RSS. channel = rss_document.find("channel") if channel is not None: title = channel.find("title") if (channel is None) or (title is None): # ATOM. title = rss_document.find("ns:title", atom_ns) if title is not None: title.text = translate_string(title.text) # Translate articles. if parallel: with ThreadPoolExecutor(max_workers = 2) as executor: futures = [] for article in rss_document.findall(".//item"): futures.append(executor.submit(process_article, article)) for article in rss_document.findall(".//ns:entry", atom_ns): futures.append(executor.submit(process_article, article)) for future in futures: future.result() else: for article in rss_document.findall(".//item"): process_article(article) for article in rss_document.findall(".//ns:entry", atom_ns): process_article(article) out_xml = ET.tostring(rss_document) out_decoded_xml = out_xml.decode() print(out_decoded_xml)