# Translates entries of RSS 2.0 feed into different locale. # # Make sure to have all dependencies installed: # pip3 install googletrans # pip3 install asyncio (if using parallel version of the script) # pip3 install hyper (for HTTP/2 support, much faster than default) # # You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl: # curl 'https://phys.org/rss-feed/' | python ./translate-feed.py "en" "pt_BR" "true" # # You must provide three command line arguments: # translate-feed.py [FROM-LANGUAGE] [TO-LANGUAGE] [RUN-PARALLEL] import json import re import sys import time import html import requests import distutils.util import xml.etree.ElementTree as ET from googletrans import Translator from bs4 import BeautifulSoup lang_from = sys.argv[1] lang_to = sys.argv[2] parallel = bool(distutils.util.strtobool(sys.argv[3])) if (len(sys.argv) >= 5): src_enc = sys.argv[4] else: src_enc = "utf-8" if parallel: import asyncio from concurrent.futures import ThreadPoolExecutor sys.stdin.reconfigure(encoding = src_enc) rss_data = sys.stdin.read() rss_document = ET.fromstring(rss_data) translator = Translator() atom_ns = {"ns": "http://www.w3.org/2005/Atom"} def translate_string(to_translate): try: if to_translate is None: return to_translate translated_text = translator.translate(to_translate, src = lang_from, dest = lang_to) if not parallel: time.sleep(0.2) return translated_text.text except: return to_translate def process_article(article): title = article.find("title") if title is None: title = article.find("ns:title", atom_ns) if title is not None: title.text = translate_string(title.text) # RSS. contents = article.find("description") if contents is None: # ATOM. contents = article.find("ns:content", atom_ns) if contents is not None: htmll = "
{}
".format(contents.text) soup = BeautifulSoup(htmll, features = "lxml") contents.text = translate_string(soup.get_text()) contents.text = contents.text.replace("\n", "
") # Translate title. # RSS. channel = rss_document.find("channel") if channel is not None: title = channel.find("title") if (channel is None) or (title is None): # ATOM. title = rss_document.find("ns:title", atom_ns) if title is not None: title.text = translate_string(title.text) # Translate articles. if parallel: with ThreadPoolExecutor(max_workers = 2) as executor: futures = [] for article in rss_document.findall(".//item"): futures.append(executor.submit(process_article, article)) for article in rss_document.findall(".//ns:entry", atom_ns): futures.append(executor.submit(process_article, article)) for future in futures: future.result() else: for article in rss_document.findall(".//item"): process_article(article) for article in rss_document.findall(".//ns:entry", atom_ns): process_article(article) print(ET.tostring(rss_document, encoding = "unicode"))