# Translates entries of RSS 2.0 feed into different locale. # # Make sure to have all dependencies installed: # pip3 install googletrans # pip3 install asyncio (if using parallel version of the script) # pip3 install hyper (for HTTP/2 support, much faster than default) # # You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl: # curl 'https://phys.org/rss-feed/' | python ./translate-feed.py "en" "pt_BR" "true" # # You must provide three command line arguments: # translate-feed.py [FROM-LANGUAGE] [TO-LANGUAGE] [RUN-PARALLEL] import json import re import sys import time import html import requests import distutils.util import xml.etree.ElementTree as ET from googletrans import Translator from bs4 import BeautifulSoup lang_from = sys.argv[1] lang_to = sys.argv[2] parallel = bool(distutils.util.strtobool(sys.argv[3])) if (len(sys.argv) >= 5): src_enc = sys.argv[4] else: src_enc = "utf-8" if parallel: import asyncio from concurrent.futures import ThreadPoolExecutor sys.stdin.reconfigure(encoding = src_enc) rss_data = sys.stdin.read() rss_document = ET.fromstring(rss_data) translator = Translator() atom_ns = {"ns": "http://www.w3.org/2005/Atom"} def translate_string(to_translate): try: if to_translate is None: return to_translate translated_text = translator.translate(to_translate, src = lang_from, dest = lang_to) if not parallel: time.sleep(0.2) return translated_text.text except: return to_translate def process_article(article): title = article.find("title") if title is None: title = article.find("ns:title", atom_ns) if title is not None: title.text = translate_string(title.text) # RSS. contents = article.find("description") if contents is None: # ATOM. contents = article.find("ns:content", atom_ns) if contents is not None: htmll = "