update translation scraper
¨
This commit is contained in:
parent
fe02722662
commit
2eb016c895
@ -26,7 +26,7 @@
|
|||||||
<url type="donation">https://github.com/sponsors/martinrotter</url>
|
<url type="donation">https://github.com/sponsors/martinrotter</url>
|
||||||
<content_rating type="oars-1.1" />
|
<content_rating type="oars-1.1" />
|
||||||
<releases>
|
<releases>
|
||||||
<release version="4.2.1" date="2022-04-22"/>
|
<release version="4.2.1" date="2022-04-26"/>
|
||||||
</releases>
|
</releases>
|
||||||
<content_rating type="oars-1.0">
|
<content_rating type="oars-1.0">
|
||||||
<content_attribute id="violence-cartoon">none</content_attribute>
|
<content_attribute id="violence-cartoon">none</content_attribute>
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
# Translates entries of RSS 2.0 feed into different locale.
|
# Translates entries of RSS 2.0 (or ATOM) feed into different locale.
|
||||||
|
#
|
||||||
|
# Requires Python 3.10+.
|
||||||
#
|
#
|
||||||
# Make sure to have all dependencies installed:
|
# Make sure to have all dependencies installed:
|
||||||
# pip3 install googletrans
|
# pip3 install googletrans-py lxml bs4 httpx httpcore asyncio --upgrade
|
||||||
# pip3 install asyncio (if using parallel version of the script)
|
|
||||||
# pip3 install hyper (for HTTP/2 support, much faster than default)
|
|
||||||
#
|
#
|
||||||
# You must provide raw RSS 2.0 (or ATOM) UTF-8 feed XML data as input, for example with curl:
|
# You must provide raw RSS 2.0 (or ATOM) UTF-8 feed XML data as input, for example with curl:
|
||||||
# curl 'https://phys.org/rss-feed/' | python ./translate-feed.py "en" "pt_BR" "true"
|
# curl 'https://phys.org/rss-feed/' | python ./translate-feed.py "en" "pt_BR" "true"
|
||||||
@ -11,14 +11,10 @@
|
|||||||
# You must provide three command line arguments:
|
# You must provide three command line arguments:
|
||||||
# translate-feed.py [FROM-LANGUAGE] [TO-LANGUAGE] [RUN-PARALLEL] [FEED-ENCODING (optional)]
|
# translate-feed.py [FROM-LANGUAGE] [TO-LANGUAGE] [RUN-PARALLEL] [FEED-ENCODING (optional)]
|
||||||
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
import io
|
import io
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import html
|
import setuptools._distutils.util
|
||||||
import requests
|
|
||||||
import distutils.util
|
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
import itertools as IT
|
import itertools as IT
|
||||||
from googletrans import Translator
|
from googletrans import Translator
|
||||||
@ -26,7 +22,7 @@ from bs4 import BeautifulSoup
|
|||||||
|
|
||||||
lang_from = sys.argv[1]
|
lang_from = sys.argv[1]
|
||||||
lang_to = sys.argv[2]
|
lang_to = sys.argv[2]
|
||||||
parallel = bool(distutils.util.strtobool(sys.argv[3]))
|
parallel = bool(setuptools._distutils.util.strtobool(sys.argv[3]))
|
||||||
|
|
||||||
if (len(sys.argv) >= 5):
|
if (len(sys.argv) >= 5):
|
||||||
src_enc = sys.argv[4]
|
src_enc = sys.argv[4]
|
||||||
@ -40,8 +36,6 @@ if parallel:
|
|||||||
sys.stdin.reconfigure(encoding = src_enc)
|
sys.stdin.reconfigure(encoding = src_enc)
|
||||||
rss_data = sys.stdin.read()
|
rss_data = sys.stdin.read()
|
||||||
|
|
||||||
#print(rss_data)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
rss_document = ET.fromstring(rss_data)
|
rss_document = ET.fromstring(rss_data)
|
||||||
except ET.ParseError as err:
|
except ET.ParseError as err:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user