2024-08-24 02:16:17 +02:00
|
|
|
#!/usr/bin/env python3
|
2024-09-21 17:13:19 +02:00
|
|
|
ModificationMetadataKey = "lastmod"
|
2025-02-15 17:50:42 +01:00
|
|
|
KeepOriginalMetadata = ["draft", "date", "lastmod", "slug"]
|
2024-09-05 01:50:13 +02:00
|
|
|
DestinationLanguages = ["it", "en", "es", "fr"] # "de", "eo"
|
2024-08-24 02:16:17 +02:00
|
|
|
IncludePaths = ["/"]
|
2025-02-15 17:50:42 +01:00
|
|
|
ExcludePaths = ["/miscellanea/Devlogs.md", "/miscellanea/Percent-Encoding.md", "/admin.md"] # "/miscellanea/PicoBlog.md"
|
2024-12-03 01:14:46 +01:00
|
|
|
TranslationFixes = {
|
|
|
|
"{{<assetsRoot>}}_/": "{{< assetsRoot>}}/",
|
|
|
|
"{{< assetsRoot >}}_/": "{{< assetsRoot >}}/",
|
|
|
|
" ``` _": " ``` ",
|
|
|
|
}
|
2024-08-24 02:16:17 +02:00
|
|
|
|
2024-09-02 18:44:31 +02:00
|
|
|
import subprocess
|
2024-08-24 02:16:17 +02:00
|
|
|
from os import getcwd, listdir
|
|
|
|
from os.path import dirname, realpath, isfile, isdir
|
|
|
|
from pathlib import Path
|
|
|
|
from translate_shell.translate import translate
|
|
|
|
|
2024-09-21 17:13:19 +02:00
|
|
|
# TODO somehow handle overriding frontmatter data for some translation (title, slug, ...) (via in other files or commented metadata lines?)
|
2024-08-24 02:16:17 +02:00
|
|
|
# TODO handle deleted files? (it should probably be done in another sh script, not here)
|
|
|
|
|
|
|
|
def printf(*objects):
|
2024-12-02 00:07:06 +01:00
|
|
|
print(*objects, end='', flush=True)
|
2024-08-24 02:16:17 +02:00
|
|
|
|
|
|
|
def get_source_language(document_path):
|
|
|
|
return document_path.split('/')[0]
|
|
|
|
|
2024-09-21 02:33:49 +02:00
|
|
|
def read_original_document(document_path):
|
|
|
|
return open(("../content/" + document_path), 'r').read()
|
|
|
|
|
2024-08-24 02:16:17 +02:00
|
|
|
def make_destination_path(document_path, destination_language):
|
2024-09-21 02:33:49 +02:00
|
|
|
return ("./translate/" + destination_language + '/'
|
2024-08-24 02:16:17 +02:00
|
|
|
+ '/'.join(document_path.split('/')[1:]))
|
|
|
|
|
|
|
|
def is_translation_uptodate(source_path, destination_path):
|
2024-09-21 17:13:19 +02:00
|
|
|
original_lines = split_with_frontmatter(read_original_document(source_path))[1].splitlines()
|
|
|
|
translated_lines = split_with_frontmatter(open(destination_path, 'r').read())[1].splitlines()
|
2024-12-02 00:07:06 +01:00
|
|
|
for [index, original_line] in enumerate(original_lines[:len(translated_lines)]):
|
2024-09-21 02:33:49 +02:00
|
|
|
line_key = original_line.split('=')[0]
|
2024-09-21 17:13:19 +02:00
|
|
|
if line_key.strip().lower() == ModificationMetadataKey:
|
2024-09-21 02:33:49 +02:00
|
|
|
if original_line != translated_lines[index]:
|
|
|
|
return False
|
|
|
|
break
|
2024-08-24 02:16:17 +02:00
|
|
|
return True
|
|
|
|
|
|
|
|
# TODO handle when the same document is available in multiple source languages?
|
|
|
|
def needs_translation(source_path, destination_language=None):
|
2024-09-21 02:33:49 +02:00
|
|
|
for exclude_path in ExcludePaths:
|
|
|
|
document_path = ('/' + '/'.join(source_path.split('/')[1:]))
|
2024-09-21 17:13:19 +02:00
|
|
|
if (document_path == exclude_path) or document_path.startswith(exclude_path.rstrip('/') + '/'):
|
2024-08-24 02:16:17 +02:00
|
|
|
return False
|
2024-09-21 02:33:49 +02:00
|
|
|
if not read_original_document(source_path).strip():
|
2024-08-24 02:16:17 +02:00
|
|
|
return False
|
|
|
|
if destination_language:
|
|
|
|
destination_path = make_destination_path(source_path, destination_language)
|
|
|
|
if isfile(destination_path) and is_translation_uptodate(source_path, destination_path):
|
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
|
|
def find_documents(folder_path):
|
|
|
|
documents_queue, documents = [], {}
|
|
|
|
for document_path in Path(folder_path).rglob('*.*'):
|
|
|
|
if isfile(document_path):
|
|
|
|
documents_queue.append('/'.join(str(document_path).split('/')[2:]))
|
|
|
|
for document in documents_queue:
|
2024-12-02 00:07:06 +01:00
|
|
|
print(f"* {document},", flush=True)
|
2024-08-24 02:16:17 +02:00
|
|
|
if needs_translation(document):
|
|
|
|
documents[document] = []
|
|
|
|
for destination_language in list(set(DestinationLanguages) - {get_source_language(document)}):
|
|
|
|
if needs_translation(document, destination_language):
|
|
|
|
documents[document].append(destination_language)
|
|
|
|
return documents
|
|
|
|
|
2024-09-21 17:13:19 +02:00
|
|
|
def split_with_frontmatter(document_text):
|
2024-09-21 02:33:49 +02:00
|
|
|
text_header = document_text.strip().splitlines()[0].strip()
|
|
|
|
if text_header in ["---", "+++"]:
|
|
|
|
text_tokens = document_text.split(text_header)
|
|
|
|
return [text_header, text_tokens[1], text_header, text_header.join(text_tokens[2:])]
|
|
|
|
|
2024-09-02 18:44:31 +02:00
|
|
|
def fix_frontmatter(translated_text, reference_text):
|
|
|
|
result = ''
|
|
|
|
reference_lines = reference_text.splitlines()
|
|
|
|
for [index, translated_line] in enumerate(translated_text.splitlines()):
|
|
|
|
if translated_line.strip() and (translated_line.lstrip() == translated_line):
|
|
|
|
reference_line = reference_lines[index]
|
|
|
|
line_key = reference_line.split('=')[0]
|
2024-09-21 17:13:19 +02:00
|
|
|
if line_key.strip().lower() in KeepOriginalMetadata:
|
2024-09-02 18:44:31 +02:00
|
|
|
translated_line = reference_line
|
|
|
|
else:
|
|
|
|
line_value = '='.join(translated_line.split('=')[1:])
|
|
|
|
translated_line = line_key
|
|
|
|
if line_value:
|
|
|
|
translated_line += ('=' + line_value)
|
|
|
|
result += (translated_line + '\n')
|
|
|
|
return result
|
|
|
|
|
2024-09-21 17:13:19 +02:00
|
|
|
# <https://stackoverflow.com/a/18815890>
|
|
|
|
def ascii_to_number(text:str) -> int:
|
|
|
|
return int(''.join(format(ord(i), 'b').zfill(8) for i in text), 2)
|
|
|
|
|
|
|
|
# <https://stackoverflow.com/a/699891>, <https://stackoverflow.com/a/40559005>
|
|
|
|
def number_to_ascii(number:int) -> str:
|
|
|
|
binary = format(int(number), '016b')
|
|
|
|
binary = binary.zfill(len(binary) + (8 - (len(binary) % 8)))
|
|
|
|
return ''.join(chr(int(binary[(i * 8):((i * 8) + 8)], 2)) for i in range(len(binary) // 8))
|
|
|
|
|
|
|
|
# TODO add checks for number-strings to ensure they aren't already in the literal text
|
2024-12-03 01:14:46 +01:00
|
|
|
# TODO handle .notranslate HTML elements
|
|
|
|
# TODO fix strange bugs, including bug of autoinserted trailing underscores '_', and HTML closing tags breaking
|
2024-09-21 17:13:19 +02:00
|
|
|
def wrap_for_translation(original_text):
|
2025-02-15 17:50:42 +01:00
|
|
|
external_tokens = []
|
2024-09-21 17:13:19 +02:00
|
|
|
original_text = (original_text
|
2024-12-03 01:14:46 +01:00
|
|
|
.replace("{{%", "{{@%").replace("%}}", "%@}}") # Hugo shortcodes
|
|
|
|
.replace("{{<", "{{@<").replace(">}}", ">@}}")
|
|
|
|
.replace("```" , "{{@```" ) # Markdown fenced code blocks
|
|
|
|
.replace(" {{@``` ", " ``` @}}"))
|
2024-09-21 17:13:19 +02:00
|
|
|
original_tokens = original_text.split("{{@")
|
|
|
|
for i in range(1, len(original_tokens)):
|
|
|
|
token_tokens = original_tokens[i].split("@}}")
|
2025-02-15 17:50:42 +01:00
|
|
|
external_tokens.append("{{@" + token_tokens[0] + "@}}")
|
2024-12-03 01:14:46 +01:00
|
|
|
token_tokens[0] = (f"{TranslationMagic}__" + str(ascii_to_number("{{@" + token_tokens[0] + "@}}")).replace("1", "1_").replace("9", "9_") + "__")
|
2024-09-21 17:13:19 +02:00
|
|
|
original_tokens[i] = ''.join(token_tokens)
|
2025-02-15 17:50:42 +01:00
|
|
|
return (''.join(original_tokens), external_tokens)
|
2024-09-21 17:13:19 +02:00
|
|
|
|
2025-02-15 17:50:42 +01:00
|
|
|
def unwrap_from_translation(translated_text, external_tokens):
|
2024-09-21 17:13:19 +02:00
|
|
|
translated_tokens = translated_text.split(f"{TranslationMagic}__")
|
|
|
|
for i in range(1, len(translated_tokens)):
|
|
|
|
token_tokens = translated_tokens[i].split("__")
|
2025-02-15 17:50:42 +01:00
|
|
|
token_tokens[0] = external_tokens.pop(0) #number_to_ascii(token_tokens[0].replace(' ', '').replace('_', ''))
|
|
|
|
if (token_tokens[1].startswith('_')):
|
|
|
|
token_tokens[1] = token_tokens[1][1:] # Extra underscore insertion workaround
|
2024-09-21 17:13:19 +02:00
|
|
|
translated_tokens[i] = (token_tokens[0] + "__".join(token_tokens[1:]))
|
|
|
|
return (''.join(translated_tokens)
|
2024-12-03 01:14:46 +01:00
|
|
|
.replace("{{@%", "{{%").replace("%@}}", "%}}") # Hugo shortcodes
|
|
|
|
.replace("{{@<", "{{<").replace(">@}}", ">}}")
|
|
|
|
.replace(" ``` @}}", " ``` ") # Markdown fenced code blocks
|
|
|
|
.replace("{{@```" , "```" ))
|
|
|
|
#.replace("{{@```" , "```" )
|
|
|
|
#.replace(" ``` @}}", " ``` ")
|
|
|
|
#.replace(" {{@``` ", " ``` "))
|
2024-09-21 17:13:19 +02:00
|
|
|
|
2024-08-24 02:16:17 +02:00
|
|
|
def translate_document(document_path, documents):
|
2024-09-02 18:44:31 +02:00
|
|
|
printf(f"* {document_path} ->")
|
2024-08-24 02:16:17 +02:00
|
|
|
for destination_language in documents[document_path]:
|
|
|
|
source_language = get_source_language(document_path)
|
2025-02-15 17:50:42 +01:00
|
|
|
original_text, external_tokens = wrap_for_translation(read_original_document(document_path))
|
2024-08-24 02:16:17 +02:00
|
|
|
printf('', destination_language)
|
|
|
|
try:
|
2024-09-02 18:44:31 +02:00
|
|
|
is_python_translator = True
|
|
|
|
translated = translate(original_text, destination_language, source_language)
|
2024-08-24 02:16:17 +02:00
|
|
|
if not len(translated.results):
|
|
|
|
raise Exception("Unhandled error")
|
|
|
|
except Exception as exception:
|
|
|
|
printf('❌', exception)
|
2024-09-02 18:44:31 +02:00
|
|
|
try:
|
|
|
|
is_python_translator = False
|
2024-09-21 17:13:19 +02:00
|
|
|
temporary_path = ("./tmp/" + document_path)
|
|
|
|
Path('/'.join(temporary_path.split('/')[:-1])).mkdir(parents=True, exist_ok=True)
|
|
|
|
open(temporary_path, 'w').write(original_text)
|
2024-09-02 18:44:31 +02:00
|
|
|
translated = subprocess.run(
|
2024-09-21 02:33:49 +02:00
|
|
|
("bash", "../Scripts/Lib/translate-shell.bash", "-brief", "-no-autocorrect",
|
2024-09-02 18:44:31 +02:00
|
|
|
"-t", destination_language, "-s", source_language,
|
2024-09-21 17:13:19 +02:00
|
|
|
("file://" + temporary_path)),
|
2024-09-02 18:44:31 +02:00
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE)
|
|
|
|
if translated.stderr:
|
|
|
|
raise Exception(translated.stderr.decode())
|
|
|
|
except Exception as exception:
|
|
|
|
printf('❌', exception)
|
|
|
|
continue
|
2025-01-11 23:03:48 +01:00
|
|
|
try:
|
|
|
|
translated_text = (translated.results[0].paraphrase
|
|
|
|
if is_python_translator else translated.stdout.decode())
|
|
|
|
translated_preamble = ("\n\n{{< noticeAutomaticTranslation " + source_language + " >}}\n\n")
|
|
|
|
if (translated_tokens := split_with_frontmatter(translated_text)):
|
|
|
|
translated_tokens[1] = fix_frontmatter(translated_tokens[1], original_text.split(translated_tokens[0])[1])
|
|
|
|
if translated_tokens[3].strip():
|
|
|
|
translated_tokens.insert(3, translated_preamble)
|
|
|
|
translated_text = ''.join(translated_tokens)
|
|
|
|
elif translated_text.strip():
|
|
|
|
translated_text = (translated_preamble + translated_text)
|
2025-02-15 17:50:42 +01:00
|
|
|
translated_text = unwrap_from_translation(translated_text, external_tokens)
|
2025-01-11 23:03:48 +01:00
|
|
|
for replacement in TranslationFixes:
|
|
|
|
translated_text = translated_text.replace(replacement, TranslationFixes[replacement])
|
|
|
|
destination_path = make_destination_path(document_path, destination_language)
|
|
|
|
Path('/'.join(destination_path.split('/')[:-1])).mkdir(parents=True, exist_ok=True)
|
|
|
|
open(destination_path, 'w').write(translated_text)
|
|
|
|
printf('✅')
|
|
|
|
except Exception as exception:
|
|
|
|
printf('❌', exception)
|
|
|
|
continue
|
2024-08-24 02:16:17 +02:00
|
|
|
printf('\n')
|
|
|
|
|
|
|
|
def main():
|
2024-10-23 01:16:01 +02:00
|
|
|
for source_language in listdir("../content"):
|
2024-08-24 02:16:17 +02:00
|
|
|
for folder_path in IncludePaths:
|
2024-10-23 01:16:01 +02:00
|
|
|
documents = find_documents("../content/" + source_language + folder_path)
|
2024-08-24 02:16:17 +02:00
|
|
|
for document_path in documents:
|
|
|
|
if len(documents[document_path]):
|
|
|
|
translate_document(document_path, documents)
|
|
|
|
|
2024-08-25 17:18:20 +02:00
|
|
|
def read_from_scripts(relative_path:str):
|
2024-10-23 01:16:01 +02:00
|
|
|
return open((dirname(realpath(__file__)) + "/../" + relative_path), 'r').read()
|
2024-08-25 17:18:20 +02:00
|
|
|
|
2025-02-15 17:50:42 +01:00
|
|
|
TranslationMagic = ("__" + str(ascii_to_number("sito.octt")))
|
2024-09-21 17:13:19 +02:00
|
|
|
|
2024-08-24 02:16:17 +02:00
|
|
|
if __name__ == "__main__":
|
2024-10-23 01:16:01 +02:00
|
|
|
globals_text = read_from_scripts("Lib/Globals.sh")
|
2024-08-24 02:16:17 +02:00
|
|
|
exec(globals_text.split('#' + globals_text.splitlines()[0].split('#!')[1] + '!')[0])
|
2024-10-23 01:16:01 +02:00
|
|
|
exec(read_from_scripts("../assets/SiteProps.toml"))
|
2024-08-24 02:16:17 +02:00
|
|
|
main()
|