Let's automatically translate the previous article [Python] English PDF (but not limited to) with DeepL or Google Translate to make it a text file. Then, I output the translation result to a text file, but wouldn't it be convenient if you could compare it side by side with the text before translation? There is still room for improvement, but it was realized with HTML. Paper used for example
The code is dirty because it was arranged in a mess, but please forgive me (from the beginning).
When displaying HTML ・ ** Highlight function and jump function to the corresponding English or Japanese text ** ・ ** Dark mode ** Added. Also, ** The color tone in non-dark mode is also gentle **.
** Supports reverse translation (Japanese → English) **.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc
DRIVER_PATH = 'chromedriver.exe'
options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')
def parse_merge(text, n=4900, m=1, inv=False):
sentences = []
sentence = ""
for j, i in enumerate(" ".join(
text.splitlines()).split(". " if inv == False else "。")):
if i in ("", " ", "."): continue
if (len(sentence) + len(i) > n) or (j % m == 0):
sentences.append(sentence)
sentence = ""
sentence += i + ("." if inv == False else "。")
sentences.append(sentence)
return sentences
def TranslateFromClipboard(tool, write, filename, isPrint, html, title,
sentence_cnt, inv):
driver = webdriver.Chrome(executable_path=DRIVER_PATH,
chrome_options=options)
url = 'https://www.deepl.com/ja/translator' if tool == "DeepL" else f'https://translate.google.co.jp/?hl=ja&tab=TT&authuser=0#view=home&op=translate&sl=auto&tl={"en" if inv else "ja"}'
driver.get(url)
if tool == "DeepL":
textarea = driver.find_element_by_css_selector(
'.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
elif tool == "GT":
textarea = driver.find_element_by_id('source')
en = parse_merge(ppc.paste(), m=sentence_cnt, inv=inv)
ja = []
for sentence in en:
if sentence == "":
ja.append("")
continue
cbText = ppc.paste()
ppc.copy(sentence)
textarea.send_keys(Keys.CONTROL, "v")
ppc.copy(cbText)
transtext = ""
while transtext == "":
time.sleep(1)
if tool == "DeepL":
transtext = driver.find_element_by_css_selector(
'.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style'
).get_property("value")
elif tool == "GT":
try:
time.sleep(1)
transtext = driver.find_element_by_css_selector(
'.tlid-translation.translation').text
except:
pass
if isPrint: print(transtext)
ja.append(transtext)
textarea.send_keys(Keys.CONTROL, "a")
textarea.send_keys(Keys.BACKSPACE)
driver.quit()
if write:
with open(filename + ".txt", "w", encoding='UTF-8') as f:
f.write("\n".join(ja))
if html:
eng = ""
jpn = ""
for i, ej in enumerate(zip(en, ja)):
eng += f'<br><a id="e{i}" href="#j{i}" onmouseover="over(' + f"'j{i}'" + ')" onmouseout="out(' + f"'j{i}'" + f')">{ej[0]}</a><br>'
jpn += f'<br><a id="j{i}" href="#e{i}" onmouseover="over(' + f"'e{i}'" + ')" onmouseout="out(' + f"'e{i}'" + f')">{ej[1]}</a><br>'
with open(filename + ".html", "w", encoding='UTF-8') as f:
f.write(
f'<h1 align="center">{title}</h1>\n<input id="btn-mode" type="checkbox">\n<hr>\n<body>\n<div class="parent">\n<div id="en">\n{eng}\n</div>\n<div id="ja">\n{jpn}\n</div>\n</div>'
+
'<style>\n:root {\n--main-text: #452b15;\n--main-bg: #f8f1e2;\n--highlight-text: #db8e3c;\n}\n:root[theme="dark"] {\n--main-text: #b0b0b0;\n--main-bg: #121212;\n--highlight-text: #fd8787;\n}\nh1 {\ncolor: var(--main-text);\n}\ninput {\nposition: absolute;\ntop: 1%;\nright: 1%;\n}\n#en {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: left;\nborder-right:1px solid #ccc;\nmargin: 1%;\noverflow: auto;\n}\n#ja {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: right;\nmargin: 1%;\noverflow: auto;\n}\na,\na:hover,\na:visited,\na:link,\na:active {\ncolor: var(--main-text);\ntext-decoration: none;\n}\nbody {\nbackground-color: var(--main-bg);\n}\n</style>\n<script>\nvar a = document.getElementsByTagName("a");\nfunction over(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--highlight-text");\n}\nfunction out(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--main-text");\n}\nconst btn = document.querySelector("#btn-mode");\nbtn.addEventListener("change", () => {\nif (btn.checked == true) {\ndocument.documentElement.setAttribute("theme", "dark");\n} else {\ndocument.documentElement.setAttribute("theme", "light");\n}\nfor (var i = 0; i < a.length; i++) {\na[i].style.color = getComputedStyle(a[i]).getPropertyValue("--main-text");\n}\n});\n</script>\n</body>'
)
if __name__ == "__main__":
args = [
"DeepL", False, "translated_text.txt", True, False, "EN ↔ JP", 1, False
]
if input("1.English → Japanese 2.Japanese → English") == "2": args[7] = True
if input("1. DeepL 2.GoogleTranslate ") == "2": args[0] = "GT"
if input("Do you want to export the translation result? Y/n ") == "y":
case = input("1. txt 2. HTML 3. both ")
if case == "1":
args[1] = True
format_ = ".txt"
elif case == "2":
args[4] = True
format_ = ".html"
elif case == "3":
args[1], args[4] = True, True
format_ = ".txt/.html"
filename = input(
f"Enter a name for the output file (default is'translated_text{format_}') ")
if filename:
args[2] = filename.replace(" ", "_")
if case == "2" or case == "3":
title = input("Please enter the title (of the dissertation)")
if title:
args[5] = title
try:
args[6] = int(
input("How many sentences do you want to translate? (The default is one sentence at a time. The smaller the value, the cleaner the output, and the larger the value, the faster.)"))
except:
pass
if input("Would you like to see the translation progress here? Y/n ") == "n":
args[3] = False
input("Press Enter when ready")
TranslateFromClipboard(*args)
The readability has improved considerably, but it takes a lot of time to translate the whole sentence because it is translated sentence by sentence (you can also translate all at once by selecting at runtime).
It takes a little more work, but there was a way to translate it more easily. Open the PDF in Word, copy it, and translate it with a script that slightly changes the function part that divides the following sentences. As expected, it should be called Word, and it also neatly formats sentences that straddle line breaks.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc
DRIVER_PATH = 'chromedriver.exe'
options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')
def parse_merge(text, n=4900):
sentences = []
sentence = ""
for i in text.splitlines():
if i in ("", " ", "."): continue
sentences.append(i)
return sentences
def TranslateFromClipboard(tool, write, filename, isPrint, html, title,inv):
driver = webdriver.Chrome(executable_path=DRIVER_PATH,
chrome_options=options)
url = 'https://www.deepl.com/ja/translator' if tool == "DeepL" else f'https://translate.google.co.jp/?hl=ja&tab=TT&authuser=0#view=home&op=translate&sl=auto&tl={"en" if inv else "ja"}'
driver.get(url)
if tool == "DeepL":
textarea = driver.find_element_by_css_selector(
'.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
elif tool == "GT":
textarea = driver.find_element_by_id('source')
en = parse_merge(ppc.paste())
ja = []
for sentence in en:
if sentence == "":
ja.append("")
continue
cbText = ppc.paste()
ppc.copy(sentence)
textarea.send_keys(Keys.CONTROL, "v")
ppc.copy(cbText)
transtext = ""
while transtext == "":
time.sleep(1)
if tool == "DeepL":
transtext = driver.find_element_by_css_selector(
'.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style'
).get_property("value")
elif tool == "GT":
try:
time.sleep(1)
transtext = driver.find_element_by_css_selector(
'.tlid-translation.translation').text
except:
pass
if isPrint: print(transtext)
ja.append(transtext)
textarea.send_keys(Keys.CONTROL, "a")
textarea.send_keys(Keys.BACKSPACE)
driver.quit()
if write:
with open(filename + ".txt", "w", encoding='UTF-8') as f:
f.write("\n".join(ja))
if html:
eng = ""
jpn = ""
for i, ej in enumerate(zip(en, ja)):
eng += f'<br><a id="e{i}" href="#j{i}" onmouseover="over(' + f"'j{i}'" + ')" onmouseout="out(' + f"'j{i}'" + f')">{ej[0]}</a><br>'
jpn += f'<br><a id="j{i}" href="#e{i}" onmouseover="over(' + f"'e{i}'" + ')" onmouseout="out(' + f"'e{i}'" + f')">{ej[1]}</a><br>'
with open(filename + ".html", "w", encoding='UTF-8') as f:
f.write(
f'<h1 align="center">{title}</h1>\n<input id="btn-mode" type="checkbox">\n<hr>\n<body>\n<div class="parent">\n<div id="en">\n{eng}\n</div>\n<div id="ja">\n{jpn}\n</div>\n</div>'
+
'<style>\n:root {\n--main-text: #452b15;\n--main-bg: #f8f1e2;\n--highlight-text: #db8e3c;\n}\n:root[theme="dark"] {\n--main-text: #b0b0b0;\n--main-bg: #121212;\n--highlight-text: #fd8787;\n}\nh1 {\ncolor: var(--main-text);\n}\ninput {\nposition: absolute;\ntop: 1%;\nright: 1%;\n}\n#en {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: left;\nborder-right:1px solid #ccc;\nmargin: 1%;\noverflow: auto;\n}\n#ja {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: right;\nmargin: 1%;\noverflow: auto;\n}\na,\na:hover,\na:visited,\na:link,\na:active {\ncolor: var(--main-text);\ntext-decoration: none;\n}\nbody {\nbackground-color: var(--main-bg);\n}\n</style>\n<script>\nvar a = document.getElementsByTagName("a");\nfunction over(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--highlight-text");\n}\nfunction out(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--main-text");\n}\nconst btn = document.querySelector("#btn-mode");\nbtn.addEventListener("change", () => {\nif (btn.checked == true) {\ndocument.documentElement.setAttribute("theme", "dark");\n} else {\ndocument.documentElement.setAttribute("theme", "light");\n}\nfor (var i = 0; i < a.length; i++) {\na[i].style.color = getComputedStyle(a[i]).getPropertyValue("--main-text");\n}\n});\n</script>\n</body>'
)
if __name__ == "__main__":
args = ["DeepL", False, "translated_text.txt", True, False, "EN ↔ JP",False]
if input("1.English → Japanese 2.Japanese → English") == "2": args[6] = True
if input("1. DeepL 2.GoogleTranslate ") == "2": args[0] = "GT"
if input("Do you want to export the translation result? Y/n ") == "y":
case = input("1. txt 2. HTML 3. both ")
if case == "1":
args[1] = True
format_ = ".txt"
elif case == "2":
args[4] = True
format_ = ".html"
elif case == "3":
args[1], args[4] = True, True
format_ = ".txt/.html"
filename = input(
f"Enter a name for the output file (default is'translated_text{format_}') ")
if filename:
args[2] = filename.replace(" ", "_")
if case == "2" or case == "3":
title = input("Please enter the title (of the dissertation)")
if title:
args[5] = title
if input("Would you like to see the translation progress here? Y/n ") == "n":
args[3] = False
input("Press Enter when ready")
TranslateFromClipboard(*args)
I made it possible to decompose paragraphs to some extent without going through Word. Since it translates paragraph by paragraph (generally), the translation speed is much faster than that of each sentence.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import pyperclip as ppc
import re
DRIVER_PATH = 'chromedriver.exe'
options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')
def textParser(text, n=30, braketDetect=True):
text = text.splitlines()
sentences = []
t = ""
bra_cnt, ket_cnt = 0, 0
for i in range(len(text)):
if not bool(re.search("\S", text[i])): continue
if braketDetect:
bra_cnt += len(re.findall("[\(?]", text[i]))
ket_cnt += len(re.findall("[\)?]", text[i]))
if i != len(text) - 1:
if bool(re.fullmatch(r"[A-Z\s]+", text[i])):
if t != "": sentences.append(t)
t = ""
sentences.append(text[i])
elif text[i + 1] == "" or re.match(
"(\d{1,2}[\.,?]\s?|I{1,3}V{0,1}X{0,1}[\.,?]|V{0,1}X{0,1}I{1,3}[\.,?])+\s",
text[i]):
sentences.append(t + text[i])
t = ""
elif (text[i][-1] not in ("?", ".", "?") and
(abs(len(text[i]) - len(text[i + 1])) < n or
(len(t + text[i]) > len(text[i + 1]) and
(text[i + 1][-1] in ("?", ".", "?")
or bool(re.match("[A-Z]", text[i + 1][0])))))) or bool(
re.match("[a-z]|\)",
text[i + 1][0])) or bra_cnt > ket_cnt:
t += text[i]
else:
sentences.append(t + text[i])
t = ""
else:
sentences.append(text[i])
return len(sentences), sentences
def TranslateFromClipboard(tool, write, filename, isPrint, html, title, inv):
driver = webdriver.Chrome(executable_path=DRIVER_PATH,
chrome_options=options)
url = 'https://www.deepl.com/ja/translator' if tool == "DeepL" else f'https://translate.google.co.jp/?hl=ja&tab=TT&authuser=0#view=home&op=translate&sl=auto&tl={"en" if inv else "ja"}'
driver.get(url)
if tool == "DeepL":
textarea = driver.find_element_by_css_selector(
'.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
elif tool == "GT":
textarea = driver.find_element_by_id('source')
length, en = textParser(ppc.paste())
ja = []
for i, sentence in enumerate(en):
if sentence == "":
ja.append("")
continue
cbText = ppc.paste()
ppc.copy(sentence)
textarea.send_keys(Keys.CONTROL, "v")
ppc.copy(cbText)
transtext = ""
cnt = 0
while transtext == "":
time.sleep(1)
if tool == "DeepL":
transtext = driver.find_element_by_css_selector(
'.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style'
).get_property("value")
elif tool == "GT":
try:
time.sleep(1)
transtext = driver.find_element_by_css_selector(
'.tlid-translation.translation').text
except:
pass
cnt += 1
if cnt % 10 == 0: textarea.send_keys(".")
if isPrint:
print(sentence)
print(transtext)
print(f"\n{i+1}/{length} {int(100*(i+1)/length)}% done\n")
ja.append(transtext)
textarea.send_keys(Keys.CONTROL, "a")
textarea.send_keys(Keys.BACKSPACE)
driver.quit()
if write:
with open(filename + ".txt", "w", encoding='UTF-8') as f:
f.write("\n".join(ja))
if html:
eng = ""
jpn = ""
for i, ej in enumerate(zip(en, ja)):
eng += f'<br><a id="e{i}" href="#j{i}" onmouseover="over(' + f"'j{i}'" + ')" onmouseout="out(' + f"'j{i}'" + f')">{ej[0]}</a><br>'
jpn += f'<br><a id="j{i}" href="#e{i}" onmouseover="over(' + f"'e{i}'" + ')" onmouseout="out(' + f"'e{i}'" + f')">{ej[1]}</a><br>'
with open(filename + ".html", "w", encoding='UTF-8') as f:
f.write(
f'<h1 align="center">{title}</h1>\n<input id="btn-mode" type="checkbox">\n<hr>\n<body>\n<div class="parent">\n<div id="en">\n{eng}\n</div>\n<div id="ja">\n{jpn}\n</div>\n</div>'
+
'<style>\n:root {\n--main-text: #452b15;\n--main-bg: #f8f1e2;\n--highlight-text: #db8e3c;\n}\n:root[theme="dark"] {\n--main-text: #b0b0b0;\n--main-bg: #121212;\n--highlight-text: #fd8787;\n}\nh1 {\ncolor: var(--main-text);\n}\ninput {\nposition: absolute;\ntop: 1%;\nright: 1%;\n}\n#en {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: left;\nborder-right:1px solid #ccc;\nmargin: 1%;\noverflow: auto;\n}\n#ja {\nwidth: 43%;\nheight: 90%;\npadding: 0 2%;\nfloat: right;\nmargin: 1%;\noverflow: auto;\n}\na,\na:hover,\na:visited,\na:link,\na:active {\ncolor: var(--main-text);\ntext-decoration: none;\n}\nbody {\nbackground-color: var(--main-bg);\n}\n</style>\n<script>\nvar a = document.getElementsByTagName("a");\nfunction over(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--highlight-text");\n}\nfunction out(e) {\ndocument.getElementById(e).style.color = getComputedStyle(document.getElementById(e)).getPropertyValue("--main-text");\n}\nconst btn = document.querySelector("#btn-mode");\nbtn.addEventListener("change", () => {\nif (btn.checked == true) {\ndocument.documentElement.setAttribute("theme", "dark");\n} else {\ndocument.documentElement.setAttribute("theme", "light");\n}\nfor (var i = 0; i < a.length; i++) {\na[i].style.color = getComputedStyle(a[i]).getPropertyValue("--main-text");\n}\n});\n</script>\n</body>'
)
if __name__ == "__main__":
args = [
"DeepL", False, "translated_text.txt", True, False,
"ORIGINAL ↔ TRANSLATED", False
]
if input("1.English → Japanese 2.Japanese → English") == "2": args[6] = True
if input("1. DeepL 2.GoogleTranslate ") == "2": args[0] = "GT"
if input("Do you want to export the translation result? Y/n ") == "y":
case = input("1. txt 2. HTML 3. both ")
if case == "1":
args[1] = True
format_ = ".txt"
elif case == "2":
args[4] = True
format_ = ".html"
elif case == "3":
args[1], args[4] = True, True
format_ = ".txt/.html"
filename = input(
f"Enter a name for the output file (default is'translated_text{format_}') ")
if filename:
args[2] = filename.replace(" ", "_")
if case == "2" or case == "3":
title = input("Please enter the title (of the dissertation)")
if title:
args[5] = title
if input("Would you like to see the translation progress here? Y/n ") == "n":
args[3] = False
input("Press Enter when ready")
TranslateFromClipboard(*args)
HTML and CSS are amateurs, so if you do this, it will be even better! I would be grateful if you could tell me if there is any point.
Recommended Posts