Abolished the input to DeepL via the clipboard and changed to the method using Javascript. Along with that, it corresponds to the use of Selenium in headless mode.
Addresses the problem of layout collapse around the table. If the image is embedded in the same paragraph as the text (such as an imaged formula), it turns out that the image disappears when you replace the text, and it is excluded from translation until a solution is found.
import win32com.client
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import re
from math import ceil
from threading import Thread
DRIVER_PATH = 'chromedriver.exe'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
options = Options()
options.add_argument(f'--user-agent={user_agent}')
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')
options.add_argument('--headless') #Cancel headless mode by commenting out (Chrome is displayed)
def Deeptrans(t, driver):
global translated_texts
stextarea = driver.find_element_by_css_selector(
'.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
ttextarea = driver.find_element_by_css_selector(
'.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')
for i in range(t * unit, min((t + 1) * unit, length)):
if sourse_texts[i]: sourse_text = sourse_texts[i]
else: continue
if not sourse_text.strip():
continue
driver.execute_script(
f'$(".lmt__source_textarea").val({repr(sourse_text)});')
stextarea.send_keys(Keys.RIGHT)
translated_text = ""
while not translated_text:
time.sleep(1)
translated_text = ttextarea.get_property("value")
stextarea.send_keys(Keys.CONTROL, "a")
stextarea.send_keys(Keys.BACKSPACE)
translated_texts.append({"index": i + 1, "text": translated_text})
def runDriver(t):
global options
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)
url = 'https://www.deepl.com/ja/translator'
driver.get(url)
Deeptrans(t, driver)
driver.quit()
def multiThreadTranslate(file_path, font):
global length, unit, sourse_texts, translated_texts
app = win32com.client.Dispatch("Word.Application")
#app.Visible = True
doc = app.Documents.Open(file_path)
try:
doc.Paragraphs(1).Range.Font.Name = font
except:
print('The specified font does not exist')
return
length = doc.Paragraphs.Count
n = 9
unit = ceil(length / n)
sourse_texts = [
doc.Paragraphs(i + 1).Range.Text if
(str(doc.Paragraphs(i + 1).Range.Style) != "TableGrid" and
str(doc.Paragraphs(min(length, i + 2)).Range.Style) != "TableGrid" and doc.Paragraphs(i + 1).Range.InlineShapes.Count)
else None for i in range(length)
]
translated_texts = []
threads = []
for t in range(n):
thread = Thread(target=runDriver, args=(t, ))
thread.setDaemon(True)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
for translated_text in sorted(translated_texts, key=lambda i: i["index"]):
doc.Paragraphs(translated_text["index"]
).Range.Text = translated_text["text"].replace(
'\n', '\r')
doc.Paragraphs(translated_text["index"]).Range.Font.Name = font
doc.SaveAs2(FileName=re.sub("(.+)(\.pdf)", r"\1_jp.pdf", file_path),
FileFormat=17)
doc.Close(SaveChanges=0)
app.Quit()
print('Process is completed.')
if __name__ == '__main__':
file_path = input('Enter the absolute path of the PDF (drag and drop is also possible): ')
print('Please select a font')
fonts = {'1': 'Yu Mincho', '2': 'Meiryo', '3': 'BIZ UDP Mincho Medium', '4': 'Other'}
font = fonts[input(' '.join(
[", ".join(list(fonts.items())[i])
for i in range(len(fonts))]) + ": ")]
if font == 'Other': font = input('Please enter the font name: ')
multiThreadTranslate(file_path, font=font)
-The above problem has been solved for the time being with the 1-thread version. -Solved the problem that the font size was adjusted without permission and rattled, and the problem that strange indentation was entered.
import win32com.client
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import re
from tqdm import tqdm
DRIVER_PATH = 'chromedriver.exe'
options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')
def Deeptrans(file_path, font):
app = win32com.client.Dispatch("Word.Application")
app.Visible = True
doc = app.Documents.Open(file_path)
driver = webdriver.Chrome(executable_path=DRIVER_PATH,
chrome_options=options)
url = 'https://www.deepl.com/ja/translator#en/ja'
driver.get(url)
stextarea = driver.find_element_by_css_selector(
'.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
ttextarea = driver.find_element_by_css_selector(
'.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')
length = doc.Paragraphs.Count
for i in tqdm(range(length)):
if str(doc.Paragraphs(i + 1).Range.Style) == "TableGrid":
continue
sourse_text = doc.Paragraphs(i + 1).Range.Text
fs = doc.Paragraphs(i + 1).Range.Font.Size
alignment = doc.Paragraphs(i + 1).Alignment
lindent = doc.Paragraphs(i + 1).LeftIndent
rindent = doc.Paragraphs(i + 1).RightIndent
if doc.Paragraphs(i + 1).Range.InlineShapes.Count:
if sourse_text.strip() == "/": continue
doc.Paragraphs(i + 1).Range.Font.Name = font
t = ""
te = []
cnt = 0
for j in range(doc.Paragraphs(i + 1).Range.Words.Count):
if "/" not in doc.Paragraphs(i + 1).Range.Words(j + 1).Text:
t += doc.Paragraphs(i + 1).Range.Words(j + 1).Text
doc.Paragraphs(i + 1).Range.Words(j + 1).Text = "'' "
else:
te.append(t)
t = ""
cnt += 1
if t: te.append(t)
for j, sourse_text in enumerate(te):
if len(sourse_text.strip()) > 5:
driver.execute_script(
f'$(".lmt__source_textarea").val({repr(sourse_text)});'
)
stextarea.send_keys(Keys.RIGHT)
translated_text = ""
while not translated_text:
time.sleep(1)
translated_text = driver.find_element_by_css_selector(
'.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style'
).get_property("value")
stextarea.send_keys(Keys.CONTROL, "a")
stextarea.send_keys(Keys.BACKSPACE)
te[j] = translated_text
g = (j for j in te)
c = 0
for j in doc.Paragraphs(i + 1).Range.Words:
if j.Text == "'' ":
j.Text = ""
elif "/" in j.Text:
try:
j.InsertBefore(g.__next__())
c += 1
if c == cnt:
j.InsertAfter(g.__next__())
except:
pass
doc.Paragraphs(i + 1).Alignment = alignment
doc.Paragraphs(i + 1).Range.Font.Size = fs
doc.Paragraphs(i + 1).LeftIndent = lindent
doc.Paragraphs(i + 1).RightIndent = rindent
continue
if re.search(r"[\x00-\x1F\x7F]",
sourse_text.strip()) or len(sourse_text.strip()) < 5:
continue
driver.execute_script(
f'$(".lmt__source_textarea").val({repr(sourse_text)});')
stextarea.send_keys(Keys.RIGHT)
translated_text = ""
while not translated_text:
time.sleep(1)
translated_text = driver.find_element_by_css_selector(
'.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style'
).get_property("value")
stextarea.send_keys(Keys.CONTROL, "a")
stextarea.send_keys(Keys.BACKSPACE)
doc.Paragraphs(i + 1).Range.Text = translated_text
doc.Paragraphs(i + 1).Range.Font.Name = font
doc.Paragraphs(i + 1).Alignment = alignment
doc.Paragraphs(i + 1).Range.Font.Size = fs
doc.Paragraphs(i + 1).LeftIndent = lindent
doc.Paragraphs(i + 1).RightIndent = rindent
driver.quit()
doc.SaveAs2(FileName=re.sub("(.+)(\.pdf)", r"\1_jp.pdf", file_path),
FileFormat=17)
doc.Close(SaveChanges=0)
app.Quit()
print('Process is completed.')
if __name__ == '__main__':
file_path = input('Please enter the absolute path of the PDF: ')
print('Please select a font')
fonts = {'1': 'Yu Mincho', '2': 'Meiryo', '3': 'BIZ UDP Mincho Medium', '4': 'Other'}
font = fonts[input(' '.join(
[", ".join(list(fonts.items())[i])
for i in range(len(fonts))]) + ": ")]
if font == 'Other': font = input('Please enter the font name: ')
Deeptrans(file_path, font)
I wrote an article about automatic PDF translation before, ** After all, I want to keep the original shape of images, formulas, columns, etc.! ** ** I had a regret, so when I searched for a means, I arrived at a method using Word, so I would like to introduce it. However, depending on the compatibility between PDF and Word, it may not be possible to process it very well.
I borrowed the PDF from here → https://mirela.net.technion.ac.il/publications/
The position is shifted due to the character width and the number of characters.
・ Windows PC ・ Microsoft Word -ChromeDriver (If you want to execute the following program as it is, save it under the execution directory)
*** Program start ↓ Open the target PDF as docx in Word ↓ Get sentences for each paragraph ↓ DeepL Translator with Selenium ↓ Rewrite via Word ↓ Save as PDF ↓ The end of the program***
It takes time to do it paragraph by paragraph, so I decided to execute it in multiple threads. If for some reason the number that manages the position of the paragraph shifts, it will take some time, but we will also post a version that translates each paragraph, so please try it.
import win32com.client
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import re
import pyperclip as ppc
from math import ceil
from threading import Thread, Lock
DRIVER_PATH = 'chromedriver.exe'
options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')
def Deeptrans(t, driver):
global translated_texts
stextarea = driver.find_element_by_css_selector(
'.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
ttextarea = driver.find_element_by_css_selector(
'.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')
for i in range(t * unit, min((t + 1) * unit, length)):
sourse_text = sourse_texts[i]
if re.search(r"[\x00-\x1F\x7F]",
sourse_text.strip()) or len(sourse_text.strip()) < 5:
continue
lock.acquire()
ppc.copy(sourse_text)
stextarea.send_keys(Keys.CONTROL, "v")
lock.release()
translated_text = ""
while not translated_text:
time.sleep(1)
translated_text = ttextarea.get_property("value")
stextarea.send_keys(Keys.CONTROL, "a")
stextarea.send_keys(Keys.BACKSPACE)
translated_texts[str(i + 1)] = translated_text
def runDriver(t):
driver = webdriver.Chrome(DRIVER_PATH)
url = 'https://www.deepl.com/ja/translator'
driver.get(url)
Deeptrans(t, driver)
driver.quit()
def multiThreadTranslate(file_path, font):
global lock, length, unit, sourse_texts, translated_texts
app = win32com.client.Dispatch("Word.Application")
app.Visible = True #Hide Word by commenting out
doc = app.Documents.Open(file_path)
try:
doc.Paragraphs(1).Range.Font.Name = font
except:
print('The specified font does not exist')
doc.Close(SaveChanges=0)
app.Quit()
return
length = doc.Paragraphs.Count
n = 9 #Open 9 Chrome and run at the same time
unit = ceil(length / n)
lock = Lock()
clipboard = ppc.paste()
sourse_texts = [doc.Paragraphs(i + 1).Range.Text for i in range(length)]
translated_texts = {}
threads = []
for t in range(n):
thread = Thread(target=runDriver, args=(t, ))
thread.setDaemon(True)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
for k, v in translated_texts.items():
doc.Paragraphs(int(k)).Range.Text = v.replace('\n', '\r')
doc.Paragraphs(int(k)).Range.Font.Name = font
doc.SaveAs2(FileName=re.sub("(.+)(\.pdf)", r"\1_jp.pdf", file_path),
FileFormat=17)
doc.Close(SaveChanges=0)
app.Quit()
print('Process is completed.')
ppc.copy(clipboard)
if __name__ == '__main__':
file_path = input('Please enter the absolute path of the PDF: ')
print('Please select a font')
fonts = {'1': 'Yu Mincho', '2': 'Meiryo', '3': 'BIZ UDP Mincho Medium', '4': 'Other'}
font = fonts[input(' '.join(
[", ".join(list(fonts.items())[i])
for i in range(len(fonts))]) + ": ")]
if font == 'Other': font = input('Please enter the font name: ')
multiThreadTranslate(file_path, font=font)
import win32com.client
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import re
import pyperclip as ppc
from tqdm import tqdm
DRIVER_PATH = 'chromedriver.exe'
options = Options()
options.add_argument('--disable-gpu')
options.add_argument('--disable-extensions')
options.add_argument('--proxy-server="direct://"')
options.add_argument('--proxy-bypass-list=*')
options.add_argument('--start-maximized')
def Deeptrans(file_path, font):
clipboard = ppc.paste()
app = win32com.client.Dispatch("Word.Application")
app.Visible = True
doc = app.Documents.Open(file_path)
driver = webdriver.Chrome(executable_path=DRIVER_PATH,
chrome_options=options)
url = 'https://www.deepl.com/ja/translator#en/ja'
driver.get(url)
stextarea = driver.find_element_by_css_selector(
'.lmt__textarea.lmt__source_textarea.lmt__textarea_base_style')
ttextarea = driver.find_element_by_css_selector(
'.lmt__textarea.lmt__target_textarea.lmt__textarea_base_style')
for i in tqdm(range(doc.Paragraphs.Count)):
sourse_text = doc.Paragraphs(i + 1).Range.Text
if re.search(r"[\x00-\x1F\x7F]",
sourse_text.strip()) or len(sourse_text.strip()) < 5:
continue
ppc.copy(sourse_text)
stextarea.send_keys(Keys.CONTROL, "v")
translated_text = ""
while not translated_text:
time.sleep(1)
translated_text = ttextarea.get_property("value")
stextarea.send_keys(Keys.CONTROL, "a")
stextarea.send_keys(Keys.BACKSPACE)
doc.Paragraphs(i + 1).Range.Text = translated_text
doc.Paragraphs(i + 1).Range.Font.Name = font
driver.quit()
doc.SaveAs2(FileName=re.sub("(.+)(\.pdf)", r"\1_jp.pdf", file_path),
FileFormat=17)
doc.Close(SaveChanges=0)
app.Quit()
print('Process is completed.')
ppc.copy(clipboard)
if __name__ == '__main__':
file_path = input('Please enter the absolute path of the PDF: ')
print('Please select a font')
fonts = {'1': 'Yu Mincho', '2': 'Meiryo', '3': 'BIZ UDP Mincho Medium', '4': 'Other'}
font = fonts[input(' '.join(
[", ".join(list(fonts.items())[i])
for i in range(len(fonts))]) + ": ")]
if font == 'Other': font = input('Please enter the font name: ')
Deeptrans(file_path, font)
To use it, just save it and run it from the command line (please install the required libraries separately).
After a while, the file original name_jp.pdf
will be output to the same directory as the original file.
It is difficult to distinguish between mathematical formulas and local sentences, and they may lose their shape or disappear. I dealt with it with a stick-on blade, but on the contrary, some sentences were not translated. Looking for a good way.
It's Word. If you are interested, please try it.
Recommended Posts