I wrote the code to OCR from PDF in Python, imitating the wonderful ancestors. And use. What I'm doing is converting the PDF to jpg with poppler and then transcribing it into a txt file with Tesseract OCR.
I'm a beginner, so I'd be happy if you could point out any strange things in the code. I also referred to many sites that I did not quote. Thank you very much.
With reference to this site, the folder structure is as follows. How to convert PDF to image file (JPEG, PNG) with Python
Parent folder | Child folder |
---|---|
\ Current | ¥image_file |
¥pdf_file | |
¥poppler | |
¥txt_file |
import os
import pathlib
from pathlib import Path
from pdf2image import convert_from_path
from PIL import Image
import sys
import pyocr
import pyocr.builders
import pathlib
import glob
def cleanup():
#Directory containing the img file to be deleted
image_dir = pathlib.Path('./image_file')
#Get a list of jpeg files in a directory with glob
jpg_path = list(image_dir.glob('**/*.jpeg'))
#Directory containing the txt file to be deleted
txt_dir = pathlib.Path('./txt_file')
#Get a list of txt files in a directory with glob
txt_path = list(txt_dir.glob('**/*.txt'))
if jpg_path == []: #Break if the list is empty
pass
else:
for i in jpg_path:
os.remove(i)
if txt_path == []: #Break if the list is empty
pass
else:
for i in txt_path:
os.remove(i)
def pdf_to_image():
# poppler/Add bin to environment variable Path(Temporarily)
# Path("__file__").parent.resolve()so.Returns the absolute path of the parent folder of the py file
poppler_dir = pathlib.Path("__file__").parent.resolve() / "poppler/bin"
#pathsep is a delimiter when adding to an environment variable;
os.environ["PATH"] += os.pathsep + str(poppler_dir)
#PDF file path
pdf_dir = pathlib.Path('./pdf_file')
#Get a list of pdf files in a directory with glob
pdf_path = list(pdf_dir.glob('**/*.pdf'))
# PDF ->Convert to Image(200dpi)
pages = convert_from_path(str(pdf_path[0]))
#Save image files page by page
image_dir = pathlib.Path("./image_file")
for i, page in enumerate(pages): #Get the number of pages of pages with the enumerate function
# .Show the end of path with stem (pathlib)
file_name = pdf_path[0].stem + "_{:02d}".format(i + 1) + ".jpeg "
image_path = image_dir / file_name
#Save as JPEG
page.save(str(image_path), "JPEG")
def image_ocr():
# tesseract-Pass the OCR path
tessera_path = "C:\***\Tesseract-OCR"
#pathsep is a delimiter when adding to an environment variable;
os.environ["PATH"] += os.pathsep + str(tessera_path)
tools = pyocr.get_available_tools()
if len(tools) == 0:
print("No OCR tool found")
sys.exit(1) #Argument 1 returns 1 with exit status
tool = tools[0]
#Directory with ocr target files
image_dir = pathlib.Path('./image_file')
#Get a list of jpeg files in a directory with glob
jpg_path = list(image_dir.glob('**/*.jpeg'))
for i in jpg_path:
#Convert the ocr content to the variable txt
txt = tool.image_to_string(
Image.open(str(i)),
lang="jpn",
builder=pyocr.builders.TextBuilder(tesseract_layout=6)
)
#Variable txt txt_Save as txt file in file directory
with open('./txt_file/' + str(i.stem) + '.txt', mode='wt') as t:
t.write(txt)
Recommended Posts