I wrote a program that uses OCR to extract text from PDF and image files. It's not easier to use OCR because I'm talking to a friend about the automation of assignments sent in PDF, and I'm typing another friend's report into Word, so I wrote a program using OCR. I wondered if I should try it.
main.py
import os
import pyocr
import tkinter
from tkinter import filedialog
from pdf2image import convert_from_path
from PIL import Image
class UseOCR:
def __init__(self):
pyocr.tesseract.TESSERACT_CMD = '/usr/local/bin/tesseract'
self.poppler_executable_path = '/usr/local/bin/'
self.initialdir = '~/'
self.extract_lang = 'jpn+eng'
self.extension = [('pdf files', '*.pdf'),
('jpeg file', '*.jpeg'),
('jpg file', '*.jpg'),
('png file', '*.png')]
def askfilenames(self):
root = tkinter.Tk()
root.withdraw()
path = filedialog.askopenfilenames(filetypes=self.extension, initialdir=self.initialdir)
return path
@staticmethod
def get_fileinfo(path):
basename = tuple(map(os.path.basename, path))
fileinfo = dict(zip(basename, path))
return fileinfo
def pdf_to_image(self, pdf):
image = convert_from_path(pdf, poppler_path=self.poppler_executable_path)
return image
def image_to_text(self, image):
tool = pyocr.get_available_tools()[0]
txt = tool.image_to_string(
image,
lang='jpn',
builder=pyocr.builders.TextBuilder()
)
return txt
if __name__ == '__main__':
OCR = UseOCR()
path = OCR.askfilenames()
fileinfo = OCR.get_fileinfo(path)
for basename, path in fileinfo.items():
filename, extension = os.path.splitext(basename)
if extension == '.pdf':
image = OCR.pdf_to_image(path)[0]
txt = OCR.image_to_text(image)
else:
image = Image.open(path)
txt = OCR.image_to_text(image)
with open('./output/{}.txt'.format(filename), mode='w') as f:
f.write(txt)
I will explain using a PDF with the following sentences. (This image is a PDF exported to jpg and cropped.)
#pyocr's TESSERACT_Rewrite CMD to tesseract path. Location → which tesseract
pyocr.tesseract.TESSERACT_CMD = '/usr/local/bin/tesseract'
# convert_from_path()The path of poppler to assign to the argument of. Location → which pdfinfo
self.poppler_executable_path = '/usr/local/bin/'
#Directory when tkinter started
self.initialdir = '~/'
#Character to OCR
self.extract_lang = 'jpn+eng'
#Specifying the extension to select with tkinter
self.extension = [('pdf files', '*.pdf'),
('jpeg file', '*.jpeg'),
('jpg file', '*.jpg'),
('png file', '*.png')]
askfilenames Returns a tuple of the full path of the file selected by Tkinter.
>>> path = OCR.askfilenames()
>>> path
('/Users/Username/Desktop/hoge.pdf',)
get_fileinfo Taking a full-path tuple as an argument returns a full-path dictionary with the filename.
>>> fileinfo = OCR.get_fileinfo(path)
>>> fileinfo
{'hoge.pdf': '/Users/Username/Desktop/hoge.pdf'}
pdf_to_image If you pass the path of the PDF file as an argument, it returns a list of PIL Image objects. Since pdf2image and PyOCR have a dependency on Pillow, it is easier to handle if you return an Image object instead of making it an image file.
>>> for k,v in fileinfo.items():
... image = OCR.pdf_to_image(v)
>>> image
[<PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=1654x2339 at 0x10E1749E8>]
image_to_text It is the central process for performing OCR. If you pass an image file or Image object as an argument, OCR is performed and the text is returned.
>>> txt = OCR.image_to_text(image[0])
>>> txt
'Test test character 0123'
Recommended Posts