I tried to extract characters from subtitles (OpenCV: tesseract-ocr edition) As a result of image processing with video, 810 seconds (about 13.5 minutes) I've lost it. Here, I tried to improve the speed of making videos by refactoring.
When I tried to transfer the same image data from PIL to OpenCV and process it, I temporarily saved it in a file and then read it, but upon closer examination, it seems that it can be done simply by converting the type of numpy. with this 810 seconds (about 13.5 minutes)-> 450 seconds (about 7.5 minutes) I was able to reduce it.
Before correction
def createTextImage(src, sentence, px, py, color=(8,8,8), fsize=28):
#Save image
tmp_path = "src_temp.png "
cv2.imwrite(tmp_path, src)
#Stored in PIL object
img = Image.open(tmp_path)
draw = ImageDraw.Draw(img)
#Write text on the image with PIL
font = ImageFont.truetype("./IPAfont00303/ipag.ttf", fsize)
draw.text((px, py), sentence, fill=color, font=font)
img.save(tmp_path)
#Store in openCV
return cv2.imread(tmp_path)
Revised
opencv(BGR) -> PIL(RGB)
cvimg = cv2.imread("sample.png ")
rgbImg = cv2.cvtColor(cvimg, cv2.COLOR_BGR2RGB)
pilImg = Image.fromarray(rgbImg)
------------------
PIL(RGB) -> opencv(BGR)
cvImg = np.array(pilImg, dtype=np.uint8)
dst = cv2.cvtColor(cvImg, cv2.COLOR_RGB2BGR)
I used to process the reading, processing, and writing of the video for each frame, but I wrote a script in the following flow to parallelize only the processing.
--Temporarily store frames in an array --Process and aggregate subtitle extraction for each frame in parallel --Sort by id --Save to video
Parallel processing uses joblib
. In the callback source, you can write in almost one line by making full use of inclusion processing. n_jobs = 16
is the number of processes.
from joblib import Parallel, delayed
def main_image_process(src, tool):
#Let's do some image processing here
#Preprocessing
gray_frame = pre_process(src.content)
#Character extraction
#Subtitle creation
...
Parallel(n_jobs=16)( [delayed(main_image_process)(f, tool) for f in frames] )
...
import sys
import cv2
import io
import os
import numpy as np
import pyocr
import pyocr.builders
from PIL import Image, ImageDraw, ImageFont
from collections import namedtuple
from joblib import Parallel, delayed
import time
MovieFrame = namedtuple("MovieFrame", ["id", "content", "timestamp"])
telop_height = 50
cap_width = 1
cap_height = 1
def pre_process(src):
kernel = np.ones((3,3),np.uint8)
gray = cv2.cvtColor(src, cv2.COLOR_BGR2GRAY)
#Binarization
o_ret, o_dst = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU)
#Opening reduction->Expansion
dst = cv2.morphologyEx(o_dst, cv2.MORPH_OPEN, kernel)
#Invert
dst = cv2.bitwise_not(dst)
# channel 1 ->Convert to 3
dst = cv2.cvtColor(dst, cv2.COLOR_GRAY2BGR)
return dst
#Character extraction
def extractTelopText(src, tool):
rgbImg = cv2.cvtColor(src, cv2.COLOR_BGR2RGB)
dst = tool.image_to_string(
Image.fromarray(rgbImg),
lang='jpn',
builder=pyocr.builders.WordBoxBuilder(tesseract_layout=6)
)
sentence = []
for item in dst:
sentence.append(item.content)
return "".join(sentence)
#Create empty subtitles
def createFooterTelop(src):
telop = np.zeros((telop_height, cap_width, 3), np.uint8)
telop[:] = tuple((128,128,128))
images = [src, telop]
dst = np.concatenate(images, axis=0)
return dst
#Parallel processing
def main_image_process(src, tool):
#Processed to make character recognition easier
gray_frame = pre_process(src.content)
#Trim only where telop is likely to appear
roi = gray_frame[435:600, :]
#Extract text
text = extractTelopText(roi, tool)
#Subtitle creation
dst = createFooterTelop(src.content)
#Add text to the image
dst = addJapaneseTelop(dst, text, 20, cap_height + telop_height - 30)
dst = addASCIITelop(dst, str(src.timestamp) + "[sec]", cap_width - 250, cap_height + telop_height - 10, color=(0,255,0))
#Store in nametuble
return MovieFrame(src.id, dst, src.timestamp)
#Add characters(Alphanumeric characters only)
def addASCIITelop(src, sentence, px, py, color=(8,8,8), fsize=28):
cv2.putText(src, sentence,
(px, py),
cv2.FONT_HERSHEY_SIMPLEX,
1,
color,
2,
cv2.LINE_AA)
return src
#Add characters(Japanese)
def addJapaneseTelop(src, sentence, px, py, color=(8,8,8), fsize=28):
rgbImg = cv2.cvtColor(src, cv2.COLOR_BGR2RGB)
#openCV -> PIL
canvas = Image.fromarray(rgbImg).copy()
draw = ImageDraw.Draw(canvas)
font = ImageFont.truetype("./IPAfont00303/ipag.ttf", fsize)
#Add characters
draw.text((px, py), sentence, fill=color, font=font)
#PIL -> openCV
dst = cv2.cvtColor(np.array(canvas, dtype=np.uint8), cv2.COLOR_RGB2BGR)
return dst
if __name__ == '__main__':
tools = pyocr.get_available_tools()
if len(tools) == 0:
print("No OCR tool found")
sys.exit(1)
tool = tools[0]
cap = cv2.VideoCapture('one_minutes.mp4')
cap_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
cap_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
telop_height = 50
fourcc = cv2.VideoWriter_fourcc('m','p','4','v')
writer = cv2.VideoWriter('extract_telop_async.mp4',fourcc, fps, (cap_width, cap_height + telop_height))
frames = []
start = time.time()
idx = 0
#Video loading
try :
while True:
if not cap.isOpened():
break
if cv2.waitKey(1) & 0xFF == ord('q'):
break
ret, frame = cap.read()
if frame is None:
break
frames.append(MovieFrame(idx,frame, round(idx/fps, 4)) )
idx += 1
except cv2.error as e:
print(e)
cap.release()
print("read movie file")
#Parallel processing(Caller)
r = Parallel(n_jobs=16)( [delayed(main_image_process)(f, tool) for f in frames] )
#sort
sorted_out = sorted(r, key=lambda x: x.id)
#Video writing
try :
for item in sorted_out:
writer.write(item.content)
except cv2.error as e:
print(e)
writer.release()
print("write movie file")
print("Done!!! {}[sec]".format(round(time.time() - start,4)))
nametuples
--Frame order
I wanted to process them all at once, so I decided to create a simple object and tried using named tuples
.
MovieFrame = namedtuple("MovieFrame", ["id", "content", "timestamp"])
With this, you can set src.id
and src.content
as well as the property (getter), so you can shorten the code.
The frames are aggregated after parallel processing, but the order may be slightly out of order. Therefore, the display order (id) is sorted.
sorted_out = sorted(r, key=lambda x: x.id)
What is the processing time? .. .. 267.7924 seconds (about 4.5 minutes) It was shortened to. : tada:
It's a big difference compared to the first 810 seconds.
processing time[sec] | processing time[min] | |
---|---|---|
Before correction | 810 | 13.5 |
Method 1(PIL <-> OpenCV) | 450 | 7.5 |
Method 1(PIL <-> OpenCV) +Method 2(Parallel processing) | 268 | 4.46 |
Since error messages are hard to be displayed, it is difficult to start parallel processing from the beginning unless you reduce the number of processes to one and eliminate bugs.
In the process of adding characters in Japanese, if you do not duplicate the image data with canvas = Image.fromarray (rgbImg) .copy ()
, you can perform sequential processing, but an error occurs in parallel processing.
Parallel processing is not possible without preparing a little environment, so processing is fast, but it is inevitable that it will be troublesome. : tired_face:
Recommended Posts