The execution environment for this article is colaboratory.
** This goal **
First, take a look at the finished product uploaded to YouTube. Traffic Safety-kun
table of contents
** 1, advance preparation ** ① Take a video on your smartphone at a place with a traffic sign. The effect of high image quality is good. (1920 * 1080, 30FPS in my case) ** Be careful of traffic safety when taking videos! ** ** (2) Create warning voices for "stop", "red light", and "no entry" I created it with gTTS, of course you can record your own voice.
makeAudio_stop.ipynb
from gtts import gTTS #Google Text to Speech
from google import colab
#Mount google drive
colab.drive.mount('/content/gdrive')
path="gdrive/My Drive/make_video/"
word="Stop it"
tts = gTTS(word,lang='ja') #Provide the string to convert to speech
tts.save(path+'STOP_2.wav') #save the string converted to speech as a .wav file
Audio(path+'STOP_2.wav')
③ Prepare images of traffic signs (red light, green light, stop, no entry, blue sky). (2, used for Deep Learning traffic signs)
Download images from Google Image Search.
python:1.getImage_STOP.ipynb
#Take an image of a stop
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium
import urllib.request as req
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver.common.keys import Keys
from google import colab
colab.drive.mount('/content/gdrive')
#Launch the browser in headless mode and display the website
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=options)
driver.implicitly_wait(10)
count=1
tempImageCount=0
#Specifying the target URL
driver.get("https://www.google.com/search?rlz=1C1CAFC_enJP862JP870&biw=1366&bih=657&tbm=isch&sxsrf=ACYBGNSNeQ5IaB9V8b-6pc6q9gOtbrY4Uw%3A1577968008026&sa=1&ei=iOENXoiRAfCSr7wP-8ee0As&q=%E6%AD%A2%E3%81%BE%E3%82%8C%E6%A8%99%E8%AD%98&oq=%E6%AD%A2%E3%81%BE%E3%82%8C%E6%A8%99%E8%AD%98&gs_l=img.3..0l2j0i24l8.144019.154252..155304...3.0..0.429.5395.0j1j5j10j1......0....1..gws-wiz-img.......0i4i37j35i39j0i23j0i4i37i24.zXhLgCDtIBY&ved=0ahUKEwiI9db09OTmAhVwyYsBHfujB7oQ4dUDCAc&uact=5")
time.sleep(3)
while True:
#image list acquisition
image_list = driver.find_elements_by_class_name('rg_ic')
print(len(image_list))
#ScrollBar at the bottom
driver.find_element_by_tag_name('body').send_keys(Keys.END)
if len(image_list) > tempImageCount:
tempImageCount = len(image_list)
print('------------------- go to next page --------------------------')
try:
#Show "Show more results" button
driver.find_element_by_id('smb').click()
print('------------------- click success --------------------------')
except:
driver.find_element_by_tag_name('body').send_keys(Keys.END)
print('------------------- KEY END --------------------------')
else:
break
#image list acquisition
image_list = driver.find_elements_by_class_name('rg_ic')
print(len(image_list))
for image in image_list:
#Get image URL
image_url = image.get_attribute('src')
#Save image
try:
image = req.urlopen(image_url).read()
with open('gdrive/My Drive/image/Traffic safety/stop/'+str(count)+'.jpg',mode='wb') as f:
f.write(image)
print('download - {}'.format(count))
count += 1
except:
print('cant open url')
driver.quit()
Let's choose a good one from the acquired images. (As many as possible, 20 or more) Good example: (without background) NG example:
** 2, traffic sign Deep Learning **
① Convert image file to Numpy format
The data in the dataset "MNIST" is an array of (28,28) as shown below. By imitating ↑, first resize the image file to a 50 * 50 square and then convert it to Numpy format. Since it is an RGB color mode, it will be an array of (50,50,3).
to_Dat.ipynb
import cv2
import os
from sklearn.model_selection import train_test_split
from PIL import Image
import os,glob
import numpy as np
from google import colab
colab.drive.mount('/content/gdrive')
!ls 'gdrive/My Drive'
#Select a category of classification
root_dir = 'gdrive/My Drive/image/Traffic safety 2/'
train_dir = 'gdrive/My Drive/data/'
groups = ['Green light','one-way','stop','No entry','Red light','blue sky']
nb_classes = len(groups)
image_size = 50
#Convert image data to Numpy format
#Read image data for each folder
X = []
Y = []
#Since there are few pictures, I will have them study the same picture 20 times. Humans remember things over and over, is it the same for deep learning?
for i in range(0,20,1):
for idx,group in enumerate(groups):
image_dir = root_dir + group
files = glob.glob(image_dir+'/*.jpg')
for i,f in enumerate(files):
img = Image.open(f)
img = img.convert('RGB') #Change to RGB mode
img = img.resize((image_size,image_size))#50*Resize to 50
data = np.asarray(img)
X.append(data)
Y.append(idx)
X = np.array(X)
Y = np.array(Y)
X_train,X_test,y_train,y_test = train_test_split(X,Y,random_state=0)
xy = (X_train,X_test,y_train,y_test)
np.save('gdrive/My Drive/data/Traffic safety 2.npy', xy)
print(X_train.shape[1:])
print('ok',len(Y))
② Deep learning with neural network Introducing the technique of "convolution", please refer to the following URL. What is a convolutional neural network? The procedure is also explained carefully After applying a two-dimensional filter to the image data as shown in ↑, you can emphasize horizontal lines and vertical lines, which can greatly improve the success rate.
deeplearning.ipynb
!pip install keras==2.2.4
import keras
from google import colab
colab.drive.mount('/content/gdrive')
!ls 'gdrive/My Drive'
import numpy as np
from keras.datasets import mnist
from keras.utils import np_utils
#Read the data saved in process ①
x_train,x_test,y_train,y_test = np.load('gdrive/My Drive/data/Traffic safety 2.npy', mmap_mode=None, allow_pickle=True , fix_imports=True, encoding='ASCII')
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
num_classes = 10
y_train = np_utils.to_categorical(y_train, num_classes)
y_test = np_utils.to_categorical(y_test, num_classes)
#neural network
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.optimizers import Adam
import time
model = Sequential()
model.add(Conv2D(50, (3, 3),
input_shape=(50, 50, 3), activation='relu')) #Convolution ①
model.add(Conv2D(32, (3, 3), activation='relu')) #Convolution ②
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu')) ##Convolution ③
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer=Adam(),
metrics=['accuracy'])
startTime = time.time()
history = model.fit(x_train, y_train, batch_size=3000, epochs=20,
verbose=1, validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
#loss
print('loss:', score[0])
#success rate
print('accuracy:', score[1])
#Learning time
print("Computation time:{0:.3f} sec".format(time.time() - startTime))
#Save trained model
model.save('gdrive/My Drive/model/Traffic safety.h5')
The success rate is 0.98, which is a good feeling. loss: 0.11440953898268777 accuracy: 0.9878378378378379 Computation time:46.734 sec
** 3, Analyze Video and detect red light, stop, frame with no entry **
(1) How do you cut out the image of the sign from the actual photo and identify it with the trained model?
I referred to the following. [Python / OpenCV] Detect moving objects with color tracking
Get the biggest blob.ipynb
#Blob analysis
def analysis_blob(binary_img):
#Binary image labeling process
label = cv2.connectedComponentsWithStats(binary_img)
#Extract blob information item by item
n = label[0] - 1
data = np.delete(label[2], 0, 0)
center = np.delete(label[3], 0, 0)
if len(center) <= 0:
return
#Index with the largest blob area
max_index = np.argmax(data[:, 4])
return center[max_index]
Prediction method.ipynb (arguments are image file and coordinates)
def predictWithModel(image_file, x_from, y_from, x_to, y_to):
image_size = 50
X1 = []
#trimming
img_crop = image_file[y_from : y_to, x_from: x_to]
img_crop = cv2.cvtColor(img_crop, cv2.COLOR_BGR2RGB)
img_crop = cv2.resize(img_crop, (image_size,image_size))#Change image size
X1.append(img_crop)#Vectorized image
X1 = np.array(X1)
x_test = X1.astype('float32')
x_test /= 255
y = model.predict(x_test) #Forecast
wk = y[0, :]
wk_sort = np.sort(wk)
wk_sort = wk_sort[::-1]
max_indx = -1
max_pcnt = -1
if float(wk_sort[0]) > 0.9 and np.argmax(wk) != 5 and np.argmax(wk) != 0:
max_pcnt = float(wk_sort[0])
max_indx = np.argmax(wk)
#print(max_indx)
#print(max_pcnt)
return max_indx, max_pcnt
Coordinate adjustment method.ipynb (red light is rectangular, stop and no entry is square)
def adjustSize(height, width, center_x, center_y, div_size, div_size_w, kbn):
if kbn == 1:
#Rectangle
x_from = center_x - div_size_w*3//4
x_to = x_from + div_size_w
y_from = center_y - div_size//2
y_to = y_from + div_size
else:
#square
x_from = center_x - div_size//2
x_to = x_from + div_size
y_from = center_y - div_size//2
y_to = y_from + div_size
if x_from < 0:
x_from = 0
if y_from < 0:
y_from = 0
if x_to >= width:
x_to = width
if y_to >= height:
y_to = height
return x_from, y_from, x_to, y_to
Image file analysis.ipynb
def predictImage3(argimg):
#Load the image.
height, width, channels = argimg.shape[:3]
#Image crop (Traffic sign is in the upper half, so analyze only the upper half)
img = argimg[0:height//2,0:width//2,::]
#Convert to HSV color space.
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
#Extract the red border by binarization.
binary = cv2.inRange(hsv, (145, 70, 0), (180, 255, 255))
if len(binary) <= 0:
return argimg,-1
#Use OPENING to eliminate noise.
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
eroded = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
#Blob analysis of mask image (obtain blob information of maximum area)
center = analysis_blob(eroded)
if center is None:
return argimg,-1
#Get center coordinates
center_x = int(center[0])
center_y = int(center[1])
#First rectangle
x_from, y_from, x_to, y_to = adjustSize(height, width, center_x, center_y, 60, 100, 1)
max_indx, max_pcnt = predictWithModel(img, x_from, y_from, x_to, y_to)
#Research square
x_from2, y_from2, x_to2, y_to2 = adjustSize(height, width, center_x, center_y, 50, 100, 0) max_indx2, max_pcnt2 = predictWithModel(img, x_from2, y_from2, x_to2, y_to2)
#Rectangle 2 (adjust size)
x_from3, y_from3, x_to3, y_to3 = adjustSize(height, width, center_x, center_y, 40, 80, 1)
max_indx3, max_pcnt3 = predictWithModel(img, x_from3, y_from3, x_to3, y_to3)
pcnt = [max_pcnt, max_pcnt2, max_pcnt3]
indx = [max_indx, max_indx2, max_indx3]
max = np.argmax(pcnt)
max_index = -1
if indx[max] == 2:
text="stop"
elif indx[max] == 4:
text="Red light"
elif indx[max] == 3:
text="No entry"
else:
text=""
if indx[max] > 0:
#Draw a circle around the center of the maximum blob on the frame
cv2.circle(argimg, (center_x, center_y), 80, (0, 200, 0),thickness=3, lineType=cv2.LINE_AA)
fontpath ='gdrive/My Drive/font/MSMINCHO.TTC' #font
font = ImageFont.truetype(fontpath, 128) #font size
img_pil = Image.fromarray(argimg) #Each value of the array is 8bit(1byte)Integer type(0~255)To PIL Image.
draw = ImageDraw.Draw(img_pil) #Create draw instance
position = (center_x, center_y + 100) #Text display position
draw.text(position, text, font = font , fill = (0,0,255,0) ) #Write text in draw fill:Color BGRA(RGB)
max_index = indx[max]
return np.array(img_pil),max_index #Convert PIL to array
return argimg,max_index
After preparing the above method, analyze the prepared Video
Analyze Video.ipynb
import cv2
from google.colab.patches import cv2_imshow
from PIL import ImageFont, ImageDraw, Image
target_dir = 'gdrive/My Drive/target/Traffic safety 2/'
files = glob.glob(target_dir+'/src_*.mp4')
target_avi_file = target_dir + "output.avi"
output_file = target_dir + "output.mp4"
#Create a VideoWriter.
fourcc = cv2.VideoWriter_fourcc(*"DIVX")
writer = cv2.VideoWriter(target_avi_file, fourcc, 30, (1920, 1080))
frame_cnt=0
fame_index_result=np.empty((0,2), int)
for i,f in enumerate(files):
#Create a VideoCapture.
cap = cv2.VideoCapture(f)
temp=np.empty((0,2), int)
while True:
#Get frame by frame.
ret, frame = cap.read()
if not ret:
break #If frame acquisition fails or the end of the video is reached
frame_cnt+=1
frame,index = predictImage3(frame)
if index > 0:
temp = np.append(temp, np.array([[frame_cnt,index]]), axis=0)
writer.write(frame) #Write a frame.
#Traffic sign Record the first recognized Frame (= timing to insert voice)
index_cnt = [np.count_nonzero(temp[:,1] == 2),np.count_nonzero(temp[:,1] == 3),np.count_nonzero(temp[:,1] == 4)]
max_index = [2,3,4][np.argmax(index_cnt)]
fame_index_result = np.append(fame_index_result, np.array([(temp[temp[:,1] == max_index])[0]]), axis=0)
writer.release()
cap.release()
** 4, put audio in the detected Frame **
Add audio.ipynb
!pip -q install pydub
!apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg
!pip install pyaudio
import cv2
import pyaudio
import sys
import time
import wave
import pydub
from pydub import AudioSegment
import moviepy.editor as mp
import datetime
temp_file = target_dir + "temp.mp4"
# Add audio to output video.
clip_output = mp.VideoFileClip(target_avi_file).subclip()
clip_output.write_videofile(temp_file, audio=mp3_file)
cap = cv2.VideoCapture(target_avi_file)
video_frame = cap.get(cv2.CAP_PROP_FRAME_COUNT) #Get the number of frames
video_fps = cap.get(cv2.CAP_PROP_FPS) #Get FPS
video_len_sec = video_frame / video_fps #Calculate the length (seconds)
print(video_len_sec)
video = mp.VideoFileClip(temp_file).subclip(0,video_len_sec)
video.write_videofile(output_file)
mp3_stop_file_2="gdrive/My Drive/mp3/stop_2.wav"
mp3_stop_file_3="gdrive/My Drive/mp3/stop_3.wav"
mp3_stop_file_4="gdrive/My Drive/mp3/stop_4.wav"
videos = np.empty((0), mp.VideoFileClip)
startSec=0
temp_file2 = target_dir + "temp2.mp4"
#Insert voice for each frame of traffic sign
for idx in range(fame_index_result.shape[0]):
if fame_index_result[idx][1] == 2:
mp3_stop_file = mp3_stop_file_2
elif fame_index_result[idx][1] == 3:
mp3_stop_file = mp3_stop_file_3
elif fame_index_result[idx][1] == 4:
mp3_stop_file = mp3_stop_file_4
base_sound = AudioSegment.from_file(mp3_stop_file)
length_seconds = base_sound.duration_seconds #Check the length of the voice
#First, cut out from 0 to the frame of the traffic sign
video_len_sec_temp = fame_index_result[idx][0] / video_fps
videos = np.append(videos, np.array([mp.VideoFileClip(temp_file).subclip(startSec,video_len_sec_temp)]), axis=0)
#Match the length of the audio, cut out a video of the same length, and insert the audio
clip_output = mp.VideoFileClip(temp_file).subclip(video_len_sec_temp, video_len_sec_temp+length_seconds)
clip_output.write_videofile(temp_file2, audio=mp3_stop_file)
#Remaining Video
videos = np.append(videos, np.array([mp.VideoFileClip(temp_file2).subclip()]), axis=0)
if idx == fame_index_result.shape[0] - 1:
last_sec = video_len_sec
else:
last_sec = fame_index_result[idx+1][0] / video_fps
if video_len_sec_temp+length_seconds < last_sec:
videos = np.append(videos, np.array([mp.VideoFileClip(temp_file).subclip(video_len_sec_temp+length_seconds, last_sec)]), axis=0)
startSec = last_sec
#Concatenate edited videos
final_clip = mp.concatenate_videoclips(videos)
final_clip.write_videofile(output_file)
Recommended Posts