Traffic Safety-kun: Recognition of traffic signs in Python

The execution environment for this article is colaboratory.

** This goal **

  1. This time, recognize the red light, green light, stop, and no entry sign.
  2. Be careful with voice when you are at a red light, stop, or are prohibited from entering.

First, take a look at the finished product uploaded to YouTube. Traffic Safety-kun

table of contents

  1. Advance preparation
  2. Traffic sign Deep Learning
  3. Analyze Video to detect red light, stop, and prohibited frames
  4. Put audio in the detected Frame

** 1, advance preparation ** ① Take a video on your smartphone at a place with a traffic sign. The effect of high image quality is good. (1920 * 1080, 30FPS in my case) ** Be careful of traffic safety when taking videos! ** ** (2) Create warning voices for "stop", "red light", and "no entry" I created it with gTTS, of course you can record your own voice.


from gtts import gTTS #Google Text to Speech
from google import colab

#Mount google drive'/content/gdrive')

path="gdrive/My Drive/make_video/"
word="Stop it"
tts = gTTS(word,lang='ja') #Provide the string to convert to speech'STOP_2.wav') #save the string converted to speech as a .wav file

③ Prepare images of traffic signs (red light, green light, stop, no entry, blue sky). (2, used for Deep Learning traffic signs)

Download images from Google Image Search.


#Take an image of a stop
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium

import urllib.request as req
from selenium import webdriver
from import Options
import time
from selenium.webdriver.common.keys import Keys 

from google import colab'/content/gdrive')

#Launch the browser in headless mode and display the website
options = webdriver.ChromeOptions()
driver = webdriver.Chrome('chromedriver',options=options)


#Specifying the target URL

while True:
    #image list acquisition
    image_list = driver.find_elements_by_class_name('rg_ic')
    #ScrollBar at the bottom

    if len(image_list) > tempImageCount:
        tempImageCount = len(image_list)
        print('------------------- go to next page --------------------------')
            #Show "Show more results" button
            print('------------------- click success --------------------------')
            print('------------------- KEY END --------------------------')

#image list acquisition
image_list = driver.find_elements_by_class_name('rg_ic')
for image in image_list:
        #Get image URL
        image_url = image.get_attribute('src')
        #Save image
            image = req.urlopen(image_url).read()
            with open('gdrive/My Drive/image/Traffic safety/stop/'+str(count)+'.jpg',mode='wb') as f:
            print('download - {}'.format(count))
            count += 1
            print('cant open url')


Let's choose a good one from the acquired images. (As many as possible, 20 or more) Good example: (without background) 2.jpg NG example: 205.jpg

** 2, traffic sign Deep Learning **

① Convert image file to Numpy format

The data in the dataset "MNIST" is an array of (28,28) as shown below. image.png By imitating ↑, first resize the image file to a 50 * 50 square and then convert it to Numpy format. Since it is an RGB color mode, it will be an array of (50,50,3).


import cv2
import os
from sklearn.model_selection import train_test_split
from PIL import Image
import os,glob
import numpy as np

from google import colab'/content/gdrive')
!ls 'gdrive/My Drive'

#Select a category of classification
root_dir = 'gdrive/My Drive/image/Traffic safety 2/'
train_dir = 'gdrive/My Drive/data/'
groups = ['Green light','one-way','stop','No entry','Red light','blue sky']
nb_classes = len(groups)
image_size = 50

#Convert image data to Numpy format
#Read image data for each folder
X = []
Y = []
#Since there are few pictures, I will have them study the same picture 20 times. Humans remember things over and over, is it the same for deep learning?
for i in range(0,20,1):
  for idx,group in enumerate(groups):
      image_dir = root_dir + group
      files = glob.glob(image_dir+'/*.jpg')
      for i,f in enumerate(files):
          img =
          img = img.convert('RGB') #Change to RGB mode
          img = img.resize((image_size,image_size))#50*Resize to 50
          data = np.asarray(img)
X = np.array(X)
Y = np.array(Y)

X_train,X_test,y_train,y_test = train_test_split(X,Y,random_state=0)
xy = (X_train,X_test,y_train,y_test)'gdrive/My Drive/data/Traffic safety 2.npy', xy)

② Deep learning with neural network Introducing the technique of "convolution", please refer to the following URL. What is a convolutional neural network? The procedure is also explained carefully After applying a two-dimensional filter to the image data as shown in ↑, you can emphasize horizontal lines and vertical lines, which can greatly improve the success rate.


!pip install keras==2.2.4
import keras

from google import colab'/content/gdrive')
!ls 'gdrive/My Drive'

import numpy as np
from keras.datasets import mnist
from keras.utils import np_utils

#Read the data saved in process ①
x_train,x_test,y_train,y_test = np.load('gdrive/My Drive/data/Traffic safety 2.npy', mmap_mode=None, allow_pickle=True , fix_imports=True, encoding='ASCII')

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
num_classes = 10
y_train = np_utils.to_categorical(y_train, num_classes)
y_test = np_utils.to_categorical(y_test, num_classes)

#neural network
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.optimizers import Adam
import time

model = Sequential()
model.add(Conv2D(50, (3, 3), 
          input_shape=(50, 50, 3), activation='relu'))       #Convolution ①
model.add(Conv2D(32, (3, 3), activation='relu'))          #Convolution ②
model.add(MaxPooling2D(pool_size=(2, 2)))                
model.add(Conv2D(64, (3, 3), activation='relu'))         ##Convolution ③
model.add(MaxPooling2D(pool_size=(2, 2)))                
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))


startTime = time.time()

history =, y_train, batch_size=3000, epochs=20,
                    verbose=1, validation_data=(x_test, y_test))

score = model.evaluate(x_test, y_test, verbose=0)
print('loss:', score[0])
#success rate
print('accuracy:', score[1])
#Learning time
print("Computation time:{0:.3f} sec".format(time.time() - startTime))

#Save trained model'gdrive/My Drive/model/Traffic safety.h5')

The success rate is 0.98, which is a good feeling. loss: 0.11440953898268777 accuracy: 0.9878378378378379 Computation time:46.734 sec

** 3, Analyze Video and detect red light, stop, frame with no entry **

(1) How do you cut out the image of the sign from the actual photo and identify it with the trained model? 止まれ.jpg

I referred to the following. [Python / OpenCV] Detect moving objects with color tracking

Get the biggest blob.ipynb

#Blob analysis
def analysis_blob(binary_img):
    #Binary image labeling process
    label = cv2.connectedComponentsWithStats(binary_img)

    #Extract blob information item by item
    n = label[0] - 1
    data = np.delete(label[2], 0, 0)
    center = np.delete(label[3], 0, 0)
    if len(center) <= 0:
    #Index with the largest blob area
    max_index = np.argmax(data[:, 4])

    return center[max_index]

Prediction method.ipynb (arguments are image file and coordinates)

def predictWithModel(image_file, x_from, y_from, x_to, y_to):
    image_size = 50

    X1 = []
    img_crop = image_file[y_from : y_to, x_from: x_to]
    img_crop = cv2.cvtColor(img_crop, cv2.COLOR_BGR2RGB)
    img_crop = cv2.resize(img_crop, (image_size,image_size))#Change image size
    X1.append(img_crop)#Vectorized image
    X1 = np.array(X1)

    x_test = X1.astype('float32')
    x_test /= 255

    y = model.predict(x_test)  #Forecast

    wk = y[0, :]
    wk_sort = np.sort(wk)
    wk_sort = wk_sort[::-1]
    max_indx = -1
    max_pcnt = -1
    if float(wk_sort[0]) > 0.9 and np.argmax(wk) != 5 and np.argmax(wk) != 0:
        max_pcnt = float(wk_sort[0])
        max_indx = np.argmax(wk)
    return max_indx, max_pcnt

Coordinate adjustment method.ipynb (red light is rectangular, stop and no entry is square)

def adjustSize(height, width, center_x, center_y, div_size, div_size_w, kbn):
    if kbn == 1:
        x_from = center_x - div_size_w*3//4
        x_to = x_from + div_size_w
        y_from =  center_y - div_size//2
        y_to = y_from + div_size
        x_from = center_x - div_size//2
        x_to = x_from + div_size
        y_from =  center_y - div_size//2
        y_to = y_from + div_size

    if x_from < 0:
        x_from = 0
    if y_from < 0:
        y_from = 0
    if x_to >= width:
        x_to = width
    if y_to >= height:
        y_to = height
    return x_from, y_from, x_to, y_to

Image file analysis.ipynb

def predictImage3(argimg):

    #Load the image.
    height, width, channels = argimg.shape[:3]

    #Image crop (Traffic sign is in the upper half, so analyze only the upper half)
    img = argimg[0:height//2,0:width//2,::]
    #Convert to HSV color space.
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

    #Extract the red border by binarization.
    binary = cv2.inRange(hsv, (145, 70, 0), (180, 255, 255))
    if len(binary) <= 0:
        return argimg,-1

    #Use OPENING to eliminate noise.
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    eroded = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

    #Blob analysis of mask image (obtain blob information of maximum area)
    center = analysis_blob(eroded)
    if center is None:
        return argimg,-1

    #Get center coordinates
    center_x = int(center[0])
    center_y = int(center[1])

    #First rectangle
    x_from, y_from, x_to, y_to = adjustSize(height, width, center_x, center_y, 60, 100, 1)
    max_indx, max_pcnt = predictWithModel(img, x_from, y_from, x_to, y_to)

    #Research square
    x_from2, y_from2, x_to2, y_to2 = adjustSize(height, width, center_x, center_y, 50, 100, 0)    max_indx2, max_pcnt2 = predictWithModel(img, x_from2, y_from2, x_to2, y_to2)

    #Rectangle 2 (adjust size)
    x_from3, y_from3, x_to3, y_to3 = adjustSize(height, width, center_x, center_y, 40, 80, 1)
    max_indx3, max_pcnt3 = predictWithModel(img, x_from3, y_from3, x_to3, y_to3)

    pcnt = [max_pcnt, max_pcnt2, max_pcnt3]
    indx = [max_indx, max_indx2, max_indx3]
    max = np.argmax(pcnt)

    max_index = -1

    if indx[max] == 2:
    elif indx[max] == 4:
        text="Red light"
    elif indx[max] == 3:
        text="No entry"

    if indx[max] > 0:
        #Draw a circle around the center of the maximum blob on the frame, (center_x, center_y), 80, (0, 200, 0),thickness=3, lineType=cv2.LINE_AA)
        fontpath ='gdrive/My Drive/font/MSMINCHO.TTC' #font
        font = ImageFont.truetype(fontpath, 128) #font size
        img_pil = Image.fromarray(argimg) #Each value of the array is 8bit(1byte)Integer type(0~255)To PIL Image.

        draw = ImageDraw.Draw(img_pil) #Create draw instance

        position = (center_x, center_y + 100) #Text display position
        draw.text(position, text, font = font , fill = (0,0,255,0) ) #Write text in draw fill:Color BGRA(RGB)
        max_index = indx[max]
        return np.array(img_pil),max_index #Convert PIL to array
    return argimg,max_index

After preparing the above method, analyze the prepared Video

Analyze Video.ipynb

import cv2
from google.colab.patches import cv2_imshow
from PIL import ImageFont, ImageDraw, Image

target_dir = 'gdrive/My Drive/target/Traffic safety 2/'
files = glob.glob(target_dir+'/src_*.mp4')

target_avi_file = target_dir + "output.avi"
output_file = target_dir + "output.mp4"
#Create a VideoWriter.
fourcc = cv2.VideoWriter_fourcc(*"DIVX")
writer = cv2.VideoWriter(target_avi_file, fourcc, 30, (1920, 1080))

fame_index_result=np.empty((0,2), int)

for i,f in enumerate(files):
    #Create a VideoCapture.
    cap = cv2.VideoCapture(f)
    temp=np.empty((0,2), int)

    while True:
        #Get frame by frame.
        ret, frame =
        if not ret:
            break  #If frame acquisition fails or the end of the video is reached

        frame,index = predictImage3(frame)
        if index > 0:
            temp = np.append(temp, np.array([[frame_cnt,index]]), axis=0)
        writer.write(frame)  #Write a frame.

    #Traffic sign Record the first recognized Frame (= timing to insert voice)
    index_cnt = [np.count_nonzero(temp[:,1] == 2),np.count_nonzero(temp[:,1] == 3),np.count_nonzero(temp[:,1] == 4)]
    max_index = [2,3,4][np.argmax(index_cnt)]
    fame_index_result = np.append(fame_index_result, np.array([(temp[temp[:,1] == max_index])[0]]), axis=0)


** 4, put audio in the detected Frame **

Add audio.ipynb

!pip -q install pydub
!apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg
!pip install pyaudio
import cv2
import pyaudio
import sys
import time
import wave
import pydub
from pydub import AudioSegment
import moviepy.editor as mp
import datetime

temp_file = target_dir + "temp.mp4"

# Add audio to output video.
clip_output = mp.VideoFileClip(target_avi_file).subclip()
clip_output.write_videofile(temp_file, audio=mp3_file)

cap = cv2.VideoCapture(target_avi_file)
video_frame = cap.get(cv2.CAP_PROP_FRAME_COUNT) #Get the number of frames
video_fps = cap.get(cv2.CAP_PROP_FPS)           #Get FPS
video_len_sec = video_frame / video_fps         #Calculate the length (seconds)
video = mp.VideoFileClip(temp_file).subclip(0,video_len_sec)

mp3_stop_file_2="gdrive/My Drive/mp3/stop_2.wav"
mp3_stop_file_3="gdrive/My Drive/mp3/stop_3.wav"
mp3_stop_file_4="gdrive/My Drive/mp3/stop_4.wav"
videos = np.empty((0), mp.VideoFileClip)

temp_file2 = target_dir + "temp2.mp4"

#Insert voice for each frame of traffic sign
for idx in range(fame_index_result.shape[0]):

    if  fame_index_result[idx][1] == 2:
        mp3_stop_file = mp3_stop_file_2
    elif  fame_index_result[idx][1] == 3:
        mp3_stop_file = mp3_stop_file_3
    elif  fame_index_result[idx][1] == 4:
        mp3_stop_file = mp3_stop_file_4
    base_sound = AudioSegment.from_file(mp3_stop_file)
    length_seconds = base_sound.duration_seconds  #Check the length of the voice

    #First, cut out from 0 to the frame of the traffic sign
    video_len_sec_temp = fame_index_result[idx][0] / video_fps
    videos = np.append(videos, np.array([mp.VideoFileClip(temp_file).subclip(startSec,video_len_sec_temp)]), axis=0)

    #Match the length of the audio, cut out a video of the same length, and insert the audio
    clip_output = mp.VideoFileClip(temp_file).subclip(video_len_sec_temp, video_len_sec_temp+length_seconds)
    clip_output.write_videofile(temp_file2, audio=mp3_stop_file)
    #Remaining Video
    videos = np.append(videos, np.array([mp.VideoFileClip(temp_file2).subclip()]), axis=0)
    if idx == fame_index_result.shape[0] - 1:
        last_sec =  video_len_sec
        last_sec = fame_index_result[idx+1][0] / video_fps
    if video_len_sec_temp+length_seconds < last_sec:
        videos = np.append(videos, np.array([mp.VideoFileClip(temp_file).subclip(video_len_sec_temp+length_seconds, last_sec)]), axis=0)
    startSec = last_sec

#Concatenate edited videos
final_clip = mp.concatenate_videoclips(videos)

