I think that visualization of the basis of judgment is often seen as a recent fashion. I started python at university, and since I am dealing with yolov3 a little, I wish I could visualize the feature map. Also, since I am a beginner, I am not familiar with the program or have a basic writing style, so please be kind to me. Here, the code of yolov3 that I am dealing with now is Github : "qqwweee/keras-yolo3" Is the code of. I'd like to tweak this code a bit to visualize the feature map.
yolov3 seems to output three, small, medium, and large, so I took out three. The parts I messed with in yolo.py are the first init, the last part of generate, and detect_image. Please take a look at the comments.
def __init__(self, **kwargs):
self.__dict__.update(self._defaults) # set up default values
self.__dict__.update(kwargs) # and update with user overrides
self.class_names = self._get_class()
self.anchors = self._get_anchors()
self.sess = K.get_session()
Increased #feature_get.
self.boxes, self.scores, self.classes ,self.feature_get= self.generate()
def generate(self):
model_path = os.path.expanduser(self.model_path)
assert model_path.endswith('.h5'), 'Keras model or weights must be a .h5 file.'
# Load model, or construct model and load weights.
num_anchors = len(self.anchors)
num_classes = len(self.class_names)
is_tiny_version = num_anchors==6 # default setting
try:
self.yolo_model = load_model(model_path, compile=False)
except:
self.yolo_model = tiny_yolo_body(Input(shape=(None,None,3)), num_anchors//2, num_classes) \
if is_tiny_version else yolo_body(Input(shape=(None,None,3)), num_anchors//3, num_classes)
self.yolo_model.load_weights(self.model_path) # make sure model, anchors and classes match
else:
assert self.yolo_model.layers[-1].output_shape[-1] == \
num_anchors/len(self.yolo_model.output) * (num_classes + 5), \
'Mismatch between model and given anchor and class sizes'
print('{} model, anchors, and classes loaded.'.format(model_path))
# Generate colors for drawing bounding boxes.
hsv_tuples = [(x / len(self.class_names), 1., 1.)
for x in range(len(self.class_names))]
self.colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
self.colors = list(
map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)),
self.colors))
np.random.seed(10101) # Fixed seed for consistent colors across runs.
np.random.shuffle(self.colors) # Shuffle colors to decorrelate adjacent classes.
np.random.seed(None) # Reset seed to default.
# Generate output tensor targets for filtered bounding boxes.
self.input_image_shape = K.placeholder(shape=(2, ))
if self.gpu_num>=2:
self.yolo_model = multi_gpu_model(self.yolo_model, gpus=self.gpu_num)
boxes, scores, classes = yolo_eval(self.yolo_model.output, self.anchors,
len(self.class_names), self.input_image_shape,
score_threshold=self.score, iou_threshold=self.iou)
# Add the following. Decide 3 layers for which you want to get a feature map
I looked at #summary and looked at the activation layer just before the output.
feature_get = [self.yolo_model.get_layer("leaky_re_lu_58").output , self.yolo_model.get_layer("leaky_re_lu_65").output , self.yolo_model.get_layer("leaky_re_lu_72").output]
return boxes, scores, classes, feature_get
def detect_image(self, image):
start = timer()
if self.model_image_size != (None, None):
assert self.model_image_size[0]%32 == 0, 'Multiples of 32 required'
assert self.model_image_size[1]%32 == 0, 'Multiples of 32 required'
boxed_image = letterbox_image(image, tuple(reversed(self.model_image_size)))
else:
new_image_size = (image.width - (image.width % 32),
image.height - (image.height % 32))
boxed_image = letterbox_image(image, new_image_size)
image_data = np.array(boxed_image, dtype='float32')
print(image_data.shape)
image_data /= 255.
image_data = np.expand_dims(image_data, 0) # Add batch dimension.
Increased #feature_get by one
out_boxes, out_scores, out_classes, feature_get= self.sess.run(
[self.boxes, self.scores, self.classes, self.feature_get],
feed_dict={
self.yolo_model.input: image_data,
self.input_image_shape: [image.size[1], image.size[0]],
K.learning_phase(): 0
})
print('Found {} boxes for {}'.format(len(out_boxes), 'img'))
font = ImageFont.truetype(font='font/FiraMono-Medium.otf',
size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32'))
thickness = (image.size[0] + image.size[1]) // 300
for i, c in reversed(list(enumerate(out_classes))):
predicted_class = self.class_names[c]
box = out_boxes[i]
score = out_scores[i]
label = '{} {:.2f}'.format(predicted_class, score)
draw = ImageDraw.Draw(image)
label_size = draw.textsize(label, font)
top, left, bottom, right = box
top = max(0, np.floor(top + 0.5).astype('int32'))
left = max(0, np.floor(left + 0.5).astype('int32'))
bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32'))
right = min(image.size[0], np.floor(right + 0.5).astype('int32'))
print(label, (left, top), (right, bottom))
if top - label_size[1] >= 0:
text_origin = np.array([left, top - label_size[1]])
else:
text_origin = np.array([left, top + 1])
# My kingdom for a good redistributable image drawing library.
for i in range(thickness):
draw.rectangle(
[left + i, top + i, right - i, bottom - i],
outline=self.colors[c])
draw.rectangle(
[tuple(text_origin), tuple(text_origin + label_size)],
fill=self.colors[c])
draw.text(text_origin, label, fill=(0, 0, 0), font=font)
del draw
end = timer()
return image, feature_get
Now write the code to get the feature map If you output it as it is, it will come out a lot, so for the time being, I will try to extract the map with the highest average value for small, medium, and large respectively.
import numpy as np
import keras.backend as K
from keras.models import load_model
from keras.layers import Input
from PIL import Image, ImageDraw,ImageOps
import tensorflow as tf
import cv2
import math
import collections
def feature(conv_list):
#I want to get the map with the highest average value.
conv_list1 = np.asarray(conv_list[0])
conv_list2 = np.asarray(conv_list[1])
conv_list3 = np.asarray(conv_list[2])
bg = np.zeros((720,720))
ave1 = []
ave2 = []
ave3 = []
for i in range(conv_list1.shape[3]):
conv = conv_list1[0,:,:,i]
norm1 = np.max(conv)
conv = conv / norm1
ave1.append(np.mean(conv))
max1 = np.max(ave1)
index1 = ave1.index(max1)
conv1 = conv_list1[0,:,:,index1]
norm1 = np.max(conv1)
conv1 = conv1 / norm1
for i in range(conv_list2.shape[3]):
conv = conv_list2[0,:,:,i]
norm2 = np.max(conv)
conv = conv / norm2
ave2.append(np.mean(conv))
max2 = np.max(ave2)
index2 = ave2.index(max2)
conv2 = conv_list2[0,:,:,index2]
norm2 = np.max(conv2)
conv2 = conv2 / norm2
for i in range(conv_list3.shape[3]):
conv = conv_list3[0,:,:,i]
norm3 = np.max(conv)
conv = conv / norm3
ave3.append(np.mean(conv))
max3 = np.max(ave3)
index3 = ave3.index(max3)
conv3 = conv_list3[0,:,:,index3]
norm3 = np.max(conv3)
conv3 = conv3 / norm3
conv1 = conv1 * 255
conv2 = conv2 * 255
conv3 = conv3 * 255
conv1 = cv2.resize(conv1, (350, 350), interpolation=cv2.INTER_NEAREST)
conv2 = cv2.resize(conv2, (350, 350), interpolation=cv2.INTER_NEAREST)
conv3 = cv2.resize(conv3, (350, 350), interpolation=cv2.INTER_NEAREST)
bg[5:355,5:355] = conv1
bg[5:355,365:715] = conv2
bg[365:715,5:355] = conv3
return bg
Finally, I messed with yolo_video.py.
def detect_img(yolo):
while True:
img = input('Input image filename:')
try:
image = Image.open(img)
except:
print('Open Error! Try again!')
continue
else:
r_image , feature_get = yolo.detect_image(image)
#Display feature map Do the following.
img_feature = feature_img.feature(feature_get)
cv2.imwrite("feature_map.png " , img_feature)
r_image.show()
yolo.close_session()
Now you should get a feature map by running yolo_video.py --image.
This time I borrowed a cat from Pakutaso, a free material, so I will try to detect it. By the way, the weight used the default yolo.h5 ↓ It is a detected cat
The feature map looks like this
I'm not sure. It looks like the cat's eyes and mouth are characteristic. By the way, when the cat in the input image is put into the network, it is resized by pasting it on the gray image of 416 x 416 with the width adjusted. The feature map is adapted to its size. Is yolov3 centered on grid cells? I don't know where I'm looking at the center, and I don't know which of the three maps I'm looking at to determine that it's a cat, so I have to make it visible. I thought it wouldn't be. Also, this time I looked at the last activated layer, but if you look at another layer, it may change in various ways. I don't know how to handle the acquired map value, so I think there is a better way.
I wrote an article like this for the first time, and I hope it will be helpful for the same beginners.
Recommended Posts