"Kumantic Segumantion" to get information about bears from images showing bears. Following Last time, this time I will try to check what the network defined in Kumantic Segumantion is looking at. I did.
Last time Same as the defined one. Hereinafter, it will be referred to as "bear network".
import torch
from torch import nn, optim
from torch.nn import functional as F
class Kuma(nn.Module):
def __init__(self):
super(Kuma, self).__init__()
#Encoder part
self.encode1 = nn.Sequential(
*[
nn.Conv2d(
in_channels = 1, out_channels = 6, kernel_size = 3, padding = 1),
nn.BatchNorm2d(6)
])
self.encode2 = nn.Sequential(
*[
nn.Conv2d(
in_channels = 6, out_channels = 16, kernel_size = 3, padding = 1),
nn.BatchNorm2d(16)
])
self.encode3 = nn.Sequential(
*[
nn.Conv2d(
in_channels = 16, out_channels = 32, kernel_size = 3, padding = 1),
nn.BatchNorm2d(32)
])
self.encode4 = nn.Sequential(
*[
nn.Conv2d(
in_channels = 32, out_channels = 64, kernel_size = 3, padding = 1),
nn.BatchNorm2d(64)
])
#Decoder part
self.decode4 = nn.Sequential(
*[
nn.ConvTranspose2d(
in_channels = 64, out_channels = 32, kernel_size = 3, padding = 1),
nn.BatchNorm2d(32)
])
self.decode3 = nn.Sequential(
*[
nn.ConvTranspose2d(
in_channels = 32, out_channels = 16, kernel_size = 3, padding = 1),
nn.BatchNorm2d(16)
])
self.decode2 = nn.Sequential(
*[
nn.ConvTranspose2d(
in_channels = 16, out_channels = 6, kernel_size = 3, padding = 1),
nn.BatchNorm2d(6)
])
self.decode1 = nn.Sequential(
*[
nn.ConvTranspose2d(
in_channels = 6, out_channels = 1, kernel_size = 3, padding = 1),
])
def forward(self, x):
#Encoder part
dim_0 = x.size()
x = F.relu(self.encode1(x))
x, idx_1 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
dim_1 = x.size()
x = F.relu(self.encode2(x))
x, idx_2 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
dim_2 = x.size()
x = F.relu(self.encode3(x))
x, idx_3 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
dim_3 = x.size()
x = F.relu(self.encode4(x))
x, idx_4 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
#Decoder part
x = F.max_unpool2d(x, idx_4, kernel_size = 2, stride = 2, output_size = dim_3)
x = F.relu(self.decode4(x))
x = F.max_unpool2d(x, idx_3, kernel_size = 2, stride = 2, output_size = dim_2)
x = F.relu(self.decode3(x))
x = F.max_unpool2d(x, idx_2, kernel_size = 2, stride = 2, output_size = dim_1)
x = F.relu(self.decode2(x))
x = F.max_unpool2d(x, idx_1, kernel_size = 2, stride = 2, output_size = dim_0)
x = F.relu(self.decode1(x))
x = torch.sigmoid(x)
return x
Last time Download the created learned bear network.
url = "https://github.com/maskot1977/PythonCourse2019/blob/master/kuma_050_20200226.pytorch?raw=true"
import urllib.request
urllib.request.urlretrieve(url, 'kuma_050_20200226.pytorch') #Download data
('kuma_050_20200226.pytorch', <http.client.HTTPMessage at 0x7f73177ebef0>)
Load the trained bear network on the defined bear network.
kuma = Kuma()
kuma.load_state_dict(torch.load("kuma_050_20200226.pytorch"))
<All keys matched successfully>
kuma
Kuma(
(encode1): Sequential(
(0): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): BatchNorm2d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(encode2): Sequential(
(0): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(encode3): Sequential(
(0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(encode4): Sequential(
(0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(decode4): Sequential(
(0): ConvTranspose2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(decode3): Sequential(
(0): ConvTranspose2d(32, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(decode2): Sequential(
(0): ConvTranspose2d(16, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): BatchNorm2d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(decode1): Sequential(
(0): ConvTranspose2d(6, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
Looking at the parameters one by one,
for name, param in kuma.named_parameters():
print(name, param.shape)
encode1.0.weight torch.Size([6, 1, 3, 3])
encode1.0.bias torch.Size([6])
encode1.1.weight torch.Size([6])
encode1.1.bias torch.Size([6])
encode2.0.weight torch.Size([16, 6, 3, 3])
encode2.0.bias torch.Size([16])
encode2.1.weight torch.Size([16])
encode2.1.bias torch.Size([16])
encode3.0.weight torch.Size([32, 16, 3, 3])
encode3.0.bias torch.Size([32])
encode3.1.weight torch.Size([32])
encode3.1.bias torch.Size([32])
encode4.0.weight torch.Size([64, 32, 3, 3])
encode4.0.bias torch.Size([64])
encode4.1.weight torch.Size([64])
encode4.1.bias torch.Size([64])
decode4.0.weight torch.Size([64, 32, 3, 3])
decode4.0.bias torch.Size([32])
decode4.1.weight torch.Size([32])
decode4.1.bias torch.Size([32])
decode3.0.weight torch.Size([32, 16, 3, 3])
decode3.0.bias torch.Size([16])
decode3.1.weight torch.Size([16])
decode3.1.bias torch.Size([16])
decode2.0.weight torch.Size([16, 6, 3, 3])
decode2.0.bias torch.Size([6])
decode2.1.weight torch.Size([6])
decode2.1.bias torch.Size([6])
decode1.0.weight torch.Size([6, 1, 3, 3])
decode1.0.bias torch.Size([1])
Ah, that's right, I fully understood ← I don't understand
Apparently, the matrix parameters shown above are called kernels (or filters) in the world of deep learning. The shape of the matrix is as shown above, but let's visualize what numbers are in it.
import matplotlib.pyplot as plt
for name, param in kuma.named_parameters():
print(name)
print(param.shape)
if len(param.shape) == 4:
x, y, z, w = param.shape
idx = 0
fig = plt.figure(figsize=(x, y))
for para in param:
for par in para:
idx += 1
ax = fig.add_subplot(y, x, idx)
im = ax.imshow(par.detach().numpy(), cmap="gray")
ax.axis('off')
#fig.colorbar(im)
plt.show()
#break
encode1.0.weight
torch.Size([6, 1, 3, 3])
Click the image to enlarge it.
encode1.0.bias torch.Size([6]) encode1.1.weight torch.Size([6]) encode1.1.bias torch.Size([6]) encode2.0.weight torch.Size([16, 6, 3, 3])
Click the image to enlarge it.
encode2.0.bias torch.Size([16]) encode2.1.weight torch.Size([16]) encode2.1.bias torch.Size([16]) encode3.0.weight torch.Size([32, 16, 3, 3])
Click the image to enlarge it.
encode3.0.bias torch.Size([32]) encode3.1.weight torch.Size([32]) encode3.1.bias torch.Size([32]) encode4.0.weight torch.Size([64, 32, 3, 3])
Click the image to enlarge it.
encode4.0.bias torch.Size([64]) encode4.1.weight torch.Size([64]) encode4.1.bias torch.Size([64]) decode4.0.weight torch.Size([64, 32, 3, 3])
Click the image to enlarge it.
decode4.0.bias torch.Size([32]) decode4.1.weight torch.Size([32]) decode4.1.bias torch.Size([32]) decode3.0.weight torch.Size([32, 16, 3, 3])
Click the image to enlarge it.
decode3.0.bias torch.Size([16]) decode3.1.weight torch.Size([16]) decode3.1.bias torch.Size([16]) decode2.0.weight torch.Size([16, 6, 3, 3])
Click the image to enlarge it.
decode2.0.bias torch.Size([6]) decode2.1.weight torch.Size([6]) decode2.1.bias torch.Size([6]) decode1.0.weight torch.Size([6, 1, 3, 3])
Click the image to enlarge it.
decode1.0.bias torch.Size([1])
Speaking of my lack of knowledge, this bear network scans the image with this "3 x 3" kernel (or filter) to extract features in the image such as "lines".
Still, I'm not sure how this recognizes Kuma-san.
Even if I look at the kernel (or filter), it doesn't come out well, so I tried passing the bear image through the bear network and seeing what the bear image looks like in each layer.
It is the same as Last time.
import numpy as np
import random
from PIL import Image, ImageDraw, ImageFilter
from itertools import product
def draw_bear(n_bear=1): #Randomly generate an image of a bear
r = g = b = 250
im = Image.new('RGB', (400, 400), (r, g, b))
draw = ImageDraw.Draw(im)
for _ in range(random.randint(-1, 0)):
r = random.randint(10, 200)
g = random.randint(10, 200)
b = random.randint(10, 200)
x1 = random.randint(0, 400)
y1 = random.randint(0, 400)
dx = random.randint(10, 50)
dy = random.randint(10, 50)
draw.ellipse((x1, y1, x1+dx, y1+dy), fill=(r, g, b))
for _ in range(n_bear):
r = g = b = 1
center_x = 200
center_y = 200
wx = 60
wy = 50
dx1 = 90
dx2 = 20
dy1 = 90
dy2 = 20
dx3 = 15
dy3 = 100
dy4 = 60
shape1 = (center_x - wx, center_y - wy, center_x + wx, center_y + wy)
shape2 = (center_x - dx1, center_y - dy1, center_x - dx2, center_y - dy2)
shape3 = (center_x + dx2, center_y - dy1, center_x + dx1, center_y - dy2)
shape4 = (center_x - dx3, center_y - dy3, center_x + dx3, center_y - dy4)
zoom = 0.2 + random.random() * 0.4
center_x = random.randint(-30, 250)
center_y = random.randint(-30, 250)
shape1 = modify(shape1, zoom=zoom, center_x=center_x, center_y=center_y)
shape2= modify(shape2, zoom=zoom, center_x=center_x, center_y=center_y)
shape3 = modify(shape3, zoom=zoom, center_x=center_x, center_y=center_y)
shape4 = modify(shape4, zoom=zoom, center_x=center_x, center_y=center_y)
draw.ellipse(shape1, fill=(r, g, b))
draw.ellipse(shape2, fill=(r, g, b))
draw.ellipse(shape3, fill=(r, g, b))
#draw.ellipse(shape4, fill=(r, g, b))
return im
def modify(shape, zoom=1, center_x=0, center_y=0):
x1, y1, x2, y2 = np.array(shape) * zoom
return (x1 + center_x, y1 + center_y, x2 + center_x, y2 + center_y)
class Noise: #Add noise to the bear's image
def __init__(self, input_image):
self.input_image = input_image
self.input_pix = self.input_image.load()
self.w, self.h = self.input_image.size
def saltpepper(self, salt=0.05, pepper=0.05):
output_image = Image.new("RGB", self.input_image.size)
output_pix = output_image.load()
for x, y in product(*map(range, (self.w, self.h))):
r = random.random()
if r < salt:
output_pix[x, y] = (255, 255, 255)
elif r > 1 - pepper:
output_pix[x, y] = ( 0, 0, 0)
else:
output_pix[x, y] = self.input_pix[x, y]
return output_image
##Process bear's image into teacher data for semantic segmentation
def getdata_for_semantic_segmentation(im):
x_im = im.filter(ImageFilter.CONTOUR)
im2 = Noise(input_image=x_im)
x_im = im2.saltpepper()
a_im = np.asarray(im)
y_im = Image.fromarray(np.where(a_im == 1, 255, 0).astype(dtype='uint8'))
return x_im, y_im
I modified it to visualize the results on the way.
import torch
from torch import nn, optim
from torch.nn import functional as F
class Kuma(nn.Module):
def __init__(self):
super(Kuma, self).__init__()
#Encoder part
self.encode1 = nn.Sequential(
*[
nn.Conv2d(
in_channels = 1, out_channels = 6, kernel_size = 3, padding = 1),
nn.BatchNorm2d(6)
])
self.encode2 = nn.Sequential(
*[
nn.Conv2d(
in_channels = 6, out_channels = 16, kernel_size = 3, padding = 1),
nn.BatchNorm2d(16)
])
self.encode3 = nn.Sequential(
*[
nn.Conv2d(
in_channels = 16, out_channels = 32, kernel_size = 3, padding = 1),
nn.BatchNorm2d(32)
])
self.encode4 = nn.Sequential(
*[
nn.Conv2d(
in_channels = 32, out_channels = 64, kernel_size = 3, padding = 1),
nn.BatchNorm2d(64)
])
#Decoder part
self.decode4 = nn.Sequential(
*[
nn.ConvTranspose2d(
in_channels = 64, out_channels = 32, kernel_size = 3, padding = 1),
nn.BatchNorm2d(32)
])
self.decode3 = nn.Sequential(
*[
nn.ConvTranspose2d(
in_channels = 32, out_channels = 16, kernel_size = 3, padding = 1),
nn.BatchNorm2d(16)
])
self.decode2 = nn.Sequential(
*[
nn.ConvTranspose2d(
in_channels = 16, out_channels = 6, kernel_size = 3, padding = 1),
nn.BatchNorm2d(6)
])
self.decode1 = nn.Sequential(
*[
nn.ConvTranspose2d(
in_channels = 6, out_channels = 1, kernel_size = 3, padding = 1),
])
def forward(self, x):
print("forward input:", x.shape)
draw_layer(x)
#Encoder part
dim_0 = x.size()
x = F.relu(self.encode1(x))
x, idx_1 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
print("after encode1:", x.shape)
draw_layer(x)
dim_1 = x.size()
x = F.relu(self.encode2(x))
x, idx_2 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
print("after encode2:", x.shape)
draw_layer(x)
dim_2 = x.size()
x = F.relu(self.encode3(x))
x, idx_3 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
print("after encode3:", x.shape)
draw_layer(x)
dim_3 = x.size()
x = F.relu(self.encode4(x))
x, idx_4 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
print("after encode4:", x.shape)
draw_layer(x)
#Decoder part
x = F.max_unpool2d(x, idx_4, kernel_size = 2, stride = 2, output_size = dim_3)
x = F.relu(self.decode4(x))
print("after decode4:", x.shape)
draw_layer(x)
x = F.max_unpool2d(x, idx_3, kernel_size = 2, stride = 2, output_size = dim_2)
x = F.relu(self.decode3(x))
print("after decode3:", x.shape)
draw_layer(x)
x = F.max_unpool2d(x, idx_2, kernel_size = 2, stride = 2, output_size = dim_1)
x = F.relu(self.decode2(x))
print("after decode2:", x.shape)
draw_layer(x)
x = F.max_unpool2d(x, idx_1, kernel_size = 2, stride = 2, output_size = dim_0)
x = F.relu(self.decode1(x))
x = torch.sigmoid(x)
print("after decode1:", x.shape)
draw_layer(x)
return x
def draw_layer(param):
if len(param.shape) == 4:
x, y, z, w = param.shape
idx = 0
fig = plt.figure(figsize=(y*2, x*2))
for para in param:
for par in para:
idx += 1
ax = fig.add_subplot(x, y, idx)
im = ax.imshow(par.detach().numpy(), cmap="gray")
ax.axis('off')
#fig.colorbar(im)
plt.show()
kuma = Kuma()
kuma.load_state_dict(torch.load("kuma_050_20200226.pytorch"))
<All keys matched successfully>
I made 4 images with 3 bears per image and checked how they look in the middle layer.
X_test = [] #Stores image data for testing
Y_test = [] #Stores correct answer data for testing
Z_test = [] #Store prediction results for testing
for i in range(4): #Generate 4 new data not used for learning
x_im, y_im = getdata_for_semantic_segmentation(draw_bear(3))
X_test.append(x_im)
Y_test.append(y_im)
#Format test image data for PyTorch
X_test_a = np.array([[np.asarray(x).transpose((2, 0, 1))[0]] for x in X_test])
X_test_t = torch.tensor(X_test_a, dtype = torch.float32)
#Calculate predictions using a trained model
Y_pred = kuma(X_test_t)
#Store predicted values as ndarray
for pred in Y_pred:
Z_test.append(pred.detach().numpy())
forward input: torch.Size([4, 1, 400, 400])
Click the image to enlarge it.
after encode1: torch.Size([4, 6, 200, 200])
Click the image to enlarge it.
after encode2: torch.Size([4, 16, 100, 100])
Click the image to enlarge it.
after encode3: torch.Size([4, 32, 50, 50])
Click the image to enlarge it.
after encode4: torch.Size([4, 64, 25, 25])
Click the image to enlarge it.
after decode4: torch.Size([4, 32, 50, 50])
Click the image to enlarge it.
after decode3: torch.Size([4, 16, 100, 100])
Click the image to enlarge it.
after decode2: torch.Size([4, 6, 200, 200])
Click the image to enlarge it.
after decode1: torch.Size([4, 1, 400, 400])
The encoder 1st layer gets a rough outline, the 2nd to 3rd layers remove noise, the 3rd to 4th layers obtain a "contoured area", and the decoder 1st to 4th layers. It's like restoring it so that it can be mapped onto the original image.
Recommended Posts