"Kumantic Segumantion" to get information about bears from images showing bears. Following Last time, this time I will try to check what the network defined in Kumantic Segumantion is looking at. I did.

A network that recognizes the silhouette of a bear

Last time Same as the defined one. Hereinafter, it will be referred to as "bear network".

import torch
from torch import nn, optim
from torch.nn import functional as F
class Kuma(nn.Module):
    def __init__(self):
        super(Kuma, self).__init__()
        #Encoder part
        self.encode1 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 1, out_channels = 6, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(6)
              ])
        self.encode2 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 6, out_channels = 16, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(16)
              ])
        self.encode3 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 16, out_channels = 32, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(32)
              ])

        self.encode4 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 32, out_channels = 64, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(64)
              ])

        #Decoder part
        self.decode4 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 64, out_channels = 32, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(32)
              ])
        self.decode3 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 32, out_channels = 16, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(16)
              ])
        self.decode2 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 16, out_channels = 6, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(6)
              ])
        self.decode1 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 6, out_channels = 1, kernel_size = 3, padding = 1),
              ])

    def forward(self, x):
        #Encoder part
        dim_0 = x.size()      
        x = F.relu(self.encode1(x))
        x, idx_1 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)

        dim_1 = x.size() 
        x = F.relu(self.encode2(x))
        x, idx_2 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)

        dim_2 = x.size()
        x = F.relu(self.encode3(x)) 
        x, idx_3 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)

        dim_3 = x.size()
        x = F.relu(self.encode4(x)) 
        x, idx_4 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)

        #Decoder part
        x = F.max_unpool2d(x, idx_4, kernel_size = 2, stride = 2, output_size = dim_3)
        x = F.relu(self.decode4(x))

        x = F.max_unpool2d(x, idx_3, kernel_size = 2, stride = 2, output_size = dim_2)
        x = F.relu(self.decode3(x))

        x = F.max_unpool2d(x, idx_2, kernel_size = 2, stride = 2, output_size = dim_1)           
        x = F.relu(self.decode2(x))

        x = F.max_unpool2d(x, idx_1, kernel_size = 2, stride = 2, output_size = dim_0)           
        x = F.relu(self.decode1(x))

        x = torch.sigmoid(x)                                     

        return x

Learned network download

Last time Download the created learned bear network.

url = "https://github.com/maskot1977/PythonCourse2019/blob/master/kuma_050_20200226.pytorch?raw=true"

import urllib.request
urllib.request.urlretrieve(url, 'kuma_050_20200226.pytorch') #Download data

('kuma_050_20200226.pytorch', <http.client.HTTPMessage at 0x7f73177ebef0>)

Load

Load the trained bear network on the defined bear network.

kuma = Kuma()
kuma.load_state_dict(torch.load("kuma_050_20200226.pytorch"))

<All keys matched successfully>

Check the contents

kuma

Kuma(
  (encode1): Sequential(
    (0): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (encode2): Sequential(
    (0): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (encode3): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (encode4): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (decode4): Sequential(
    (0): ConvTranspose2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (decode3): Sequential(
    (0): ConvTranspose2d(32, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (decode2): Sequential(
    (0): ConvTranspose2d(16, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (decode1): Sequential(
    (0): ConvTranspose2d(6, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  )
)

Looking at the parameters one by one,

for name, param in kuma.named_parameters():
    print(name, param.shape)

encode1.0.weight torch.Size([6, 1, 3, 3])
encode1.0.bias torch.Size([6])
encode1.1.weight torch.Size([6])
encode1.1.bias torch.Size([6])
encode2.0.weight torch.Size([16, 6, 3, 3])
encode2.0.bias torch.Size([16])
encode2.1.weight torch.Size([16])
encode2.1.bias torch.Size([16])
encode3.0.weight torch.Size([32, 16, 3, 3])
encode3.0.bias torch.Size([32])
encode3.1.weight torch.Size([32])
encode3.1.bias torch.Size([32])
encode4.0.weight torch.Size([64, 32, 3, 3])
encode4.0.bias torch.Size([64])
encode4.1.weight torch.Size([64])
encode4.1.bias torch.Size([64])
decode4.0.weight torch.Size([64, 32, 3, 3])
decode4.0.bias torch.Size([32])
decode4.1.weight torch.Size([32])
decode4.1.bias torch.Size([32])
decode3.0.weight torch.Size([32, 16, 3, 3])
decode3.0.bias torch.Size([16])
decode3.1.weight torch.Size([16])
decode3.1.bias torch.Size([16])
decode2.0.weight torch.Size([16, 6, 3, 3])
decode2.0.bias torch.Size([6])
decode2.1.weight torch.Size([6])
decode2.1.bias torch.Size([6])
decode1.0.weight torch.Size([6, 1, 3, 3])
decode1.0.bias torch.Size([1])

Ah, that's right, I fully understood ← I don't understand

Kernel (or filter)

Apparently, the matrix parameters shown above are called kernels (or filters) in the world of deep learning. The shape of the matrix is as shown above, but let's visualize what numbers are in it.

import matplotlib.pyplot as plt
for name, param in kuma.named_parameters():
    print(name)
    print(param.shape)
    if len(param.shape) == 4:
        x, y, z, w = param.shape
        idx = 0
        fig = plt.figure(figsize=(x, y))
        for para in param:
            for par in para:
                idx += 1
                ax = fig.add_subplot(y, x, idx)
                im = ax.imshow(par.detach().numpy(), cmap="gray")
                ax.axis('off')
                #fig.colorbar(im)
        plt.show()
    #break

encode1.0.weight
torch.Size([6, 1, 3, 3])

Click the image to enlarge it.

encode1.0.bias torch.Size([6]) encode1.1.weight torch.Size([6]) encode1.1.bias torch.Size([6]) encode2.0.weight torch.Size([16, 6, 3, 3])

Click the image to enlarge it.

encode2.0.bias torch.Size([16]) encode2.1.weight torch.Size([16]) encode2.1.bias torch.Size([16]) encode3.0.weight torch.Size([32, 16, 3, 3])

Click the image to enlarge it.

encode3.0.bias torch.Size([32]) encode3.1.weight torch.Size([32]) encode3.1.bias torch.Size([32]) encode4.0.weight torch.Size([64, 32, 3, 3])

Click the image to enlarge it.

encode4.0.bias torch.Size([64]) encode4.1.weight torch.Size([64]) encode4.1.bias torch.Size([64]) decode4.0.weight torch.Size([64, 32, 3, 3])

Click the image to enlarge it.

decode4.0.bias torch.Size([32]) decode4.1.weight torch.Size([32]) decode4.1.bias torch.Size([32]) decode3.0.weight torch.Size([32, 16, 3, 3])

Click the image to enlarge it.

decode3.0.bias torch.Size([16]) decode3.1.weight torch.Size([16]) decode3.1.bias torch.Size([16]) decode2.0.weight torch.Size([16, 6, 3, 3])

Click the image to enlarge it.

decode2.0.bias torch.Size([6]) decode2.1.weight torch.Size([6]) decode2.1.bias torch.Size([6]) decode1.0.weight torch.Size([6, 1, 3, 3])

Click the image to enlarge it.

decode1.0.bias torch.Size([1])

Speaking of my lack of knowledge, this bear network scans the image with this "3 x 3" kernel (or filter) to extract features in the image such as "lines".

Still, I'm not sure how this recognizes Kuma-san.

Pass the bear image through the bear network

Even if I look at the kernel (or filter), it doesn't come out well, so I tried passing the bear image through the bear network and seeing what the bear image looks like in each layer.

Bear image generation

It is the same as Last time.

import numpy as np
import random
from PIL import Image, ImageDraw, ImageFilter
from itertools import product

def draw_bear(n_bear=1): #Randomly generate an image of a bear
    r = g = b = 250
    im = Image.new('RGB', (400, 400), (r, g, b))
    draw = ImageDraw.Draw(im)

    for _ in range(random.randint(-1, 0)):
        r = random.randint(10, 200)
        g = random.randint(10, 200)
        b = random.randint(10, 200)
        x1 = random.randint(0, 400)
        y1 = random.randint(0, 400)
        dx = random.randint(10, 50)
        dy = random.randint(10, 50)
        draw.ellipse((x1, y1, x1+dx, y1+dy), fill=(r, g, b))

    for _ in range(n_bear):
        r = g = b = 1
        center_x = 200
        center_y = 200
        wx = 60
        wy = 50
        dx1 = 90
        dx2 = 20
        dy1 = 90
        dy2 = 20
        dx3 = 15
        dy3 = 100
        dy4 = 60
        shape1 = (center_x - wx, center_y - wy, center_x + wx, center_y + wy)
        shape2 = (center_x - dx1, center_y - dy1, center_x - dx2, center_y - dy2)
        shape3 = (center_x + dx2, center_y - dy1, center_x + dx1, center_y - dy2)
        shape4 = (center_x - dx3, center_y - dy3, center_x + dx3, center_y - dy4)

        zoom = 0.2 + random.random() * 0.4
        center_x = random.randint(-30, 250)
        center_y = random.randint(-30, 250)

        shape1 = modify(shape1, zoom=zoom, center_x=center_x, center_y=center_y)
        shape2= modify(shape2, zoom=zoom, center_x=center_x, center_y=center_y)
        shape3 = modify(shape3, zoom=zoom, center_x=center_x, center_y=center_y)
        shape4 = modify(shape4, zoom=zoom, center_x=center_x, center_y=center_y)

        draw.ellipse(shape1, fill=(r, g, b))
        draw.ellipse(shape2, fill=(r, g, b))
        draw.ellipse(shape3, fill=(r, g, b))
        #draw.ellipse(shape4, fill=(r, g, b))

    return im

def modify(shape, zoom=1, center_x=0, center_y=0):
    x1, y1, x2, y2 = np.array(shape) * zoom
    return (x1 + center_x, y1 + center_y, x2 + center_x, y2 + center_y)

class Noise: #Add noise to the bear's image
    def __init__(self, input_image):
        self.input_image = input_image
        self.input_pix = self.input_image.load()
        self.w, self.h = self.input_image.size

    def saltpepper(self, salt=0.05, pepper=0.05):
        output_image = Image.new("RGB", self.input_image.size)
        output_pix = output_image.load()

        for x, y in product(*map(range, (self.w, self.h))):
            r = random.random()
            if r < salt:
                output_pix[x, y] = (255, 255, 255)
            elif r > 1 - pepper:
                output_pix[x, y] = (  0,   0,   0)
            else:
                output_pix[x, y] = self.input_pix[x, y]
        return output_image

##Process bear's image into teacher data for semantic segmentation
def getdata_for_semantic_segmentation(im): 
    x_im = im.filter(ImageFilter.CONTOUR)
    im2 = Noise(input_image=x_im)
    x_im = im2.saltpepper()
    a_im = np.asarray(im)
    y_im = Image.fromarray(np.where(a_im == 1, 255, 0).astype(dtype='uint8'))
    return x_im, y_im

Slightly modified bear network

I modified it to visualize the results on the way.

import torch
from torch import nn, optim
from torch.nn import functional as F
class Kuma(nn.Module):
    def __init__(self):
        super(Kuma, self).__init__()
        #Encoder part
        self.encode1 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 1, out_channels = 6, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(6)
              ])
        self.encode2 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 6, out_channels = 16, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(16)
              ])
        self.encode3 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 16, out_channels = 32, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(32)
              ])

        self.encode4 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 32, out_channels = 64, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(64)
              ])

        #Decoder part
        self.decode4 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 64, out_channels = 32, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(32)
              ])
        self.decode3 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 32, out_channels = 16, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(16)
              ])
        self.decode2 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 16, out_channels = 6, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(6)
              ])
        self.decode1 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 6, out_channels = 1, kernel_size = 3, padding = 1),
              ])

    def forward(self, x):
        print("forward input:", x.shape)
        draw_layer(x)
        #Encoder part
        dim_0 = x.size()      
        x = F.relu(self.encode1(x))
        x, idx_1 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
        print("after encode1:", x.shape)
        draw_layer(x)

        dim_1 = x.size() 
        x = F.relu(self.encode2(x))
        x, idx_2 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
        print("after encode2:", x.shape)
        draw_layer(x)

        dim_2 = x.size()
        x = F.relu(self.encode3(x)) 
        x, idx_3 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
        print("after encode3:", x.shape)
        draw_layer(x)

        dim_3 = x.size()
        x = F.relu(self.encode4(x)) 
        x, idx_4 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
        print("after encode4:", x.shape)
        draw_layer(x)

        #Decoder part
        x = F.max_unpool2d(x, idx_4, kernel_size = 2, stride = 2, output_size = dim_3)
        x = F.relu(self.decode4(x))
        print("after decode4:", x.shape)
        draw_layer(x)

        x = F.max_unpool2d(x, idx_3, kernel_size = 2, stride = 2, output_size = dim_2)
        x = F.relu(self.decode3(x))
        print("after decode3:", x.shape)
        draw_layer(x)

        x = F.max_unpool2d(x, idx_2, kernel_size = 2, stride = 2, output_size = dim_1)           
        x = F.relu(self.decode2(x))
        print("after decode2:", x.shape)
        draw_layer(x)

        x = F.max_unpool2d(x, idx_1, kernel_size = 2, stride = 2, output_size = dim_0)           
        x = F.relu(self.decode1(x))
        x = torch.sigmoid(x)  
        print("after decode1:", x.shape) 
        draw_layer(x)                                  

        return x

def draw_layer(param):
    if len(param.shape) == 4:
        x, y, z, w = param.shape
        idx = 0
        fig = plt.figure(figsize=(y*2, x*2))
        for para in param:
            for par in para:
                idx += 1
                ax = fig.add_subplot(x, y, idx)
                im = ax.imshow(par.detach().numpy(), cmap="gray")
                ax.axis('off')
                #fig.colorbar(im)
        plt.show()

kuma = Kuma()
kuma.load_state_dict(torch.load("kuma_050_20200226.pytorch"))

<All keys matched successfully>

I made 4 images with 3 bears per image and checked how they look in the middle layer.

X_test = [] #Stores image data for testing
Y_test = [] #Stores correct answer data for testing
Z_test = [] #Store prediction results for testing

for i in range(4): #Generate 4 new data not used for learning
    x_im, y_im = getdata_for_semantic_segmentation(draw_bear(3))
    X_test.append(x_im)
    Y_test.append(y_im)

#Format test image data for PyTorch
X_test_a = np.array([[np.asarray(x).transpose((2, 0, 1))[0]] for x in X_test])
X_test_t = torch.tensor(X_test_a, dtype = torch.float32)

#Calculate predictions using a trained model
Y_pred = kuma(X_test_t)

#Store predicted values as ndarray
for pred in Y_pred:
    Z_test.append(pred.detach().numpy())

forward input: torch.Size([4, 1, 400, 400])

Click the image to enlarge it.

after encode1: torch.Size([4, 6, 200, 200])

Click the image to enlarge it.

after encode2: torch.Size([4, 16, 100, 100])

Click the image to enlarge it.

after encode3: torch.Size([4, 32, 50, 50])

Click the image to enlarge it.

after encode4: torch.Size([4, 64, 25, 25])

Click the image to enlarge it.

after decode4: torch.Size([4, 32, 50, 50])

Click the image to enlarge it.

after decode3: torch.Size([4, 16, 100, 100])

Click the image to enlarge it.

after decode2: torch.Size([4, 6, 200, 200])

Click the image to enlarge it.

after decode1: torch.Size([4, 1, 400, 400])

Click the image to enlarge it.

The encoder 1st layer gets a rough outline, the 2nd to 3rd layers remove noise, the 3rd to 4th layers obtain a "contoured area", and the decoder 1st to 4th layers. It's like restoring it so that it can be mapped onto the original image.

See the contents of Kumantic Segumantion