Kumantic Segumantion

"Kumantic Segumantion" to get information about bears from images showing bears. This is a sequel to Kumantic Segmentation 2.

Difference from last time

Last time, I aimed to "detect multiple </ b> of bear's footprints" by using "an image of multiple </ b> of bear's footprints" for learning. , Did not work).

This time, we aim to "detect multiple </ b> bear silhouettes" by using "an image showing only one </ b> bear silhouette" for learning.

Automatically generate an image of a bear

Used to create datasets for training and validation.

import numpy as np
import random
from PIL import Image, ImageDraw, ImageFilter
from itertools import product

def draw_bear(n_bear=1): #Randomly generate an image of a bear
    r = g = b = 250
    im = Image.new('RGB', (400, 400), (r, g, b))
    draw = ImageDraw.Draw(im)

    for _ in range(random.randint(-1, 0)):
        r = random.randint(10, 200)
        g = random.randint(10, 200)
        b = random.randint(10, 200)
        x1 = random.randint(0, 400)
        y1 = random.randint(0, 400)
        dx = random.randint(10, 50)
        dy = random.randint(10, 50)
        draw.ellipse((x1, y1, x1+dx, y1+dy), fill=(r, g, b))

    for _ in range(n_bear):
        r = g = b = 1
        center_x = 200
        center_y = 200
        wx = 60
        wy = 50
        dx1 = 90
        dx2 = 20
        dy1 = 90
        dy2 = 20
        dx3 = 15
        dy3 = 100
        dy4 = 60
        shape1 = (center_x - wx, center_y - wy, center_x + wx, center_y + wy)
        shape2 = (center_x - dx1, center_y - dy1, center_x - dx2, center_y - dy2)
        shape3 = (center_x + dx2, center_y - dy1, center_x + dx1, center_y - dy2)
        shape4 = (center_x - dx3, center_y - dy3, center_x + dx3, center_y - dy4)

        zoom = 0.2 + random.random() * 0.4
        center_x = random.randint(-30, 250)
        center_y = random.randint(-30, 250)

        shape1 = modify(shape1, zoom=zoom, center_x=center_x, center_y=center_y)
        shape2= modify(shape2, zoom=zoom, center_x=center_x, center_y=center_y)
        shape3 = modify(shape3, zoom=zoom, center_x=center_x, center_y=center_y)
        shape4 = modify(shape4, zoom=zoom, center_x=center_x, center_y=center_y)

        draw.ellipse(shape1, fill=(r, g, b))
        draw.ellipse(shape2, fill=(r, g, b))
        draw.ellipse(shape3, fill=(r, g, b))
        #draw.ellipse(shape4, fill=(r, g, b))

    return im

def modify(shape, zoom=1, center_x=0, center_y=0):
    x1, y1, x2, y2 = np.array(shape) * zoom
    return (x1 + center_x, y1 + center_y, x2 + center_x, y2 + center_y)

class Noise: #Add noise to the bear's image
    def __init__(self, input_image):
        self.input_image = input_image
        self.input_pix = self.input_image.load()
        self.w, self.h = self.input_image.size

    def saltpepper(self, salt=0.05, pepper=0.05):
        output_image = Image.new("RGB", self.input_image.size)
        output_pix = output_image.load()

        for x, y in product(*map(range, (self.w, self.h))):
            r = random.random()
            if r < salt:
                output_pix[x, y] = (255, 255, 255)
            elif r > 1 - pepper:
                output_pix[x, y] = (  0,   0,   0)
            else:
                output_pix[x, y] = self.input_pix[x, y]
        return output_image

##Process bear's image into teacher data for semantic segmentation
def getdata_for_semantic_segmentation(im): 
    x_im = im.filter(ImageFilter.CONTOUR)
    im2 = Noise(input_image=x_im)
    x_im = im2.saltpepper()
    a_im = np.asarray(im)
    y_im = Image.fromarray(np.where(a_im == 1, 255, 0).astype(dtype='uint8'))
    return x_im, y_im

If you prepare the above function, you can get the bear image x_im and the correct answer data y_im as follows.

x_im, y_im = getdata_for_semantic_segmentation(draw_bear())

Check the contents

x_im

output_4_0.png

y_im

output_5_0.png

Alright, I confirmed that the image of Mr. Kuma was generated. This is a bear no matter how you look at it.

Generate 1000 learning data

Generate 1000 training data as follows.

%%time
X_data = [] #For storing image data
Y_data = [] #For storing correct answer data
for i in range(1000): #Generate 1000 images
    x_im, y_im = getdata_for_semantic_segmentation(draw_bear())
    X_data.append(x_im) #image data
    Y_data.append(y_im) #Correct answer data
CPU times: user 1min 13s, sys: 865 ms, total: 1min 14s
Wall time: 1min 15s

Just in case, check only the first 8 of the generated data.

%matplotlib inline
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10,10))
for i in range(16):
    ax = fig.add_subplot(4, 4, i+1)
    ax.axis('off')
    if i < 8: #Display the top 8 of image data
        ax.set_title('input_{}'.format(i))
        ax.imshow(X_data[i],cmap=plt.cm.gray, interpolation='none')
    else: #Display the top 8 correct answer data
        ax.set_title('answer_{}'.format(i - 8))
        ax.imshow(Y_data[i - 8],cmap=plt.cm.gray, interpolation='none')
plt.show()

output_7_0.png

Format the resulting data for PyTorch.

import torch
from torch.utils.data import TensorDataset, DataLoader

#Convert image data and correct answer data to ndarray
X_a = np.array([[np.asarray(x).transpose((2, 0, 1))[0]] for x in X_data])
Y_a = np.array([[np.asarray(y).transpose((2, 0, 1))[0]] for y in Y_data])

#Convert ndarray image data and correct answer data to tensor
X_t = torch.tensor(X_a, dtype = torch.float32)               
Y_t = torch.tensor(Y_a, dtype = torch.float32)

#Stored in data loader for learning with PyTorch
data_set = TensorDataset(X_t, Y_t)
data_loader = DataLoader(data_set, batch_size = 100, shuffle = True)

Bear network that recognizes the silhouette of a bear

I used the same network as last time.

from torch import nn, optim
from torch.nn import functional as F
class Kuma(nn.Module):
    def __init__(self):
        super(Kuma, self).__init__()
        #Encoder part
        self.encode1 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 1, out_channels = 6, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(6)
              ])
        self.encode2 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 6, out_channels = 16, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(16)
              ])
        self.encode3 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 16, out_channels = 32, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(32)
              ])

        #Decoder part
        self.decode3 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 32, out_channels = 16, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(16)
              ])
        self.decode2 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 16, out_channels = 6, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(6)
              ])
        self.decode1 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 6, out_channels = 1, kernel_size = 3, padding = 1),
              ])

    def forward(self, x):
        #Encoder part
        dim_0 = x.size() #For restoring the size in the first layer of the decoder
        x = F.relu(self.encode1(x))
        # return_indices =Set to True and max in the decoder_Use pool position idx
        x, idx_1 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
        dim_1 = x.size() #For restoring the size in the second layer of the decoder
        x = F.relu(self.encode2(x))
        # return_indices =Set to True and max in the decoder_Use pool position idx
        x, idx_2 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)            
        dim_2 = x.size()
        x = F.relu(self.encode3(x)) #For restoring the size in the third layer of the decoder
        # return_indices =Set to True and max in the decoder_Use pool position idx
        x, idx_3 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)

        #Decoder part
        x = F.max_unpool2d(x, idx_3, kernel_size = 2, stride = 2, output_size = dim_2)
        x = F.relu(self.decode3(x))
        x = F.max_unpool2d(x, idx_2, kernel_size = 2, stride = 2, output_size = dim_1)           
        x = F.relu(self.decode2(x))                           
        x = F.max_unpool2d(x, idx_1, kernel_size = 2, stride = 2, output_size = dim_0)           
        x = F.relu(self.decode1(x))                           
        x = torch.sigmoid(x)                                     

        return x

Start learning

First, learn only 50 epochs.

%%time

kuma = Kuma()
loss_fn = nn.MSELoss()                               
optimizer = optim.Adam(kuma.parameters(), lr = 0.01)

total_loss_history = []                                     
epoch_time = 50
for epoch in range(epoch_time):
    !date
    total_loss = 0.0                          
    kuma.train()
    for i, (XX, yy) in enumerate(data_loader):
        optimizer.zero_grad()       
        y_pred = kuma(XX)
        loss = loss_fn(y_pred, yy)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print("epoch:",epoch, " loss:", total_loss/(i + 1))
    total_loss_history.append(total_loss/(i + 1))

plt.plot(total_loss_history)
plt.ylabel("loss")
plt.xlabel("epoch time")
plt.savefig("total_loss_history")
plt.show()
Tue Feb 25 12:07:52 UTC 2020
epoch: 0  loss: 1202.7168701171875
Tue Feb 25 12:10:12 UTC 2020
epoch: 1  loss: 1200.6845336914062
Tue Feb 25 12:12:28 UTC 2020
...
Tue Feb 25 13:53:40 UTC 2020
epoch: 47  loss: 1199.1316650390625
Tue Feb 25 13:55:54 UTC 2020
epoch: 48  loss: 1199.1294555664062
Tue Feb 25 13:58:08 UTC 2020
epoch: 49  loss: 1199.133544921875

output_10_1.png

CPU times: user 1h 36min 47s, sys: 2min 23s, total: 1h 39min 11s
Wall time: 1h 52min 30s

It looks like it has converged.

Verification

Generate 100 new data that were not used for learning and verify.

X_test = [] #Stores image data for testing
Y_test = [] #Stores correct answer data for testing
Z_test = [] #Store prediction results for testing

for i in range(100): #Generate 100 new data not used for learning
    x_im, y_im = getdata_for_semantic_segmentation(draw_bear())
    X_test.append(x_im)
    Y_test.append(y_im)

#Format test image data for PyTorch
X_test_a = np.array([[np.asarray(x).transpose((2, 0, 1))[0]] for x in X_test])
X_test_t = torch.tensor(X_test_a, dtype = torch.float32)

#Calculate predictions using a trained model
Y_pred = kuma(X_test_t)

#Store predicted values as ndarray
for pred in Y_pred:
    Z_test.append(pred.detach().numpy())

I will show only the first 10 data.

#Draw image data, correct answer data, and predicted values for the first 10 pieces of data
fig = plt.figure(figsize=(12,36))
for i in range(10):
    ax = fig.add_subplot(10, 3, (i * 3)+1)
    ax.axis('off')
    ax.set_title('input_{}'.format(i))
    ax.imshow(X_test[i])
    ax = fig.add_subplot(10, 3, (i * 3)+2)
    ax.axis('off')
    ax.set_title('answer_{}'.format(i))
    ax.imshow(Y_test[i])
    ax = fig.add_subplot(10, 3, (i * 3)+3)
    ax.axis('off')
    ax.set_title('predicted_{}'.format(i))
    yp2 = Y_pred[i].detach().numpy()[0] * 255
    z_im = Image.fromarray(np.array([yp2, yp2, yp2]).transpose((1, 2, 0)).astype(dtype='uint8'))
    ax.imshow(z_im)
plt.show()

output_13_0.png

There is a lot of noise, but it seems to be predictable to some extent.

The size of the bear

Let's look at the relationship between the actual bear size and the detected (predicted) bear size.

A_ans = []
A_pred = []
for yt, zt in zip(Y_test, Z_test):
    #Correct white area (divide by 3 because there are 3 vectors)
    A_ans.append(np.where(np.asarray(yt) > 0.5, 1, 0).sum() / 3) 
    A_pred.append(np.where(np.asarray(zt) > 0.5, 1, 0).sum()) #Predicted white area

plt.figure(figsize=(4, 4))
plt.scatter(A_ans, A_pred, alpha=0.5)
plt.grid()
plt.xlabel('Observed sizes')
plt.ylabel('Predicted sizes')
#plt.xlim([0, 1700])
#plt.ylim([0, 1700])
plt.show()

output_14_0.png

The accuracy is not good, but there is a vague linear relationship.

Save trained model

You can save the trained model and use it again at any time as follows.

torch.save(kuma.state_dict(), "kuma_050_20200225.pytorch")

An additional 50 epochs

Somehow, I tried to learn an additional 50 epochs. Code omitted.

output_16_1.png

I wonder if the error has decreased a little (but it doesn't change much)

output_18_0.png

output_19_0.png

There seems to be no major improvement.

torch.save(kuma.state_dict(), "kuma_100_20200225.pytorch")

For the time being, I saved the trained model as above.

Detect multiple bears

Now it's finally time to go live. Until now, we learned "images containing only one bear </ b>" and detected bears from "images containing only one bear </ b>". I used to learn "images containing only one bear </ b>" and from "images containing multiple </ b> bears". I would like to detect a bear.

Loading trained model

Load the trained model you saved earlier.

kuma = Kuma()
kuma.load_state_dict(torch.load("kuma_100_20200225.pytorch"))
<All keys matched successfully>

Image containing multiple bears

Generates 100 images containing multiple bears. It also generates a disappointing image that does not show the bear.

X_test = [] #Stores image data for testing
Y_test = [] #Stores correct answer data for testing
Z_test = [] #Store prediction results for testing

for i in range(100): #Generate 100 new data not used for learning
    x_im, y_im = getdata_for_semantic_segmentation(draw_bear(n_bear=random.randint(0, 5)))
    X_test.append(x_im)
    Y_test.append(y_im)

#Format test image data for PyTorch
X_test_a = np.array([[np.asarray(x).transpose((2, 0, 1))[0]] for x in X_test])
X_test_t = torch.tensor(X_test_a, dtype = torch.float32)

Start forecasting

As it is a trained model, the prediction is instantaneous.

#Calculate predictions using a trained model
Y_pred = kuma(X_test_t)

#Store predicted values as ndarray
for pred in Y_pred:
    Z_test.append(pred.detach().numpy())

#Draw image data, correct answer data, and predicted values for the first 10 pieces of data
fig = plt.figure(figsize=(12,36))
for i in range(10):
    ax = fig.add_subplot(10, 3, (i * 3)+1)
    ax.axis('off')
    ax.set_title('input_{}'.format(i))
    ax.imshow(X_test[i])
    ax = fig.add_subplot(10, 3, (i * 3)+2)
    ax.axis('off')
    ax.set_title('answer_{}'.format(i))
    ax.imshow(Y_test[i])
    ax = fig.add_subplot(10, 3, (i * 3)+3)
    ax.axis('off')
    ax.set_title('predicted_{}'.format(i))
    yp2 = Y_pred[i].detach().numpy()[0] * 255
    z_im = Image.fromarray(np.array([yp2, yp2, yp2]).transpose((1, 2, 0)).astype(dtype='uint8'))
    ax.imshow(z_im)
plt.show()

output_10_0.png

It's quite noisy, but I was able to make a rough prediction.

Let's see the relationship between the actual size of the bear and the predicted size.

A_ans = []
A_pred = []
for yt, zt in zip(Y_test, Z_test):
    #Correct white area (divide by 3 because there are 3 vectors)
    A_ans.append(np.where(np.asarray(yt) > 0.5, 1, 0).sum() / 3) 
    A_pred.append(np.where(np.asarray(zt) > 0.5, 1, 0).sum()) #Predicted white area

plt.figure(figsize=(4, 4))
plt.scatter(A_ans, A_pred, alpha=0.5)
plt.grid()
plt.xlabel('Observed sizes')
plt.ylabel('Predicted sizes')
#plt.xlim([0, 1700])
#plt.ylim([0, 1700])
plt.show()

output_11_0.png

There is a fairly clean straight line relationship, so if you just want to know the size, it seems that you can correct from this relationship.

Deeper network

I added another layer to create a deeper network.

import torch
from torch import nn, optim
from torch.nn import functional as F
class Kuma(nn.Module):
    def __init__(self):
        super(Kuma, self).__init__()
        #Encoder part
        self.encode1 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 1, out_channels = 6, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(6)
              ])
        self.encode2 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 6, out_channels = 16, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(16)
              ])
        self.encode3 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 16, out_channels = 32, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(32)
              ])
        
        self.encode4 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 32, out_channels = 64, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(64)
              ])

        #Decoder part
        self.decode4 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 64, out_channels = 32, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(32)
              ])
        self.decode3 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 32, out_channels = 16, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(16)
              ])
        self.decode2 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 16, out_channels = 6, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(6)
              ])
        self.decode1 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 6, out_channels = 1, kernel_size = 3, padding = 1),
              ])

    def forward(self, x):
        #Encoder part
        dim_0 = x.size()      
        x = F.relu(self.encode1(x))
        x, idx_1 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)

        dim_1 = x.size() 
        x = F.relu(self.encode2(x))
        x, idx_2 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)

        dim_2 = x.size()
        x = F.relu(self.encode3(x)) 
        x, idx_3 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)

        dim_3 = x.size()
        x = F.relu(self.encode4(x)) 
        x, idx_4 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)

        #Decoder part
        x = F.max_unpool2d(x, idx_4, kernel_size = 2, stride = 2, output_size = dim_3)
        x = F.relu(self.decode4(x))

        x = F.max_unpool2d(x, idx_3, kernel_size = 2, stride = 2, output_size = dim_2)
        x = F.relu(self.decode3(x))

        x = F.max_unpool2d(x, idx_2, kernel_size = 2, stride = 2, output_size = dim_1)           
        x = F.relu(self.decode2(x))

        x = F.max_unpool2d(x, idx_1, kernel_size = 2, stride = 2, output_size = dim_0)           
        x = F.relu(self.decode1(x))

        x = torch.sigmoid(x)                                     

        return x

The result is as follows. The code is the same as the one described above (only the file name to save is different)

Learning curve

output_6_1.png

Prediction result example (1 bear)

output_8_0.png

Bear size

output_9_0.png

Example of prediction results (many bears)

output_13_0.png

Bear size

output_14_0.png

Just add one layer and it's pretty clean!