Kumantic segmentation 2

I am studying semantic segmentation, which cuts out the part of the image of interest from the deep learning method. This time, I would like to cut out the footprints of bears. This is a sequel to Bear ... not Semantic Segmentation.

The big difference from the last time was that last time it was decided that "the target bear is only one per image" </ b>, but this time "the bear's footprint is one image". It is "3 per" </ b>.

Automatically generate bear footprint images

First and foremost, the author doesn't know what a real bear footprint looks like. The image is an image.

import random
def draw_footprints(): #Randomly generate bear footprint images
    r = g = b = 250
    im = Image.new('RGB', (400, 400), (r, g, b))
    draw = ImageDraw.Draw(im)

    for _ in range(100):
        r = random.randint(10, 200)
        g = random.randint(10, 200)
        b = random.randint(10, 200)
        x1 = random.randint(0, 400)
        y1 = random.randint(0, 400)
        dx = random.randint(10, 50)
        dy = random.randint(10, 50)
        draw.ellipse((x1, y1, x1+dx, y1+dy), fill=(r, g, b))

    for _ in range(3):
        r = g = b = 1
        center_x = 200
        center_y = 200
        wx = 60
        wy = 50
        dx1 = 60
        dx2 = 30
        dy1 = 90
        dy2 = 50
        dx3 = 15
        dy3 = 100
        dy4 = 60
        shape1 = (center_x - wx, center_y - wy, center_x + wx, center_y + wy)
        shape2 = (center_x - dx1, center_y - dy1, center_x - dx2, center_y - dy2)
        shape3 = (center_x + dx2, center_y - dy1, center_x + dx1, center_y - dy2)
        shape4 = (center_x - dx3, center_y - dy3, center_x + dx3, center_y - dy4)

        zoom = 0.2 + random.random() * 0.4
        center_x = random.randint(-30, 250)
        center_y = random.randint(-30, 250)

        shape1 = modify(shape1, zoom=zoom, center_x=center_x, center_y=center_y)
        shape2= modify(shape2, zoom=zoom, center_x=center_x, center_y=center_y)
        shape3 = modify(shape3, zoom=zoom, center_x=center_x, center_y=center_y)
        shape4 = modify(shape4, zoom=zoom, center_x=center_x, center_y=center_y)

        draw.ellipse(shape1, fill=(r, g, b))
        draw.ellipse(shape2, fill=(r, g, b))
        draw.ellipse(shape3, fill=(r, g, b))
        draw.ellipse(shape4, fill=(r, g, b))
        
    return im

def modify(shape, zoom=1, center_x=0, center_y=0):
    x1, y1, x2, y2 = np.array(shape) * zoom
    return (x1 + center_x, y1 + center_y, x2 + center_x, y2 + center_y)
from PIL import Image, ImageDraw
from itertools import product

class Noise: #Add noise to the bear footprint image
    def __init__(self, input_image):
        self.input_image = input_image
        self.input_pix = self.input_image.load()
        self.w, self.h = self.input_image.size

    def saltpepper(self, salt=0.05, pepper=0.05):
        output_image = Image.new("RGB", self.input_image.size)
        output_pix = output_image.load()

        for x, y in product(*map(range, (self.w, self.h))):
            r = random.random()
            if r < salt:
                output_pix[x, y] = (255, 255, 255)
            elif r > 1 - pepper:
                output_pix[x, y] = (  0,   0,   0)
            else:
                output_pix[x, y] = self.input_pix[x, y]
        return output_image
from PIL import ImageFilter
import numpy as np

#Process bear footprint images into teacher data for semantic segmentation
def getdata_for_semantic_segmentation(im): 
    x_im = im.filter(ImageFilter.CONTOUR)
    im2 = Noise(input_image=x_im)
    x_im = im2.saltpepper()
    a_im = np.asarray(im)
    y_im = Image.fromarray(np.where(a_im == 1, 255, 0).astype(dtype='uint8'))
    return x_im, y_im

Bear footprint image generation example

x_im, y_im = getdata_for_semantic_segmentation(draw_footprints())

Enter the figure below (an image that includes bear footprints). Do you know where the bear footprints are in this?

x_im

output_5_0.png

The answer is in the figure below. We aim for learning that can output this answer.

y_im

output_6_0.png

Generation of teacher set

This time, 1000 images were automatically generated as a teacher set.

%%time
X_data = [] #For storing image data
Y_data = [] #For storing correct answer data
for i in range(1000): #Generate 1000 images
    x_im, y_im = getdata_for_semantic_segmentation(draw_footprints())
    X_data.append(x_im) #image data
    Y_data.append(y_im) #Correct answer data
CPU times: user 1min 20s, sys: 811 ms, total: 1min 21s
Wall time: 1min 21s

Teacher set generation example

%matplotlib inline
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10,10))
for i in range(16):
    ax = fig.add_subplot(4, 4, i+1)
    ax.axis('off')
    if i < 8: #Display the top 8 of image data
        ax.set_title('input_{}'.format(i))
        ax.imshow(X_data[i],cmap=plt.cm.gray, interpolation='none')
    else: #Display the top 8 correct answer data
        ax.set_title('answer_{}'.format(i - 8))
        ax.imshow(Y_data[i - 8],cmap=plt.cm.gray, interpolation='none')
plt.show()

output_8_0.png

Learning

Now, let's start learning. First of all, data conversion

import torch
from torch.utils.data import TensorDataset, DataLoader

#Convert image data and correct answer data to ndarray
X_a = np.array([[np.asarray(x).transpose((2, 0, 1))[0]] for x in X_data])
Y_a = np.array([[np.asarray(y).transpose((2, 0, 1))[0]] for y in Y_data])

#Convert ndarray image data and correct answer data to tensor
X_t = torch.tensor(X_a, dtype = torch.float32)               
Y_t = torch.tensor(Y_a, dtype = torch.float32)

#Stored in data loader for learning with PyTorch
data_set = TensorDataset(X_t, Y_t)
data_loader = DataLoader(data_set, batch_size = 100, shuffle = True)

Definition of a class for learning Kumantic segmentation

from torch import nn, optim
from torch.nn import functional as F
class Kuma(nn.Module):
    def __init__(self):
        super(Kuma, self).__init__()
        #Encoder part
        self.encode1 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 1, out_channels = 6, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(6)
              ])
        self.encode2 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 6, out_channels = 16, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(16)
              ])
        self.encode3 = nn.Sequential(
            *[
              nn.Conv2d(
                  in_channels = 16, out_channels = 32, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(32)
              ])

        #Decoder part
        self.decode3 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 32, out_channels = 16, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(16)
              ])
        self.decode2 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 16, out_channels = 6, kernel_size = 3, padding = 1),
              nn.BatchNorm2d(6)
              ])
        self.decode1 = nn.Sequential(
            *[
              nn.ConvTranspose2d(
                  in_channels = 6, out_channels = 1, kernel_size = 3, padding = 1),
              ])

    def forward(self, x):
        #Encoder part
        dim_0 = x.size() #For restoring the size in the first layer of the decoder
        x = F.relu(self.encode1(x))
        # return_indices =Set to True and max in the decoder_Use pool position idx
        x, idx_1 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
        dim_1 = x.size() #For restoring the size in the second layer of the decoder
        x = F.relu(self.encode2(x))
        # return_indices =Set to True and max in the decoder_Use pool position idx
        x, idx_2 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)            
        dim_2 = x.size()
        x = F.relu(self.encode3(x)) #For restoring the size in the third layer of the decoder
        # return_indices =Set to True and max in the decoder_Use pool position idx
        x, idx_3 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)

        #Decoder part
        x = F.max_unpool2d(x, idx_3, kernel_size = 2, stride = 2, output_size = dim_2)
        x = F.relu(self.decode3(x))
        x = F.max_unpool2d(x, idx_2, kernel_size = 2, stride = 2, output_size = dim_1)           
        x = F.relu(self.decode2(x))                           
        x = F.max_unpool2d(x, idx_1, kernel_size = 2, stride = 2, output_size = dim_0)           
        x = F.relu(self.decode1(x))                           
        x = torch.sigmoid(x)                                     

        return x

Start learning

%%time

kuma = Kuma()
loss_fn = nn.MSELoss()                               
optimizer = optim.Adam(kuma.parameters(), lr = 0.01)

total_loss_history = []                                     
epoch_time = 50
for epoch in range(epoch_time):
    !date
    total_loss = 0.0                          
    kuma.train()
    for i, (XX, yy) in enumerate(data_loader):
        optimizer.zero_grad()       
        y_pred = kuma(XX)
        loss = loss_fn(y_pred, yy)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print("epoch:",epoch, " loss:", total_loss/(i + 1))
    total_loss_history.append(total_loss/(i + 1))

plt.plot(total_loss_history)
plt.ylabel("loss")
plt.xlabel("epoch time")
plt.savefig("total_loss_history")
plt.show()
Mon Feb  3 12:23:20 UTC 2020
epoch: 0  loss: 2685.716845703125
Mon Feb  3 12:24:47 UTC 2020
epoch: 1  loss: 2681.9998046875
Mon Feb  3 12:26:13 UTC 2020
epoch: 2  loss: 2679.750439453125
Mon Feb  3 12:27:39 UTC 2020
epoch: 3  loss: 2678.707568359375
Mon Feb  3 12:29:05 UTC 2020
...
Mon Feb  3 13:39:11 UTC 2020
epoch: 47  loss: 2677.768359375
Mon Feb  3 13:40:49 UTC 2020
epoch: 48  loss: 2677.7637939453125
Mon Feb  3 13:42:29 UTC 2020
epoch: 49  loss: 2677.7629150390626

output_11_1.png

CPU times: user 2h 15min 56s, sys: 3min, total: 2h 18min 56s
Wall time: 1h 20min 54s

Forecast new data

Let's generate new data that was not used for training and make predictions.

X_test = [] #Stores image data for testing
Y_test = [] #Stores correct answer data for testing
Z_test = [] #Store prediction results for testing

for i in range(100): #Generate 100 new data not used for learning
    x_im, y_im = getdata_for_semantic_segmentation(draw_footprints())
    X_test.append(x_im)
    Y_test.append(y_im)

Data shaping

#Format test image data for PyTorch
X_test_a = np.array([[np.asarray(x).transpose((2, 0, 1))[0]] for x in X_test])
X_test_t = torch.tensor(X_test_a, dtype = torch.float32)

#Calculate predictions using a trained model
Y_pred = kuma(X_test_t)

#Store predicted values as ndarray
for pred in Y_pred:
    Z_test.append(pred.detach().numpy())

Let's compare the correct answer and the prediction result for the first 10 images.

#Draw image data, correct answer data, and predicted values for the first 10 pieces of data
fig = plt.figure(figsize=(12,36))
for i in range(10):
    ax = fig.add_subplot(10, 3, (i * 3)+1)
    ax.axis('off')
    ax.set_title('input_{}'.format(i))
    ax.imshow(X_test[i])
    ax = fig.add_subplot(10, 3, (i * 3)+2)
    ax.axis('off')
    ax.set_title('answer_{}'.format(i))
    ax.imshow(Y_test[i])
    ax = fig.add_subplot(10, 3, (i * 3)+3)
    ax.axis('off')
    ax.set_title('predicted_{}'.format(i))
    yp2 = Y_pred[i].detach().numpy()[0] * 255
    z_im = Image.fromarray(np.array([yp2, yp2, yp2]).transpose((1, 2, 0)).astype(dtype='uint8'))
    ax.imshow(z_im)
plt.show()

output_14_0.png

There is. It's not working. It seems that footprints have been taken somehow, but it seems that there are many false positives that judge non-footprints as footprints.

Let's compare the area of the correct answer with the area of the predicted value.

A_ans = []
A_pred = []
for yt, zt in zip(Y_test, Z_test):
    #Correct white area (divide by 3 because there are 3 vectors)
    A_ans.append(np.where(np.asarray(yt) > 0.5, 1, 0).sum() / 3) 
    A_pred.append(np.where(np.asarray(zt) > 0.5, 1, 0).sum()) #Predicted white area

plt.figure(figsize=(4, 4))
plt.scatter(A_ans, A_pred, alpha=0.5)
plt.grid()
plt.xlabel('Observed sizes')
plt.ylabel('Predicted sizes')
#plt.xlim([0, 1700])
#plt.ylim([0, 1700])
plt.show()

output_15_0.png

In Bear ... not Semantic Segmentation, there was a linear relationship, but this time it seems that a good relationship has not come out.

The continuation is coming again.

Postscript

Subsequent increases in the number of data from 1000 to 4000 and the number of epochs from 50 to 100 did not change much. By the way, the learning curve did not go down easily from around 60 epochs.

Recommended Posts