Kumantic segmentation 2

I am studying semantic segmentation, which cuts out the part of the image of interest from the deep learning method. This time, I would like to cut out the footprints of bears. This is a sequel to Bear ... not Semantic Segmentation.

The big difference from the last time was that last time it was decided that "the target bear is only one per image" </ b>, but this time "the bear's footprint is one image". It is "3 per" </ b>.

Automatically generate bear footprint images

First and foremost, the author doesn't know what a real bear footprint looks like. The image is an image.

import random def draw_footprints(): #Randomly generate bear footprint images r = g = b = 250 im = Image.new('RGB', (400, 400), (r, g, b)) draw = ImageDraw.Draw(im) for _ in range(100): r = random.randint(10, 200) g = random.randint(10, 200) b = random.randint(10, 200) x1 = random.randint(0, 400) y1 = random.randint(0, 400) dx = random.randint(10, 50) dy = random.randint(10, 50) draw.ellipse((x1, y1, x1+dx, y1+dy), fill=(r, g, b)) for _ in range(3): r = g = b = 1 center_x = 200 center_y = 200 wx = 60 wy = 50 dx1 = 60 dx2 = 30 dy1 = 90 dy2 = 50 dx3 = 15 dy3 = 100 dy4 = 60 shape1 = (center_x - wx, center_y - wy, center_x + wx, center_y + wy) shape2 = (center_x - dx1, center_y - dy1, center_x - dx2, center_y - dy2) shape3 = (center_x + dx2, center_y - dy1, center_x + dx1, center_y - dy2) shape4 = (center_x - dx3, center_y - dy3, center_x + dx3, center_y - dy4) zoom = 0.2 + random.random() * 0.4 center_x = random.randint(-30, 250) center_y = random.randint(-30, 250) shape1 = modify(shape1, zoom=zoom, center_x=center_x, center_y=center_y) shape2= modify(shape2, zoom=zoom, center_x=center_x, center_y=center_y) shape3 = modify(shape3, zoom=zoom, center_x=center_x, center_y=center_y) shape4 = modify(shape4, zoom=zoom, center_x=center_x, center_y=center_y) draw.ellipse(shape1, fill=(r, g, b)) draw.ellipse(shape2, fill=(r, g, b)) draw.ellipse(shape3, fill=(r, g, b)) draw.ellipse(shape4, fill=(r, g, b)) return im def modify(shape, zoom=1, center_x=0, center_y=0): x1, y1, x2, y2 = np.array(shape) * zoom return (x1 + center_x, y1 + center_y, x2 + center_x, y2 + center_y)

from PIL import Image, ImageDraw from itertools import product class Noise: #Add noise to the bear footprint image def __init__(self, input_image): self.input_image = input_image self.input_pix = self.input_image.load() self.w, self.h = self.input_image.size def saltpepper(self, salt=0.05, pepper=0.05): output_image = Image.new("RGB", self.input_image.size) output_pix = output_image.load() for x, y in product(*map(range, (self.w, self.h))): r = random.random() if r < salt: output_pix[x, y] = (255, 255, 255) elif r > 1 - pepper: output_pix[x, y] = ( 0, 0, 0) else: output_pix[x, y] = self.input_pix[x, y] return output_image

from PIL import ImageFilter import numpy as np #Process bear footprint images into teacher data for semantic segmentation def getdata_for_semantic_segmentation(im): x_im = im.filter(ImageFilter.CONTOUR) im2 = Noise(input_image=x_im) x_im = im2.saltpepper() a_im = np.asarray(im) y_im = Image.fromarray(np.where(a_im == 1, 255, 0).astype(dtype='uint8')) return x_im, y_im

Bear footprint image generation example

x_im, y_im = getdata_for_semantic_segmentation(draw_footprints())

Enter the figure below (an image that includes bear footprints). Do you know where the bear footprints are in this?

x_im

The answer is in the figure below. We aim for learning that can output this answer.

y_im

Generation of teacher set

This time, 1000 images were automatically generated as a teacher set.

%%time X_data = [] #For storing image data Y_data = [] #For storing correct answer data for i in range(1000): #Generate 1000 images x_im, y_im = getdata_for_semantic_segmentation(draw_footprints()) X_data.append(x_im) #image data Y_data.append(y_im) #Correct answer data

CPU times: user 1min 20s, sys: 811 ms, total: 1min 21s Wall time: 1min 21s

Teacher set generation example

%matplotlib inline import matplotlib.pyplot as plt fig = plt.figure(figsize=(10,10)) for i in range(16): ax = fig.add_subplot(4, 4, i+1) ax.axis('off') if i < 8: #Display the top 8 of image data ax.set_title('input_{}'.format(i)) ax.imshow(X_data[i],cmap=plt.cm.gray, interpolation='none') else: #Display the top 8 correct answer data ax.set_title('answer_{}'.format(i - 8)) ax.imshow(Y_data[i - 8],cmap=plt.cm.gray, interpolation='none') plt.show()

Learning

Now, let's start learning. First of all, data conversion

import torch from torch.utils.data import TensorDataset, DataLoader #Convert image data and correct answer data to ndarray X_a = np.array([[np.asarray(x).transpose((2, 0, 1))[0]] for x in X_data]) Y_a = np.array([[np.asarray(y).transpose((2, 0, 1))[0]] for y in Y_data]) #Convert ndarray image data and correct answer data to tensor X_t = torch.tensor(X_a, dtype = torch.float32) Y_t = torch.tensor(Y_a, dtype = torch.float32) #Stored in data loader for learning with PyTorch data_set = TensorDataset(X_t, Y_t) data_loader = DataLoader(data_set, batch_size = 100, shuffle = True)

Definition of a class for learning Kumantic segmentation

from torch import nn, optim from torch.nn import functional as F class Kuma(nn.Module): def __init__(self): super(Kuma, self).__init__() #Encoder part self.encode1 = nn.Sequential( *[ nn.Conv2d( in_channels = 1, out_channels = 6, kernel_size = 3, padding = 1), nn.BatchNorm2d(6) ]) self.encode2 = nn.Sequential( *[ nn.Conv2d( in_channels = 6, out_channels = 16, kernel_size = 3, padding = 1), nn.BatchNorm2d(16) ]) self.encode3 = nn.Sequential( *[ nn.Conv2d( in_channels = 16, out_channels = 32, kernel_size = 3, padding = 1), nn.BatchNorm2d(32) ]) #Decoder part self.decode3 = nn.Sequential( *[ nn.ConvTranspose2d( in_channels = 32, out_channels = 16, kernel_size = 3, padding = 1), nn.BatchNorm2d(16) ]) self.decode2 = nn.Sequential( *[ nn.ConvTranspose2d( in_channels = 16, out_channels = 6, kernel_size = 3, padding = 1), nn.BatchNorm2d(6) ]) self.decode1 = nn.Sequential( *[ nn.ConvTranspose2d( in_channels = 6, out_channels = 1, kernel_size = 3, padding = 1), ]) def forward(self, x): #Encoder part dim_0 = x.size() #For restoring the size in the first layer of the decoder x = F.relu(self.encode1(x)) # return_indices =Set to True and max in the decoder_Use pool position idx x, idx_1 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True) dim_1 = x.size() #For restoring the size in the second layer of the decoder x = F.relu(self.encode2(x)) # return_indices =Set to True and max in the decoder_Use pool position idx x, idx_2 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True) dim_2 = x.size() x = F.relu(self.encode3(x)) #For restoring the size in the third layer of the decoder # return_indices =Set to True and max in the decoder_Use pool position idx x, idx_3 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True) #Decoder part x = F.max_unpool2d(x, idx_3, kernel_size = 2, stride = 2, output_size = dim_2) x = F.relu(self.decode3(x)) x = F.max_unpool2d(x, idx_2, kernel_size = 2, stride = 2, output_size = dim_1) x = F.relu(self.decode2(x)) x = F.max_unpool2d(x, idx_1, kernel_size = 2, stride = 2, output_size = dim_0) x = F.relu(self.decode1(x)) x = torch.sigmoid(x) return x

Start learning

%%time kuma = Kuma() loss_fn = nn.MSELoss() optimizer = optim.Adam(kuma.parameters(), lr = 0.01) total_loss_history = [] epoch_time = 50 for epoch in range(epoch_time): !date total_loss = 0.0 kuma.train() for i, (XX, yy) in enumerate(data_loader): optimizer.zero_grad() y_pred = kuma(XX) loss = loss_fn(y_pred, yy) loss.backward() optimizer.step() total_loss += loss.item() print("epoch:",epoch, " loss:", total_loss/(i + 1)) total_loss_history.append(total_loss/(i + 1)) plt.plot(total_loss_history) plt.ylabel("loss") plt.xlabel("epoch time") plt.savefig("total_loss_history") plt.show()

Mon Feb 3 12:23:20 UTC 2020 epoch: 0 loss: 2685.716845703125 Mon Feb 3 12:24:47 UTC 2020 epoch: 1 loss: 2681.9998046875 Mon Feb 3 12:26:13 UTC 2020 epoch: 2 loss: 2679.750439453125 Mon Feb 3 12:27:39 UTC 2020 epoch: 3 loss: 2678.707568359375 Mon Feb 3 12:29:05 UTC 2020 ... Mon Feb 3 13:39:11 UTC 2020 epoch: 47 loss: 2677.768359375 Mon Feb 3 13:40:49 UTC 2020 epoch: 48 loss: 2677.7637939453125 Mon Feb 3 13:42:29 UTC 2020 epoch: 49 loss: 2677.7629150390626

CPU times: user 2h 15min 56s, sys: 3min, total: 2h 18min 56s Wall time: 1h 20min 54s

Forecast new data

Let's generate new data that was not used for training and make predictions.

X_test = [] #Stores image data for testing Y_test = [] #Stores correct answer data for testing Z_test = [] #Store prediction results for testing for i in range(100): #Generate 100 new data not used for learning x_im, y_im = getdata_for_semantic_segmentation(draw_footprints()) X_test.append(x_im) Y_test.append(y_im)

Data shaping

#Format test image data for PyTorch X_test_a = np.array([[np.asarray(x).transpose((2, 0, 1))[0]] for x in X_test]) X_test_t = torch.tensor(X_test_a, dtype = torch.float32) #Calculate predictions using a trained model Y_pred = kuma(X_test_t) #Store predicted values as ndarray for pred in Y_pred: Z_test.append(pred.detach().numpy())

Let's compare the correct answer and the prediction result for the first 10 images.

#Draw image data, correct answer data, and predicted values for the first 10 pieces of data fig = plt.figure(figsize=(12,36)) for i in range(10): ax = fig.add_subplot(10, 3, (i * 3)+1) ax.axis('off') ax.set_title('input_{}'.format(i)) ax.imshow(X_test[i]) ax = fig.add_subplot(10, 3, (i * 3)+2) ax.axis('off') ax.set_title('answer_{}'.format(i)) ax.imshow(Y_test[i]) ax = fig.add_subplot(10, 3, (i * 3)+3) ax.axis('off') ax.set_title('predicted_{}'.format(i)) yp2 = Y_pred[i].detach().numpy()[0] * 255 z_im = Image.fromarray(np.array([yp2, yp2, yp2]).transpose((1, 2, 0)).astype(dtype='uint8')) ax.imshow(z_im) plt.show()

There is. It's not working. It seems that footprints have been taken somehow, but it seems that there are many false positives that judge non-footprints as footprints.

Let's compare the area of the correct answer with the area of the predicted value.

A_ans = [] A_pred = [] for yt, zt in zip(Y_test, Z_test): #Correct white area (divide by 3 because there are 3 vectors) A_ans.append(np.where(np.asarray(yt) > 0.5, 1, 0).sum() / 3) A_pred.append(np.where(np.asarray(zt) > 0.5, 1, 0).sum()) #Predicted white area plt.figure(figsize=(4, 4)) plt.scatter(A_ans, A_pred, alpha=0.5) plt.grid() plt.xlabel('Observed sizes') plt.ylabel('Predicted sizes') #plt.xlim([0, 1700]) #plt.ylim([0, 1700]) plt.show()

In Bear ... not Semantic Segmentation, there was a linear relationship, but this time it seems that a good relationship has not come out.

The continuation is coming again.

Postscript

Subsequent increases in the number of data from 1000 to 4000 and the number of epochs from 50 to 100 did not change much. By the way, the learning curve did not go down easily from around 60 epochs.

Recommended Posts
Kumantic segmentation 2

Kumantic Segumantion