"Kumantic Segumantion" to get information about bears from images showing bears. This is a sequel to Kumantic Segmentation 2.

Difference from last time

Last time, I aimed to "detect multiple of bear's footprints" by using "an image of multiple of bear's footprints" for learning. , Did not work).

This time, we aim to "detect multiple bear silhouettes" by using "an image showing only one bear silhouette" for learning.

Automatically generate an image of a bear

Used to create datasets for training and validation.

import numpy as np import random from PIL import Image, ImageDraw, ImageFilter from itertools import product def draw_bear(n_bear=1): #Randomly generate an image of a bear r = g = b = 250 im = Image.new('RGB', (400, 400), (r, g, b)) draw = ImageDraw.Draw(im) for _ in range(random.randint(-1, 0)): r = random.randint(10, 200) g = random.randint(10, 200) b = random.randint(10, 200) x1 = random.randint(0, 400) y1 = random.randint(0, 400) dx = random.randint(10, 50) dy = random.randint(10, 50) draw.ellipse((x1, y1, x1+dx, y1+dy), fill=(r, g, b)) for _ in range(n_bear): r = g = b = 1 center_x = 200 center_y = 200 wx = 60 wy = 50 dx1 = 90 dx2 = 20 dy1 = 90 dy2 = 20 dx3 = 15 dy3 = 100 dy4 = 60 shape1 = (center_x - wx, center_y - wy, center_x + wx, center_y + wy) shape2 = (center_x - dx1, center_y - dy1, center_x - dx2, center_y - dy2) shape3 = (center_x + dx2, center_y - dy1, center_x + dx1, center_y - dy2) shape4 = (center_x - dx3, center_y - dy3, center_x + dx3, center_y - dy4) zoom = 0.2 + random.random() * 0.4 center_x = random.randint(-30, 250) center_y = random.randint(-30, 250) shape1 = modify(shape1, zoom=zoom, center_x=center_x, center_y=center_y) shape2= modify(shape2, zoom=zoom, center_x=center_x, center_y=center_y) shape3 = modify(shape3, zoom=zoom, center_x=center_x, center_y=center_y) shape4 = modify(shape4, zoom=zoom, center_x=center_x, center_y=center_y) draw.ellipse(shape1, fill=(r, g, b)) draw.ellipse(shape2, fill=(r, g, b)) draw.ellipse(shape3, fill=(r, g, b)) #draw.ellipse(shape4, fill=(r, g, b)) return im def modify(shape, zoom=1, center_x=0, center_y=0): x1, y1, x2, y2 = np.array(shape) * zoom return (x1 + center_x, y1 + center_y, x2 + center_x, y2 + center_y) class Noise: #Add noise to the bear's image def __init__(self, input_image): self.input_image = input_image self.input_pix = self.input_image.load() self.w, self.h = self.input_image.size def saltpepper(self, salt=0.05, pepper=0.05): output_image = Image.new("RGB", self.input_image.size) output_pix = output_image.load() for x, y in product(*map(range, (self.w, self.h))): r = random.random() if r < salt: output_pix[x, y] = (255, 255, 255) elif r > 1 - pepper: output_pix[x, y] = ( 0, 0, 0) else: output_pix[x, y] = self.input_pix[x, y] return output_image ##Process bear's image into teacher data for semantic segmentation def getdata_for_semantic_segmentation(im): x_im = im.filter(ImageFilter.CONTOUR) im2 = Noise(input_image=x_im) x_im = im2.saltpepper() a_im = np.asarray(im) y_im = Image.fromarray(np.where(a_im == 1, 255, 0).astype(dtype='uint8')) return x_im, y_im

If you prepare the above function, you can get the bear image x_im and the correct answer data y_im as follows.

x_im, y_im = getdata_for_semantic_segmentation(draw_bear())

Check the contents

x_im

y_im

Alright, I confirmed that the image of Mr. Kuma was generated. This is a bear no matter how you look at it.

Generate 1000 learning data

Generate 1000 training data as follows.

%%time X_data = [] #For storing image data Y_data = [] #For storing correct answer data for i in range(1000): #Generate 1000 images x_im, y_im = getdata_for_semantic_segmentation(draw_bear()) X_data.append(x_im) #image data Y_data.append(y_im) #Correct answer data

CPU times: user 1min 13s, sys: 865 ms, total: 1min 14s Wall time: 1min 15s

Just in case, check only the first 8 of the generated data.

%matplotlib inline import matplotlib.pyplot as plt fig = plt.figure(figsize=(10,10)) for i in range(16): ax = fig.add_subplot(4, 4, i+1) ax.axis('off') if i < 8: #Display the top 8 of image data ax.set_title('input_{}'.format(i)) ax.imshow(X_data[i],cmap=plt.cm.gray, interpolation='none') else: #Display the top 8 correct answer data ax.set_title('answer_{}'.format(i - 8)) ax.imshow(Y_data[i - 8],cmap=plt.cm.gray, interpolation='none') plt.show()

Format the resulting data for PyTorch.

import torch from torch.utils.data import TensorDataset, DataLoader #Convert image data and correct answer data to ndarray X_a = np.array([[np.asarray(x).transpose((2, 0, 1))[0]] for x in X_data]) Y_a = np.array([[np.asarray(y).transpose((2, 0, 1))[0]] for y in Y_data]) #Convert ndarray image data and correct answer data to tensor X_t = torch.tensor(X_a, dtype = torch.float32) Y_t = torch.tensor(Y_a, dtype = torch.float32) #Stored in data loader for learning with PyTorch data_set = TensorDataset(X_t, Y_t) data_loader = DataLoader(data_set, batch_size = 100, shuffle = True)

Bear network that recognizes the silhouette of a bear

I used the same network as last time.

from torch import nn, optim from torch.nn import functional as F class Kuma(nn.Module): def __init__(self): super(Kuma, self).__init__() #Encoder part self.encode1 = nn.Sequential( *[ nn.Conv2d( in_channels = 1, out_channels = 6, kernel_size = 3, padding = 1), nn.BatchNorm2d(6) ]) self.encode2 = nn.Sequential( *[ nn.Conv2d( in_channels = 6, out_channels = 16, kernel_size = 3, padding = 1), nn.BatchNorm2d(16) ]) self.encode3 = nn.Sequential( *[ nn.Conv2d( in_channels = 16, out_channels = 32, kernel_size = 3, padding = 1), nn.BatchNorm2d(32) ]) #Decoder part self.decode3 = nn.Sequential( *[ nn.ConvTranspose2d( in_channels = 32, out_channels = 16, kernel_size = 3, padding = 1), nn.BatchNorm2d(16) ]) self.decode2 = nn.Sequential( *[ nn.ConvTranspose2d( in_channels = 16, out_channels = 6, kernel_size = 3, padding = 1), nn.BatchNorm2d(6) ]) self.decode1 = nn.Sequential( *[ nn.ConvTranspose2d( in_channels = 6, out_channels = 1, kernel_size = 3, padding = 1), ]) def forward(self, x): #Encoder part dim_0 = x.size() #For restoring the size in the first layer of the decoder x = F.relu(self.encode1(x)) # return_indices =Set to True and max in the decoder_Use pool position idx x, idx_1 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True) dim_1 = x.size() #For restoring the size in the second layer of the decoder x = F.relu(self.encode2(x)) # return_indices =Set to True and max in the decoder_Use pool position idx x, idx_2 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True) dim_2 = x.size() x = F.relu(self.encode3(x)) #For restoring the size in the third layer of the decoder # return_indices =Set to True and max in the decoder_Use pool position idx x, idx_3 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True) #Decoder part x = F.max_unpool2d(x, idx_3, kernel_size = 2, stride = 2, output_size = dim_2) x = F.relu(self.decode3(x)) x = F.max_unpool2d(x, idx_2, kernel_size = 2, stride = 2, output_size = dim_1) x = F.relu(self.decode2(x)) x = F.max_unpool2d(x, idx_1, kernel_size = 2, stride = 2, output_size = dim_0) x = F.relu(self.decode1(x)) x = torch.sigmoid(x) return x

Start learning

First, learn only 50 epochs.

%%time kuma = Kuma() loss_fn = nn.MSELoss() optimizer = optim.Adam(kuma.parameters(), lr = 0.01) total_loss_history = [] epoch_time = 50 for epoch in range(epoch_time): !date total_loss = 0.0 kuma.train() for i, (XX, yy) in enumerate(data_loader): optimizer.zero_grad() y_pred = kuma(XX) loss = loss_fn(y_pred, yy) loss.backward() optimizer.step() total_loss += loss.item() print("epoch:",epoch, " loss:", total_loss/(i + 1)) total_loss_history.append(total_loss/(i + 1)) plt.plot(total_loss_history) plt.ylabel("loss") plt.xlabel("epoch time") plt.savefig("total_loss_history") plt.show()

Tue Feb 25 12:07:52 UTC 2020 epoch: 0 loss: 1202.7168701171875 Tue Feb 25 12:10:12 UTC 2020 epoch: 1 loss: 1200.6845336914062 Tue Feb 25 12:12:28 UTC 2020 ... Tue Feb 25 13:53:40 UTC 2020 epoch: 47 loss: 1199.1316650390625 Tue Feb 25 13:55:54 UTC 2020 epoch: 48 loss: 1199.1294555664062 Tue Feb 25 13:58:08 UTC 2020 epoch: 49 loss: 1199.133544921875

CPU times: user 1h 36min 47s, sys: 2min 23s, total: 1h 39min 11s Wall time: 1h 52min 30s

It looks like it has converged.

Verification

Generate 100 new data that were not used for learning and verify.

X_test = [] #Stores image data for testing Y_test = [] #Stores correct answer data for testing Z_test = [] #Store prediction results for testing for i in range(100): #Generate 100 new data not used for learning x_im, y_im = getdata_for_semantic_segmentation(draw_bear()) X_test.append(x_im) Y_test.append(y_im) #Format test image data for PyTorch X_test_a = np.array([[np.asarray(x).transpose((2, 0, 1))[0]] for x in X_test]) X_test_t = torch.tensor(X_test_a, dtype = torch.float32) #Calculate predictions using a trained model Y_pred = kuma(X_test_t) #Store predicted values as ndarray for pred in Y_pred: Z_test.append(pred.detach().numpy())

I will show only the first 10 data.

#Draw image data, correct answer data, and predicted values for the first 10 pieces of data fig = plt.figure(figsize=(12,36)) for i in range(10): ax = fig.add_subplot(10, 3, (i * 3)+1) ax.axis('off') ax.set_title('input_{}'.format(i)) ax.imshow(X_test[i]) ax = fig.add_subplot(10, 3, (i * 3)+2) ax.axis('off') ax.set_title('answer_{}'.format(i)) ax.imshow(Y_test[i]) ax = fig.add_subplot(10, 3, (i * 3)+3) ax.axis('off') ax.set_title('predicted_{}'.format(i)) yp2 = Y_pred[i].detach().numpy()[0] * 255 z_im = Image.fromarray(np.array([yp2, yp2, yp2]).transpose((1, 2, 0)).astype(dtype='uint8')) ax.imshow(z_im) plt.show()

There is a lot of noise, but it seems to be predictable to some extent.

The size of the bear

Let's look at the relationship between the actual bear size and the detected (predicted) bear size.

A_ans = [] A_pred = [] for yt, zt in zip(Y_test, Z_test): #Correct white area (divide by 3 because there are 3 vectors) A_ans.append(np.where(np.asarray(yt) > 0.5, 1, 0).sum() / 3) A_pred.append(np.where(np.asarray(zt) > 0.5, 1, 0).sum()) #Predicted white area plt.figure(figsize=(4, 4)) plt.scatter(A_ans, A_pred, alpha=0.5) plt.grid() plt.xlabel('Observed sizes') plt.ylabel('Predicted sizes') #plt.xlim([0, 1700]) #plt.ylim([0, 1700]) plt.show()

The accuracy is not good, but there is a vague linear relationship.

Save trained model

You can save the trained model and use it again at any time as follows.

torch.save(kuma.state_dict(), "kuma_050_20200225.pytorch")

An additional 50 epochs

Somehow, I tried to learn an additional 50 epochs. Code omitted.

I wonder if the error has decreased a little (but it doesn't change much)

There seems to be no major improvement.

torch.save(kuma.state_dict(), "kuma_100_20200225.pytorch")

For the time being, I saved the trained model as above.

Detect multiple bears

Now it's finally time to go live. Until now, we learned "images containing only one bear " and detected bears from "images containing only one bear ". I used to learn "images containing only one bear " and from "images containing multiple bears". I would like to detect a bear.

Loading trained model

Load the trained model you saved earlier.

kuma = Kuma() kuma.load_state_dict(torch.load("kuma_100_20200225.pytorch"))

<All keys matched successfully>

Image containing multiple bears

Generates 100 images containing multiple bears. It also generates a disappointing image that does not show the bear.

X_test = [] #Stores image data for testing Y_test = [] #Stores correct answer data for testing Z_test = [] #Store prediction results for testing for i in range(100): #Generate 100 new data not used for learning x_im, y_im = getdata_for_semantic_segmentation(draw_bear(n_bear=random.randint(0, 5))) X_test.append(x_im) Y_test.append(y_im) #Format test image data for PyTorch X_test_a = np.array([[np.asarray(x).transpose((2, 0, 1))[0]] for x in X_test]) X_test_t = torch.tensor(X_test_a, dtype = torch.float32)

Start forecasting

As it is a trained model, the prediction is instantaneous.

#Calculate predictions using a trained model Y_pred = kuma(X_test_t) #Store predicted values as ndarray for pred in Y_pred: Z_test.append(pred.detach().numpy()) #Draw image data, correct answer data, and predicted values for the first 10 pieces of data fig = plt.figure(figsize=(12,36)) for i in range(10): ax = fig.add_subplot(10, 3, (i * 3)+1) ax.axis('off') ax.set_title('input_{}'.format(i)) ax.imshow(X_test[i]) ax = fig.add_subplot(10, 3, (i * 3)+2) ax.axis('off') ax.set_title('answer_{}'.format(i)) ax.imshow(Y_test[i]) ax = fig.add_subplot(10, 3, (i * 3)+3) ax.axis('off') ax.set_title('predicted_{}'.format(i)) yp2 = Y_pred[i].detach().numpy()[0] * 255 z_im = Image.fromarray(np.array([yp2, yp2, yp2]).transpose((1, 2, 0)).astype(dtype='uint8')) ax.imshow(z_im) plt.show()

It's quite noisy, but I was able to make a rough prediction.

Let's see the relationship between the actual size of the bear and the predicted size.

A_ans = [] A_pred = [] for yt, zt in zip(Y_test, Z_test): #Correct white area (divide by 3 because there are 3 vectors) A_ans.append(np.where(np.asarray(yt) > 0.5, 1, 0).sum() / 3) A_pred.append(np.where(np.asarray(zt) > 0.5, 1, 0).sum()) #Predicted white area plt.figure(figsize=(4, 4)) plt.scatter(A_ans, A_pred, alpha=0.5) plt.grid() plt.xlabel('Observed sizes') plt.ylabel('Predicted sizes') #plt.xlim([0, 1700]) #plt.ylim([0, 1700]) plt.show()

There is a fairly clean straight line relationship, so if you just want to know the size, it seems that you can correct from this relationship.

Deeper network

I added another layer to create a deeper network.

import torch from torch import nn, optim from torch.nn import functional as F class Kuma(nn.Module): def __init__(self): super(Kuma, self).__init__() #Encoder part self.encode1 = nn.Sequential( *[ nn.Conv2d( in_channels = 1, out_channels = 6, kernel_size = 3, padding = 1), nn.BatchNorm2d(6) ]) self.encode2 = nn.Sequential( *[ nn.Conv2d( in_channels = 6, out_channels = 16, kernel_size = 3, padding = 1), nn.BatchNorm2d(16) ]) self.encode3 = nn.Sequential( *[ nn.Conv2d( in_channels = 16, out_channels = 32, kernel_size = 3, padding = 1), nn.BatchNorm2d(32) ]) self.encode4 = nn.Sequential( *[ nn.Conv2d( in_channels = 32, out_channels = 64, kernel_size = 3, padding = 1), nn.BatchNorm2d(64) ]) #Decoder part self.decode4 = nn.Sequential( *[ nn.ConvTranspose2d( in_channels = 64, out_channels = 32, kernel_size = 3, padding = 1), nn.BatchNorm2d(32) ]) self.decode3 = nn.Sequential( *[ nn.ConvTranspose2d( in_channels = 32, out_channels = 16, kernel_size = 3, padding = 1), nn.BatchNorm2d(16) ]) self.decode2 = nn.Sequential( *[ nn.ConvTranspose2d( in_channels = 16, out_channels = 6, kernel_size = 3, padding = 1), nn.BatchNorm2d(6) ]) self.decode1 = nn.Sequential( *[ nn.ConvTranspose2d( in_channels = 6, out_channels = 1, kernel_size = 3, padding = 1), ]) def forward(self, x): #Encoder part dim_0 = x.size() x = F.relu(self.encode1(x)) x, idx_1 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True) dim_1 = x.size() x = F.relu(self.encode2(x)) x, idx_2 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True) dim_2 = x.size() x = F.relu(self.encode3(x)) x, idx_3 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True) dim_3 = x.size() x = F.relu(self.encode4(x)) x, idx_4 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True) #Decoder part x = F.max_unpool2d(x, idx_4, kernel_size = 2, stride = 2, output_size = dim_3) x = F.relu(self.decode4(x)) x = F.max_unpool2d(x, idx_3, kernel_size = 2, stride = 2, output_size = dim_2) x = F.relu(self.decode3(x)) x = F.max_unpool2d(x, idx_2, kernel_size = 2, stride = 2, output_size = dim_1) x = F.relu(self.decode2(x)) x = F.max_unpool2d(x, idx_1, kernel_size = 2, stride = 2, output_size = dim_0) x = F.relu(self.decode1(x)) x = torch.sigmoid(x) return x

The result is as follows. The code is the same as the one described above (only the file name to save is different)

Learning curve

Prediction result example (1 bear)

Bear size

Example of prediction results (many bears)

Bear size

Just add one layer and it's pretty clean!

Recommended Posts
Kumantic Segumantion

Kumantic segmentation 2

See the contents of Kumantic Segumantion