"Kumantic Segumantion" to get information about bears from images showing bears. This is a sequel to Kumantic Segmentation 2.
Last time, I aimed to "detect multiple </ b> of bear's footprints" by using "an image of multiple </ b> of bear's footprints" for learning. , Did not work).
This time, we aim to "detect multiple </ b> bear silhouettes" by using "an image showing only one </ b> bear silhouette" for learning.
Used to create datasets for training and validation.
import numpy as np
import random
from PIL import Image, ImageDraw, ImageFilter
from itertools import product
def draw_bear(n_bear=1): #Randomly generate an image of a bear
r = g = b = 250
im = Image.new('RGB', (400, 400), (r, g, b))
draw = ImageDraw.Draw(im)
for _ in range(random.randint(-1, 0)):
r = random.randint(10, 200)
g = random.randint(10, 200)
b = random.randint(10, 200)
x1 = random.randint(0, 400)
y1 = random.randint(0, 400)
dx = random.randint(10, 50)
dy = random.randint(10, 50)
draw.ellipse((x1, y1, x1+dx, y1+dy), fill=(r, g, b))
for _ in range(n_bear):
r = g = b = 1
center_x = 200
center_y = 200
wx = 60
wy = 50
dx1 = 90
dx2 = 20
dy1 = 90
dy2 = 20
dx3 = 15
dy3 = 100
dy4 = 60
shape1 = (center_x - wx, center_y - wy, center_x + wx, center_y + wy)
shape2 = (center_x - dx1, center_y - dy1, center_x - dx2, center_y - dy2)
shape3 = (center_x + dx2, center_y - dy1, center_x + dx1, center_y - dy2)
shape4 = (center_x - dx3, center_y - dy3, center_x + dx3, center_y - dy4)
zoom = 0.2 + random.random() * 0.4
center_x = random.randint(-30, 250)
center_y = random.randint(-30, 250)
shape1 = modify(shape1, zoom=zoom, center_x=center_x, center_y=center_y)
shape2= modify(shape2, zoom=zoom, center_x=center_x, center_y=center_y)
shape3 = modify(shape3, zoom=zoom, center_x=center_x, center_y=center_y)
shape4 = modify(shape4, zoom=zoom, center_x=center_x, center_y=center_y)
draw.ellipse(shape1, fill=(r, g, b))
draw.ellipse(shape2, fill=(r, g, b))
draw.ellipse(shape3, fill=(r, g, b))
#draw.ellipse(shape4, fill=(r, g, b))
return im
def modify(shape, zoom=1, center_x=0, center_y=0):
x1, y1, x2, y2 = np.array(shape) * zoom
return (x1 + center_x, y1 + center_y, x2 + center_x, y2 + center_y)
class Noise: #Add noise to the bear's image
def __init__(self, input_image):
self.input_image = input_image
self.input_pix = self.input_image.load()
self.w, self.h = self.input_image.size
def saltpepper(self, salt=0.05, pepper=0.05):
output_image = Image.new("RGB", self.input_image.size)
output_pix = output_image.load()
for x, y in product(*map(range, (self.w, self.h))):
r = random.random()
if r < salt:
output_pix[x, y] = (255, 255, 255)
elif r > 1 - pepper:
output_pix[x, y] = ( 0, 0, 0)
else:
output_pix[x, y] = self.input_pix[x, y]
return output_image
##Process bear's image into teacher data for semantic segmentation
def getdata_for_semantic_segmentation(im):
x_im = im.filter(ImageFilter.CONTOUR)
im2 = Noise(input_image=x_im)
x_im = im2.saltpepper()
a_im = np.asarray(im)
y_im = Image.fromarray(np.where(a_im == 1, 255, 0).astype(dtype='uint8'))
return x_im, y_im
If you prepare the above function, you can get the bear image x_im
and the correct answer data y_im
as follows.
x_im, y_im = getdata_for_semantic_segmentation(draw_bear())
Check the contents
x_im
y_im
Alright, I confirmed that the image of Mr. Kuma was generated. This is a bear no matter how you look at it.
Generate 1000 training data as follows.
%%time
X_data = [] #For storing image data
Y_data = [] #For storing correct answer data
for i in range(1000): #Generate 1000 images
x_im, y_im = getdata_for_semantic_segmentation(draw_bear())
X_data.append(x_im) #image data
Y_data.append(y_im) #Correct answer data
CPU times: user 1min 13s, sys: 865 ms, total: 1min 14s
Wall time: 1min 15s
Just in case, check only the first 8 of the generated data.
%matplotlib inline
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10,10))
for i in range(16):
ax = fig.add_subplot(4, 4, i+1)
ax.axis('off')
if i < 8: #Display the top 8 of image data
ax.set_title('input_{}'.format(i))
ax.imshow(X_data[i],cmap=plt.cm.gray, interpolation='none')
else: #Display the top 8 correct answer data
ax.set_title('answer_{}'.format(i - 8))
ax.imshow(Y_data[i - 8],cmap=plt.cm.gray, interpolation='none')
plt.show()
Format the resulting data for PyTorch.
import torch
from torch.utils.data import TensorDataset, DataLoader
#Convert image data and correct answer data to ndarray
X_a = np.array([[np.asarray(x).transpose((2, 0, 1))[0]] for x in X_data])
Y_a = np.array([[np.asarray(y).transpose((2, 0, 1))[0]] for y in Y_data])
#Convert ndarray image data and correct answer data to tensor
X_t = torch.tensor(X_a, dtype = torch.float32)
Y_t = torch.tensor(Y_a, dtype = torch.float32)
#Stored in data loader for learning with PyTorch
data_set = TensorDataset(X_t, Y_t)
data_loader = DataLoader(data_set, batch_size = 100, shuffle = True)
I used the same network as last time.
from torch import nn, optim
from torch.nn import functional as F
class Kuma(nn.Module):
def __init__(self):
super(Kuma, self).__init__()
#Encoder part
self.encode1 = nn.Sequential(
*[
nn.Conv2d(
in_channels = 1, out_channels = 6, kernel_size = 3, padding = 1),
nn.BatchNorm2d(6)
])
self.encode2 = nn.Sequential(
*[
nn.Conv2d(
in_channels = 6, out_channels = 16, kernel_size = 3, padding = 1),
nn.BatchNorm2d(16)
])
self.encode3 = nn.Sequential(
*[
nn.Conv2d(
in_channels = 16, out_channels = 32, kernel_size = 3, padding = 1),
nn.BatchNorm2d(32)
])
#Decoder part
self.decode3 = nn.Sequential(
*[
nn.ConvTranspose2d(
in_channels = 32, out_channels = 16, kernel_size = 3, padding = 1),
nn.BatchNorm2d(16)
])
self.decode2 = nn.Sequential(
*[
nn.ConvTranspose2d(
in_channels = 16, out_channels = 6, kernel_size = 3, padding = 1),
nn.BatchNorm2d(6)
])
self.decode1 = nn.Sequential(
*[
nn.ConvTranspose2d(
in_channels = 6, out_channels = 1, kernel_size = 3, padding = 1),
])
def forward(self, x):
#Encoder part
dim_0 = x.size() #For restoring the size in the first layer of the decoder
x = F.relu(self.encode1(x))
# return_indices =Set to True and max in the decoder_Use pool position idx
x, idx_1 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
dim_1 = x.size() #For restoring the size in the second layer of the decoder
x = F.relu(self.encode2(x))
# return_indices =Set to True and max in the decoder_Use pool position idx
x, idx_2 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
dim_2 = x.size()
x = F.relu(self.encode3(x)) #For restoring the size in the third layer of the decoder
# return_indices =Set to True and max in the decoder_Use pool position idx
x, idx_3 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
#Decoder part
x = F.max_unpool2d(x, idx_3, kernel_size = 2, stride = 2, output_size = dim_2)
x = F.relu(self.decode3(x))
x = F.max_unpool2d(x, idx_2, kernel_size = 2, stride = 2, output_size = dim_1)
x = F.relu(self.decode2(x))
x = F.max_unpool2d(x, idx_1, kernel_size = 2, stride = 2, output_size = dim_0)
x = F.relu(self.decode1(x))
x = torch.sigmoid(x)
return x
First, learn only 50 epochs.
%%time
kuma = Kuma()
loss_fn = nn.MSELoss()
optimizer = optim.Adam(kuma.parameters(), lr = 0.01)
total_loss_history = []
epoch_time = 50
for epoch in range(epoch_time):
!date
total_loss = 0.0
kuma.train()
for i, (XX, yy) in enumerate(data_loader):
optimizer.zero_grad()
y_pred = kuma(XX)
loss = loss_fn(y_pred, yy)
loss.backward()
optimizer.step()
total_loss += loss.item()
print("epoch:",epoch, " loss:", total_loss/(i + 1))
total_loss_history.append(total_loss/(i + 1))
plt.plot(total_loss_history)
plt.ylabel("loss")
plt.xlabel("epoch time")
plt.savefig("total_loss_history")
plt.show()
Tue Feb 25 12:07:52 UTC 2020
epoch: 0 loss: 1202.7168701171875
Tue Feb 25 12:10:12 UTC 2020
epoch: 1 loss: 1200.6845336914062
Tue Feb 25 12:12:28 UTC 2020
...
Tue Feb 25 13:53:40 UTC 2020
epoch: 47 loss: 1199.1316650390625
Tue Feb 25 13:55:54 UTC 2020
epoch: 48 loss: 1199.1294555664062
Tue Feb 25 13:58:08 UTC 2020
epoch: 49 loss: 1199.133544921875
CPU times: user 1h 36min 47s, sys: 2min 23s, total: 1h 39min 11s
Wall time: 1h 52min 30s
It looks like it has converged.
Generate 100 new data that were not used for learning and verify.
X_test = [] #Stores image data for testing
Y_test = [] #Stores correct answer data for testing
Z_test = [] #Store prediction results for testing
for i in range(100): #Generate 100 new data not used for learning
x_im, y_im = getdata_for_semantic_segmentation(draw_bear())
X_test.append(x_im)
Y_test.append(y_im)
#Format test image data for PyTorch
X_test_a = np.array([[np.asarray(x).transpose((2, 0, 1))[0]] for x in X_test])
X_test_t = torch.tensor(X_test_a, dtype = torch.float32)
#Calculate predictions using a trained model
Y_pred = kuma(X_test_t)
#Store predicted values as ndarray
for pred in Y_pred:
Z_test.append(pred.detach().numpy())
I will show only the first 10 data.
#Draw image data, correct answer data, and predicted values for the first 10 pieces of data
fig = plt.figure(figsize=(12,36))
for i in range(10):
ax = fig.add_subplot(10, 3, (i * 3)+1)
ax.axis('off')
ax.set_title('input_{}'.format(i))
ax.imshow(X_test[i])
ax = fig.add_subplot(10, 3, (i * 3)+2)
ax.axis('off')
ax.set_title('answer_{}'.format(i))
ax.imshow(Y_test[i])
ax = fig.add_subplot(10, 3, (i * 3)+3)
ax.axis('off')
ax.set_title('predicted_{}'.format(i))
yp2 = Y_pred[i].detach().numpy()[0] * 255
z_im = Image.fromarray(np.array([yp2, yp2, yp2]).transpose((1, 2, 0)).astype(dtype='uint8'))
ax.imshow(z_im)
plt.show()
There is a lot of noise, but it seems to be predictable to some extent.
Let's look at the relationship between the actual bear size and the detected (predicted) bear size.
A_ans = []
A_pred = []
for yt, zt in zip(Y_test, Z_test):
#Correct white area (divide by 3 because there are 3 vectors)
A_ans.append(np.where(np.asarray(yt) > 0.5, 1, 0).sum() / 3)
A_pred.append(np.where(np.asarray(zt) > 0.5, 1, 0).sum()) #Predicted white area
plt.figure(figsize=(4, 4))
plt.scatter(A_ans, A_pred, alpha=0.5)
plt.grid()
plt.xlabel('Observed sizes')
plt.ylabel('Predicted sizes')
#plt.xlim([0, 1700])
#plt.ylim([0, 1700])
plt.show()
The accuracy is not good, but there is a vague linear relationship.
You can save the trained model and use it again at any time as follows.
torch.save(kuma.state_dict(), "kuma_050_20200225.pytorch")
Somehow, I tried to learn an additional 50 epochs. Code omitted.
I wonder if the error has decreased a little (but it doesn't change much)
There seems to be no major improvement.
torch.save(kuma.state_dict(), "kuma_100_20200225.pytorch")
For the time being, I saved the trained model as above.
Now it's finally time to go live. Until now, we learned "images containing only one bear </ b>" and detected bears from "images containing only one bear </ b>". I used to learn "images containing only one bear </ b>" and from "images containing multiple </ b> bears". I would like to detect a bear.
Load the trained model you saved earlier.
kuma = Kuma()
kuma.load_state_dict(torch.load("kuma_100_20200225.pytorch"))
<All keys matched successfully>
Generates 100 images containing multiple bears. It also generates a disappointing image that does not show the bear.
X_test = [] #Stores image data for testing
Y_test = [] #Stores correct answer data for testing
Z_test = [] #Store prediction results for testing
for i in range(100): #Generate 100 new data not used for learning
x_im, y_im = getdata_for_semantic_segmentation(draw_bear(n_bear=random.randint(0, 5)))
X_test.append(x_im)
Y_test.append(y_im)
#Format test image data for PyTorch
X_test_a = np.array([[np.asarray(x).transpose((2, 0, 1))[0]] for x in X_test])
X_test_t = torch.tensor(X_test_a, dtype = torch.float32)
As it is a trained model, the prediction is instantaneous.
#Calculate predictions using a trained model
Y_pred = kuma(X_test_t)
#Store predicted values as ndarray
for pred in Y_pred:
Z_test.append(pred.detach().numpy())
#Draw image data, correct answer data, and predicted values for the first 10 pieces of data
fig = plt.figure(figsize=(12,36))
for i in range(10):
ax = fig.add_subplot(10, 3, (i * 3)+1)
ax.axis('off')
ax.set_title('input_{}'.format(i))
ax.imshow(X_test[i])
ax = fig.add_subplot(10, 3, (i * 3)+2)
ax.axis('off')
ax.set_title('answer_{}'.format(i))
ax.imshow(Y_test[i])
ax = fig.add_subplot(10, 3, (i * 3)+3)
ax.axis('off')
ax.set_title('predicted_{}'.format(i))
yp2 = Y_pred[i].detach().numpy()[0] * 255
z_im = Image.fromarray(np.array([yp2, yp2, yp2]).transpose((1, 2, 0)).astype(dtype='uint8'))
ax.imshow(z_im)
plt.show()
It's quite noisy, but I was able to make a rough prediction.
Let's see the relationship between the actual size of the bear and the predicted size.
A_ans = []
A_pred = []
for yt, zt in zip(Y_test, Z_test):
#Correct white area (divide by 3 because there are 3 vectors)
A_ans.append(np.where(np.asarray(yt) > 0.5, 1, 0).sum() / 3)
A_pred.append(np.where(np.asarray(zt) > 0.5, 1, 0).sum()) #Predicted white area
plt.figure(figsize=(4, 4))
plt.scatter(A_ans, A_pred, alpha=0.5)
plt.grid()
plt.xlabel('Observed sizes')
plt.ylabel('Predicted sizes')
#plt.xlim([0, 1700])
#plt.ylim([0, 1700])
plt.show()
There is a fairly clean straight line relationship, so if you just want to know the size, it seems that you can correct from this relationship.
I added another layer to create a deeper network.
import torch
from torch import nn, optim
from torch.nn import functional as F
class Kuma(nn.Module):
def __init__(self):
super(Kuma, self).__init__()
#Encoder part
self.encode1 = nn.Sequential(
*[
nn.Conv2d(
in_channels = 1, out_channels = 6, kernel_size = 3, padding = 1),
nn.BatchNorm2d(6)
])
self.encode2 = nn.Sequential(
*[
nn.Conv2d(
in_channels = 6, out_channels = 16, kernel_size = 3, padding = 1),
nn.BatchNorm2d(16)
])
self.encode3 = nn.Sequential(
*[
nn.Conv2d(
in_channels = 16, out_channels = 32, kernel_size = 3, padding = 1),
nn.BatchNorm2d(32)
])
self.encode4 = nn.Sequential(
*[
nn.Conv2d(
in_channels = 32, out_channels = 64, kernel_size = 3, padding = 1),
nn.BatchNorm2d(64)
])
#Decoder part
self.decode4 = nn.Sequential(
*[
nn.ConvTranspose2d(
in_channels = 64, out_channels = 32, kernel_size = 3, padding = 1),
nn.BatchNorm2d(32)
])
self.decode3 = nn.Sequential(
*[
nn.ConvTranspose2d(
in_channels = 32, out_channels = 16, kernel_size = 3, padding = 1),
nn.BatchNorm2d(16)
])
self.decode2 = nn.Sequential(
*[
nn.ConvTranspose2d(
in_channels = 16, out_channels = 6, kernel_size = 3, padding = 1),
nn.BatchNorm2d(6)
])
self.decode1 = nn.Sequential(
*[
nn.ConvTranspose2d(
in_channels = 6, out_channels = 1, kernel_size = 3, padding = 1),
])
def forward(self, x):
#Encoder part
dim_0 = x.size()
x = F.relu(self.encode1(x))
x, idx_1 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
dim_1 = x.size()
x = F.relu(self.encode2(x))
x, idx_2 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
dim_2 = x.size()
x = F.relu(self.encode3(x))
x, idx_3 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
dim_3 = x.size()
x = F.relu(self.encode4(x))
x, idx_4 = F.max_pool2d(x, kernel_size = 2, stride = 2, return_indices = True)
#Decoder part
x = F.max_unpool2d(x, idx_4, kernel_size = 2, stride = 2, output_size = dim_3)
x = F.relu(self.decode4(x))
x = F.max_unpool2d(x, idx_3, kernel_size = 2, stride = 2, output_size = dim_2)
x = F.relu(self.decode3(x))
x = F.max_unpool2d(x, idx_2, kernel_size = 2, stride = 2, output_size = dim_1)
x = F.relu(self.decode2(x))
x = F.max_unpool2d(x, idx_1, kernel_size = 2, stride = 2, output_size = dim_0)
x = F.relu(self.decode1(x))
x = torch.sigmoid(x)
return x
The result is as follows. The code is the same as the one described above (only the file name to save is different)
Learning curve
Prediction result example (1 bear)
Bear size
Example of prediction results (many bears)
Bear size
Just add one layer and it's pretty clean!