It can be installed with Python, so I played with it. It is almost as a reference below. I had a hard time trying to MLP Goo Choki Par with pytorch-lightning by imitating Reference ②. In other words, is it normal, not an image? I was able to make my own data debut. 【reference】 ①ML solutions in MediaPipe (2) Python package version of MediaPipe is super easy + finger gesture estimation with simple MLP
The installation went very easily below. In Raspi4, I got an error saying that the version is not supported and did not enter.
pip install mediapipe
――Do you want to move everything? ?? --Hands details --Data accumulation --Data analysis (scatter plot, cos similarity) --Try plotting with matplotlib --Own data dataset, Dataloader --Network and learning -Draw while predicting
It seems that there are 15 ML solutions on the above page, but the following 4 are compatible with Python.
I tried to move them all. It sounds awkward, but the code is all similar in structure, so it's easy to understand. So, let's arrange the working code below.
import cv2
import mediapipe as mp
mp_drawing = mp.solutions.drawing_utils
mp_face_mesh = mp.solutions.face_mesh
# For webcam input:
face_mesh = mp_face_mesh.FaceMesh(
min_detection_confidence=0.5, min_tracking_confidence=0.5)
drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1)
cap = cv2.VideoCapture(0)
while cap.isOpened():
success, image = cap.read()
if not success:
print("Ignoring empty camera frame.")
# If loading a video, use 'break' instead of 'continue'.
continue
# Flip the image horizontally for a later selfie-view display, and convert
# the BGR image to RGB.
image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
# To improve performance, optionally mark the image as not writeable to
# pass by reference.
image.flags.writeable = False
results = face_mesh.process(image)
# Draw the face mesh annotations on the image.
image.flags.writeable = True
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
if results.multi_face_landmarks:
for face_landmarks in results.multi_face_landmarks:
mp_drawing.draw_landmarks(
image=image,
landmark_list=face_landmarks,
connections=mp_face_mesh.FACE_CONNECTIONS,
landmark_drawing_spec=drawing_spec,
connection_drawing_spec=drawing_spec)
cv2.imshow('MediaPipe FaceMesh', image)
if cv2.waitKey(5) & 0xFF == 27:
break
face_mesh.close()
cap.release()
import mediapipe as mp
import cv2
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
min_detection_confidence=0.7,
min_tracking_confidence=0.5,
)
cap = cv2.VideoCapture(0)
while cap.isOpened():
success, image = cap.read()
if not success:
print("Ignoring empty camera frame.")
# If loading a video, use 'break' instead of 'continue'.
continue
# Flip the image horizontally for a later selfie-view display, and convert
# the BGR image to RGB.
image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
# To improve performance, optionally mark the image as not writeable to
# pass by reference.
image.flags.writeable = False
results = hands.process(image)
# Draw the hand annotations on the image.
image.flags.writeable = True
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
image_width, image_height = image.shape[1], image.shape[0]
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
#print('Handedness:', results.multi_handedness)
mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
cv2.imshow('MediaPipe Hands', image)
if cv2.waitKey(5) & 0xFF == 27:
break
hands.close()
cap.release()
import mediapipe as mp
import cv2
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(
min_detection_confidence=0.5, min_tracking_confidence=0.5)
cap = cv2.VideoCapture(0)
while cap.isOpened():
success, image = cap.read()
if not success:
print("Ignoring empty camera frame.")
# If loading a video, use 'break' instead of 'continue'.
continue
# Flip the image horizontally for a later selfie-view display, and convert
# the BGR image to RGB.
image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
# To improve performance, optionally mark the image as not writeable to
# pass by reference.
image.flags.writeable = False
results = pose.process(image)
# Draw the pose annotation on the image.
image.flags.writeable = True
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
mp_drawing.draw_landmarks(
image, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
cv2.imshow('MediaPipe Pose', image)
if cv2.waitKey(5) & 0xFF == 27:
break
hands.close()
cap.release()
import cv2
import mediapipe as mp
mp_drawing = mp.solutions.drawing_utils
mp_holistic = mp.solutions.holistic
# For webcam input:
holistic = mp_holistic.Holistic(
min_detection_confidence=0.5, min_tracking_confidence=0.5)
cap = cv2.VideoCapture(0)
while cap.isOpened():
success, image = cap.read()
if not success:
print("Ignoring empty camera frame.")
# If loading a video, use 'break' instead of 'continue'.
continue
# Flip the image horizontally for a later selfie-view display, and convert
# the BGR image to RGB.
image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
# To improve performance, optionally mark the image as not writeable to
# pass by reference.
image.flags.writeable = False
results = holistic.process(image)
# Draw landmark annotation on the image.
image.flags.writeable = True
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
mp_drawing.draw_landmarks(
image, results.face_landmarks, mp_holistic.FACE_CONNECTIONS)
mp_drawing.draw_landmarks(
image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
mp_drawing.draw_landmarks(
image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
mp_drawing.draw_landmarks(
image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
cv2.imshow('MediaPipe Holistic', image)
if cv2.waitKey(5) & 0xFF == 27:
break
holistic.close()
cap.release()
In other words, the following Camera input code contains the analysis result of mediapipe in ML. The above code outputs a mirror image, that is, an image reflected in a normal mirror.
import cv2
cap = cv2.VideoCapture(0)
while cap.isOpened():
success, image = cap.read()
if not success:
print("Ignoring empty camera frame.")
# If loading a video, use 'break' instead of 'continue'.
continue
#cv2.imshow('Camera', image) #Camera sight
image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
cv2.imshow('amera_mirror', image) #Mirror sight
if cv2.waitKey(5) & 0xFF == 27:
break
cap.release()
import mediapipe as mp
import cv2
import mediapipe as mp
mp_drawing = mp.solutions.drawing_utils
mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(
min_detection_confidence=0.5, min_tracking_confidence=0.5)
drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1)
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(
min_detection_confidence=0.5, min_tracking_confidence=0.5)
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
min_detection_confidence=0.7,
min_tracking_confidence=0.5,
)
image_blank = cv2.imread('blank.jpg') #Use a blank sheet as a mount
sk = 0
cap = cv2.VideoCapture(0)
while cap.isOpened():
success, image = cap.read()
image_blank = cv2.imread('blank.jpg')
if not success:
print("Ignoring empty camera frame.")
# If loading a video, use 'break' instead of 'continue'.
continue
image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
cv2.imshow('Camera', image)
image.flags.writeable = False
results_face = face_mesh.process(image)
results_pose = pose.process(image)
results_hands = hands.process(image)
image.flags.writeable = True
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
if results_face.multi_face_landmarks:
for face_landmarks in results_face.multi_face_landmarks:
mp_drawing.draw_landmarks(
image=image_blank,
landmark_list=face_landmarks,
connections=mp_face_mesh.FACE_CONNECTIONS,
landmark_drawing_spec=drawing_spec,
connection_drawing_spec=drawing_spec)
cv2.imshow('MediaPipe FaceMesh', image_blank)
cv2.imwrite('./image/blank/face/image'+ str(sk) + '.png', image_blank)
if results_hands.multi_hand_landmarks:
for hand_landmarks in results_hands.multi_hand_landmarks:
#print('Handedness:', results.multi_handedness)
mp_drawing.draw_landmarks(image_blank, hand_landmarks, mp_hands.HAND_CONNECTIONS)
cv2.imshow('MediaPipe Hands', image_blank)
cv2.imwrite('./image/blank/facehands/image'+ str(sk) + '.png', image_blank)
mp_drawing.draw_landmarks(
image_blank, results_pose.pose_landmarks, mp_pose.POSE_CONNECTIONS)
cv2.imshow('MediaPipe Pose', image_blank)
sk += 1
if cv2.waitKey(5) & 0xFF == 27:
break
face_mesh.close()
hands.close()
pose.close()
cap.release()
An expressive face image was obtained. I can hear not only facial expressions but also voices. face video face + hands video
I haven't confirmed the code, but it measures the following points for explanation. The details of Face_mesh are FACE DETECTION MODEL + FACE LANDMARK MODEL. As you can see from the image above, I think it is very accurate and fast. And it is said that hands is also detected and drawn by the same method. The landmarks that can be obtained are as follows. From reference ① You can see that the landmaks and connections obtained in results_hands are superimposed and drawn on image_blank with the following code.
results_hands = hands.process(image)
if results_hands.multi_hand_landmarks:
for hand_landmarks in results_hands.multi_hand_landmarks:
#print('Handedness:', results.multi_handedness)
mp_drawing.draw_landmarks(image_blank, hand_landmarks, mp_hands.HAND_CONNECTIONS)
The animation is done with the following code. The hands are interrupted when the hands are not visible, so the frames are dropped. Therefore, exception handling is performed as follows so that the image file with dropped frames can be skipped, but since there are many dropped frames, the number interrupted by the variable s0 is skipped.
import numpy as np
import PIL.Image
s=583
images = []
s1=1
s0 =100
sk = 0
for j in range(0,s0,1):
try:
for i in range(s1,s,2):
im = PIL.Image.open('./image/blank/facehands/image'+str(i)+'.png')
im =im.resize(size=(640, 480), resample=PIL.Image.NEAREST)
images.append(im)
except Exception as e:
s1=i+1
sk += 1
print(sk, e)
print("finish", s)
images[0].save('./image/blank/facehands.gif', save_all=True, append_images=images[1:s], duration=100*1, loop=0)
with open('./hands/sample_hands.csv', 'a', newline='') as f:
landmark_point = []
writer = csv.writer(f)
if results.multi_hand_landmarks:
idx += 1
print('Handedness:', results.multi_handedness)
for hand_landmarks in results.multi_hand_landmarks:
for index, landmark in enumerate(hand_landmarks.landmark):
landmark_x = min(int(landmark.x * image_width), image_width - 1)
landmark_y = min(int(landmark.y * image_height), image_height - 1)
landmark_ = landmark_x,landmark_y
landmark_point.append(landmark_x)
landmark_point.append(landmark_y)
print(landmark_point)
writer.writerow(np.array(landmark_point))
with open('./hands/sample_hands8.csv', 'a', newline='') as f:
landmark_point = []
writer = csv.writer(f)
Next, if the landmark has been acquired, convert the position information to coordinates (integer value) and acquire one set of landmark_point.
If you can get one set (1,21,2) of landmark_point, write it with writer.writerow.
if results.multi_hand_landmarks:
idx += 1
#print('Handedness:', results.multi_handedness)
for hand_landmarks in results.multi_hand_landmarks:
for index, landmark in enumerate(hand_landmarks.landmark):
landmark_x = min(int(landmark.x * image_width), image_width - 1)
landmark_y = min(int(landmark.y * image_height), image_height - 1)
landmark_ = landmark_x,landmark_y
landmark_point.append(landmark_x)
landmark_point.append(landmark_y)
writer.writerow(np.array(landmark_point))
For the rest, cv2.imshow on the screen and save the file. I think I like whether the mirror image is good or not.
import mediapipe as mp
from PIL import Image
import cv2
import csv
import numpy as np
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
min_detection_confidence=0.7,
min_tracking_confidence=0.5,
)
idx = 0
cap = cv2.VideoCapture(0)
while cap.isOpened():
success, image = cap.read()
if not success:
print("Ignoring empty camera frame.")
# If loading a video, use 'break' instead of 'continue'.
continue
# Flip the image horizontally for a later selfie-view display, and convert
# the BGR image to RGB.
image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
# To improve performance, optionally mark the image as not writeable to
# pass by reference.
image.flags.writeable = False
results = hands.process(image)
# Draw the hand annotations on the image.
image.flags.writeable = True
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
image_width, image_height = image.shape[1], image.shape[0]
with open('./hands/sample_hands6.csv', 'a', newline='') as f:
list_landmarks = []
landmark_point = []
writer = csv.writer(f)
if results.multi_hand_landmarks:
idx += 1
print('Handedness:', results.multi_handedness)
for hand_landmarks in results.multi_hand_landmarks:
for index, landmark in enumerate(hand_landmarks.landmark):
landmark_x = min(int(landmark.x * image_width), image_width - 1)
landmark_y = min(int(landmark.y * image_height), image_height - 1)
landmark_ = landmark_x,landmark_y #[idx,index, np.array((landmark_x, landmark_y))]
landmark_point.append(landmark_x)
landmark_point.append(landmark_y)
print(landmark_point)
writer.writerow(np.array(landmark_point))
mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
cv2.imshow('MediaPipe Hands', image)
#cv2.imwrite('./image/annotated_image' + str(idx) + '.png', cv2.flip(image, 1))
cv2.imwrite('./image/annotated_image' + str(idx) + '.png', image)
if cv2.waitKey(5) & 0xFF == 27:
break
hands.close()
cap.release()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def cos_sim(v1, v2):
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
df = pd.read_csv('./hands/sample_hands9.csv', sep=',')
#print(df.head(3)) #Data confirmation
df = df.astype(int)
print(df.iloc[0, :])
#Origin 0 in the following for statement(df.iloc[i,0], df.iloc[i,1])Decides whether to use the coordinates from
for i in range(1,len(df),1):
for j in range(0,21,2):
df.iloc[i,2*j+1] = df.iloc[i,2*j+1]-df.iloc[i,1]
df.iloc[i,2*j] = df.iloc[i,2*j]-df.iloc[i,0]
cs_sim =[]
for i in range(1,len(df),1):
cs= cos_sim(df.iloc[30,:], df.iloc[i,:])
#print(df.iloc[i,:]-df.iloc[i,0])
print('cos similarity: {}-{}'.format(30,i),cs)
cs_sim.append(cs)
plt.figure(figsize=(12, 6))
plt.plot(cs_sim)
plt.ylim(0.9,)
plt.savefig('./hands/cos_sim_hands_plot9.png')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
from pandas import plotting
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
df = pd.read_csv('./hands/sample_hands9.csv', sep=',')
print(df.head(3))
df = df.astype(int)
plotting.scatter_matrix(df[df.columns[1:11]], figsize=(6,6), alpha=0.8, diagonal='kde')
plt.savefig('./hands/scatter_plot0-10.png')
plt.pause(5)
plt.close()
#In this example, it is divided into three groups(Set the random number seed of Mersenne Twister to 10.)
kmeans_model = KMeans(n_clusters=3, random_state=10).fit(df.iloc[:, :])
#Get the label of the classification result
labels = kmeans_model.labels_
#Check the classification result
print(len(labels),labels)
#Decide the color to give to each.
color_codes = {0:'#00FF00', 1:'#FF0000', 2:'#0000FF'} #,3:'#FF00FF', 4:'#00FFFF', 5:'#FFFF00', 6:'#000000'}
#Give a color to each sample.
colors = [color_codes[x] for x in labels]
#Draw a color-coded Scatter Matrix.
plotting.scatter_matrix(df[df.columns[1:11]], figsize=(6,6),c=colors, diagonal='kde', alpha=0.8) #Data plot
plt.savefig('./hands/scatter_color_plot0-10.png')
plt.pause(1)
plt.close()
#Perform principal component analysis
pca = PCA()
pca.fit(df.iloc[:, :])
PCA(copy=True, n_components=None, whiten=False)
#Mapping data to principal component space=Dimensional compression
feature = pca.transform(df.iloc[:, :])
#Plot with the first and second principal components
plt.figure(figsize=(6, 6))
for x, y, name in zip(feature[:, 0], feature[:, 1], df.iloc[:, 0]):
plt.text(x, y, name, alpha=0.8, size=10)
plt.scatter(feature[:, 0], feature[:, 1], alpha=0.8, color=colors[:])
plt.title("Principal Component Analysis")
plt.xlabel("The first principal component score")
plt.ylabel("The second principal component score")
plt.savefig('./hands/PCA_hands_plot.png')
plt.pause(1)
plt.close()
plotting.scatter_matrix(df[df.columns[0:10]], figsize=(6,6),c=colors, diagonal='kde', alpha=0.8) Result: You can see that the scatter plot above has three colors, and the difference in correlation is different up to landmark 5 and clustering is performed.
Next, PCA principal component analysis is performed, the data is projected two-dimensionally, and then clustered. The result was neatly divided into three as shown below. You can see that the points distributed on the boundary region side of each cluster (14,15,64,65,88,89,132,133, etc.) are in the transition region of the above cos-similarity graph. By the way, blue; goo, green; choki, red; par. plt.scatter(feature[:, 0], feature[:, 1], alpha=0.8, color=colors)
The k-means categorization of 139 data has the following results.
139
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2]
In this way, Goo Choki Paa can be classified without a teacher.
import pandas as pd
import matplotlib.pyplot as plt
from pandas import plotting
from sklearn.cluster import KMeans
import csv
import numpy as np
df = pd.read_csv('./hands/sample_hands9.csv', sep=',')
#In this example, it is divided into three groups(Set the random number seed of Mersenne Twister to 10.)
kmeans_model = KMeans(n_clusters=3, random_state=10).fit(df.iloc[:, :])
#Get the label of the classification result
labels = kmeans_model.labels_
#Check the classification result
print(len(labels),labels)
df['42']=labels
header = ['name', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42]
with open('./hands/sample_hands9_.csv', 'a', newline='') as f:
writer = csv.writer(f)
writer.writerow(header)
for i in range(len(df)):
writer.writerow(np.array(df.iloc[i,:]))
#For verification below
df_ = pd.read_csv('./hands/sample_hands9_.csv', sep=',')
print(df_.head(3))
print(df_['42'].astype(int))
plotting.scatter_matrix(df_[df_.columns[1:11]], figsize=(6,6), alpha=0.8, diagonal='kde')
plt.savefig('./hands/scatter_plot_0-10.png')
plt.pause(5)
plt.close()
import cv2
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('./hands/sample_hands9_.csv', sep=',')
#print(df.head(3))
data_num = len(df)
#print(data_num)
df = df.astype(int)
x = []
for j in range(data_num):
x_ = []
for i in range(0,21,1):
x__ = [df['{}'.format(2*i)][j],df['{}'.format(2*i+1)][j]]
x_.append(x__)
x.append(x_)
y = df['42']
x = np.array(x)
y = np.array(y)
print(x.shape,y.shape)
fig = plt.figure()
ax = plt.axes()
while 1:
for j in range(0,data_num):
for i in range(20):
plt.plot(x[j][i][0],x[j][i][1],color='black', marker='o')
plt.text(600,-120,y[j],size=50)
plt.xlim(700,0)
plt.ylim(600,-200)
plt.title(j)
plt.pause(0.1)
plt.savefig('./hands/draw/data_plot{}.png'.format(j))
plt.clf()
if cv2.waitKey(5) & 0xFF == 27:
break
So I ended up with the following code. It is almost as shown in Reference ⑤. However, the following code adopts this way of writing somewhere. ** How to return data and label data is the key to using Dataloader ** 【reference】 ・ Let's play with Pytorch [From data molding to FNN] -Input numpy ndarray instead of images in a CNN
#The following float()And long()The designation of is the liver of this time
self.data = torch.from_numpy(np.array(x)).float()
self.label = torch.from_numpy(np.array(y)).long()
The dataset uses the csv reading that was used almost above. transform is not explicitly defined this time.
class HandsDataset(torch.utils.data.Dataset):
def __init__(self, data_num, transform=None):
self.transform = transform
self.data_num = data_num
self.data = []
self.label = []
df = pd.read_csv('./hands/sample_hands7.csv', sep=',')
print(df.head(3)) #Data confirmation
df = df.astype(int)
x = []
for j in range(self.data_num):
x_ = []
for i in range(0,21,1):
x__ = [df['{}'.format(2*i)][j],df['{}'.format(2*i+1)][j]]
x_.append(x__)
x.append(x_)
y = df['42'][:self.data_num]
#The following float()And long()The designation of is the liver of this time
self.data = torch.from_numpy(np.array(x)).float()
print(self.data)
self.label = torch.from_numpy(np.array(y)).long()
print(self.label)
def __len__(self):
return self.data_num
def __getitem__(self, idx):
out_data = self.data[idx]
out_label = self.label[idx]
if self.transform:
out_data = self.transform(out_data)
return out_data, out_label
class LitHands(pl.LightningModule):
def __init__(self, hidden_size=10, learning_rate=2e-4):
super().__init__()
...
# Hardcode some dataset specific attributes
self.num_classes = 3
self.dims = (1, 21, 2)
channels, width, height = self.dims
...
def forward(self, x):
...
return F.log_softmax(x, dim=1)
def training_step(self, batch, batch_idx):
...
return loss
def validation_step(self, batch, batch_idx):
...
return loss
def test_step(self, batch, batch_idx):
...
def configure_optimizers(self):
...
def setup(self, stage=None):
data_num=1350 #292
self.dataset = HandsDataset(data_num, transform=None)
n_train = int(len(self.dataset)*0.5)
n_val = int(len(self.dataset)*0.3)
n_test = len(self.dataset)-n_train-n_val
print("n_train, n_val, n_test ",n_train, n_val, n_test)
self.train_data, self.val_data, self.test_data = random_split(self.dataset,[n_train, n_val, n_test])
print('type(train_data)',type(self.train_data))
def train_dataloader(self):
self.trainloader = DataLoader(self.train_data, shuffle=True, drop_last = True, batch_size=32, num_workers=0)
return self.trainloader
def val_dataloader(self):
return DataLoader(self.val_data, shuffle=False, batch_size=32, num_workers=0)
def test_dataloader(self):
return DataLoader(self.test_data, shuffle=False, batch_size=32)
network Finally, I will describe network and learning. The code is as follows with reference to Reference ②.
Input; (batch, index, (x, y)) = (32,21,2) Output class_num = 3 (goo, choki, par); (3) The ch is changed from 42⇒20⇒20⇒3. Final output of inference; F.log_softmax (x, dim = 1) It is said. In training_step, we set loss = F.nll_loss (logits, y) and learn to make it smaller. logits is the inferred ch and y is the teacher data ch.
class LitHands(pl.LightningModule):
def __init__(self, hidden_size=10, learning_rate=2e-4):
super().__init__()
# Set our init args as class attributes
self.hidden_size = hidden_size
self.learning_rate = learning_rate
# Hardcode some dataset specific attributes
self.num_classes = 3
self.dims = (1, 21, 2)
channels, width, height = self.dims
# Define PyTorch model
self.model = nn.Sequential(
nn.Flatten(),
nn.Linear(channels * width * height, 2*hidden_size),
nn.ReLU(),
#nn.Dropout(0.1),
nn.Linear(2*hidden_size, 2*hidden_size),
nn.ReLU(),
#nn.Dropout(0.1),
nn.Linear(2*hidden_size, self.num_classes)
)
def forward(self, x):
x = self.model(x)
return F.log_softmax(x, dim=1)
def training_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = F.nll_loss(logits, y)
#print(logits,y)
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = F.nll_loss(logits, y)
preds = torch.argmax(logits, dim=1)
acc = accuracy(preds, y)
# Calling self.log will surface up scalars for you in TensorBoard
self.log('val_loss', loss, prog_bar=True)
self.log('val_acc', acc, prog_bar=True)
return loss
def test_step(self, batch, batch_idx):
# Here we just reuse the validation_step for testing
return self.validation_step(batch, batch_idx)
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
return optimizer
def main():
model = LitHands()
print(model)
trainer = pl.Trainer(max_epochs=2000) #Convergence is fast
#trainer = pl.Trainer(max_epochs=1, gpus=1) #I wonder if I will use it when there is a lot of data
trainer.fit(model) #, DataLoader(train, batch_size = 32, shuffle= True), DataLoader(val, batch_size = 32))
trainer.test(model)
print('training_finished')
PATH = "hands_mlp.ckpt"
trainer.save_checkpoint(PATH) #Save learning results
pretrained_model = model.load_from_checkpoint(PATH) #Read pretrained weights; weights you just learned
pretrained_model.freeze()
pretrained_model.eval()
a = torch.tensor([[315., 420.], #Output as a trial
[409., 401.],
[485., 349.],
[534., 302.],
[574., 279.],
[418., 205.],
[442., 126.],
[462., 74.],
[477., 33.],
[364., 186.],
[370., 89.],
[379., 22.],
[386., -33.],
[312., 192.],
[311., 98.],
[316., 37.],
[321., -9.],
[259., 218.],
[230., 154.],
[215., 113.],
[204., 77.]])
print(a[:])
results = pretrained_model(a[:].reshape(1,21,2))
print(results)
preds = torch.argmax(results)
print(preds)
df = pd.read_csv('./hands/sample_hands7.csv', sep=',') #Prepare appropriate data different from teacher data
print(df.head(3)) #Data confirmation
df = df.astype(int)
data_num = len(df)
x = []
for j in range(data_num):
x_ = []
for i in range(0,21,1):
x__ = [df['{}'.format(2*i)][j],df['{}'.format(2*i+1)][j]]
x_.append(x__)
x.append(x_)
data_ = torch.from_numpy(np.array(x)).float()
y = df['42'][:data_num]
label_ = torch.from_numpy(np.array(y)).long()
count = 0
for j in range(data_num):
a = data_[j] #Calculated for all data
results = pretrained_model(a[:].reshape(1,21,2)) #Predicted value calculation for all data
preds = torch.argmax(results)
print(j,preds,label_[j]) #Predicted value preds and original value label and display
if preds== label_[j]:
count += 1
acc=count/data_num
print("acc = ",acc)
if __name__ == '__main__':
start_time = time.time()
main()
print('elapsed time: {:.3f} [sec]'.format(time.time() - start_time))
The execution result is as follows.
>python mediapipe_mlp_last.py
LitHands(
(model): Sequential(
(0): Flatten(start_dim=1, end_dim=-1)
(1): Linear(in_features=42, out_features=20, bias=True)
(2): ReLU()
(3): Linear(in_features=20, out_features=20, bias=True)
(4): ReLU()
(5): Linear(in_features=20, out_features=3, bias=True)
)
)
GPU available: True, used: False
TPU available: None, using: 0 TPU cores
...
n_train, n_val, n_test 675 405 270
type(train_data) <class 'torch.utils.data.dataset.Subset'>
...
| Name | Type | Params
-------------------------------------
0 | model | Sequential | 1.3 K
-------------------------------------
1.3 K Trainable params
0 Non-trainable params
1.3 K Total params
Epoch 1999: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 34/34 [00:00<00:00, 558.63it/s, loss=0.00861, v_num=449, val_loss=0.184, val_acc=0.983]
...
Testing: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 1128.01it/s]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'val_acc': tensor(1.), 'val_loss': tensor(0.0014)}
--------------------------------------------------------------------------------
training_finished
acc = 0.9911176905995559
elapsed time: 128.257 [sec]
pretrained_model = model.load_from_checkpoint(PATH)
print(pretrained_model)
pretrained_model.eval()
pretrained_model.freeze()
preds = pretrained_model(X) #This code is executed when getting x with webcam in a loop
a = np.array(landmark_point).astype(int)
a = torch.from_numpy(a).float()
#print(a.reshape(1,21,2))
a = a[:42]
results_ = pretrained_model(a[:].reshape(1,21,2))
print(results_)
preds = torch.argmax(results_)
import mediapipe as mp
from PIL import Image
import cv2
import csv
import numpy as np
import torch
from mediapipe_mlp_last import LitHands
mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
min_detection_confidence=0.7,
min_tracking_confidence=0.5,
)
model = LitHands()
PATH = "hands_mlp.ckpt"
pretrained_model = model.load_from_checkpoint(PATH)
print(pretrained_model)
pretrained_model.eval()
pretrained_model.freeze()
image0 = cv2.imread('blank.jpg') #Use a blank sheet as a mount
idx = 0
cap = cv2.VideoCapture(0)
while cap.isOpened():
success, image = cap.read()
image_blank = image0.copy() #Use a blank sheet as a mount
cv2.imwrite('./image/x/image_o' + str(idx) + '.png', image)
if not success:
print("Ignoring empty camera frame.")
# If loading a video, use 'break' instead of 'continue'.
continue
# Flip the image horizontally for a later selfie-view display, and convert
# the BGR image to RGB.
image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
# To improve performance, optionally mark the image as not writeable to
# pass by reference.
image.flags.writeable = False
results = hands.process(image)
# Draw the hand annotations on the image.
image.flags.writeable = True
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
image_width, image_height = image.shape[1], image.shape[0]
with open('./hands/sample_hands8.csv', 'a', newline='') as f:
list_landmarks = []
landmark_point = []
writer = csv.writer(f)
if results.multi_hand_landmarks:
idx += 1
#print('Handedness:', results.multi_handedness)
for hand_landmarks in results.multi_hand_landmarks:
for index, landmark in enumerate(hand_landmarks.landmark):
landmark_x = min(int(landmark.x * image_width), image_width - 1)
landmark_y = min(int(landmark.y * image_height), image_height - 1)
landmark_ = landmark_x,landmark_y
landmark_point.append(landmark_x)
landmark_point.append(landmark_y)
#print(landmark_point)
a = np.array(landmark_point).astype(int)
a = torch.from_numpy(a).float()
#print(a.reshape(1,21,2))
a = a[:42]
results_ = pretrained_model(a[:].reshape(1,21,2))
print(results_)
preds = torch.argmax(results_)
print(preds)
landmark_point.append(preds)
writer.writerow(np.array(landmark_point))
mp_drawing.draw_landmarks(image_blank, hand_landmarks, mp_hands.HAND_CONNECTIONS)
cv2.imshow('MediaPipe Hands_{}'.format(preds), image_blank)
cv2.imwrite('./'+'image/{}'.format(preds) +'/image{}_'.format(preds) + str(idx) + '.png', cv2.flip(image_blank, 1))
if cv2.waitKey(5) & 0xFF == 27:
break
hands.close()
cap.release()
The original image Plot of learning results (thinning out even numbers due to capacity) I had a little trouble, so I will post the code.
import os
import pickle
import numpy as np
import PIL.Image
import pandas as pd
df = pd.read_csv('./hands/sample_hands_results_.csv', sep=',')
print(df.head(3))
df = df.astype(int)
print(df['name'], df['42'])
s=len(df) #139 #583
images = []
s1=1
s0 =3 #100
sk=0
for j in range(0,s0,1):
try:
for i in range(s1,s,2):
im = PIL.Image.open('./image/{}'.format(df['42'][i])+'/image{}_'.format(df['42'][i])+str(df['name'][i])+'.png')
#im = PIL.Image.open('./hands/draw_results/data_plot'+str(i)+'.png')
im =im.resize(size=(640, 478), resample=PIL.Image.NEAREST)
images.append(im)
except Exception as e:
s1=i+1
sk += 1
print(sk,e)
print("finish", s, len(images))
images[0].save('./hands/hands_results_.gif', save_all=True, append_images=images[1:s], duration=100*1, loop=0)
import cv2
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('./hands/sample_hands_results_.csv', sep=',')
print(df.head(3))
data_num = len(df)
print(data_num)
df = df.astype(int)
x = []
for j in range(data_num):
x_ = []
for i in range(0,21,1):
x__ = [df['{}'.format(2*i)][j],df['{}'.format(2*i+1)][j]]
x_.append(x__)
x.append(x_)
y = df['42']
x = np.array(x)
y = np.array(y)
print(x.shape,y.shape)
fig = plt.figure()
ax = plt.axes()
while 1:
for j in range(0,data_num):
for i in range(20):
plt.plot(x[j][i][0],x[j][i][1],color='black', marker='o')
plt.text(600,-120,y[j],size=50)
plt.xlim(700,0)
plt.ylim(600,-200)
plt.title(j)
plt.pause(0.1)
plt.savefig('./hands/draw_results/data_plot{}.png'.format(j))
plt.clf()
if cv2.waitKey(5) & 0xFF == 27:
break
・ I want to map (convert) from face_mesh to an actual face
Recommended Posts