|
import mediapipe as mp
|
|
import numpy as np
|
|
import cv2
|
|
import copy
|
|
|
|
mp_holistic = mp.solutions.holistic
|
|
mp_drawing = mp.solutions.drawing_utils
|
|
width, height = 640, 480
|
|
|
|
model = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)
|
|
|
|
def mediapipe_detection(image):
|
|
|
|
results = model.process(image)
|
|
return results
|
|
|
|
def extract_keypoint(results,last):
|
|
res = []
|
|
if results.pose_landmarks:
|
|
for p in results.pose_landmarks.landmark:
|
|
res.append(np.array([p.x,p.y,p.z,p.visibility]))
|
|
else:
|
|
for _ in range(33):
|
|
res.append(np.array([0,0,0,0]))
|
|
|
|
if results.left_hand_landmarks:
|
|
for p in results.left_hand_landmarks.landmark:
|
|
res.append(np.array([p.x,p.y,p.z]))
|
|
elif last!= None and last.left_hand_landmarks:
|
|
for p in last.left_hand_landmarks.landmark:
|
|
res.append(np.array([p.x,p.y,p.z]))
|
|
else:
|
|
for _ in range(21):
|
|
res.append(np.array([0,0,0]))
|
|
|
|
if results.right_hand_landmarks:
|
|
for p in results.right_hand_landmarks.landmark:
|
|
res.append(np.array([p.x,p.y,p.z]))
|
|
elif last!=None and last.right_hand_landmarks:
|
|
for p in last.right_hand_landmarks.landmark:
|
|
res.append(np.array([p.x,p.y,p.z]))
|
|
else:
|
|
for _ in range(21):
|
|
res.append(np.array([0,0,0]))
|
|
return res
|
|
|
|
def normalize_keypoint(res,img=None):
|
|
|
|
x1,y1,x2,y2 = res[11][0]*width,res[11][1]*height,res[12][0]*width,res[12][1]*height
|
|
try:
|
|
cv2.circle(img,(int(x1),int(y1)),4,(0,255,255),-1)
|
|
cv2.circle(img,(int(x2),int(y2)),4,(0,255,255),-1)
|
|
except:
|
|
|
|
pass
|
|
dis = np.sqrt((x1-x2)**2+(y1-y2)**2)
|
|
x_cen = (res[11][0]+res[12][0])/2
|
|
y_cen = (res[11][1]+res[12][1])/2
|
|
vector = [0.5-x_cen,0.5-y_cen]
|
|
scale = (200*width/640)/dis
|
|
for i in range(len(res)):
|
|
if res[i][0]==0 and res[i][1]==0:
|
|
continue
|
|
res[i][0] = vector[0]+res[i][0]
|
|
res[i][1] = vector[1]+res[i][1]
|
|
res[i][0] = 0.5+(res[i][0]-0.5)*scale
|
|
res[i][1] = 0.5+(res[i][1]-0.5)*scale
|
|
return res
|
|
|
|
def update_mpresult(res,results,last):
|
|
c = 0
|
|
if results.pose_landmarks:
|
|
for p in results.pose_landmarks.landmark:
|
|
p.x = res[c][0]
|
|
p.y = res[c][1]
|
|
if(c==20 and p.y>1.1 and last): last.right_hand_landmarks = None
|
|
elif(c==19 and p.y>1.1 and last): last.left_hand_landmarks = None
|
|
c+=1
|
|
else:
|
|
for _ in range(33):
|
|
c+=1
|
|
if results.left_hand_landmarks:
|
|
for p in results.left_hand_landmarks.landmark:
|
|
p.x = res[c][0]
|
|
p.y = res[c][1]
|
|
c+=1
|
|
else:
|
|
if last!=None and last.left_hand_landmarks: results.left_hand_landmarks = copy.deepcopy(last.left_hand_landmarks)
|
|
for _ in range(21):
|
|
c+=1
|
|
if results.right_hand_landmarks:
|
|
for p in results.right_hand_landmarks.landmark:
|
|
p.x = res[c][0]
|
|
p.y = res[c][1]
|
|
c+=1
|
|
else:
|
|
if last!=None and last.right_hand_landmarks: results.right_hand_landmarks = copy.deepcopy(last.right_hand_landmarks)
|
|
for _ in range(21):
|
|
c+=1
|
|
return results
|
|
|
|
def extract_keypoints_flatten(result, last, img=None):
|
|
|
|
res = extract_keypoint(result, last)
|
|
res = normalize_keypoint(res,img)
|
|
update_mpresult(res,result,last)
|
|
return np.concatenate([x for x in res])
|
|
|
|
def mediapipe_process(frames):
|
|
"""Main function to call, process a batch of frames into numpy array for prediction"""
|
|
sequence = []
|
|
last = None
|
|
for frame in frames:
|
|
results = mediapipe_detection(frame)
|
|
keypoints = extract_keypoints_flatten(results, last)
|
|
last = copy.deepcopy(results)
|
|
sequence.append(keypoints)
|
|
return np.array(sequence) |