Spaces:

ozyman
/

fasd

Runtime error

App Files Files Community

ozyman commited on Apr 6, 2023

Commit

91f288a

1 Parent(s): 952b15c

added averaging and parallelism

Browse files

Files changed (1) hide show

app.py +93 -69

app.py CHANGED Viewed

@@ -32,12 +32,13 @@ os.environ['OMP_NUM_THREADS'] = '4'
 os.environ['AWS_ACCESS_KEY_ID'] = 'AKIA3JAMX4K53MFDKMGJ'
 os.environ['AWS_SECRET_ACCESS_KEY'] = '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'
-app_version = 'dsdg_vid_1'
 device = torch.device("cpu")
 labels = ['Live', 'Spoof']
 PIX_THRESHOLD = 0.45
-DSDG_THRESHOLD = 0.5
 MIN_FACE_WIDTH_THRESHOLD = 210
 examples = [
     ['examples/1_1_21_2_33_scene_fake.jpg'],
@@ -79,29 +80,6 @@ class Normaliztion_valtest(object):
         return image_x
-def prepare_data_dsdg(images, boxes, depths):
-    transform = transforms.Compose([Normaliztion_valtest()])
-    files_total = 1
-    image_x = np.zeros((files_total, 256, 256, 3))
-    depth_x = np.ones((files_total, 32, 32))
-    for i, (image, bbox, depth_img) in enumerate(
-            zip(images, boxes, depths)):
-        x, y, x2, y2 = bbox
-        depth_img = cv.cvtColor(depth_img, cv.COLOR_RGB2GRAY)
-        image = image[y:y2, x:x2]
-        depth_img = depth_img[y:y2, x:x2]
-        image_x[i, :, :, :] = cv.resize(image, (256, 256))
-        # transform to binary mask --> threshold = 0
-        depth_x[i, :, :] = cv.resize(depth_img, (32, 32))
-    image_x = image_x.transpose((0, 3, 1, 2))
-    image_x = transform(image_x)
-    image_x = torch.from_numpy(image_x.astype(float)).float()
-    depth_x = torch.from_numpy(depth_x.astype(float)).float()
-    return image_x, depth_x
 def find_largest_face(faces):
     # find the largest face in the list
     largest_face = None
@@ -144,54 +122,84 @@ def deepix_model_inference(img, bbox):
     return img_deepix, confidences_deepix, cls_deepix
-def dsdg_model_inference(img, bbox, dsdg_thresh):
-    dsdg_thresh = dsdg_thresh / 10000
-    dense_flag = True
-    x, y, x2, y2 = bbox
-    w = x2 - x
-    h = y2 - y
     if w < MIN_FACE_WIDTH_THRESHOLD:
         color_dsdg = (0, 0, 0)
         text = f'Small res ({w}*{h})'
-        img_dsdg = cv.rectangle(img.copy(), (x, y), (x2, y2), color_dsdg, 2)
         cv.putText(img_dsdg, text, (x, y2 + 30),
                    cv.FONT_HERSHEY_COMPLEX, 1, color_dsdg)
-        cls_dsdg = -1
-        return img_dsdg, {}, cls_dsdg
-    bbox_conf = list(bbox)
-    bbox_conf.append(1)
-    param_lst, roi_box_lst = tddfa(img, [bbox_conf])
-    ver_lst = tddfa.recon_vers(param_lst, roi_box_lst, dense_flag=dense_flag)
-    depth_img = depth(img, ver_lst, tddfa.tri, with_bg_flag=False)
     with torch.no_grad():
         map_score_list = []
-        image_x, map_x = prepare_data_dsdg([img], [list(bbox)], [depth_img])
         # get the inputs
         image_x = image_x.unsqueeze(0)
         map_x = map_x.unsqueeze(0)
         inputs = image_x.to(device)
         test_maps = map_x.to(device)
         optimizer.zero_grad()
         map_score = 0.0
         for frame_t in range(inputs.shape[1]):
             mu, logvar, map_x, x_concat, x_Block1, x_Block2, x_Block3, x_input = cdcn_model(inputs[:, frame_t, :, :, :])
             score_norm = torch.sum(mu) / torch.sum(test_maps[:, frame_t, :, :])
             map_score += score_norm
         map_score = map_score / inputs.shape[1]
         map_score_list.append(map_score)
     res_dsdg = map_score_list[0].item()
     if res_dsdg > 10:
         res_dsdg = 0.0
-    cls_dsdg = 'Real' if res_dsdg >= dsdg_thresh else 'Spoof'
-    text = f'{cls_dsdg} {w}*{h}'
-    confidences_dsdg = {'Real confidence': res_dsdg}
-    color_dsdg = (0, 255, 0) if cls_dsdg == 'Real' else (255, 0, 0)
-    img_dsdg = cv.rectangle(img.copy(), (x, y), (x2, y2), color_dsdg, 2)
-    cv.putText(img_dsdg, text, (x, y2 + 30),
-                cv.FONT_HERSHEY_COMPLEX, 1, color_dsdg)
-    res_dsdg = res_dsdg * 1000000
-    # cls_dsdg = 1 if cls_dsdg == 'Real' else 0
-    return img_dsdg, confidences_dsdg, res_dsdg
 def inference(img, dsdg_thresh):
@@ -210,17 +218,16 @@ def inference(img, dsdg_thresh):
 def process_video(vid_path, dsdg_thresh):
     cap = cv.VideoCapture(vid_path)
     input_width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
     input_height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
-    # Set video codec and create VideoWriter object to save the output video
     fourcc = cv.VideoWriter_fourcc(*'mp4v')
     output_vid_path = 'output_dsdg.mp4'
-    out_dsdg = cv.VideoWriter(output_vid_path, fourcc, 20.0, (input_width, input_height))
     frame_counter = 0
-    confidences_arr = []
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
@@ -228,21 +235,38 @@ def process_video(vid_path, dsdg_thresh):
         # Process only every 5th frame
         if frame_counter % 5 == 0:
             # Run inference on the current frame
-            _, _, _, img_dsdg, confidences_dsdg, res_dsdg = inference(frame, dsdg_thresh)
-            if res_dsdg == -1:
-                continue
-            confidences_arr.append(res_dsdg)
-            # Write the DSDG frame to the output video
-            out_dsdg.write(img_dsdg)
         frame_counter += 1
-    # Release resources
     cap.release()
-    out_dsdg.release()
-    if not confidences_arr:
         return vid_path, {'Not supported right now': 0}, -1, vid_path, 'Faces too small or not found', -1
-    avg_conf = sum(confidences_arr) / len(confidences_arr)
-    text_dsdg = f'Average real confidence: {avg_conf}\nFrames used: {len(confidences_arr)}\nConfidences:{confidences_arr}'
-    return vid_path, {'Not supported right now': 0}, -1, output_vid_path, text_dsdg, avg_conf
 def upload_to_s3(vid_path, app_version, *labels):
@@ -281,7 +305,7 @@ with demo:
     with gr.Row():
         with gr.Column():
             input_vid = gr.Video(format='mp4', source='webcam')
-            dsdg_thresh = gr.Slider(value=DSDG_THRESHOLD, label='DSDG threshold', maximum=3.0, step=0.05)
             btn_run = gr.Button(value="Run")
         with gr.Column():
             outputs=[

 os.environ['AWS_ACCESS_KEY_ID'] = 'AKIA3JAMX4K53MFDKMGJ'
 os.environ['AWS_SECRET_ACCESS_KEY'] = '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'
+app_version = 'dsdg_vid_2'
 device = torch.device("cpu")
 labels = ['Live', 'Spoof']
 PIX_THRESHOLD = 0.45
+DSDG_THRESHOLD = 50.0
+DSDG_FACTOR = 1000000
 MIN_FACE_WIDTH_THRESHOLD = 210
 examples = [
     ['examples/1_1_21_2_33_scene_fake.jpg'],
         return image_x
 def find_largest_face(faces):
     # find the largest face in the list
     largest_face = None
     return img_deepix, confidences_deepix, cls_deepix
+def get_depth_img(img, bbox):
+    bbox_conf = list(bbox)
+    bbox_conf.append(1)
+    param_lst, roi_box_lst = tddfa(img, [bbox_conf])
+    ver_lst = tddfa.recon_vers(param_lst, roi_box_lst, dense_flag=True)
+    depth_img = depth(img, ver_lst, tddfa.tri, with_bg_flag=False)
+    return depth_img
+def analyze_face(img):
+    face = extract_face(img)
+    if face is None:
+        return img, (), None
+    x, y, w, h = face
+    x2 = x + w
+    y2 = y + h
+    bbox = (x, y, x2, y2)
+    img_dsdg = img.copy()
     if w < MIN_FACE_WIDTH_THRESHOLD:
         color_dsdg = (0, 0, 0)
         text = f'Small res ({w}*{h})'
+        img_dsdg = cv.rectangle(img_dsdg, (x, y), (x2, y2), color_dsdg, 2)
         cv.putText(img_dsdg, text, (x, y2 + 30),
                    cv.FONT_HERSHEY_COMPLEX, 1, color_dsdg)
+        # cls_dsdg = -1
+        return img_dsdg, bbox, None
+    depth_img = get_depth_img(img, bbox)
+    return img_dsdg, bbox, depth_img
+def prepare_data_dsdg(images, boxes, depths):
+    transform = transforms.Compose([Normaliztion_valtest()])
+    files_total = len(images)
+    image_x = np.zeros((files_total, 256, 256, 3))
+    depth_x = np.ones((files_total, 32, 32))
+    for i, (image, bbox, depth_img) in enumerate(
+            zip(images, boxes, depths)):
+        x, y, x2, y2 = bbox
+        depth_img = cv.cvtColor(depth_img, cv.COLOR_RGB2GRAY)
+        image = image[y:y2, x:x2]
+        depth_img = depth_img[y:y2, x:x2]
+        image_x[i, :, :, :] = cv.resize(image, (256, 256))
+        # transform to binary mask --> threshold = 0
+        depth_x[i, :, :] = cv.resize(depth_img, (32, 32))
+    image_x = image_x.transpose((0, 3, 1, 2))
+    image_x = transform(image_x)
+    image_x = torch.from_numpy(image_x.astype(float)).float()
+    depth_x = torch.from_numpy(depth_x.astype(float)).float()
+    return image_x, depth_x
+def dsdg_model_inference(imgs, bboxes, depth_imgs):
     with torch.no_grad():
         map_score_list = []
+        image_x, map_x = prepare_data_dsdg(imgs, bboxes, depth_imgs)
         # get the inputs
         image_x = image_x.unsqueeze(0)
         map_x = map_x.unsqueeze(0)
         inputs = image_x.to(device)
         test_maps = map_x.to(device)
         optimizer.zero_grad()
+        scores = []
         map_score = 0.0
         for frame_t in range(inputs.shape[1]):
             mu, logvar, map_x, x_concat, x_Block1, x_Block2, x_Block3, x_input = cdcn_model(inputs[:, frame_t, :, :, :])
             score_norm = torch.sum(mu) / torch.sum(test_maps[:, frame_t, :, :])
+            scores.append(score_norm.item() * DSDG_FACTOR)
             map_score += score_norm
         map_score = map_score / inputs.shape[1]
         map_score_list.append(map_score)
     res_dsdg = map_score_list[0].item()
     if res_dsdg > 10:
         res_dsdg = 0.0
+    res_dsdg = res_dsdg * DSDG_FACTOR
+    return res_dsdg, scores
 def inference(img, dsdg_thresh):
 def process_video(vid_path, dsdg_thresh):
     cap = cv.VideoCapture(vid_path)
     input_width = int(cap.get(cv.CAP_PROP_FRAME_WIDTH))
     input_height = int(cap.get(cv.CAP_PROP_FRAME_HEIGHT))
     fourcc = cv.VideoWriter_fourcc(*'mp4v')
     output_vid_path = 'output_dsdg.mp4'
     frame_counter = 0
+    all_frames = []
+    inference_images = []
+    inference_bboxes = []
+    inference_depths = []
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
         # Process only every 5th frame
         if frame_counter % 5 == 0:
             # Run inference on the current frame
+            frame = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
+            img, bbox, depth_img = analyze_face(frame)
+            if bbox and (depth_img is not None):
+                inference_images.append(img)
+                inference_bboxes.append(bbox)
+                inference_depths.append(depth_img)
+            all_frames.append(img)
         frame_counter += 1
     cap.release()
+    if not inference_images:
         return vid_path, {'Not supported right now': 0}, -1, vid_path, 'Faces too small or not found', -1
+    res_dsdg, scores = dsdg_model_inference(inference_images, inference_bboxes, inference_depths)
+    cls_dsdg = 'Real' if res_dsdg >= dsdg_thresh else 'Spoof'
+    for img, bbox, score in zip(inference_images, inference_bboxes, scores):
+        x, y, x2, y2 = bbox
+        w = x2 - x
+        h = y2 - y
+        frame_cls = 'Real' if score >= dsdg_thresh else 'Spoof'
+        color_dsdg = (0, 255, 0) if frame_cls == 'Real' else (255, 0, 0)
+        text = f'{cls_dsdg} {w}*{h}'
+        cv.rectangle(img, (x, y), (x2, y2), color_dsdg, 2)
+        cv.putText(img, text, (x, y2 + 30), cv.FONT_HERSHEY_COMPLEX, 1, color_dsdg)
+    out_dsdg = cv.VideoWriter(output_vid_path, fourcc, 6.0, (input_width, input_height))
+    for img in all_frames:
+        # Write the DSDG frame to the output video
+        img_dsdg = cv.cvtColor(img, cv.COLOR_RGB2BGR)
+        out_dsdg.write(img_dsdg)
+    out_dsdg.release()
+    text_dsdg = f'Label: {cls_dsdg}, average real confidence: {res_dsdg}\nFrames used: {len(scores)}\nConfidences: {scores}'
+    return vid_path, {'Not supported right now': 0}, -1, output_vid_path, text_dsdg, res_dsdg
 def upload_to_s3(vid_path, app_version, *labels):
     with gr.Row():
         with gr.Column():
             input_vid = gr.Video(format='mp4', source='webcam')
+            dsdg_thresh = gr.Slider(value=DSDG_THRESHOLD, label='DSDG threshold', maximum=300, step=5)
             btn_run = gr.Button(value="Run")
         with gr.Column():
             outputs=[