Spaces:

Chaerin5
/

FoundHand

Running on Zero

App Files Files Community

Chaerin5 commited on Dec 29, 2024

Commit

702c185

1 Parent(s): 7df9bdd

fix bug

Browse files

Files changed (1) hide show

app.py +168 -49

app.py CHANGED Viewed

@@ -256,19 +256,128 @@ hands = mp_hands.Hands(
     min_detection_confidence=0.1,
 )
-def make_ref_cond(
-    image
-):
-    print("ready to run autoencoder")
-    # print(f"image.device: {image.device}, type(image): {type(image)}")
-    # image = image.to("cuda")
-    print(f"autoencoder device: {next(autoencoder.parameters()).device}")
-    latent = opts.latent_scaling_factor * autoencoder.encode(image[None, ...]).sample()
-    return image[None, ...], latent
 def get_ref_anno(ref):
-    print("inside get_ref_anno")
     if ref is None:
         return (
             None,
@@ -280,11 +389,8 @@ def get_ref_anno(ref):
     img = ref["composite"][..., :3]
     img = cv2.resize(img, opts.image_size, interpolation=cv2.INTER_AREA)
     keypts = np.zeros((42, 2))
-    print("ready to run mediapipe")
     if REF_POSE_MASK:
-        print(f"type(img): {type(img)}, img.shape: {img.shape}, img.dtype: {img.dtype}")
         mp_pose = hands.process(img)
-        print("processed mediapipe")
         detected = np.array([0, 0])
         start_idx = 0
         if mp_pose.multi_hand_landmarks:
@@ -317,13 +423,11 @@ def get_ref_anno(ref):
             elif keypts[21].sum() != 0:
                 input_point = np.array(keypts[21:22])
                 input_label = np.array([1])
-            print("ready to run SAM")
             masks, _, _ = sam_predictor.predict(
                 point_coords=input_point,
                 point_labels=input_label,
                 multimask_output=False,
             )
-            print("finished SAM")
             hand_mask = masks[0]
             masked_img = img * hand_mask[..., None] + 255 * (1 - hand_mask[..., None])
             ref_pose = visualize_hand(keypts, masked_img)
@@ -332,47 +436,62 @@ def get_ref_anno(ref):
     else:
         hand_mask = np.zeros_like(img[:,:, 0])
         ref_pose = np.zeros_like(img)
-    image_transform = Compose(
-        [
-            ToTensor(),
-            Resize(opts.image_size),
-            Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
-        ]
-    )
-    image = image_transform(img)
-    kpts_valid = check_keypoints_validity(keypts, opts.image_size)
-    heatmaps = torch.tensor(
-        keypoint_heatmap(
-            scale_keypoint(keypts, opts.image_size, opts.latent_size), opts.latent_size, var=1.0
         )
-        * kpts_valid[:, None, None],
-        dtype=torch.float,
-        # device=device,
-    )[None, ...]
-    mask = torch.tensor(
-        cv2.resize(
-            hand_mask.astype(int),
-            dsize=opts.latent_size,
-            interpolation=cv2.INTER_NEAREST,
-        ),
-        dtype=torch.float,
-        # device=device,
-    ).unsqueeze(0)[None, ...]
-    image, latent = make_ref_cond(
-        image,
-        # keypts,
-        # hand_mask,
-        # device=device,
-        # target_size=opts.image_size,
-        # latent_size=opts.latent_size,
-    )
-    print("finished autoencoder")
     if not REF_POSE_MASK:
         heatmaps = torch.zeros_like(heatmaps)
         mask = torch.zeros_like(mask)
     ref_cond = torch.cat([latent, heatmaps, mask], 1)
     return img, ref_pose, ref_cond

     min_detection_confidence=0.1,
 )
+# def make_ref_cond(
+#     image
+# ):
+#     print("ready to run autoencoder")
+#     # print(f"image.device: {image.device}, type(image): {type(image)}")
+#     # image = image.to("cuda")
+#     print(f"autoencoder device: {next(autoencoder.parameters()).device}")
+#     latent = opts.latent_scaling_factor * autoencoder.encode(image[None, ...]).sample()
+#     return image[None, ...], latent
+# def get_ref_anno(ref):
+#     print("inside get_ref_anno")
+#     if ref is None:
+#         return (
+#             None,
+#             None,
+#             None,
+#             None,
+#             None,
+#         )
+#     img = ref["composite"][..., :3]
+#     img = cv2.resize(img, opts.image_size, interpolation=cv2.INTER_AREA)
+#     keypts = np.zeros((42, 2))
+#     print("ready to run mediapipe")
+#     if REF_POSE_MASK:
+#         print(f"type(img): {type(img)}, img.shape: {img.shape}, img.dtype: {img.dtype}")
+#         mp_pose = hands.process(img)
+#         print("processed mediapipe")
+#         detected = np.array([0, 0])
+#         start_idx = 0
+#         if mp_pose.multi_hand_landmarks:
+#             # handedness is flipped assuming the input image is mirrored in MediaPipe
+#             for hand_landmarks, handedness in zip(
+#                 mp_pose.multi_hand_landmarks, mp_pose.multi_handedness
+#             ):
+#                 # actually right hand
+#                 if handedness.classification[0].label == "Left":
+#                     start_idx = 0
+#                     detected[0] = 1
+#                 # actually left hand
+#                 elif handedness.classification[0].label == "Right":
+#                     start_idx = 21
+#                     detected[1] = 1
+#                 for i, landmark in enumerate(hand_landmarks.landmark):
+#                     keypts[start_idx + i] = [
+#                         landmark.x * opts.image_size[1],
+#                         landmark.y * opts.image_size[0],
+#                     ]
+#             sam_predictor.set_image(img)
+#             l = keypts[:21].shape[0]
+#             if keypts[0].sum() != 0 and keypts[21].sum() != 0:
+#                 input_point = np.array([keypts[0], keypts[21]])
+#                 input_label = np.array([1, 1])
+#             elif keypts[0].sum() != 0:
+#                 input_point = np.array(keypts[:1])
+#                 input_label = np.array([1])
+#             elif keypts[21].sum() != 0:
+#                 input_point = np.array(keypts[21:22])
+#                 input_label = np.array([1])
+#             print("ready to run SAM")
+#             masks, _, _ = sam_predictor.predict(
+#                 point_coords=input_point,
+#                 point_labels=input_label,
+#                 multimask_output=False,
+#             )
+#             print("finished SAM")
+#             hand_mask = masks[0]
+#             masked_img = img * hand_mask[..., None] + 255 * (1 - hand_mask[..., None])
+#             ref_pose = visualize_hand(keypts, masked_img)
+#         else:
+#             raise gr.Error("No hands detected in the reference image.")
+#     else:
+#         hand_mask = np.zeros_like(img[:,:, 0])
+#         ref_pose = np.zeros_like(img)
+#     image_transform = Compose(
+#         [
+#             ToTensor(),
+#             Resize(opts.image_size),
+#             Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+#         ]
+#     )
+#     image = image_transform(img)
+#     kpts_valid = check_keypoints_validity(keypts, opts.image_size)
+#     heatmaps = torch.tensor(
+#         keypoint_heatmap(
+#             scale_keypoint(keypts, opts.image_size, opts.latent_size), opts.latent_size, var=1.0
+#         )
+#         * kpts_valid[:, None, None],
+#         dtype=torch.float,
+#         # device=device,
+#     )[None, ...]
+#     mask = torch.tensor(
+#         cv2.resize(
+#             hand_mask.astype(int),
+#             dsize=opts.latent_size,
+#             interpolation=cv2.INTER_NEAREST,
+#         ),
+#         dtype=torch.float,
+#         # device=device,
+#     ).unsqueeze(0)[None, ...]
+#     image, latent = make_ref_cond(
+#         image,
+#         # keypts,
+#         # hand_mask,
+#         # device=device,
+#         # target_size=opts.image_size,
+#         # latent_size=opts.latent_size,
+#     )
+#     print("finished autoencoder")
+#     if not REF_POSE_MASK:
+#         heatmaps = torch.zeros_like(heatmaps)
+#         mask = torch.zeros_like(mask)
+#     ref_cond = torch.cat([latent, heatmaps, mask], 1)
+#     return img, ref_pose, ref_cond
 def get_ref_anno(ref):
     if ref is None:
         return (
             None,
     img = ref["composite"][..., :3]
     img = cv2.resize(img, opts.image_size, interpolation=cv2.INTER_AREA)
     keypts = np.zeros((42, 2))
     if REF_POSE_MASK:
         mp_pose = hands.process(img)
         detected = np.array([0, 0])
         start_idx = 0
         if mp_pose.multi_hand_landmarks:
             elif keypts[21].sum() != 0:
                 input_point = np.array(keypts[21:22])
                 input_label = np.array([1])
             masks, _, _ = sam_predictor.predict(
                 point_coords=input_point,
                 point_labels=input_label,
                 multimask_output=False,
             )
             hand_mask = masks[0]
             masked_img = img * hand_mask[..., None] + 255 * (1 - hand_mask[..., None])
             ref_pose = visualize_hand(keypts, masked_img)
     else:
         hand_mask = np.zeros_like(img[:,:, 0])
         ref_pose = np.zeros_like(img)
+    print(f"keypts.max(): {keypts.max()}, keypts.min(): {keypts.min()}")
+    def make_ref_cond(
+        img,
+        keypts,
+        hand_mask,
+        device="cuda",
+        target_size=(256, 256),
+        latent_size=(32, 32),
+    ):
+        image_transform = Compose(
+            [
+                ToTensor(),
+                Resize(target_size),
+                Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ]
         )
+        image = image_transform(img)
+        kpts_valid = check_keypoints_validity(keypts, target_size)
+        heatmaps = torch.tensor(
+            keypoint_heatmap(
+                scale_keypoint(keypts, target_size, latent_size), latent_size, var=1.0
+            )
+            * kpts_valid[:, None, None],
+            dtype=torch.float,
+        )[None, ...]
+        mask = torch.tensor(
+            cv2.resize(
+                hand_mask.astype(int),
+                dsize=latent_size,
+                interpolation=cv2.INTER_NEAREST,
+            ),
+            dtype=torch.float,
+        ).unsqueeze(0)[None, ...]
+        return image[None, ...], heatmaps, mask
+    print(f"img.max(): {img.max()}, img.min(): {img.min()}")
+    image, heatmaps, mask = make_ref_cond(
+        img,
+        keypts,
+        hand_mask,
+        device="cuda",
+        target_size=opts.image_size,
+        latent_size=opts.latent_size,
+    )
+    print(f"image.max(): {image.max()}, image.min(): {image.min()}")
+    print(f"opts.latent_scaling_factor: {opts.latent_scaling_factor}")
+    latent = opts.latent_scaling_factor * autoencoder.encode(image).sample()
+    print(f"latent.max(): {latent.max()}, latent.min(): {latent.min()}")
     if not REF_POSE_MASK:
         heatmaps = torch.zeros_like(heatmaps)
         mask = torch.zeros_like(mask)
+    print(f"heatmaps.max(): {heatmaps.max()}, heatmaps.min(): {heatmaps.min()}")
+    print(f"mask.max(): {mask.max()}, mask.min(): {mask.min()}")
     ref_cond = torch.cat([latent, heatmaps, mask], 1)
+    print(f"ref_cond.max(): {ref_cond.max()}, ref_cond.min(): {ref_cond.min()}")
     return img, ref_pose, ref_cond