Spaces:

paralym
/

MAmmoTH-VL-8B

Runtime error

App Files Files Community

paralym commited on Dec 8, 2024

Commit

15c4e1f

verified ·

1 Parent(s): 52183c9

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -13

app.py CHANGED Viewed

@@ -201,7 +201,7 @@ def is_valid_image_filename(name):
         return False
-def sample_frames_old(video_file, num_frames):
     video = cv2.VideoCapture(video_file)
     total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
     interval = total_frames // num_frames
@@ -216,7 +216,7 @@ def sample_frames_old(video_file, num_frames):
     video.release()
     return frames
-def sample_frames(video_path, frame_count=32):
     video_frames = []
     vr = VideoReader(video_path, ctx=cpu(0))
     total_frames = len(vr)
@@ -240,6 +240,22 @@ def sample_frames(video_path, frame_count=32):
     return video_frames
 def load_image(image_file):
     if image_file.startswith("http") or image_file.startswith("https"):
@@ -319,6 +335,7 @@ def bot(history, temperature, top_p, max_output_tokens):
     images_this_term = []
     text_this_term = ""
     num_new_images = 0
     # previous_image = False
     for i, message in enumerate(history[:-1]):
@@ -332,7 +349,9 @@ def bot(history, temperature, top_p, max_output_tokens):
             if is_valid_video_filename(message[0][0]):
                 # raise ValueError("Video is not supported")
                 # num_new_images += our_chatbot.num_frames
-                num_new_images += len(sample_frames(message[0][0], our_chatbot.num_frames))
             elif is_valid_image_filename(message[0][0]):
                 print("#### Load image from local file",message[0][0])
                 num_new_images += 1
@@ -343,6 +362,7 @@ def bot(history, temperature, top_p, max_output_tokens):
             num_new_images = 0
             # previous_image = False
     image_list = []
     for f in images_this_term:
         if is_valid_video_filename(f):
@@ -388,19 +408,21 @@ def bot(history, temperature, top_p, max_output_tokens):
                     with open(file_path, "rb") as src, open(filename, "wb") as dst:
                         dst.write(src.read())
-    image_tensor = [
-        our_chatbot.image_processor.preprocess(f, return_tensors="pt")["pixel_values"][
-            0
         ]
-        .half()
-        .to(our_chatbot.model.device)
-        for f in image_list
-    ]
-    image_tensor = torch.stack(image_tensor)
-    image_token = DEFAULT_IMAGE_TOKEN * num_new_images
     inp = text
     inp = image_token + "\n" + inp
@@ -440,6 +462,7 @@ def bot(history, temperature, top_p, max_output_tokens):
         max_new_tokens=max_output_tokens,
         use_cache=False,
         stopping_criteria=[stopping_criteria],
     )
     t = Thread(target=our_chatbot.model.generate, kwargs=generate_kwargs)

         return False
+def sample_frames_v1(video_file, num_frames):
     video = cv2.VideoCapture(video_file)
     total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
     interval = total_frames // num_frames
     video.release()
     return frames
+def sample_frames_v2(video_path, frame_count=32):
     video_frames = []
     vr = VideoReader(video_path, ctx=cpu(0))
     total_frames = len(vr)
     return video_frames
+def sample_frames(video_path, num_frames=8):
+    cap = cv2.VideoCapture(video_path)
+    frames = []
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    for i in indices:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        ret, frame = cap.read()
+        if ret:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frames.append(Image.fromarray(frame))
+    cap.release()
+    return frames
 def load_image(image_file):
     if image_file.startswith("http") or image_file.startswith("https"):
     images_this_term = []
     text_this_term = ""
+    is_video = False
     num_new_images = 0
     # previous_image = False
     for i, message in enumerate(history[:-1]):
             if is_valid_video_filename(message[0][0]):
                 # raise ValueError("Video is not supported")
                 # num_new_images += our_chatbot.num_frames
+                # num_new_images += len(sample_frames(message[0][0], our_chatbot.num_frames))
+                num_new_images += 1
+                is_video = True
             elif is_valid_image_filename(message[0][0]):
                 print("#### Load image from local file",message[0][0])
                 num_new_images += 1
             num_new_images = 0
             # previous_image = False
     image_list = []
     for f in images_this_term:
         if is_valid_video_filename(f):
                     with open(file_path, "rb") as src, open(filename, "wb") as dst:
                         dst.write(src.read())
+    if not is_video:
+        image_tensor = [
+            our_chatbot.image_processor.preprocess(f, return_tensors="pt")["pixel_values"][
+                0
+            ]
+            .half()
+            .to(our_chatbot.model.device)
+            for f in image_list
         ]
+        image_tensor = torch.stack(image_tensor)
+    else:
+        image_tensor = our_chatbot.image_processor.preprocess(image_list, return_tensors="pt")["pixel_values"].half().to(our_chatbot.model.device)
+    image_token = DEFAULT_IMAGE_TOKEN * num_new_images if not is_video else DEFAULT_IMAGE_TOKEN * num_new_images
     inp = text
     inp = image_token + "\n" + inp
         max_new_tokens=max_output_tokens,
         use_cache=False,
         stopping_criteria=[stopping_criteria],
+        modalities=["video"] if is_video else ["image"]
     )
     t = Thread(target=our_chatbot.model.generate, kwargs=generate_kwargs)