Spaces:

TH9817
/

aa

Runtime error

App Files Files Community

TH9817 commited on Oct 31, 2024

Commit

e167623

verified ·

1 Parent(s): ef4a010

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -36

app.py CHANGED Viewed

@@ -2,31 +2,32 @@ import av
 import torch
 import numpy as np
 from huggingface_hub import hf_hub_download
-from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration
-#import time
-#start = time.time()
-model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
-#device = torch.device('mps')
 model = LlavaNextVideoForConditionalGeneration.from_pretrained(
-    model_id,
-    torch_dtype=torch.float16,
-    low_cpu_mem_usage=True,
-).to(0)
-processor = LlavaNextVideoProcessor.from_pretrained(model_id)
 def read_video_pyav(container, indices):
     '''
     Decode the video with PyAV decoder.
     Args:
-        container (`av.container.input.InputContainer`): PyAV container.
-        indices (`List[int]`): List of frame indices to decode.
     Returns:
-        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
     '''
     frames = []
     container.seek(0)
@@ -39,34 +40,56 @@ def read_video_pyav(container, indices):
             frames.append(frame)
     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
-# define a chat history and use `apply_chat_template` to get correctly formatted prompt
-# Each value in "content" has to be a list of dicts with types ("text", "image", "video")
 conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "text", "text": "What is happening in this video?"},
-            {"type": "video"},
-            ],
-    },
 ]
 prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
-video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
-#video_path="/Users/aa469627/Desktop/videollama/scene/sample1-Scene-049.mp4"
-container = av.open(video_path)
-# sample uniformly 8 frames from the video, can sample more for longer videos
-total_frames = container.streams.video[0].frames
-indices = np.arange(0, total_frames, total_frames / 8).astype(int)
-clip = read_video_pyav(container, indices)
-inputs_video = processor(text=prompt, videos=clip, padding=True, return_tensors="pt").to(model.device)
-output = model.generate(**inputs_video, max_new_tokens=200, do_sample=False)
-print(processor.decode(output[0][2:], skip_special_tokens=True))
-#end = time.time()
-#print(end-start)

 import torch
 import numpy as np
 from huggingface_hub import hf_hub_download
+from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.float16
+)
+processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
 model = LlavaNextVideoForConditionalGeneration.from_pretrained(
+    "llava-hf/LLaVA-NeXT-Video-7B-hf",
+    quantization_config=quantization_config,
+    device_map='auto'
+)
 def read_video_pyav(container, indices):
     '''
     Decode the video with PyAV decoder.
     Args:
+        container (av.container.input.InputContainer): PyAV container.
+        indices (List[int]): List of frame indices to decode.
     Returns:
+        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
     '''
     frames = []
     container.seek(0)
             frames.append(frame)
     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+from huggingface_hub import hf_hub_download
+# Download video from the hub
+video_path_1 = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
+video_path_2 = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="karate.mp4", repo_type="dataset")
+container = av.open(video_path_1)
+# sample uniformly 8 frames from the video (we can sample more for longer videos)
+total_frames = container.streams.video[0].frames
+indices = np.arange(0, total_frames, total_frames / 8).astype(int)
+clip_baby = read_video_pyav(container, indices)
+container = av.open(video_path_2)
+# sample uniformly 8 frames from the video (we can sample more for longer videos)
+total_frames = container.streams.video[0].frames
+indices = np.arange(0, total_frames, total_frames / 8).astype(int)
+clip_karate = read_video_pyav(container, indices)
+# Each "content" is a list of dicts and you can add image/video/text modalities
 conversation = [
+      {
+          "role": "user",
+          "content": [
+              {"type": "text", "text": "Why is this video funny?"},
+              {"type": "video"},
+              ],
+      },
+]
+conversation_2 = [
+      {
+          "role": "user",
+          "content": [
+              {"type": "text", "text": "What do you see in this video?"},
+              {"type": "video"},
+              ],
+      },
 ]
 prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=True)
+inputs = processor([prompt, prompt_2], videos=[clip_baby, clip_karate], padding=True, return_tensors="pt").to(model.device)
+generate_kwargs = {"max_new_tokens": 100, "do_sample": True, "top_p": 0.9}
+output = model.generate(**inputs, **generate_kwargs)
+generated_text = processor.batch_decode(output, skip_special_tokens=True)
+print(generated_text)