TH9817 commited on
Commit
7d19143
·
verified ·
1 Parent(s): 82ceac9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -11
app.py CHANGED
@@ -1,7 +1,23 @@
1
  import av
2
  import torch
3
  import numpy as np
4
- from transformers import VideoLlavaForConditionalGeneration, VideoLlavaProcessor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def read_video_pyav(container, indices):
7
  '''
@@ -23,20 +39,34 @@ def read_video_pyav(container, indices):
23
  frames.append(frame)
24
  return np.stack([x.to_ndarray(format="rgb24") for x in frames])
25
 
26
- # Load the model in half-precision
27
- model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", torch_dtype=torch.float16, device_map="auto")
28
- processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
29
 
30
- # Load the video as an np.arrau, sampling uniformly 8 frames
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
 
32
  container = av.open(video_path)
 
 
33
  total_frames = container.streams.video[0].frames
34
  indices = np.arange(0, total_frames, total_frames / 8).astype(int)
35
- video = read_video_pyav(container, indices)
 
36
 
37
- # For better results, we recommend to prompt the model in the following format
38
- prompt = "USER: <video>\nWhy is this funny? ASSISTANT:"
39
- inputs = processor(text=prompt, videos=video, return_tensors="pt")
40
 
41
- out = model.generate(**inputs, max_new_tokens=60)
42
- processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
 
1
  import av
2
  import torch
3
  import numpy as np
4
+ from huggingface_hub import hf_hub_download
5
+ from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration
6
+ #import time
7
+
8
+ #start = time.time()
9
+
10
+ model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
11
+
12
+ #device = torch.device('mps')
13
+
14
+ model = LlavaNextVideoForConditionalGeneration.from_pretrained(
15
+ model_id,
16
+ torch_dtype=torch.float16,
17
+ low_cpu_mem_usage=True,
18
+ ).to(0)
19
+
20
+ processor = LlavaNextVideoProcessor.from_pretrained(model_id)
21
 
22
  def read_video_pyav(container, indices):
23
  '''
 
39
  frames.append(frame)
40
  return np.stack([x.to_ndarray(format="rgb24") for x in frames])
41
 
 
 
 
42
 
43
+ # define a chat history and use `apply_chat_template` to get correctly formatted prompt
44
+ # Each value in "content" has to be a list of dicts with types ("text", "image", "video")
45
+ conversation = [
46
+ {
47
+
48
+ "role": "user",
49
+ "content": [
50
+ {"type": "text", "text": "What is happening in this video?"},
51
+ {"type": "video"},
52
+ ],
53
+ },
54
+ ]
55
+
56
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
57
+
58
  video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
59
+ #video_path="/Users/aa469627/Desktop/videollama/scene/sample1-Scene-049.mp4"
60
  container = av.open(video_path)
61
+
62
+ # sample uniformly 8 frames from the video, can sample more for longer videos
63
  total_frames = container.streams.video[0].frames
64
  indices = np.arange(0, total_frames, total_frames / 8).astype(int)
65
+ clip = read_video_pyav(container, indices)
66
+ inputs_video = processor(text=prompt, videos=clip, padding=True, return_tensors="pt").to(model.device)
67
 
68
+ output = model.generate(**inputs_video, max_new_tokens=200, do_sample=False)
69
+ print(processor.decode(output[0][2:], skip_special_tokens=True))
 
70
 
71
+ #end = time.time()
72
+ #print(end-start)