TH9817 commited on
Commit
e167623
·
verified ·
1 Parent(s): ef4a010

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -36
app.py CHANGED
@@ -2,31 +2,32 @@ import av
2
  import torch
3
  import numpy as np
4
  from huggingface_hub import hf_hub_download
5
- from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration
6
- #import time
7
 
8
- #start = time.time()
9
 
10
- model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
11
-
12
- #device = torch.device('mps')
 
13
 
 
14
  model = LlavaNextVideoForConditionalGeneration.from_pretrained(
15
- model_id,
16
- torch_dtype=torch.float16,
17
- low_cpu_mem_usage=True,
18
- ).to(0)
19
 
20
- processor = LlavaNextVideoProcessor.from_pretrained(model_id)
21
 
22
  def read_video_pyav(container, indices):
23
  '''
24
  Decode the video with PyAV decoder.
 
25
  Args:
26
- container (`av.container.input.InputContainer`): PyAV container.
27
- indices (`List[int]`): List of frame indices to decode.
 
28
  Returns:
29
- result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
30
  '''
31
  frames = []
32
  container.seek(0)
@@ -39,34 +40,56 @@ def read_video_pyav(container, indices):
39
  frames.append(frame)
40
  return np.stack([x.to_ndarray(format="rgb24") for x in frames])
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- # define a chat history and use `apply_chat_template` to get correctly formatted prompt
44
- # Each value in "content" has to be a list of dicts with types ("text", "image", "video")
45
  conversation = [
46
- {
47
-
48
- "role": "user",
49
- "content": [
50
- {"type": "text", "text": "What is happening in this video?"},
51
- {"type": "video"},
52
- ],
53
- },
 
 
 
 
 
 
 
 
 
54
  ]
55
 
56
  prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 
57
 
58
- video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
59
- #video_path="/Users/aa469627/Desktop/videollama/scene/sample1-Scene-049.mp4"
60
- container = av.open(video_path)
61
 
62
- # sample uniformly 8 frames from the video, can sample more for longer videos
63
- total_frames = container.streams.video[0].frames
64
- indices = np.arange(0, total_frames, total_frames / 8).astype(int)
65
- clip = read_video_pyav(container, indices)
66
- inputs_video = processor(text=prompt, videos=clip, padding=True, return_tensors="pt").to(model.device)
67
 
68
- output = model.generate(**inputs_video, max_new_tokens=200, do_sample=False)
69
- print(processor.decode(output[0][2:], skip_special_tokens=True))
70
 
71
- #end = time.time()
72
- #print(end-start)
 
2
  import torch
3
  import numpy as np
4
  from huggingface_hub import hf_hub_download
5
+ from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
 
6
 
 
7
 
8
+ quantization_config = BitsAndBytesConfig(
9
+ load_in_4bit=True,
10
+ bnb_4bit_compute_dtype=torch.float16
11
+ )
12
 
13
+ processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
14
  model = LlavaNextVideoForConditionalGeneration.from_pretrained(
15
+ "llava-hf/LLaVA-NeXT-Video-7B-hf",
16
+ quantization_config=quantization_config,
17
+ device_map='auto'
18
+ )
19
 
 
20
 
21
  def read_video_pyav(container, indices):
22
  '''
23
  Decode the video with PyAV decoder.
24
+
25
  Args:
26
+ container (av.container.input.InputContainer): PyAV container.
27
+ indices (List[int]): List of frame indices to decode.
28
+
29
  Returns:
30
+ np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
31
  '''
32
  frames = []
33
  container.seek(0)
 
40
  frames.append(frame)
41
  return np.stack([x.to_ndarray(format="rgb24") for x in frames])
42
 
43
+ from huggingface_hub import hf_hub_download
44
+
45
+ # Download video from the hub
46
+ video_path_1 = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
47
+ video_path_2 = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="karate.mp4", repo_type="dataset")
48
+
49
+ container = av.open(video_path_1)
50
+
51
+ # sample uniformly 8 frames from the video (we can sample more for longer videos)
52
+ total_frames = container.streams.video[0].frames
53
+ indices = np.arange(0, total_frames, total_frames / 8).astype(int)
54
+ clip_baby = read_video_pyav(container, indices)
55
+
56
+
57
+ container = av.open(video_path_2)
58
+
59
+ # sample uniformly 8 frames from the video (we can sample more for longer videos)
60
+ total_frames = container.streams.video[0].frames
61
+ indices = np.arange(0, total_frames, total_frames / 8).astype(int)
62
+ clip_karate = read_video_pyav(container, indices)
63
 
64
+ # Each "content" is a list of dicts and you can add image/video/text modalities
 
65
  conversation = [
66
+ {
67
+ "role": "user",
68
+ "content": [
69
+ {"type": "text", "text": "Why is this video funny?"},
70
+ {"type": "video"},
71
+ ],
72
+ },
73
+ ]
74
+
75
+ conversation_2 = [
76
+ {
77
+ "role": "user",
78
+ "content": [
79
+ {"type": "text", "text": "What do you see in this video?"},
80
+ {"type": "video"},
81
+ ],
82
+ },
83
  ]
84
 
85
  prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
86
+ prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=True)
87
 
88
+ inputs = processor([prompt, prompt_2], videos=[clip_baby, clip_karate], padding=True, return_tensors="pt").to(model.device)
 
 
89
 
90
+ generate_kwargs = {"max_new_tokens": 100, "do_sample": True, "top_p": 0.9}
 
 
 
 
91
 
92
+ output = model.generate(**inputs, **generate_kwargs)
93
+ generated_text = processor.batch_decode(output, skip_special_tokens=True)
94
 
95
+ print(generated_text)