lmms-lab
/

LLaVA-Video-7B-Qwen2

mfarre HF staff commited on Oct 10, 2024

Commit

d5cd10a

verified ·

1 Parent(s): 7bcec28

Update README.md (#3)

- Update README.md (e0b7f7a7a63a5dd8ec988ef5ed2014cca7d4cfe4)

Co-authored-by: Miquel Farré <mfarre@users.noreply.huggingface.co>

Files changed (1) hide show

README.md CHANGED Viewed

@@ -171,7 +171,7 @@ import warnings
 from decord import VideoReader, cpu
 import numpy as np
 warnings.filterwarnings("ignore")
-def load_video(self, video_path, max_frames_num,fps=1,force_sample=False):
     if max_frames_num == 0:
         return np.zeros((1, 336, 336, 3))
     vr = VideoReader(video_path, ctx=cpu(0),num_threads=1)
@@ -196,9 +196,9 @@ device_map = "auto"
 tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map)  # Add any other thing you want to pass in llava_model_args
 model.eval()
 video_path = "XXXX"
-max_frames_num = "64"
 video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
-video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
 video = [video]
 conv_template = "qwen_1_5"  # Make sure you use correct chat template for different models
 time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. These frames are located at {frame_time}.Please answer the following questions related to this video."

 from decord import VideoReader, cpu
 import numpy as np
 warnings.filterwarnings("ignore")
+def load_video(video_path, max_frames_num,fps=1,force_sample=False):
     if max_frames_num == 0:
         return np.zeros((1, 336, 336, 3))
     vr = VideoReader(video_path, ctx=cpu(0),num_threads=1)
 tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map)  # Add any other thing you want to pass in llava_model_args
 model.eval()
 video_path = "XXXX"
+max_frames_num = 64
 video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
+video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().half()
 video = [video]
 conv_template = "qwen_1_5"  # Make sure you use correct chat template for different models
 time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. These frames are located at {frame_time}.Please answer the following questions related to this video."