Video-Text-to-Text
Transformers
Safetensors
English
llava
text-generation
multimodal
Eval Results
Inference Endpoints
ZhangYuanhan commited on
Commit
01a5f7c
1 Parent(s): 8f131cb

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +2 -1
README.md CHANGED
@@ -196,6 +196,7 @@ video_path = "XXXX"
196
  max_frames_num = "64"
197
  video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
198
  video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
 
199
  conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
200
  question = DEFAULT_IMAGE_TOKEN + "\nPlease describe this video in detail."
201
  conv = copy.deepcopy(conv_templates[conv_template])
@@ -206,7 +207,7 @@ input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX,
206
  cont = model.generate(
207
  input_ids,
208
  images=video,
209
- modalities="video"
210
  do_sample=False,
211
  temperature=0,
212
  max_new_tokens=4096,
 
196
  max_frames_num = "64"
197
  video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
198
  video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
199
+ video = [video]
200
  conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
201
  question = DEFAULT_IMAGE_TOKEN + "\nPlease describe this video in detail."
202
  conv = copy.deepcopy(conv_templates[conv_template])
 
207
  cont = model.generate(
208
  input_ids,
209
  images=video,
210
+ modalities= ["video"]
211
  do_sample=False,
212
  temperature=0,
213
  max_new_tokens=4096,