OpenGVLab
/

VideoChat-Flash-Qwen2-7B_res224

Video-Text-to-Text

videochat_flash_qwen

feature-extraction

Model card Files Files and versions Community

lixinhao commited on 2 days ago

Commit

10a3b5a

·

verified ·

1 Parent(s): c7a5c8f

Update modeling_videochat_flash.py

Files changed (1) hide show

modeling_videochat_flash.py +2 -2

modeling_videochat_flash.py CHANGED Viewed

@@ -636,7 +636,7 @@ class VideoChatFlashQwenForCausalLM(LlavaMetaForCausalLM, Qwen2ForCausalLM_Flash
         image_sizes = [frames[0].shape[:2]]
-        frames = [self.get_vision_tower().image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].half().cuda()]
         conv = conv_templates["qwen_2"].copy()
@@ -679,7 +679,7 @@ class VideoChatFlashQwenForCausalLM(LlavaMetaForCausalLM, Qwen2ForCausalLM_Flash
         outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
         if outputs.endswith(stop_str):
-                outputs = outputs[: -len(stop_str)]
         outputs = outputs.strip()

         image_sizes = [frames[0].shape[:2]]
+        frames = [self.get_vision_tower().image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].to(self.model.dtype).cuda()]
         conv = conv_templates["qwen_2"].copy()
         outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
         if outputs.endswith(stop_str):
+            outputs = outputs[: -len(stop_str)]
         outputs = outputs.strip()