mfarre HF staff commited on
Commit
4c7362f
·
1 Parent(s): 2f5ae20

initial test

Browse files
Files changed (2) hide show
  1. app.py +9 -1
  2. video_highlight_detector.py +48 -9
app.py CHANGED
@@ -8,6 +8,8 @@ from typing import Tuple, Optional
8
  import torch
9
  from pathlib import Path
10
  import time
 
 
11
 
12
  from video_highlight_detector import (
13
  load_model,
@@ -159,5 +161,11 @@ def create_ui(examples_path: str):
159
  return app
160
 
161
  if __name__ == "__main__":
 
 
 
 
 
 
162
  app = create_ui("video_spec.json")
163
- app.launch(share=True)
 
8
  import torch
9
  from pathlib import Path
10
  import time
11
+ import torch
12
+
13
 
14
  from video_highlight_detector import (
15
  load_model,
 
161
  return app
162
 
163
  if __name__ == "__main__":
164
+ # Initialize CUDA
165
+ if not torch.cuda.is_available():
166
+ raise RuntimeError("This application requires a GPU to run")
167
+ torch.cuda.init()
168
+ torch.cuda.empty_cache()
169
+
170
  app = create_ui("video_spec.json")
171
+ app.launch()
video_highlight_detector.py CHANGED
@@ -732,35 +732,74 @@ class BatchedVideoHighlightDetector:
732
 
733
  def load_model(
734
  checkpoint_path: Optional[str] = None,
735
- base_model_id: str = "HuggingFaceTB/SmolVLM-2.2B-Instruct",
736
  device: str = "cuda"
737
  ):
738
  """Load the model and processor."""
739
- # For demonstration, we set the target size
 
 
 
 
 
 
 
740
  video_target_size = 384
741
-
742
  processor = AutoProcessor.from_pretrained(base_model_id)
743
- # Configure the image processor
744
  processor.image_processor.size = {"longest_edge": video_target_size}
745
  processor.image_processor.do_resize = True
746
  processor.image_processor.do_image_splitting = False
747
 
 
 
 
 
 
748
  if checkpoint_path:
749
  model = SmolVLMForConditionalGeneration.from_pretrained(
750
  checkpoint_path,
751
- torch_dtype=torch.bfloat16,
752
- device_map=device
753
  )
754
  else:
755
- model = SmolVLMForConditionalGeneration.from_pretrained(
756
  base_model_id,
757
- torch_dtype=torch.bfloat16,
758
- device_map=device
759
  )
760
 
761
  return model, processor
762
 
763
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
764
  def main():
765
  checkpoint_path = "/fsx/miquel/smolvlmvideo/checkpoints/final-visionUnfrozen-balanced/checkpoint-6550"
766
  base_model_id = "HuggingFaceTB/SmolVLM-2.2B-Instruct"
 
732
 
733
  def load_model(
734
  checkpoint_path: Optional[str] = None,
735
+ base_model_id: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
736
  device: str = "cuda"
737
  ):
738
  """Load the model and processor."""
739
+ if device == "cuda" and not torch.cuda.is_available():
740
+ raise RuntimeError("CUDA requested but not available")
741
+
742
+ if device == "cuda":
743
+ torch.cuda.empty_cache()
744
+ # Initialize CUDA
745
+ torch.cuda.init()
746
+
747
  video_target_size = 384
 
748
  processor = AutoProcessor.from_pretrained(base_model_id)
 
749
  processor.image_processor.size = {"longest_edge": video_target_size}
750
  processor.image_processor.do_resize = True
751
  processor.image_processor.do_image_splitting = False
752
 
753
+ model_kwargs = {
754
+ "torch_dtype": torch.bfloat16,
755
+ "device_map": device
756
+ }
757
+
758
  if checkpoint_path:
759
  model = SmolVLMForConditionalGeneration.from_pretrained(
760
  checkpoint_path,
761
+ **model_kwargs
 
762
  )
763
  else:
764
+ model = SmolVLMForConditionalGeneration.from_pretrained(
765
  base_model_id,
766
+ **model_kwargs
 
767
  )
768
 
769
  return model, processor
770
 
771
 
772
+ # def load_model(
773
+ # checkpoint_path: Optional[str] = None,
774
+ # base_model_id: str = "HuggingFaceTB/SmolVLM-2.2B-Instruct",
775
+ # device: str = "cuda"
776
+ # ):
777
+ # """Load the model and processor."""
778
+ # # For demonstration, we set the target size
779
+ # video_target_size = 384
780
+
781
+ # processor = AutoProcessor.from_pretrained(base_model_id)
782
+ # # Configure the image processor
783
+ # processor.image_processor.size = {"longest_edge": video_target_size}
784
+ # processor.image_processor.do_resize = True
785
+ # processor.image_processor.do_image_splitting = False
786
+
787
+ # if checkpoint_path:
788
+ # model = SmolVLMForConditionalGeneration.from_pretrained(
789
+ # checkpoint_path,
790
+ # torch_dtype=torch.bfloat16,
791
+ # device_map=device
792
+ # )
793
+ # else:
794
+ # model = SmolVLMForConditionalGeneration.from_pretrained(
795
+ # base_model_id,
796
+ # torch_dtype=torch.bfloat16,
797
+ # device_map=device
798
+ # )
799
+
800
+ # return model, processor
801
+
802
+
803
  def main():
804
  checkpoint_path = "/fsx/miquel/smolvlmvideo/checkpoints/final-visionUnfrozen-balanced/checkpoint-6550"
805
  base_model_id = "HuggingFaceTB/SmolVLM-2.2B-Instruct"