initial test
Browse files- app.py +9 -1
- video_highlight_detector.py +48 -9
app.py
CHANGED
@@ -8,6 +8,8 @@ from typing import Tuple, Optional
|
|
8 |
import torch
|
9 |
from pathlib import Path
|
10 |
import time
|
|
|
|
|
11 |
|
12 |
from video_highlight_detector import (
|
13 |
load_model,
|
@@ -159,5 +161,11 @@ def create_ui(examples_path: str):
|
|
159 |
return app
|
160 |
|
161 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
app = create_ui("video_spec.json")
|
163 |
-
app.launch(
|
|
|
8 |
import torch
|
9 |
from pathlib import Path
|
10 |
import time
|
11 |
+
import torch
|
12 |
+
|
13 |
|
14 |
from video_highlight_detector import (
|
15 |
load_model,
|
|
|
161 |
return app
|
162 |
|
163 |
if __name__ == "__main__":
|
164 |
+
# Initialize CUDA
|
165 |
+
if not torch.cuda.is_available():
|
166 |
+
raise RuntimeError("This application requires a GPU to run")
|
167 |
+
torch.cuda.init()
|
168 |
+
torch.cuda.empty_cache()
|
169 |
+
|
170 |
app = create_ui("video_spec.json")
|
171 |
+
app.launch()
|
video_highlight_detector.py
CHANGED
@@ -732,35 +732,74 @@ class BatchedVideoHighlightDetector:
|
|
732 |
|
733 |
def load_model(
|
734 |
checkpoint_path: Optional[str] = None,
|
735 |
-
base_model_id: str = "HuggingFaceTB/
|
736 |
device: str = "cuda"
|
737 |
):
|
738 |
"""Load the model and processor."""
|
739 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
740 |
video_target_size = 384
|
741 |
-
|
742 |
processor = AutoProcessor.from_pretrained(base_model_id)
|
743 |
-
# Configure the image processor
|
744 |
processor.image_processor.size = {"longest_edge": video_target_size}
|
745 |
processor.image_processor.do_resize = True
|
746 |
processor.image_processor.do_image_splitting = False
|
747 |
|
|
|
|
|
|
|
|
|
|
|
748 |
if checkpoint_path:
|
749 |
model = SmolVLMForConditionalGeneration.from_pretrained(
|
750 |
checkpoint_path,
|
751 |
-
|
752 |
-
device_map=device
|
753 |
)
|
754 |
else:
|
755 |
-
model =
|
756 |
base_model_id,
|
757 |
-
|
758 |
-
device_map=device
|
759 |
)
|
760 |
|
761 |
return model, processor
|
762 |
|
763 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
764 |
def main():
|
765 |
checkpoint_path = "/fsx/miquel/smolvlmvideo/checkpoints/final-visionUnfrozen-balanced/checkpoint-6550"
|
766 |
base_model_id = "HuggingFaceTB/SmolVLM-2.2B-Instruct"
|
|
|
732 |
|
733 |
def load_model(
|
734 |
checkpoint_path: Optional[str] = None,
|
735 |
+
base_model_id: str = "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
736 |
device: str = "cuda"
|
737 |
):
|
738 |
"""Load the model and processor."""
|
739 |
+
if device == "cuda" and not torch.cuda.is_available():
|
740 |
+
raise RuntimeError("CUDA requested but not available")
|
741 |
+
|
742 |
+
if device == "cuda":
|
743 |
+
torch.cuda.empty_cache()
|
744 |
+
# Initialize CUDA
|
745 |
+
torch.cuda.init()
|
746 |
+
|
747 |
video_target_size = 384
|
|
|
748 |
processor = AutoProcessor.from_pretrained(base_model_id)
|
|
|
749 |
processor.image_processor.size = {"longest_edge": video_target_size}
|
750 |
processor.image_processor.do_resize = True
|
751 |
processor.image_processor.do_image_splitting = False
|
752 |
|
753 |
+
model_kwargs = {
|
754 |
+
"torch_dtype": torch.bfloat16,
|
755 |
+
"device_map": device
|
756 |
+
}
|
757 |
+
|
758 |
if checkpoint_path:
|
759 |
model = SmolVLMForConditionalGeneration.from_pretrained(
|
760 |
checkpoint_path,
|
761 |
+
**model_kwargs
|
|
|
762 |
)
|
763 |
else:
|
764 |
+
model = SmolVLMForConditionalGeneration.from_pretrained(
|
765 |
base_model_id,
|
766 |
+
**model_kwargs
|
|
|
767 |
)
|
768 |
|
769 |
return model, processor
|
770 |
|
771 |
|
772 |
+
# def load_model(
|
773 |
+
# checkpoint_path: Optional[str] = None,
|
774 |
+
# base_model_id: str = "HuggingFaceTB/SmolVLM-2.2B-Instruct",
|
775 |
+
# device: str = "cuda"
|
776 |
+
# ):
|
777 |
+
# """Load the model and processor."""
|
778 |
+
# # For demonstration, we set the target size
|
779 |
+
# video_target_size = 384
|
780 |
+
|
781 |
+
# processor = AutoProcessor.from_pretrained(base_model_id)
|
782 |
+
# # Configure the image processor
|
783 |
+
# processor.image_processor.size = {"longest_edge": video_target_size}
|
784 |
+
# processor.image_processor.do_resize = True
|
785 |
+
# processor.image_processor.do_image_splitting = False
|
786 |
+
|
787 |
+
# if checkpoint_path:
|
788 |
+
# model = SmolVLMForConditionalGeneration.from_pretrained(
|
789 |
+
# checkpoint_path,
|
790 |
+
# torch_dtype=torch.bfloat16,
|
791 |
+
# device_map=device
|
792 |
+
# )
|
793 |
+
# else:
|
794 |
+
# model = SmolVLMForConditionalGeneration.from_pretrained(
|
795 |
+
# base_model_id,
|
796 |
+
# torch_dtype=torch.bfloat16,
|
797 |
+
# device_map=device
|
798 |
+
# )
|
799 |
+
|
800 |
+
# return model, processor
|
801 |
+
|
802 |
+
|
803 |
def main():
|
804 |
checkpoint_path = "/fsx/miquel/smolvlmvideo/checkpoints/final-visionUnfrozen-balanced/checkpoint-6550"
|
805 |
base_model_id = "HuggingFaceTB/SmolVLM-2.2B-Instruct"
|