Spaces:

quarterturn
/

molmo-natural-language-image-captioner

Running on Zero

quarterturn commited on 18 days ago

Commit

7cbe3e4

verified ·

1 Parent(s): 25739ea

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -27,7 +27,7 @@ def unzip_images(zip_file):
     return image_paths, image_data, session_dir
-@spaces.GPU(duration=120)  # Keep increased timeout
 def generate_caption(image_path, prompt):
     try:
         # Load processor and model in FP16
@@ -35,7 +35,7 @@ def generate_caption(image_path, prompt):
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             trust_remote_code=True,
-            torch_dtype=torch.float16,  # Cast model to FP16
             device_map='auto'
         )
@@ -48,8 +48,11 @@ def generate_caption(image_path, prompt):
             text=prompt,
         )
-        # Move and cast inputs to FP16 on GPU
-        inputs = {k: v.to('cuda', dtype=torch.float16).unsqueeze(0) for k, v in inputs.items()}
         with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=True):
             output = model.generate_from_batch(

     return image_paths, image_data, session_dir
+@spaces.GPU(duration=120)
 def generate_caption(image_path, prompt):
     try:
         # Load processor and model in FP16
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             trust_remote_code=True,
+            torch_dtype=torch.float16,
             device_map='auto'
         )
             text=prompt,
         )
+        # Move inputs to GPU, keeping input_ids as torch.long, others as FP16
+        inputs = {
+            k: v.to('cuda', dtype=torch.float16 if k != 'input_ids' else torch.long).unsqueeze(0)
+            for k, v in inputs.items()
+        }
         with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=True):
             output = model.generate_from_batch(