Spaces:

quarterturn
/

molmo-natural-language-image-captioner

Running on Zero

quarterturn commited on Mar 24

Commit

63f27a5

verified ·

1 Parent(s): 285c7ab

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -27,14 +27,19 @@ def unzip_images(zip_file):
     return image_paths, image_data, session_dir
-@spaces.GPU(duration=86)
 def generate_caption(image_path, prompt):
     try:
-        # Load processor and model
-        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, torch_dtype='auto')
-        model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype='auto', device_map='auto')
-        # Explicitly move to GPU
         model.to('cuda')
         image = Image.open(image_path)
@@ -43,10 +48,10 @@ def generate_caption(image_path, prompt):
             text=prompt,
         )
-        # Move inputs to GPU
-        inputs = {k: v.to('cuda').unsqueeze(0) for k, v in inputs.items()}
-        with torch.autocast(device_type="cuda", enabled=True):
             output = model.generate_from_batch(
                 inputs,
                 GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
@@ -60,12 +65,11 @@ def generate_caption(image_path, prompt):
         del model
         del inputs
         del output
-        torch.cuda.empty_cache()  # Clear GPU memory
         return generated_text
     except Exception as e:
-        # Clean up on error
         torch.cuda.empty_cache()
         raise e

     return image_paths, image_data, session_dir
+@spaces.GPU(duration=180)  # Keep increased timeout
 def generate_caption(image_path, prompt):
     try:
+        # Load processor and model in FP16
+        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            trust_remote_code=True,
+            torch_dtype=torch.float16,  # Cast model to FP16
+            device_map='auto'
+        )
+        # Move model to GPU
         model.to('cuda')
         image = Image.open(image_path)
             text=prompt,
         )
+        # Move and cast inputs to FP16 on GPU
+        inputs = {k: v.to('cuda', dtype=torch.float16).unsqueeze(0) for k, v in inputs.items()}
+        with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=True):
             output = model.generate_from_batch(
                 inputs,
                 GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
         del model
         del inputs
         del output
+        torch.cuda.empty_cache()
         return generated_text
     except Exception as e:
         torch.cuda.empty_cache()
         raise e