Spaces:

Jonny001
/

Image-to-Text

Running

App Files Files Community

Jonny001 commited on Dec 13, 2024

Commit

20e841b

verified ·

1 Parent(s): c36fdb3

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -31

app.py CHANGED Viewed

@@ -4,46 +4,42 @@ import torch
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForCausalLM
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 florence_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True).to(device).eval()
 florence_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True)
-def generate_captions(image):
     if not isinstance(image, Image.Image):
         image = Image.fromarray(image)
     inputs = florence_processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt").to(device)
-    captions = []
-    for i in range(3):
-        generated_ids = florence_model.generate(
-            input_ids=inputs["input_ids"],
-            pixel_values=inputs["pixel_values"],
-            max_new_tokens=1024,
-            early_stopping=False,
-            do_sample=True,
-            temperature=0.7 + i * 0.1,
-            num_beams=3
-        )
-        generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
-        parsed_answer = florence_processor.post_process_generation(
-            generated_text,
-            task="<MORE_DETAILED_CAPTION>",
-            image_size=(image.width, image.height)
-        )
-        prompt = parsed_answer["<MORE_DETAILED_CAPTION>"]
-        captions.append(prompt)
-        print(f"\n\nGeneration {i+1} completed!:" + prompt)
-    return "\n\n".join([f"Caption {i+1}: {caption}" for i, caption in enumerate(captions)])
-io = gr.Interface(
-    generate_captions,
-    inputs=[gr.Image(label="Input Image")],
-    outputs=[gr.Textbox(label="Output Captions", lines=10, show_copy_button=True)],
-    theme="Yntec/HaleyCH_Theme_Orange"
-)
 io.launch(debug=True)

 from PIL import Image
 from transformers import AutoProcessor, AutoModelForCausalLM
 subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 florence_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True).to(device).eval()
 florence_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True)
+def generate_caption(image):
     if not isinstance(image, Image.Image):
         image = Image.fromarray(image)
     inputs = florence_processor(text="<MORE_DETAILED_CAPTION>", images=image, return_tensors="pt").to(device)
+    generated_ids = florence_model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=1024,
+        early_stopping=False,
+        do_sample=False,
+        num_beams=3,
+    )
+    generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    parsed_answer = florence_processor.post_process_generation(
+        generated_text,
+        task="<MORE_DETAILED_CAPTION>",
+        image_size=(image.width, image.height)
+    )
+    prompt =  parsed_answer["<MORE_DETAILED_CAPTION>"]
+    print("\n\nGeneration completed!:"+ prompt)
+    return prompt
+io = gr.Interface(generate_caption,
+                  inputs=[gr.Image(label="Input Image")],
+                  outputs = [gr.Textbox(label="Output Prompt", lines=3, show_copy_button = True),
+                            ],
+                  theme="Yntec/HaleyCH_Theme_Orange",
+                  description="⚠ Sorry for the inconvenience. The space are currently running on the CPU, which might affect performance. We appreciate your understanding."
+                 )
 io.launch(debug=True)