Spaces:

Aashi
/

Text-Image-Analyzer

Running on Zero

App Files Files Community

Aashi commited on Sep 27, 2024

Commit

2f8eff4

verified ·

1 Parent(s): 99f09a0

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -4

app.py CHANGED Viewed

@@ -1,7 +1,79 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+# import gradio as gr
+# def greet(name):
+#     return "Hello " + name + "!!"
+# demo = gr.Interface(fn=greet, inputs="text", outputs="text")
+# demo.launch()
 import gradio as gr
+import requests
+import torch
+from PIL import Image
+from transformers import MllamaForConditionalGeneration, AutoProcessor
+# Load the Llama 3.2 Vision Model
+def load_llama_model():
+    model_id = "meta-llama/Llama-3.2-11B-Vision"
+    # Load model and processor
+    model = MllamaForConditionalGeneration.from_pretrained(
+        model_id,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+    )
+    processor = AutoProcessor.from_pretrained(model_id)
+    return model, processor
+# Function to generate predictions for text and image
+def process_input(text, image=None):
+    model, processor = load_llama_model()
+    if image:
+        # If an image is uploaded, process it as a PIL Image object
+        vision_input = image.convert("RGB").resize((224, 224))
+        prompt = f"<|image|><|begin_of_text|>{text}"
+        # Process image and text together
+        inputs = processor(vision_input, prompt, return_tensors="pt").to(model.device)
+    else:
+        # If no image is uploaded, just process the text
+        prompt = f"<|begin_of_text|>{text}"
+        inputs = processor(prompt, return_tensors="pt").to(model.device)
+    # Generate output from the model
+    outputs = model.generate(**inputs, max_new_tokens=100)
+    # Decode the output to return a readable text
+    decoded_output = processor.decode(outputs[0], skip_special_tokens=True)
+    return decoded_output
+# Gradio Interface Setup
+def demo():
+    # Define Gradio input and output components
+    text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5)
+    # Use type="pil" to work with PIL Image objects
+    image_input = gr.Image(label="Upload an Image", type="pil")
+    output = gr.Textbox(label="Model Output", lines=5)
+    # Define the interface layout
+    interface = gr.Interface(
+        fn=process_input,
+        inputs=[text_input, image_input],
+        outputs=output,
+        title="Llama 3.2 Multimodal Text-Image Analyzer",
+        description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model."
+    )
+    # Launch the demo
+    interface.launch()
+# Run the demo
+if __name__ == "__main__":
+    demo()