louiecerv's picture
saved
73adaf9
import streamlit as st
import google.generativeai as genai
from PIL import Image
import os
# Replace with your actual API key
API_KEY = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=API_KEY)
def multimodal_prompt(image_file, analysis_task):
"""
Sends a multimodal prompt to the Gemini model with an image and a selected analysis task.
Args:
image_file: The uploaded image file object.
analysis_task: The selected task for image analysis.
Returns:
The model's response as a string.
"""
model = genai.GenerativeModel("gemini-1.5-flash")
try:
# Load image data as bytes
image_bytes = image_file.getvalue()
# Create the image input for the model
image_part = {
"mime_type": "image/png" if image_file.type == "image/png" else "image/jpeg",
"data": image_bytes
}
# Construct the multimodal prompt
prompt = [
f"Perform the following task on the image: {analysis_task}",
image_part
]
# Send the request to the model
response = model.generate_content(prompt)
return response.text
except Exception as e:
return f"An error occurred: {e}"
def main():
# Streamlit UI
st.title("Multimodal Gemini Image Analysis App")
st.write("Upload an image and choose a task for analysis.")
# File uploader for images (JPEG, PNG)
uploaded_image = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"])
# List of image analysis tasks
analysis_tasks = [
"Scene Analysis: Describe the scene depicted in the image. Identify the objects present, their spatial relationships, and any actions taking place.",
"Object Detection and Classification: Identify and classify all objects present in the image. Provide detailed descriptions of each object, including its size, shape, color, and texture.",
"Image Captioning: Generate a concise and accurate caption that describes the content of the image.",
"Visual Question Answering: Answer specific questions about the image, such as 'What color is the car?' or 'How many people are in the image?'",
"Image Similarity Search: Given a query image, find similar images from a large dataset based on visual features.",
"Image Segmentation: Segment the image into different regions corresponding to objects or areas of interest.",
"Optical Character Recognition (OCR): Extract text from the image, such as printed or handwritten text.",
"Diagram Understanding: Analyze a diagram (e.g., flowchart, circuit diagram) and extract its structure and meaning.",
"Art Analysis: Describe the artistic style, subject matter, and emotional impact of an image.",
"Medical Image Analysis: Analyze medical images (e.g., X-rays, MRIs) to detect abnormalities or diagnose diseases."
]
# Task selection dropdown
selected_task = st.selectbox("Select an image analysis task:", analysis_tasks)
if uploaded_image is not None:
# Preview the uploaded image
st.image(uploaded_image, caption="Uploaded Image", use_container_width=True)
if uploaded_image is not None and selected_task:
if st.button("Analyze Image"):
with st.spinner("Processing..."):
response = multimodal_prompt(uploaded_image, selected_task)
st.markdown(response)
if __name__ == "__main__":
main()