import streamlit as st import google.generativeai as genai from PIL import Image import os # Replace with your actual API key API_KEY = os.getenv("GEMINI_API_KEY") genai.configure(api_key=API_KEY) def multimodal_prompt(image_file, analysis_task): """ Sends a multimodal prompt to the Gemini model with an image and a selected analysis task. Args: image_file: The uploaded image file object. analysis_task: The selected task for image analysis. Returns: The model's response as a string. """ model = genai.GenerativeModel("gemini-1.5-flash") try: # Load image data as bytes image_bytes = image_file.getvalue() # Create the image input for the model image_part = { "mime_type": "image/png" if image_file.type == "image/png" else "image/jpeg", "data": image_bytes } # Construct the multimodal prompt prompt = [ f"Perform the following task on the image: {analysis_task}", image_part ] # Send the request to the model response = model.generate_content(prompt) return response.text except Exception as e: return f"An error occurred: {e}" def main(): # Streamlit UI st.title("Multimodal Gemini Image Analysis App") st.write("Upload an image and choose a task for analysis.") # File uploader for images (JPEG, PNG) uploaded_image = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"]) # List of image analysis tasks analysis_tasks = [ "Scene Analysis: Describe the scene depicted in the image. Identify the objects present, their spatial relationships, and any actions taking place.", "Object Detection and Classification: Identify and classify all objects present in the image. Provide detailed descriptions of each object, including its size, shape, color, and texture.", "Image Captioning: Generate a concise and accurate caption that describes the content of the image.", "Visual Question Answering: Answer specific questions about the image, such as 'What color is the car?' or 'How many people are in the image?'", "Image Similarity Search: Given a query image, find similar images from a large dataset based on visual features.", "Image Segmentation: Segment the image into different regions corresponding to objects or areas of interest.", "Optical Character Recognition (OCR): Extract text from the image, such as printed or handwritten text.", "Diagram Understanding: Analyze a diagram (e.g., flowchart, circuit diagram) and extract its structure and meaning.", "Art Analysis: Describe the artistic style, subject matter, and emotional impact of an image.", "Medical Image Analysis: Analyze medical images (e.g., X-rays, MRIs) to detect abnormalities or diagnose diseases." ] # Task selection dropdown selected_task = st.selectbox("Select an image analysis task:", analysis_tasks) if uploaded_image is not None: # Preview the uploaded image st.image(uploaded_image, caption="Uploaded Image", use_container_width=True) if uploaded_image is not None and selected_task: if st.button("Analyze Image"): with st.spinner("Processing..."): response = multimodal_prompt(uploaded_image, selected_task) st.markdown(response) if __name__ == "__main__": main()