import streamlit as st
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
import matplotlib.pyplot as plt

# Load the pre-trained CLIP model and processor
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")

# Function to make predictions from the image
def predict_image_description(image):
    # Preprocess the image and generate text inputs
    inputs = processor(text=["a photo of an animal", "a photo of a human", "a photo of a car", "a photo of a tree", "a photo of a house"], 
                       images=image, 
                       return_tensors="pt", 
                       padding=True)

    # Get model predictions
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
    probs = logits_per_image.softmax(dim=1)  # Softmax to get probabilities

    # Return top 3 predictions
    top_3_probabilities, top_3_indices = torch.topk(probs, 3)
    labels = ["an animal", "a human", "a car", "a tree", "a house"]
    
    predictions = []
    for i in range(3):
        prediction = labels[top_3_indices[0][i]]  # Get the label
        probability = top_3_probabilities[0][i].item()  # Get probability
        predictions.append(f"{prediction}: {probability * 100:.2f}%")
    
    return predictions

# Streamlit UI
st.title("Real-Time Image-to-Text Generator")
st.markdown("Upload an image, and I will tell you what it is!")

# Image upload feature
uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])

if uploaded_file is not None:
    # Open the uploaded image
    image = Image.open(uploaded_file)

    # Display the image
    st.image(image, caption="Uploaded Image", use_column_width=True)

    # Predict the description
    predictions = predict_image_description(image)

    # Display the predictions
    st.write("Predictions:")
    for prediction in predictions:
        st.write(prediction)