import streamlit as st from PIL import Image import torch from transformers import CLIPProcessor, CLIPModel import matplotlib.pyplot as plt # Load the pre-trained CLIP model and processor processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16") model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16") # Function to make predictions from the image def predict_image_description(image): # Preprocess the image and generate text inputs inputs = processor(text=["a photo of an animal", "a photo of a human", "a photo of a car", "a photo of a tree", "a photo of a house"], images=image, return_tensors="pt", padding=True) # Get model predictions outputs = model(**inputs) logits_per_image = outputs.logits_per_image # this is the image-text similarity score probs = logits_per_image.softmax(dim=1) # Softmax to get probabilities # Return top 3 predictions top_3_probabilities, top_3_indices = torch.topk(probs, 3) labels = ["an animal", "a human", "a car", "a tree", "a house"] predictions = [] for i in range(3): prediction = labels[top_3_indices[0][i]] # Get the label probability = top_3_probabilities[0][i].item() # Get probability predictions.append(f"{prediction}: {probability * 100:.2f}%") return predictions # Streamlit UI st.title("Real-Time Image-to-Text Generator") st.markdown("Upload an image, and I will tell you what it is!") # Image upload feature uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) if uploaded_file is not None: # Open the uploaded image image = Image.open(uploaded_file) # Display the image st.image(image, caption="Uploaded Image", use_column_width=True) # Predict the description predictions = predict_image_description(image) # Display the predictions st.write("Predictions:") for prediction in predictions: st.write(prediction)