from transformers import pipeline, AutoTokenizer, AutoModel import torch import torch.nn.functional as F import streamlit as st from PIL import Image import requests from io import BytesIO import numpy as np import pandas as pd # Mean Pooling - Take attention mask into account for correct averaging def mean_pooling(model_output, attention_mask): token_embeddings = model_output[0] # First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) # Initialize the tokenizer and model tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') # Initialize the image-to-text pipeline image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") st.title('Image Captioning and Review Visualization Application') def get_embeddings(sentences): # Tokenize sentences encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') # Compute token embeddings with torch.no_grad(): model_output = model(**encoded_input) # Perform pooling sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) # Normalize embeddings return F.normalize(sentence_embeddings, p=2, dim=1) # Load the dataset @st.cache(allow_output_mutation=True) def load_dataset(): df = pd.read_csv('toys_and_games_reviews.csv') review_texts = df['reviewText'].dropna().tolist() review_embeddings = get_embeddings(review_texts).numpy() return review_texts, review_embeddings review_texts, review_embeddings = load_dataset() # Find top N similar reviews def find_top_n_similar_reviews(query_embedding, review_embeddings, review_texts, top_n=3): similarities = torch.mm(query_embedding, review_embeddings.T).squeeze(0) top_n_indices = torch.topk(similarities, top_n).indices.tolist() return [review_texts[i] for i in top_n_indices] input_type = st.radio("Select input type:", ("Upload Image", "Image URL", "Text")) image = None text_input = "" # Image upload handling if input_type == "Upload Image": uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) if uploaded_file is not None: image = Image.open(uploaded_file) st.image(image, caption='Uploaded Image', use_column_width=True) elif input_type == "Image URL": image_url = st.text_input("Enter the image URL here:", "") if image_url: try: response = requests.get(image_url) image = Image.open(BytesIO(response.content)) st.image(image, caption='Image from URL', use_column_width=True) except Exception as e: st.error(f"Error loading image from URL: {e}") image = None elif input_type == "Text": text_input = st.text_area("Enter text here:", "") if st.button('Generate Caption'): if input_type in ["Upload Image", "Image URL"] and image: with st.spinner("Generating caption..."): result = image_to_text(image_url if input_type == "Image URL" else uploaded_file) result_text = result[0]['generated_text'] if result else "Failed to generate caption." elif input_type == "Text" and text_input: result_text = text_input else: st.warning("Please upload an image, enter an image URL or input text") result_text = "" if result_text: st.success(f'Generated Caption: {result_text}') query_embedding = get_embeddings([result_text]).numpy() similar_reviews = find_top_n_similar_reviews(torch.tensor(query_embedding).float(), torch.tensor(review_embeddings).float(), review_texts) st.write("Similar Reviews Based on the Caption:") for review in similar_reviews: st.write(review)