import streamlit as st from transformers import AutoTokenizer, AutoModel, pipeline as transformers_pipeline, AutoModelForCausalLM from diffusers import DiffusionPipeline import requests from PIL import Image import io import torch import torch.nn.functional as F import pandas as pd # Function for mean pooling of embeddings def mean_pooling(model_output, attention_mask): token_embeddings = model_output[0] # First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) return sum_embeddings / sum_mask # Load model and tokenizer from HuggingFace Hub for sentence embeddings tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2') def load_image(input_type, uploaded_file=None, image_url=""): """ Loads an image from an uploaded file or URL. """ if input_type == "Upload Image" and uploaded_file is not None: return Image.open(io.BytesIO(uploaded_file.getvalue())) elif input_type == "Image URL" and image_url: try: response = requests.get(image_url) return Image.open(io.BytesIO(response.content)) except Exception as e: st.error(f"Error loading image from URL: {e}") return None def image_to_caption(image, input_type, uploaded_file, image_url): """ Generates a caption for the given image. """ image_to_text_pipeline = transformers_pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") if input_type == "Upload Image" and uploaded_file: return image_to_text_pipeline(uploaded_file.getvalue())[0]['generated_text'] elif input_type == "Image URL" and image_url: return image_to_text_pipeline(image_url)[0]['generated_text'] return "" def select_closest_sentence(generated_text): """ Selects the sentence closest in meaning to the generated_text. """ # Load CSV data df = pd.read_csv('toys_and_games_reviews.csv', encoding='ISO-8859-1') sentences = df.iloc[:, -1].tolist() # Assuming the last column contains sentences # Tokenize and compute embeddings for sentences from CSV encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') with torch.no_grad(): model_output = model(**encoded_input) sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) # Tokenize and compute embedding for the generated_text encoded_new_sentence = tokenizer([generated_text], padding=True, truncation=True, return_tensors='pt') with torch.no_grad(): model_output_new_sentence = model(**encoded_new_sentence) new_sentence_embedding = mean_pooling(model_output_new_sentence, encoded_new_sentence['attention_mask']) new_sentence_embedding = F.normalize(new_sentence_embedding, p=2, dim=1) # Find the most similar sentence in your corpus most_similar_idx = F.cosine_similarity(new_sentence_embedding, sentence_embeddings).topk(1).indices.item() most_similar_sentence = sentences[most_similar_idx] return most_similar_sentence def generate_text_from_caption(caption): """ Generates text based on the provided caption. """ text_generator = transformers_pipeline('text-generation', model='pranavpsv/genre-story-generator-v2') generated = text_generator(caption, max_length=100, num_return_sequences=1) return generated[0]['generated_text'] def main(): st.title('Image to Story to Image Converter') # User interface for input selection input_type = st.radio("Select input type:", ("Upload Image", "Image URL")) uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) if input_type == "Upload Image" else None image_url = st.text_input("Enter the image URL