import streamlit as st from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForCausalLM import torch from PIL import Image import requests from io import BytesIO import io # Function to perform mean pooling on the model outputs def mean_pooling(model_output, attention_mask): token_embeddings = model_output['last_hidden_state'] input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) mean_pooled_embeddings = sum_embeddings / sum_mask return mean_pooled_embeddings # Initialize the pipeline for image-to-text image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") # Initialize tokenizer and model for text processing tokenizer_text = AutoTokenizer.from_pretrained('jim33282007/5240_grp27_proj') model_text = AutoModel.from_pretrained('jim33282007/5240_grp27_proj') # Initialize a text generation model model_gpt2 = AutoModelForCausalLM.from_pretrained('gpt2-xl') tokenizer_gpt2 = AutoTokenizer.from_pretrained('gpt2-xl') st.title('Image Captioning, Text Embedding, Text Generation, and Input Application') # Function to load images from URL def load_image_from_url(url): try: response = requests.get(url) img = Image.open(BytesIO(response.content)) return img except Exception as e: st.error(f"Error loading image from URL: {e}") return None # User option to select input type: Upload, URL, or Type Sentence input_type = st.radio("Select input type:", ("Upload Image", "Image URL", "Type Sentence")) image = None typed_text = "" if input_type == "Upload Image": uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) if uploaded_file is not None: image = Image.open(io.BytesIO(uploaded_file.getvalue())) st.image(image, caption='Uploaded Image', use_column_width=True) elif input_type == "Image URL": image_url = st.text_input("Enter the image URL here:", "") if image_url: image = load_image_from_url(image_url) if image: st.image(image, caption='Image from URL', use_column_width=True) elif input_type == "Type Sentence": typed_text = st.text_area("Type your sentence here:") # Generate caption and process text button if st.button('Generate Caption and Process Text'): if image or typed_text: with st.spinner("Processing..."): generated_text_p1 = "" if input_type == "Upload Image" and uploaded_file is not None: result = image_to_text(image) generated_text_p1 = result[0]['generated_text'] elif input_type == "Image URL" and image_url: result = image_to_text(image_url) generated_text_p1 = result[0]['generated_text'] elif input_type == "Type Sentence" and typed_text: generated_text_p1 = typed_text if generated_text_p1: st.success(f'Processed Text: {generated_text_p1}') # Generate additional text using GPT-2 based on the processed text input_ids = tokenizer_gpt2.encode(generated_text_p1, return_tensors='pt') generated_outputs = model_gpt2.generate(input_ids, max_length=100, num_return_sequences=1) generated_text = tokenizer_gpt2.decode(generated_outputs[0], skip_special_tokens=True) st.text_area("Generated Text:", generated_text, height=200) else: st.error("Please upload an image, enter an image URL, or type a sentence first.")