import streamlit as st
from transformers import pipeline
#from diffusers import DiffusionPipeline
from PIL import Image
import requests
import io
from io import BytesIO

# Load the image-to-text pipeline
image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

# Load the text mask pipeline
generate_mask = pipeline("fill-mask", model="google-bert/bert-base-uncased")

# Load the text generation pipeline
extend_text = pipeline("text-generation", model="pranavpsv/genre-story-generator-v2")

# Load the text-to-image model
#text_to_image = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")

def main():
    st.title("SmartCart (Product Recommender)")

    # User input for text or URL
    input_option = st.radio("Select input option:", ("Text", "URL"))

    # Input text
    if input_option == "Text":
        text_input = st.text_input("Enter the text:")
        if st.button("Generate Story and Image") and text_input:
            #generate_image(text_input)
            generated_text = generate_mask_from_result(text_input)
            st.success(f'Generated Caption: {text_input}')
            st.success(f'Generated Text: {generated_text}')
            

    # Input URL
    elif input_option == "URL":
        image_url = st.text_input("Enter the image URL:")
        if st.button("Generate Story and Image") and image_url:
            image_text = image_to_text_from_url(image_url)
            #generate_image(image_text)
            generated_text = generate_mask_from_result(image_text)
            st.success(f'Generated Caption: {image_text}')
            st.success(f'Generated Text: {generated_text}')
            

def image_to_text_from_file(uploaded_file):
    image_bytes = io.BytesIO(uploaded_file.read())
    return image_to_text(image_bytes)[0]['generated_text']

def image_to_text_from_url(image_url):
    response = requests.get(image_url)
    image_bytes = Image.open(BytesIO(response.content))
    return image_to_text(image_bytes)[0]['generated_text']

def generate_image(text):
    rephrased_text = "I want to buy " + text + " and [MASK] for my children"
    generated_image = text_to_image(rephrased_text)
    st.image(generated_image, caption="Generated Image", use_column_width=True)

def generate_mask_from_result(text):
    output = generate_mask(f"I want to buy 2 toys for my children. I will buy {text} and [MASK].")
    
    if output and output[0]['token_str'] == text:
        # If the first result matches the input, get the second output instead
        second_output = output[1] if len(output) > 1 else None
        result = second_output['token_str'] if second_output else None
    else:
        result = output[0]['token_str'] if output else None

    extended_text = extend_text(f"A child with {text} and {result} ")
    return extended_text[0]['generated_text']


if __name__ == "__main__":
    main()