import streamlit as st
from transformers import pipeline
from huggingface_hub import InferenceClient
from PIL import Image
import os


def initialize():
    if 'initialized' not in st.session_state: 
        print("Initializing...")
        st.session_state['initialized'] = True
        st.session_state['api_key'] = os.getenv("HUGGINGFACE_TOKEN")
        st.session_state['client'] = InferenceClient(api_key=st.session_state['api_key'])


def main():
    initialize()
    st.header("Character Captions")
    st.write("Have a character caption any image you upload!")
    character = st.selectbox("Choose a character", ["artist", "elmo", "unintelligible", "goku"])

    uploaded_img = st.file_uploader("Upload an image here")

    if uploaded_img is not None:
        image = Image.open(uploaded_img)
        st.image(image)

        
        image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
        response = image_captioner(image)
        caption = response[0]['generated_text']


        character_prompts = {
            "artist": f"Describe this caption like you're a artist: {caption}.",
            "elmo": f"Describe this caption like you're elmo: {caption}.",
            "unintelligible": f"Describe this caption in a way that makes no sense: {caption}.",
            "goku": f"Describe this caption like you're goku: {caption}."
        }

        prompt = character_prompts[character]
        messages = [
            { "role": "user", "content": prompt }
        ]

        
        stream = st.session_state['client'].chat.completions.create(
            model="meta-llama/Llama-3.2-3B-Instruct", 
            messages=messages, 
            max_tokens=500,
            stream=True
        )

        response = ''
        for chunk in stream:
            response += chunk.choices[0].delta.content
        
        st.write(response)


if __name__ == '__main__':
    main()