############################################################################################################################# # Filename : app.py # Description: A Streamlit application to turn an image to audio story. # Author : Georgios Ioannou # # Copyright © 2024 by Georgios Ioannou ############################################################################################################################# # Import libraries. import os # Load environment variable(s). import requests # Send HTTP GET request to Hugging Face models for inference. import streamlit as st # Build the GUI of the application. from langchain.chat_models import ChatOpenAI # Access to OpenAI gpt-3.5-turbo model. from langchain.chains import LLMChain # Chain to run queries against LLMs. # A prompt template. It accepts a set of parameters from the user that can be used to generate a prompt for a language model. from langchain.prompts import PromptTemplate from transformers import pipeline # Access to Hugging Face models. ############################################################################################################################# # Load environment variable(s). HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") ############################################################################################################################# # Function to apply local CSS. def local_css(file_name): with open(file_name) as f: st.markdown(f"", unsafe_allow_html=True) ############################################################################################################################# # Return the text generated by the model for the image. # Using pipeline. def img_to_text(image_path): # https://huggingface.co/tasks # Task used here : "image-to-text". # Model used here: "Salesforce/blip-image-captioning-base". # Backup model: "nlpconnect/vit-gpt2-image-captioning". # Backup model: "Salesforce/blip-image-captioning-large" image_to_text = pipeline( "image-to-text", model="Salesforce/blip-image-captioning-base" ) # image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") # image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") scenario = image_to_text(image_path)[0]["generated_text"] return scenario ############################################################################################################################# # Return the story generated by the model for the scenario. # Using Langchain. def generate_story(scenario, personality): # Model used here: "gpt-3.5-turbo". # The template can be customized to meet one's needs such as: # Generate a story and generate lyrics of a song. template = """ You are a story teller. You must sound like {personality}. The story should be less than 50 words. Generate a story based on the above constraints and the following scenario: {scenario}. """ prompt = PromptTemplate( template=template, input_variables=["scenario", "personality"] ) story_llm = LLMChain( llm=ChatOpenAI( model_name="gpt-3.5-turbo", temperature=0 ), # Increasing the temperature, the model becomes more creative and takes longer for inference. prompt=prompt, verbose=True, # Print intermediate values to the console. ) story = story_llm.predict( scenario=scenario, personality=personality ) # Format prompt with kwargs and pass to LLM. return story ############################################################################################################################# # Return the speech generated by the model for the story. # Using inference api. def text_to_speech(story): # Model used here: "espnet/kan-bayashi_ljspeech_vits. # Backup model: "facebook/mms-tts-eng". API_URL = ( "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits" ) # API_URL = "https://api-inference.huggingface.co/models/facebook/mms-tts-eng" headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"} payload = {"inputs": story} response = requests.post(API_URL, headers=headers, json=payload) with open("audio.flac", "wb") as file: file.write(response.content) ############################################################################################################################# # Main function to create the Streamlit web application. def main(): try: # Page title and favicon. st.set_page_config(page_title="Image To Audio Story", page_icon="🖼️") # Load CSS. local_css("styles/style.css") # Title. title = f"""

Turn Image to Audio Story

""" st.markdown(title, unsafe_allow_html=True) # Subtitle. title = f"""

CUNY Tech Prep Tutorial 1

""" st.markdown(title, unsafe_allow_html=True) # Image. image = "./ctp.png" left_co, cent_co, last_co = st.columns(3) with cent_co: st.image(image=image) # Define the personalities for the dropdown menu. personalities = [ "Donald Trump", "Abraham Lincoln", "Aristotle", "Cardi B", "Kanye West", ] personality = st.selectbox("Select a personality:", personalities) # Upload an image. uploaded_file = st.file_uploader("Choose an image:") if uploaded_file is not None: # Display the uploaded image. bytes_data = uploaded_file.getvalue() with open(uploaded_file.name, "wb") as file: file.write(bytes_data) st.image(uploaded_file, caption="Uploaded Image.", use_column_width=True) with st.spinner(text="Model Inference..."): # Spinner to keep the application interactive. # Model inference. scenario = img_to_text(uploaded_file.name) story = generate_story(scenario=scenario, personality=personality) text_to_speech(story) # Display the scenario and story. with st.expander("Scenario"): st.write(scenario) with st.expander("Story"): st.write(story) # Display the audio. st.audio("audio.flac") except Exception as e: # Display any errors. st.error(e) # GitHub repository of author. st.markdown( f"""

Check out our GitHub repository

""", unsafe_allow_html=True, ) ############################################################################################################################# if __name__ == "__main__": main()