############################################################################################################################# # Filename : app.py # Description: A Streamlit application to turn an image to audio story. # Author : Georgios Ioannou # # Copyright © 2024 by Georgios Ioannou ############################################################################################################################# # Import libraries. import os # Load environment variable(s). import requests # Send HTTP GET request to Hugging Face models for inference. import streamlit as st # Build the GUI of the application. from langchain.chat_models import ChatOpenAI # Access to OpenAI gpt-3.5-turbo model. from langchain.chains import LLMChain # Chain to run queries against LLMs. # A prompt template. It accepts a set of parameters from the user that can be used to generate a prompt for a language model. from langchain.prompts import PromptTemplate from transformers import pipeline # Access to Hugging Face models. ############################################################################################################################# # Load environment variable(s). HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") ############################################################################################################################# # Function to apply local CSS. def local_css(file_name): with open(file_name) as f: st.markdown(f"", unsafe_allow_html=True) ############################################################################################################################# # Return the text generated by the model for the image. # Using pipeline. def img_to_text(image_path): # https://huggingface.co/tasks # Task used here : "image-to-text". # Model used here: "Salesforce/blip-image-captioning-base". # Backup model: "nlpconnect/vit-gpt2-image-captioning". image_to_text = pipeline( "image-to-text", model="Salesforce/blip-image-captioning-base" ) # image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") scenario = image_to_text(image_path)[0]["generated_text"] return scenario ############################################################################################################################# # Return the story generated by the model for the scenario. # Using Langchain. def generate_story(scenario, personality): # Model used here: "gpt-3.5-turbo". # The template can be customized to meet one's needs such as: # Generate a story and generate lyrics of a song. template = """ You are a story teller. You must sound like {personality}. The story should be less than 50 words. Generate a story based on the above constraints and the following scenario: {scenario}. """ prompt = PromptTemplate( template=template, input_variables=["scenario", "personality"] ) story_llm = LLMChain( llm=ChatOpenAI( model_name="gpt-3.5-turbo", temperature=0 ), # Increasing the temperature, the model becomes more creative and takes longer for inference. prompt=prompt, verbose=True, # Print intermediate values to the console. ) story = story_llm.predict( scenario=scenario, personality=personality ) # Format prompt with kwargs and pass to LLM. return story ############################################################################################################################# # Return the speech generated by the model for the story. # Using inference api. def text_to_speech(story): # Model used here: "espnet/kan-bayashi_ljspeech_vits. # Backup model: "facebook/mms-tts-eng". API_URL = ( "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits" ) # API_URL = "https://api-inference.huggingface.co/models/facebook/mms-tts-eng" headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"} payload = {"inputs": story} response = requests.post(API_URL, headers=headers, json=payload) with open("audio.flac", "wb") as file: file.write(response.content) ############################################################################################################################# # Main function to create the Streamlit web application. def main(): try: # Page title and favicon. st.set_page_config(page_title="Image To Audio Story", page_icon="🖼️") # Load CSS. local_css("styles/style.css") # Title. title = f"""