# Use a pipeline as a high-level helper import torch from transformers import pipeline from scipy.io import wavfile from PIL import Image import gradio as gr device = "cuda" if torch.cuda.is_available() else "cpu" image_pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large",device=device) narator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs",device=device) def generate_audio(text): # generate the audio from the text audio_text = narator(text) # save the audio to a WAV file wavfile.write(filename="audio.wav", rate=audio_text['sampling_rate'], data=audio_text['audio'][0]) return "audio.wav" def caption_my_image(image_path): image = image_pipe(image_path) caption_text = image[0]['generated_text'] return generate_audio(caption_text) demo = gr.Interface(fn=caption_my_image, inputs=[gr.Image(label="Image",type="pil")], outputs=[gr.Audio(label="Image Caption")], title="@SmartChoiceLearningHub HF Project 1 :Image to Text to Speech", description="This app generates a caption for an image and converts the caption to speech.") demo.launch()