import gradio as gr | |
from transformers import pipeline | |
img_text_pipe = pipeline("image-to-text", | |
model="Salesforce/blip-image-captioning-base") | |
narrator = pipeline("text-to-speech", | |
model="kakao-enterprise/vits-ljs") | |
def describe_image(file_path): | |
img_text_pip_output = img_text_pipe(file_path) | |
description_text = img_text_pip_output[0]['generated_text'] | |
print(description_text) | |
narrated_text = narrator(description_text) | |
(narrated_text["sampling_rate"], narrated_text["audio"][0] ) | |
return (narrated_text["sampling_rate"], narrated_text["audio"][0]) | |
iface = gr.Interface(fn=describe_image, | |
inputs=gr.Image(label="Input image", | |
type="pil"), | |
outputs=gr.Audio(label="Narration", type="numpy", autoplay=True) | |
) | |
iface.launch() | |