File size: 881 Bytes
a7ab7f4
 
 
 
 
7812e4e
a7ab7f4
 
7812e4e
a7ab7f4
 
 
 
 
 
9c6e277
a7ab7f4
 
9c6e277
 
a7ab7f4
9c6e277
 
a7ab7f4
 
 
 
 
9c6e277
a7ab7f4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import gradio as gr
from transformers import pipeline


img_text_pipe = pipeline("image-to-text",
                model="Salesforce/blip-image-captioning-base")

narrator = pipeline("text-to-speech",
                    model="kakao-enterprise/vits-ljs")

def describe_image(file_path):

    img_text_pip_output = img_text_pipe(file_path)

    description_text = img_text_pip_output[0]['generated_text']
    print(description_text)

    narrated_text = narrator(description_text)

    (narrated_text["sampling_rate"], narrated_text["audio"][0] )
        
    return (narrated_text["sampling_rate"], narrated_text["audio"][0])



iface = gr.Interface(fn=describe_image, 
                     inputs=gr.Image(label="Input image", 
                  type="pil"),
                   outputs=gr.Audio(label="Narration", type="numpy",  autoplay=True)
                   )
iface.launch()