File size: 1,513 Bytes
e585f50
f42c65b
536a548
f42c65b
 
e585f50
 
 
536a548
c545edf
 
 
536a548
f42c65b
 
 
 
 
 
536a548
e585f50
536a548
e585f50
536a548
e585f50
 
f42c65b
 
 
 
 
 
 
 
 
 
 
e585f50
 
 
 
 
f42c65b
 
 
 
e585f50
 
f42c65b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import os
from PIL import Image, ImageDraw, ImageFont
import gradio as gr
from helper import load_image_from_url, render_results_in_image 
from helper import summarize_predictions_natural_language
from transformers import pipeline
from transformers.utils import logging
logging.set_verbosity_error()

from helper import ignore_warnings
ignore_warnings()




od_pipe = pipeline("object-detection", "facebook/detr-resnet-50")
tts_pipe = pipeline("text-to-speech",
                    model="kakao-enterprise/vits-ljs")


def get_pipeline_prediction(pil_image):
    
    pipeline_output = od_pipe(pil_image)
    
    processed_image = render_results_in_image(pil_image,
                                            pipeline_output)
    
    text = summarize_predictions_natural_language(pipeline_output)
    print(text)
    narrated_text = tts_pipe(text)

    #print (narrated_text)
    print(narrated_text["audio"][0])
    print (narrated_text["sampling_rate"])
    return processed_image, (narrated_text["sampling_rate"], narrated_text["audio"][0] )
    #return processed_image


demo = gr.Interface(
  fn=get_pipeline_prediction,
  inputs=gr.Image(label="Input image", 
                  type="pil"),
  outputs=[gr.Image(label="Output image with predicted instances",
                   type="pil"), gr.Audio(label="Narration", type="numpy",  autoplay=True)]
  #outputs=gr.Image(label="Output image with predicted instances",
  #                 type="pil")
)

demo.launch(server_name="0.0.0.0", server_port=7860)