import os
from PIL import Image, ImageDraw, ImageFont
import gradio as gr
from helper import load_image_from_url, render_results_in_image 
from helper import summarize_predictions_natural_language
from transformers import pipeline
from transformers.utils import logging
logging.set_verbosity_error()

from helper import ignore_warnings
ignore_warnings()


od_pipe = pipeline("object-detection", "facebook/detr-resnet-50")
tts_pipe = pipeline("text-to-speech",
                    model="kakao-enterprise/vits-ljs")


def get_pipeline_prediction(pil_image):
    
    pipeline_output = od_pipe(pil_image)
    
    processed_image = render_results_in_image(pil_image,
                                            pipeline_output)
    
    text = summarize_predictions_natural_language(pipeline_output)
    print(text)
    narrated_text = tts_pipe(text)

    #print (narrated_text)
    print(narrated_text["audio"][0])
    print (narrated_text["sampling_rate"])
    return processed_image, (narrated_text["sampling_rate"], narrated_text["audio"][0] )
    #return processed_image


demo = gr.Interface(
  fn=get_pipeline_prediction,
  inputs=gr.Image(label="Input image", 
                  type="pil"),
  outputs=[gr.Image(label="Output image with predicted instances",
                   type="pil"), gr.Audio(label="Narration", type="numpy",  autoplay=True)]
  #outputs=gr.Image(label="Output image with predicted instances",
  #                 type="pil")
)

demo.launch(server_name="0.0.0.0", server_port=7860)