Image_Whisper / app.py
chethu's picture
Update app.py
209f8c0 verified
raw
history blame
1.82 kB
import os
from PIL import Image, ImageDraw, ImageFont
import gradio as gr
from helper import load_image_from_url, render_results_in_image
from helper import summarize_predictions_natural_language
from transformers import pipeline
from tokenizers import Tokenizer, Encoding
from tokenizers import decoders
from tokenizers import models
from tokenizers import normalizers
from tokenizers import pre_tokenizers
from tokenizers import processors
import matplotlib.pyplot as plt
import requests
import inflect
from predictions import get_predictions
from helper import ignore_warnings
ignore_warnings()
from transformers.utils import logging
logging.set_verbosity_error()
od_pipe = pipeline("object-detection", "facebook/detr-resnet-50")
tts_pipe = pipeline("text-to-speech",
model="kakao-enterprise/vits-ljs")
def get_pipeline_prediction(pil_image):
pipeline_output = od_pipe(pil_image)
processed_image = render_results_in_image(pil_image,
pipeline_output)
text = summarize_predictions_natural_language(pipeline_output)
print(text)
narrated_text = tts_pipe(text)
#print (narrated_text)
print(narrated_text["audio"][0])
print (narrated_text["sampling_rate"])
return processed_image, (narrated_text["sampling_rate"], narrated_text["audio"][0] )
#return processed_image
demo = gr.Interface(
fn=get_predictions,
inputs=gr.Image(label="Input image",
type="pil"),
outputs=[gr.Image(label="Output image with predicted instances",
type="pil"), gr.Audio(label="Narration", type="numpy", autoplay=True)]
#outputs=gr.Image(label="Output image with predicted instances",
# type="pil")
)
demo.launch(server_name="0.0.0.0", server_port=7860)