Spaces:
Sleeping
Sleeping
import os | |
from PIL import Image, ImageDraw, ImageFont | |
import gradio as gr | |
from helper import load_image_from_url, render_results_in_image | |
from helper import summarize_predictions_natural_language | |
from transformers import pipeline | |
from tokenizers import Tokenizer, Encoding | |
from tokenizers import decoders | |
from tokenizers import models | |
from tokenizers import normalizers | |
from tokenizers import pre_tokenizers | |
from tokenizers import processors | |
import matplotlib.pyplot as plt | |
import requests | |
import inflect | |
from predictions import get_predictions | |
from helper import ignore_warnings | |
ignore_warnings() | |
from transformers.utils import logging | |
logging.set_verbosity_error() | |
od_pipe = pipeline("object-detection", "facebook/detr-resnet-50") | |
tts_pipe = pipeline("text-to-speech", | |
model="kakao-enterprise/vits-ljs") | |
def get_pipeline_prediction(pil_image): | |
pipeline_output = od_pipe(pil_image) | |
processed_image = render_results_in_image(pil_image, | |
pipeline_output) | |
text = summarize_predictions_natural_language(pipeline_output) | |
print(text) | |
narrated_text = tts_pipe(text) | |
#print (narrated_text) | |
print(narrated_text["audio"][0]) | |
print (narrated_text["sampling_rate"]) | |
return processed_image, (narrated_text["sampling_rate"], narrated_text["audio"][0] ) | |
#return processed_image | |
demo = gr.Interface( | |
fn=get_predictions, | |
inputs=gr.Image(label="Input image", | |
type="pil"), | |
outputs=[gr.Image(label="Output image with predicted instances", | |
type="pil"), gr.Audio(label="Narration", type="numpy", autoplay=True)] | |
#outputs=gr.Image(label="Output image with predicted instances", | |
# type="pil") | |
) | |
demo.launch(server_name="0.0.0.0", server_port=7860) | |