Spaces:
Running
Running
File size: 4,100 Bytes
d7750ec f96d4eb d7750ec f96d4eb d7750ec f96d4eb d7750ec 8c7b6a7 d7750ec a2c75f4 d7750ec 76486da d7750ec a2c75f4 d7750ec f96d4eb d7750ec cf236b2 d7750ec cf236b2 d7750ec cf236b2 d7750ec f96d4eb d7750ec a2c75f4 d7750ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import gradio as gr
from PIL import Image, ImageDraw, ImageFont
import scipy.io.wavfile as wavfile
# Use a pipeline as a high-level helper
from transformers import pipeline
# model_path = ("../Models/models--facebook--detr-resnet-50/snapshots"
# "/1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b")
# object_detector = pipeline("object-detection",
# model=model_path)
object_detector = pipeline("object-detection",
model="facebook/detr-resnet-50")
narrator = pipeline("text-to-speech",
model="kakao-enterprise/vits-ljs")
# Define the function to generate audio from text
def generate_audio(text):
# Generate the narrated text
narrated_text = narrator(text)
# Save the audio to a WAV file
wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
data=narrated_text["audio"][0])
# Return the path to the saved audio file
return "output.wav"
def read_objects(detection_objects):
# Initialize counters for each object label
object_counts = {}
# Count the occurrences of each label
for detection in detection_objects:
label = detection['label']
if label in object_counts:
object_counts[label] += 1
else:
object_counts[label] = 1
# Generate the response string
response = "This picture contains"
labels = list(object_counts.keys())
for i, label in enumerate(labels):
response += f" {object_counts[label]} {label}"
if object_counts[label] > 1:
response += "s"
if i < len(labels) - 2:
response += ","
elif i == len(labels) - 2:
response += " and"
response += "."
return response
def draw_bounding_boxes(image, detections, font_path=None, font_size=50):
# Make a copy of the image to draw on
draw_image = image.copy()
draw = ImageDraw.Draw(draw_image)
# Load custom font or default font if path not provided
if font_path:
font = ImageFont.truetype(font_path, font_size)
else:
# When font_path is not provided, load default font but its size is fixed
font = ImageFont.load_default()
# Increase font size workaround by using a TTF font file, if needed, can download and specify the path
for detection in detections:
box = detection['box']
xmin = box['xmin']
ymin = box['ymin']
xmax = box['xmax']
ymax = box['ymax']
# Draw the bounding box
draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=5)
# Optionally, you can also draw the label and score
label = detection['label']
score = detection['score']
text = f"{label} {score:.2f}"
# Draw text with background rectangle for visibility
if font_path: # Use the custom font with increased size
text_size = draw.textbbox((xmin, ymin), text, font=font)
else:
# Calculate text size using the default font
text_size = draw.textbbox((xmin, ymin), text)
draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
draw.text((xmin, ymin), text, fill="white", font=font)
return draw_image
def detect_object(image):
raw_image = image
output = object_detector(raw_image)
processed_image = draw_bounding_boxes(raw_image, output)
natural_text = read_objects(output)
processed_audio = generate_audio(natural_text)
return processed_image, processed_audio
examples = [
["example1.jpg"],
["example2.jpg"],
]
demo = gr.Interface(fn=detect_object,
inputs=[gr.Image(label="Select Image",type="pil")],
theme='freddyaboulton/dracula_revamped',
outputs=[gr.Image(label="Processed Image", type="pil")],
examples = examples,
title="Object Detector",
description="Detect objects in the input image with bounding boxes with audio description.")
demo.launch()
|