Spaces:

basharat8763
/

P06_ObjectDetectorWithAudio

Sleeping

App Files Files Community

P06_ObjectDetectorWithAudio / app.py

basharat8763

Create app.py

1814a78 verified 22 days ago

raw

history blame contribute delete

4.49 kB

	import torch
	import gradio as gr
	from PIL import Image, ImageDraw, ImageFont
	import scipy.io.wavfile as wavfile

	# Use a pipeline as a high-level helper
	from transformers import pipeline


	narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
	object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")

	def generate_audio(text):
	# Generate the narrated text
	narrated_text = narrator(text)

	# Save the audio to a WAV file
	wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
	data=narrated_text["audio"][0])

	# Return the path to the saved audio file
	return "output.wav"


	# One shot prompting technique
	def read_objects(detection_objects):
	# Initialize counters for each object label
	object_counts = {}

	# Count the occurrences of each label
	for detection in detection_objects:
	label = detection['label']
	if label in object_counts:
	object_counts[label] += 1
	else:
	object_counts[label] = 1

	# Generate the response string
	response = "This picture contains"
	labels = list(object_counts.keys())
	for i, label in enumerate(labels):
	response += f" {object_counts[label]} {label}"
	if object_counts[label] > 1:
	response += "s"
	if i < len(labels) - 2:
	response += ","
	elif i == len(labels) - 2:
	response += " and"

	response += "."

	return response


	def draw_bounding_boxes(image, detections, font_path=None, font_size=20):
	"""
	Draws bounding boxes on the given image based on the detections.
	:param image: PIL.Image object
	:param detections: List of detection results, where each result is a dictionary containing
	'score', 'label', and 'box' keys. 'box' itself is a dictionary with 'xmin',
	'ymin', 'xmax', 'ymax'.
	:param font_path: Path to the TrueType font file to use for text.
	:param font_size: Size of the font to use for text.
	:return: PIL.Image object with bounding boxes drawn.
	"""
	# Make a copy of the image to draw on
	draw_image = image.copy()
	draw = ImageDraw.Draw(draw_image)

	# Load custom font or default font if path not provided
	if font_path:
	font = ImageFont.truetype(font_path, font_size)
	else:
	# When font_path is not provided, load default font but it's size is fixed
	font = ImageFont.load_default()
	# Increase font size workaround by using a TTF font file, if needed, can download and specify the path

	for detection in detections:
	box = detection['box']
	xmin = box['xmin']
	ymin = box['ymin']
	xmax = box['xmax']
	ymax = box['ymax']

	# Draw the bounding box
	draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)

	# Optionally, you can also draw the label and score
	label = detection['label']
	score = detection['score']
	text = f"{label} {score:.2f}"

	# Draw text with background rectangle for visibility
	if font_path: # Use the custom font with increased size
	text_size = draw.textbbox((xmin, ymin), text, font=font)
	else:
	# Calculate text size using the default font
	text_size = draw.textbbox((xmin, ymin), text)

	draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
	draw.text((xmin, ymin), text, fill="white", font=font)

	return draw_image

	# for text output
	# raw_image = Image.open("../Files/cat.jpg")
	# output = object_detector(raw_image)
	# print(output)


	# Function for object detection and bounding box drawing
	def detect_object(image):
	raw_image = image
	output = object_detector(raw_image)
	processed_image = draw_bounding_boxes(raw_image, output)

	natural_text = read_objects(output)
	processed_audio = generate_audio(natural_text)
	return processed_image, processed_audio

	# return processed_image


	demo = gr.Interface(
	fn=detect_object,
	inputs=[gr.Image(label="Select Image", type="pil")],
	outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
	title="Project 06: Object Detector with Audio",
	description="As understood from the title, if not already, this application will detect objects in your image and "
	"will provide audio of the same"
	)

	demo.launch()