basharat8763 commited on
Commit
1814a78
·
verified ·
1 Parent(s): f00d80d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -0
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ from PIL import Image, ImageDraw, ImageFont
4
+ import scipy.io.wavfile as wavfile
5
+
6
+ # Use a pipeline as a high-level helper
7
+ from transformers import pipeline
8
+
9
+
10
+ narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
11
+ object_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
12
+
13
+ def generate_audio(text):
14
+ # Generate the narrated text
15
+ narrated_text = narrator(text)
16
+
17
+ # Save the audio to a WAV file
18
+ wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
19
+ data=narrated_text["audio"][0])
20
+
21
+ # Return the path to the saved audio file
22
+ return "output.wav"
23
+
24
+
25
+ # One shot prompting technique
26
+ def read_objects(detection_objects):
27
+ # Initialize counters for each object label
28
+ object_counts = {}
29
+
30
+ # Count the occurrences of each label
31
+ for detection in detection_objects:
32
+ label = detection['label']
33
+ if label in object_counts:
34
+ object_counts[label] += 1
35
+ else:
36
+ object_counts[label] = 1
37
+
38
+ # Generate the response string
39
+ response = "This picture contains"
40
+ labels = list(object_counts.keys())
41
+ for i, label in enumerate(labels):
42
+ response += f" {object_counts[label]} {label}"
43
+ if object_counts[label] > 1:
44
+ response += "s"
45
+ if i < len(labels) - 2:
46
+ response += ","
47
+ elif i == len(labels) - 2:
48
+ response += " and"
49
+
50
+ response += "."
51
+
52
+ return response
53
+
54
+
55
+ def draw_bounding_boxes(image, detections, font_path=None, font_size=20):
56
+ """
57
+ Draws bounding boxes on the given image based on the detections.
58
+ :param image: PIL.Image object
59
+ :param detections: List of detection results, where each result is a dictionary containing
60
+ 'score', 'label', and 'box' keys. 'box' itself is a dictionary with 'xmin',
61
+ 'ymin', 'xmax', 'ymax'.
62
+ :param font_path: Path to the TrueType font file to use for text.
63
+ :param font_size: Size of the font to use for text.
64
+ :return: PIL.Image object with bounding boxes drawn.
65
+ """
66
+ # Make a copy of the image to draw on
67
+ draw_image = image.copy()
68
+ draw = ImageDraw.Draw(draw_image)
69
+
70
+ # Load custom font or default font if path not provided
71
+ if font_path:
72
+ font = ImageFont.truetype(font_path, font_size)
73
+ else:
74
+ # When font_path is not provided, load default font but it's size is fixed
75
+ font = ImageFont.load_default()
76
+ # Increase font size workaround by using a TTF font file, if needed, can download and specify the path
77
+
78
+ for detection in detections:
79
+ box = detection['box']
80
+ xmin = box['xmin']
81
+ ymin = box['ymin']
82
+ xmax = box['xmax']
83
+ ymax = box['ymax']
84
+
85
+ # Draw the bounding box
86
+ draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=3)
87
+
88
+ # Optionally, you can also draw the label and score
89
+ label = detection['label']
90
+ score = detection['score']
91
+ text = f"{label} {score:.2f}"
92
+
93
+ # Draw text with background rectangle for visibility
94
+ if font_path: # Use the custom font with increased size
95
+ text_size = draw.textbbox((xmin, ymin), text, font=font)
96
+ else:
97
+ # Calculate text size using the default font
98
+ text_size = draw.textbbox((xmin, ymin), text)
99
+
100
+ draw.rectangle([(text_size[0], text_size[1]), (text_size[2], text_size[3])], fill="red")
101
+ draw.text((xmin, ymin), text, fill="white", font=font)
102
+
103
+ return draw_image
104
+
105
+ # for text output
106
+ # raw_image = Image.open("../Files/cat.jpg")
107
+ # output = object_detector(raw_image)
108
+ # print(output)
109
+
110
+
111
+ # Function for object detection and bounding box drawing
112
+ def detect_object(image):
113
+ raw_image = image
114
+ output = object_detector(raw_image)
115
+ processed_image = draw_bounding_boxes(raw_image, output)
116
+
117
+ natural_text = read_objects(output)
118
+ processed_audio = generate_audio(natural_text)
119
+ return processed_image, processed_audio
120
+
121
+ # return processed_image
122
+
123
+
124
+ demo = gr.Interface(
125
+ fn=detect_object,
126
+ inputs=[gr.Image(label="Select Image", type="pil")],
127
+ outputs=[gr.Image(label="Processed Image", type="pil"), gr.Audio(label="Generated Audio")],
128
+ title="Project 06: Object Detector with Audio",
129
+ description="As understood from the title, if not already, this application will detect objects in your image and "
130
+ "will provide audio of the same"
131
+ )
132
+
133
+ demo.launch()