ankanxopencv commited on
Commit
fb26b5c
1 Parent(s): 0816a52

Upload 13 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ sample/person.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ sample/video_1.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ yolov3.weights filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import gradio as gr # type: ignore
4
+ from mbnet import load_model, detect_objects, get_box_dimensions, draw_labels, load_img
5
+ from yolov3 import load_image, load_yolo, detect_objects_yolo, get_box_dimensions_yolo, draw_labels_yolo
6
+
7
+
8
+ # Image Inference
9
+
10
+ def img_inf(img,model):
11
+ if model=="MobileNet-SSD":
12
+ model, classes, colors = load_model()
13
+ image, height, width, channels = load_img(img)
14
+ blob, outputs = detect_objects(image, model)
15
+ boxes, class_ids = get_box_dimensions(outputs, height, width)
16
+ image1 = draw_labels(boxes, colors, class_ids, classes, image)
17
+ return cv2.cvtColor(image1, cv2.COLOR_BGR2RGB)
18
+ else:
19
+ model, classes, colors, output_layers = load_yolo()
20
+ image, height, width, channels = load_image(img)
21
+ blob, outputs = detect_objects_yolo(image, model, output_layers)
22
+ boxes, confs, class_ids = get_box_dimensions_yolo(outputs, height, width)
23
+ image=draw_labels_yolo(boxes, confs, colors, class_ids, classes, image)
24
+ return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
25
+
26
+
27
+ model_name = gr.Radio(["MobileNet-SSD", "YOLOv3"], value="YOLOv3", label="Model", info="choose your model")
28
+ inputs_image = gr.Image(type="filepath", label="Input Image")
29
+ outputs_image = gr.Image(type="numpy", label="Output Image")
30
+ interface_image = gr.Interface(
31
+ fn=img_inf,
32
+ inputs=[inputs_image,model_name],
33
+ outputs=outputs_image,
34
+ title="Image Inference",
35
+ description="Upload your photo and select one model and see the results!",
36
+ examples=[["sample/dog.jpg"]],
37
+ cache_examples=False,
38
+ )
39
+
40
+
41
+ # Video Inference
42
+
43
+ def vid_inf(vid, model_type):
44
+ if model_type == "MobileNet-SSD":
45
+ cap = cv2.VideoCapture(vid)
46
+ # get the video frames' width and height for proper saving of videos
47
+ frame_width = int(cap.get(3))
48
+ frame_height = int(cap.get(4))
49
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
50
+ frame_size = (frame_width, frame_height)
51
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
52
+ output_video = "output_recorded.mp4"
53
+
54
+ # create the `VideoWriter()` object
55
+ out = cv2.VideoWriter(output_video, fourcc, fps, frame_size)
56
+
57
+ model, classes, colors = load_model()
58
+ while cap.isOpened():
59
+ ret, frame = cap.read()
60
+ if ret:
61
+ height, width, channels = frame.shape
62
+ blob, outputs = detect_objects(frame, model)
63
+ boxes, class_ids = get_box_dimensions(outputs, height, width)
64
+ frame = draw_labels(boxes, colors, class_ids, classes, frame)
65
+ out.write(frame)
66
+ yield cv2.cvtColor(frame, cv2.COLOR_BGR2RGB),None
67
+ else:
68
+ break
69
+
70
+ cap.release()
71
+ out.release()
72
+ cv2.destroyAllWindows()
73
+ yield None, output_video
74
+
75
+ else:
76
+ cap = cv2.VideoCapture(vid)
77
+ # get the video frames' width and height for proper saving of videos
78
+ frame_width = int(cap.get(3))
79
+ frame_height = int(cap.get(4))
80
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
81
+ frame_size = (frame_width, frame_height)
82
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
83
+ output_video = "output_recorded.mp4"
84
+
85
+ # create the `VideoWriter()` object
86
+ out = cv2.VideoWriter(output_video, fourcc, fps, frame_size)
87
+
88
+ model, classes, colors, output_layers = load_yolo()
89
+ while cap.isOpened():
90
+ ret, frame_y = cap.read()
91
+ if ret:
92
+ height, width, channels = frame_y.shape
93
+ blob, outputs = detect_objects_yolo(frame_y, model, output_layers)
94
+ boxes, confs, class_ids = get_box_dimensions_yolo(outputs, height, width)
95
+ frame_y = draw_labels_yolo(boxes, confs, colors, class_ids, classes, frame_y)
96
+ out.write(frame_y)
97
+ yield cv2.cvtColor(frame_y, cv2.COLOR_BGR2RGB), None
98
+ else:
99
+ break
100
+
101
+ cap.release()
102
+ out.release()
103
+ cv2.destroyAllWindows()
104
+ yield None, output_video
105
+
106
+
107
+ model_name = gr.Radio(["MobileNet-SSD", "YOLOv3"], value="YOLOv3", label="Model", info="choose your model")
108
+ input_video = gr.Video(sources=None, label="Input Video")
109
+ output_frame = gr.Image(type="numpy", label="Output Frames")
110
+ output_video_file = gr.Video(label="Output video")
111
+
112
+
113
+ interface_video = gr.Interface(
114
+ fn=vid_inf,
115
+ inputs=[input_video, model_name],
116
+ outputs=[output_frame,output_video_file],
117
+ title="Video Inference",
118
+ description="Upload your video and select one model and see the results!",
119
+ examples=[["sample/video_1.mp4"],["sample/person.mp4"]],
120
+ cache_examples=False,
121
+ )
122
+
123
+ gr.TabbedInterface(
124
+ [interface_image, interface_video],
125
+ tab_names=['Image', 'Video'],
126
+ title='GradioxOpenCV-DNN'
127
+ ).queue().launch()
coco.names ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ person
2
+ bicycle
3
+ car
4
+ motorbike
5
+ aeroplane
6
+ bus
7
+ train
8
+ truck
9
+ boat
10
+ traffic light
11
+ fire hydrant
12
+ stop sign
13
+ parking meter
14
+ bench
15
+ bird
16
+ cat
17
+ dog
18
+ horse
19
+ sheep
20
+ cow
21
+ elephant
22
+ bear
23
+ zebra
24
+ giraffe
25
+ backpack
26
+ umbrella
27
+ handbag
28
+ tie
29
+ suitcase
30
+ frisbee
31
+ skis
32
+ snowboard
33
+ sports ball
34
+ kite
35
+ baseball bat
36
+ baseball glove
37
+ skateboard
38
+ surfboard
39
+ tennis racket
40
+ bottle
41
+ wine glass
42
+ cup
43
+ fork
44
+ knife
45
+ spoon
46
+ bowl
47
+ banana
48
+ apple
49
+ sandwich
50
+ orange
51
+ broccoli
52
+ carrot
53
+ hot dog
54
+ pizza
55
+ donut
56
+ cake
57
+ chair
58
+ sofa
59
+ pottedplant
60
+ bed
61
+ diningtable
62
+ toilet
63
+ tvmonitor
64
+ laptop
65
+ mouse
66
+ remote
67
+ keyboard
68
+ cell phone
69
+ microwave
70
+ oven
71
+ toaster
72
+ sink
73
+ refrigerator
74
+ book
75
+ clock
76
+ vase
77
+ scissors
78
+ teddy bear
79
+ hair drier
80
+ toothbrush
frozen_inference_graph.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a8d8a89d695842e60d8c6d144181100555563e21acf2fa1e8f561fec5c3c6ad
3
+ size 69688296
mbnet.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ #import time
4
+
5
+
6
+ #video_path = 'D:/OfficeWork/VS_code_exp/exp/video_1.mp4'
7
+ #image_path = 'D:/OfficeWork/VS_code_exp/exp/test.jpg.jpg'
8
+
9
+ def load_model():
10
+ model= cv2.dnn.readNet(model='frozen_inference_graph.pb',
11
+ config='ssd_mobilenet_v2_coco_2018_03_29.pbtxt.txt',
12
+ framework='TensorFlow')
13
+ with open('object_detection_classes_coco.txt', 'r') as f:
14
+ class_names = f.read().split('\n')
15
+ COLORS = np.random.uniform(0, 255, size=(len(class_names), 3))
16
+ return model, class_names, COLORS
17
+
18
+ def load_img(img_path):
19
+ img=cv2.imread(img_path)
20
+ img=cv2.resize(img, None, fx=0.4, fy=0.4)
21
+ height, width, channels = img.shape
22
+ return img, height, width, channels
23
+
24
+ def detect_objects(img, net):
25
+ blob = cv2.dnn.blobFromImage(img, size=(300, 300), mean=(104, 117, 123), swapRB=True)
26
+ net.setInput(blob)
27
+ outputs = net.forward()
28
+ #print (outputs)
29
+ return blob, outputs
30
+
31
+ def get_box_dimensions(outputs, height, width):
32
+ boxes = []
33
+ class_ids = []
34
+
35
+ for detect in outputs[0,0,:,:]:
36
+ scores = detect[2]
37
+ class_id = detect[1]
38
+ if scores > 0.3:
39
+ center_x = int(detect[0] * width)
40
+ center_y = int(detect[1] * height)
41
+ w = int(detect[5] * width)
42
+ h = int(detect[6] * height)
43
+ x = int((detect[3] * width))
44
+ y = int((detect[4] * height))
45
+ boxes.append([x, y, w, h])
46
+ class_ids.append(class_id)
47
+ return boxes, class_ids
48
+
49
+ def draw_labels(boxes, colors, class_ids, classes, img):
50
+ font = cv2.FONT_HERSHEY_PLAIN
51
+ model, classes, colors = load_model()
52
+ for i in range(len(boxes)):
53
+ x, y, w, h = boxes[i]
54
+ label = classes[int(class_ids[0])-1]
55
+ color = colors[i]
56
+ cv2.rectangle(img, (x,y), (w,h), color, 5)
57
+ cv2.putText(img, label, (x, y - 5), font, 5, color, 5)
58
+ return img
59
+
60
+ def image_detect(img_path):
61
+ model, classes, colors = load_model()
62
+ image, height, width, channels = load_img(img_path)
63
+ blob, outputs = detect_objects(image, model)
64
+ boxes, class_ids = get_box_dimensions(outputs, height, width)
65
+ image1 = draw_labels(boxes, colors, class_ids, classes, image)
66
+ return image1
67
+
68
+
69
+ #def start_video(video_path):
70
+ model, classes, colors = load_model()
71
+ cap = cv2.VideoCapture(video_path)
72
+ while True:
73
+ _, frame = cap.read()
74
+ height, width, channels = frame.shape
75
+ blob, outputs = detect_objects(frame, model)
76
+ boxes, class_ids = get_box_dimensions(outputs, height, width)
77
+ frame=draw_labels(boxes, colors, class_ids, classes, frame)
78
+ yield cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
79
+
80
+ cv2.destroyAllWindows()
object_detection_classes_coco.txt ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ person
2
+ bicycle
3
+ car
4
+ motorcycle
5
+ airplane
6
+ bus
7
+ train
8
+ truck
9
+ boat
10
+ traffic light
11
+ fire hydrant
12
+ street sign
13
+ stop sign
14
+ parking meter
15
+ bench
16
+ bird
17
+ cat
18
+ dog
19
+ horse
20
+ sheep
21
+ cow
22
+ elephant
23
+ bear
24
+ zebra
25
+ giraffe
26
+ hat
27
+ backpack
28
+ umbrella
29
+ shoe
30
+ eye glasses
31
+ handbag
32
+ tie
33
+ suitcase
34
+ frisbee
35
+ skis
36
+ snowboard
37
+ sports ball
38
+ kite
39
+ baseball bat
40
+ baseball glove
41
+ skateboard
42
+ surfboard
43
+ tennis racket
44
+ bottle
45
+ plate
46
+ wine glass
47
+ cup
48
+ fork
49
+ knife
50
+ spoon
51
+ bowl
52
+ banana
53
+ apple
54
+ sandwich
55
+ orange
56
+ broccoli
57
+ carrot
58
+ hot dog
59
+ pizza
60
+ donut
61
+ cake
62
+ chair
63
+ couch
64
+ potted plant
65
+ bed
66
+ mirror
67
+ dining table
68
+ window
69
+ desk
70
+ toilet
71
+ door
72
+ tv
73
+ laptop
74
+ mouse
75
+ remote
76
+ keyboard
77
+ cell phone
78
+ microwave
79
+ oven
80
+ toaster
81
+ sink
82
+ refrigerator
83
+ blender
84
+ book
85
+ clock
86
+ vase
87
+ scissors
88
+ teddy bear
89
+ hair drier
90
+ toothbrush
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ opencv-contrib-python
2
+ numpy
3
+ gradio
sample/dog.jpg ADDED
sample/person.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46636187a6af45c1fe71b5b8e14d96eb64908f8b285f29bd194e9e9e66c0cb02
3
+ size 8497766
sample/video_1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4faf22572a0655605807476f3766e79be5b97bfdb55af020d6404e5561b9e122
3
+ size 1803408
ssd_mobilenet_v2_coco_2018_03_29.pbtxt.txt ADDED
The diff for this file is too large to render. See raw diff
 
yolov3.cfg ADDED
@@ -0,0 +1,789 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [net]
2
+ # Testing
3
+ # batch=1
4
+ # subdivisions=1
5
+ # Training
6
+ batch=64
7
+ subdivisions=16
8
+ width=608
9
+ height=608
10
+ channels=3
11
+ momentum=0.9
12
+ decay=0.0005
13
+ angle=0
14
+ saturation = 1.5
15
+ exposure = 1.5
16
+ hue=.1
17
+
18
+ learning_rate=0.001
19
+ burn_in=1000
20
+ max_batches = 500200
21
+ policy=steps
22
+ steps=400000,450000
23
+ scales=.1,.1
24
+
25
+ [convolutional]
26
+ batch_normalize=1
27
+ filters=32
28
+ size=3
29
+ stride=1
30
+ pad=1
31
+ activation=leaky
32
+
33
+ # Downsample
34
+
35
+ [convolutional]
36
+ batch_normalize=1
37
+ filters=64
38
+ size=3
39
+ stride=2
40
+ pad=1
41
+ activation=leaky
42
+
43
+ [convolutional]
44
+ batch_normalize=1
45
+ filters=32
46
+ size=1
47
+ stride=1
48
+ pad=1
49
+ activation=leaky
50
+
51
+ [convolutional]
52
+ batch_normalize=1
53
+ filters=64
54
+ size=3
55
+ stride=1
56
+ pad=1
57
+ activation=leaky
58
+
59
+ [shortcut]
60
+ from=-3
61
+ activation=linear
62
+
63
+ # Downsample
64
+
65
+ [convolutional]
66
+ batch_normalize=1
67
+ filters=128
68
+ size=3
69
+ stride=2
70
+ pad=1
71
+ activation=leaky
72
+
73
+ [convolutional]
74
+ batch_normalize=1
75
+ filters=64
76
+ size=1
77
+ stride=1
78
+ pad=1
79
+ activation=leaky
80
+
81
+ [convolutional]
82
+ batch_normalize=1
83
+ filters=128
84
+ size=3
85
+ stride=1
86
+ pad=1
87
+ activation=leaky
88
+
89
+ [shortcut]
90
+ from=-3
91
+ activation=linear
92
+
93
+ [convolutional]
94
+ batch_normalize=1
95
+ filters=64
96
+ size=1
97
+ stride=1
98
+ pad=1
99
+ activation=leaky
100
+
101
+ [convolutional]
102
+ batch_normalize=1
103
+ filters=128
104
+ size=3
105
+ stride=1
106
+ pad=1
107
+ activation=leaky
108
+
109
+ [shortcut]
110
+ from=-3
111
+ activation=linear
112
+
113
+ # Downsample
114
+
115
+ [convolutional]
116
+ batch_normalize=1
117
+ filters=256
118
+ size=3
119
+ stride=2
120
+ pad=1
121
+ activation=leaky
122
+
123
+ [convolutional]
124
+ batch_normalize=1
125
+ filters=128
126
+ size=1
127
+ stride=1
128
+ pad=1
129
+ activation=leaky
130
+
131
+ [convolutional]
132
+ batch_normalize=1
133
+ filters=256
134
+ size=3
135
+ stride=1
136
+ pad=1
137
+ activation=leaky
138
+
139
+ [shortcut]
140
+ from=-3
141
+ activation=linear
142
+
143
+ [convolutional]
144
+ batch_normalize=1
145
+ filters=128
146
+ size=1
147
+ stride=1
148
+ pad=1
149
+ activation=leaky
150
+
151
+ [convolutional]
152
+ batch_normalize=1
153
+ filters=256
154
+ size=3
155
+ stride=1
156
+ pad=1
157
+ activation=leaky
158
+
159
+ [shortcut]
160
+ from=-3
161
+ activation=linear
162
+
163
+ [convolutional]
164
+ batch_normalize=1
165
+ filters=128
166
+ size=1
167
+ stride=1
168
+ pad=1
169
+ activation=leaky
170
+
171
+ [convolutional]
172
+ batch_normalize=1
173
+ filters=256
174
+ size=3
175
+ stride=1
176
+ pad=1
177
+ activation=leaky
178
+
179
+ [shortcut]
180
+ from=-3
181
+ activation=linear
182
+
183
+ [convolutional]
184
+ batch_normalize=1
185
+ filters=128
186
+ size=1
187
+ stride=1
188
+ pad=1
189
+ activation=leaky
190
+
191
+ [convolutional]
192
+ batch_normalize=1
193
+ filters=256
194
+ size=3
195
+ stride=1
196
+ pad=1
197
+ activation=leaky
198
+
199
+ [shortcut]
200
+ from=-3
201
+ activation=linear
202
+
203
+
204
+ [convolutional]
205
+ batch_normalize=1
206
+ filters=128
207
+ size=1
208
+ stride=1
209
+ pad=1
210
+ activation=leaky
211
+
212
+ [convolutional]
213
+ batch_normalize=1
214
+ filters=256
215
+ size=3
216
+ stride=1
217
+ pad=1
218
+ activation=leaky
219
+
220
+ [shortcut]
221
+ from=-3
222
+ activation=linear
223
+
224
+ [convolutional]
225
+ batch_normalize=1
226
+ filters=128
227
+ size=1
228
+ stride=1
229
+ pad=1
230
+ activation=leaky
231
+
232
+ [convolutional]
233
+ batch_normalize=1
234
+ filters=256
235
+ size=3
236
+ stride=1
237
+ pad=1
238
+ activation=leaky
239
+
240
+ [shortcut]
241
+ from=-3
242
+ activation=linear
243
+
244
+ [convolutional]
245
+ batch_normalize=1
246
+ filters=128
247
+ size=1
248
+ stride=1
249
+ pad=1
250
+ activation=leaky
251
+
252
+ [convolutional]
253
+ batch_normalize=1
254
+ filters=256
255
+ size=3
256
+ stride=1
257
+ pad=1
258
+ activation=leaky
259
+
260
+ [shortcut]
261
+ from=-3
262
+ activation=linear
263
+
264
+ [convolutional]
265
+ batch_normalize=1
266
+ filters=128
267
+ size=1
268
+ stride=1
269
+ pad=1
270
+ activation=leaky
271
+
272
+ [convolutional]
273
+ batch_normalize=1
274
+ filters=256
275
+ size=3
276
+ stride=1
277
+ pad=1
278
+ activation=leaky
279
+
280
+ [shortcut]
281
+ from=-3
282
+ activation=linear
283
+
284
+ # Downsample
285
+
286
+ [convolutional]
287
+ batch_normalize=1
288
+ filters=512
289
+ size=3
290
+ stride=2
291
+ pad=1
292
+ activation=leaky
293
+
294
+ [convolutional]
295
+ batch_normalize=1
296
+ filters=256
297
+ size=1
298
+ stride=1
299
+ pad=1
300
+ activation=leaky
301
+
302
+ [convolutional]
303
+ batch_normalize=1
304
+ filters=512
305
+ size=3
306
+ stride=1
307
+ pad=1
308
+ activation=leaky
309
+
310
+ [shortcut]
311
+ from=-3
312
+ activation=linear
313
+
314
+
315
+ [convolutional]
316
+ batch_normalize=1
317
+ filters=256
318
+ size=1
319
+ stride=1
320
+ pad=1
321
+ activation=leaky
322
+
323
+ [convolutional]
324
+ batch_normalize=1
325
+ filters=512
326
+ size=3
327
+ stride=1
328
+ pad=1
329
+ activation=leaky
330
+
331
+ [shortcut]
332
+ from=-3
333
+ activation=linear
334
+
335
+
336
+ [convolutional]
337
+ batch_normalize=1
338
+ filters=256
339
+ size=1
340
+ stride=1
341
+ pad=1
342
+ activation=leaky
343
+
344
+ [convolutional]
345
+ batch_normalize=1
346
+ filters=512
347
+ size=3
348
+ stride=1
349
+ pad=1
350
+ activation=leaky
351
+
352
+ [shortcut]
353
+ from=-3
354
+ activation=linear
355
+
356
+
357
+ [convolutional]
358
+ batch_normalize=1
359
+ filters=256
360
+ size=1
361
+ stride=1
362
+ pad=1
363
+ activation=leaky
364
+
365
+ [convolutional]
366
+ batch_normalize=1
367
+ filters=512
368
+ size=3
369
+ stride=1
370
+ pad=1
371
+ activation=leaky
372
+
373
+ [shortcut]
374
+ from=-3
375
+ activation=linear
376
+
377
+ [convolutional]
378
+ batch_normalize=1
379
+ filters=256
380
+ size=1
381
+ stride=1
382
+ pad=1
383
+ activation=leaky
384
+
385
+ [convolutional]
386
+ batch_normalize=1
387
+ filters=512
388
+ size=3
389
+ stride=1
390
+ pad=1
391
+ activation=leaky
392
+
393
+ [shortcut]
394
+ from=-3
395
+ activation=linear
396
+
397
+
398
+ [convolutional]
399
+ batch_normalize=1
400
+ filters=256
401
+ size=1
402
+ stride=1
403
+ pad=1
404
+ activation=leaky
405
+
406
+ [convolutional]
407
+ batch_normalize=1
408
+ filters=512
409
+ size=3
410
+ stride=1
411
+ pad=1
412
+ activation=leaky
413
+
414
+ [shortcut]
415
+ from=-3
416
+ activation=linear
417
+
418
+
419
+ [convolutional]
420
+ batch_normalize=1
421
+ filters=256
422
+ size=1
423
+ stride=1
424
+ pad=1
425
+ activation=leaky
426
+
427
+ [convolutional]
428
+ batch_normalize=1
429
+ filters=512
430
+ size=3
431
+ stride=1
432
+ pad=1
433
+ activation=leaky
434
+
435
+ [shortcut]
436
+ from=-3
437
+ activation=linear
438
+
439
+ [convolutional]
440
+ batch_normalize=1
441
+ filters=256
442
+ size=1
443
+ stride=1
444
+ pad=1
445
+ activation=leaky
446
+
447
+ [convolutional]
448
+ batch_normalize=1
449
+ filters=512
450
+ size=3
451
+ stride=1
452
+ pad=1
453
+ activation=leaky
454
+
455
+ [shortcut]
456
+ from=-3
457
+ activation=linear
458
+
459
+ # Downsample
460
+
461
+ [convolutional]
462
+ batch_normalize=1
463
+ filters=1024
464
+ size=3
465
+ stride=2
466
+ pad=1
467
+ activation=leaky
468
+
469
+ [convolutional]
470
+ batch_normalize=1
471
+ filters=512
472
+ size=1
473
+ stride=1
474
+ pad=1
475
+ activation=leaky
476
+
477
+ [convolutional]
478
+ batch_normalize=1
479
+ filters=1024
480
+ size=3
481
+ stride=1
482
+ pad=1
483
+ activation=leaky
484
+
485
+ [shortcut]
486
+ from=-3
487
+ activation=linear
488
+
489
+ [convolutional]
490
+ batch_normalize=1
491
+ filters=512
492
+ size=1
493
+ stride=1
494
+ pad=1
495
+ activation=leaky
496
+
497
+ [convolutional]
498
+ batch_normalize=1
499
+ filters=1024
500
+ size=3
501
+ stride=1
502
+ pad=1
503
+ activation=leaky
504
+
505
+ [shortcut]
506
+ from=-3
507
+ activation=linear
508
+
509
+ [convolutional]
510
+ batch_normalize=1
511
+ filters=512
512
+ size=1
513
+ stride=1
514
+ pad=1
515
+ activation=leaky
516
+
517
+ [convolutional]
518
+ batch_normalize=1
519
+ filters=1024
520
+ size=3
521
+ stride=1
522
+ pad=1
523
+ activation=leaky
524
+
525
+ [shortcut]
526
+ from=-3
527
+ activation=linear
528
+
529
+ [convolutional]
530
+ batch_normalize=1
531
+ filters=512
532
+ size=1
533
+ stride=1
534
+ pad=1
535
+ activation=leaky
536
+
537
+ [convolutional]
538
+ batch_normalize=1
539
+ filters=1024
540
+ size=3
541
+ stride=1
542
+ pad=1
543
+ activation=leaky
544
+
545
+ [shortcut]
546
+ from=-3
547
+ activation=linear
548
+
549
+ ######################
550
+
551
+ [convolutional]
552
+ batch_normalize=1
553
+ filters=512
554
+ size=1
555
+ stride=1
556
+ pad=1
557
+ activation=leaky
558
+
559
+ [convolutional]
560
+ batch_normalize=1
561
+ size=3
562
+ stride=1
563
+ pad=1
564
+ filters=1024
565
+ activation=leaky
566
+
567
+ [convolutional]
568
+ batch_normalize=1
569
+ filters=512
570
+ size=1
571
+ stride=1
572
+ pad=1
573
+ activation=leaky
574
+
575
+ [convolutional]
576
+ batch_normalize=1
577
+ size=3
578
+ stride=1
579
+ pad=1
580
+ filters=1024
581
+ activation=leaky
582
+
583
+ [convolutional]
584
+ batch_normalize=1
585
+ filters=512
586
+ size=1
587
+ stride=1
588
+ pad=1
589
+ activation=leaky
590
+
591
+ [convolutional]
592
+ batch_normalize=1
593
+ size=3
594
+ stride=1
595
+ pad=1
596
+ filters=1024
597
+ activation=leaky
598
+
599
+ [convolutional]
600
+ size=1
601
+ stride=1
602
+ pad=1
603
+ filters=255
604
+ activation=linear
605
+
606
+
607
+ [yolo]
608
+ mask = 6,7,8
609
+ anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
610
+ classes=80
611
+ num=9
612
+ jitter=.3
613
+ ignore_thresh = .7
614
+ truth_thresh = 1
615
+ random=1
616
+
617
+
618
+ [route]
619
+ layers = -4
620
+
621
+ [convolutional]
622
+ batch_normalize=1
623
+ filters=256
624
+ size=1
625
+ stride=1
626
+ pad=1
627
+ activation=leaky
628
+
629
+ [upsample]
630
+ stride=2
631
+
632
+ [route]
633
+ layers = -1, 61
634
+
635
+
636
+
637
+ [convolutional]
638
+ batch_normalize=1
639
+ filters=256
640
+ size=1
641
+ stride=1
642
+ pad=1
643
+ activation=leaky
644
+
645
+ [convolutional]
646
+ batch_normalize=1
647
+ size=3
648
+ stride=1
649
+ pad=1
650
+ filters=512
651
+ activation=leaky
652
+
653
+ [convolutional]
654
+ batch_normalize=1
655
+ filters=256
656
+ size=1
657
+ stride=1
658
+ pad=1
659
+ activation=leaky
660
+
661
+ [convolutional]
662
+ batch_normalize=1
663
+ size=3
664
+ stride=1
665
+ pad=1
666
+ filters=512
667
+ activation=leaky
668
+
669
+ [convolutional]
670
+ batch_normalize=1
671
+ filters=256
672
+ size=1
673
+ stride=1
674
+ pad=1
675
+ activation=leaky
676
+
677
+ [convolutional]
678
+ batch_normalize=1
679
+ size=3
680
+ stride=1
681
+ pad=1
682
+ filters=512
683
+ activation=leaky
684
+
685
+ [convolutional]
686
+ size=1
687
+ stride=1
688
+ pad=1
689
+ filters=255
690
+ activation=linear
691
+
692
+
693
+ [yolo]
694
+ mask = 3,4,5
695
+ anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
696
+ classes=80
697
+ num=9
698
+ jitter=.3
699
+ ignore_thresh = .7
700
+ truth_thresh = 1
701
+ random=1
702
+
703
+
704
+
705
+ [route]
706
+ layers = -4
707
+
708
+ [convolutional]
709
+ batch_normalize=1
710
+ filters=128
711
+ size=1
712
+ stride=1
713
+ pad=1
714
+ activation=leaky
715
+
716
+ [upsample]
717
+ stride=2
718
+
719
+ [route]
720
+ layers = -1, 36
721
+
722
+
723
+
724
+ [convolutional]
725
+ batch_normalize=1
726
+ filters=128
727
+ size=1
728
+ stride=1
729
+ pad=1
730
+ activation=leaky
731
+
732
+ [convolutional]
733
+ batch_normalize=1
734
+ size=3
735
+ stride=1
736
+ pad=1
737
+ filters=256
738
+ activation=leaky
739
+
740
+ [convolutional]
741
+ batch_normalize=1
742
+ filters=128
743
+ size=1
744
+ stride=1
745
+ pad=1
746
+ activation=leaky
747
+
748
+ [convolutional]
749
+ batch_normalize=1
750
+ size=3
751
+ stride=1
752
+ pad=1
753
+ filters=256
754
+ activation=leaky
755
+
756
+ [convolutional]
757
+ batch_normalize=1
758
+ filters=128
759
+ size=1
760
+ stride=1
761
+ pad=1
762
+ activation=leaky
763
+
764
+ [convolutional]
765
+ batch_normalize=1
766
+ size=3
767
+ stride=1
768
+ pad=1
769
+ filters=256
770
+ activation=leaky
771
+
772
+ [convolutional]
773
+ size=1
774
+ stride=1
775
+ pad=1
776
+ filters=255
777
+ activation=linear
778
+
779
+
780
+ [yolo]
781
+ mask = 0,1,2
782
+ anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
783
+ classes=80
784
+ num=9
785
+ jitter=.3
786
+ ignore_thresh = .7
787
+ truth_thresh = 1
788
+ random=1
789
+
yolov3.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ #import argparse
4
+ #import time
5
+
6
+
7
+
8
+ #ideo_path = 'D:/OfficeWork/VS_code_exp/exp/video_1.mp4'
9
+ #image_path = 'D:\OfficeWork/VS_code_exp/exp/test.jpg.jpg'
10
+
11
+
12
+ #Load yolo
13
+ def load_yolo():
14
+ net = cv2.dnn.readNet("yolov3.weights", "yolov3.cfg")
15
+ classes = []
16
+ with open("coco.names", "r") as f:
17
+ classes = [line.strip() for line in f.readlines()]
18
+
19
+ output_layers = [layer_name for layer_name in net.getUnconnectedOutLayersNames()]
20
+ colors = np.random.uniform(0, 255, size=(len(classes), 3))
21
+ return net, classes, colors, output_layers
22
+
23
+ def load_image(img_path):
24
+ # image loading
25
+ img = cv2.imread(img_path)
26
+ img = cv2.resize(img, None, fx=0.4, fy=0.4)
27
+ height, width, channels = img.shape
28
+ return img, height, width, channels
29
+
30
+ def start_webcam():
31
+ cap = cv2.VideoCapture(0)
32
+
33
+ return cap
34
+
35
+
36
+ def display_blob(blob):
37
+ '''
38
+ Three images each for RED, GREEN, BLUE channel
39
+ '''
40
+ for b in blob:
41
+ for n, imgb in enumerate(b):
42
+ cv2.imshow(str(n), imgb)
43
+
44
+ def detect_objects_yolo(img, net, outputLayers):
45
+ blob = cv2.dnn.blobFromImage(img, scalefactor=0.00392, size=(320, 320), mean=(0, 0, 0), swapRB=True, crop=False)
46
+ net.setInput(blob)
47
+ outputs = net.forward(outputLayers)
48
+ #output=np.ascontiguousarray(list(outputs))
49
+ #print(outputs)
50
+ #for i, out in enumerate(outputs):
51
+ # print(i, np.array(out).shape)
52
+ return blob, outputs
53
+
54
+ def get_box_dimensions_yolo(outputs, height, width):
55
+ boxes = []
56
+ confs = []
57
+ class_ids = []
58
+ for output in outputs:
59
+ for detect in output:
60
+ scores = detect[5:]
61
+ #print('detect', scores)
62
+ class_id = np.argmax(scores)
63
+ conf = scores[class_id]
64
+ if conf > 0.3:
65
+ center_x = int(detect[0] * width)
66
+ center_y = int(detect[1] * height)
67
+ w = int(detect[2] * width)
68
+ h = int(detect[3] * height)
69
+ x = int(center_x - w/2)
70
+ y = int(center_y - h / 2)
71
+ boxes.append([x, y, w, h])
72
+ #print(boxes)
73
+ confs.append(float(conf))
74
+ class_ids.append(class_id)
75
+ return boxes, confs, class_ids
76
+
77
+ def draw_labels_yolo(boxes, confs, colors, class_ids, classes, img):
78
+ indexes = cv2.dnn.NMSBoxes(boxes, confs, 0.5, 0.4)
79
+ font = cv2.FONT_HERSHEY_PLAIN
80
+ for i in range(len(boxes)):
81
+ if i in indexes:
82
+ x, y, w, h = boxes[i]
83
+ label = str(classes[class_ids[i]])
84
+ color = colors[i]
85
+ cv2.rectangle(img, (x,y), (x+w, y+h), color, 5)
86
+ cv2.putText(img, label, (x, y - 5), font, 5, color, 5)
87
+ return img
88
+
89
+ def image_detect_yolo(img_path):
90
+ model, classes, colors, output_layers = load_yolo()
91
+ image, height, width, channels = load_image(img_path)
92
+ blob, outputs = detect_objects_yolo(image, model, output_layers)
93
+ #print(outputs)
94
+ boxes, confs, class_ids = get_box_dimensions_yolo(outputs, height, width)
95
+ image=draw_labels_yolo(boxes, confs, colors, class_ids, classes, image)
96
+ return image
97
+ '''while True:
98
+ key = cv2.waitKey(1)
99
+ if key == 27:
100
+ break'''
101
+
102
+ #def webcam_detect():
103
+ model, classes, colors, output_layers = load_yolo()
104
+ cap = start_webcam()
105
+ while True:
106
+ _, frame = cap.read()
107
+ height, width, channels = frame.shape
108
+ blob, outputs = detect_objects(frame, model, output_layers)
109
+ boxes, confs, class_ids = get_box_dimensions(outputs, height, width)
110
+ draw_labels(boxes, confs, colors, class_ids, classes, frame)
111
+ key = cv2.waitKey(1)
112
+ if key == 27:
113
+ break
114
+ cap.release()
115
+
116
+
117
+ #def start_video_yolo(video_path):
118
+ model, classes, colors, output_layers = load_yolo()
119
+ cap = cv2.VideoCapture(video_path)
120
+ while True:
121
+ _, frame = cap.read()
122
+ height, width, channels = frame.shape
123
+ blob, outputs = detect_objects_yolo(frame, model, output_layers)
124
+ boxes, confs, class_ids = get_box_dimensions_yolo(outputs, height, width)
125
+ frame=draw_labels_yolo(boxes, confs, colors, class_ids, classes, frame)
126
+ yield cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
127
+ '''key = cv2.waitKey(1)
128
+ if key == 27 :
129
+ break
130
+ cap.release()'''
131
+
132
+
133
+
134
+ cv2.destroyAllWindows()
yolov3.weights ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:523e4e69e1d015393a1b0a441cef1d9c7659e3eb2d7e15f793f060a21b32f297
3
+ size 248007048