dawn17 commited on
Commit
bb8e219
1 Parent(s): 22543b5

Upload 21 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ sample/street.jpeg filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+
5
+ from src.run.yolov3.inference import YoloInfer
6
+
7
+
8
+ infer = YoloInfer(model_path="./checkpoint/model.pt")
9
+
10
+ demo = gr.Interface(
11
+ fn=infer.infer,
12
+ inputs=[
13
+ gr.Image(
14
+ shape=(416, 416),
15
+ label="Input Image",
16
+ value="./sample/bird_plane.jpeg",
17
+ ),
18
+ gr.Slider(
19
+ minimum=0,
20
+ maximum=1,
21
+ value=0.2,
22
+ label="IOU Threshold",
23
+ info="Permissible overlap for the same class bounding boxes",
24
+ ),
25
+ gr.Slider(
26
+ minimum=0,
27
+ maximum=1,
28
+ value=0.95,
29
+ label="Objectness Threshold",
30
+ info="Confidence for each pixel to predict an object",
31
+ ),
32
+ gr.Slider(
33
+ minimum=0,
34
+ maximum=1,
35
+ value=0.5,
36
+ label="Class Threshold",
37
+ info="Confidence for each pixel to predict a class",
38
+ ),
39
+ gr.Slider(
40
+ minimum=0,
41
+ maximum=10,
42
+ value=1,
43
+ label="Font Size",
44
+ info="Bounding box text size",
45
+ ),
46
+ ],
47
+ outputs=[
48
+ gr.Image(),
49
+ ],
50
+ examples=[
51
+ [os.path.join("./sample/", f)]
52
+ for f in os.listdir("./sample/")
53
+ ],
54
+ )
55
+
56
+
57
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ gradio
3
+ torchvision
4
+ numpy
5
+ grad-cam
6
+ Pillow
7
+ albumentations
8
+ tqdm
9
+ cv2
10
+ matplotlib
sample/bird_plane.jpeg ADDED
sample/bird_plane_2.jpeg ADDED
sample/cartoon.png ADDED
sample/horse_person_cycle.jpeg ADDED
sample/street.jpeg ADDED

Git LFS Details

  • SHA256: aeb32f667b0f5b4967393101fa5b09085162754e0b911752ebf658748cc93cc3
  • Pointer size: 132 Bytes
  • Size of remote file: 5.39 MB
sample/street2.webp ADDED
src/datasets/pascal_voc/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .dataset import YOLODataset
src/datasets/pascal_voc/dataset.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Creates a Pytorch dataset to load the Pascal VOC & MS COCO datasets
3
+ """
4
+
5
+ import os
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import torch
10
+ from PIL import Image, ImageFile
11
+ from torch.utils.data import Dataset
12
+
13
+ ImageFile.LOAD_TRUNCATED_IMAGES = True
14
+
15
+
16
+ class YOLODataset(Dataset):
17
+ def __init__(
18
+ self,
19
+ csv_file,
20
+ img_dir,
21
+ label_dir,
22
+ anchors,
23
+ image_size=416,
24
+ S=[13, 26, 52],
25
+ transform=None,
26
+ load_mosaic=True,
27
+ ):
28
+ self.annotations = pd.read_csv(csv_file)
29
+ self.img_dir = img_dir
30
+ self.label_dir = label_dir
31
+ self.image_size = image_size
32
+ self.transform = transform
33
+ self.S = S
34
+ self.load_mosaic = load_mosaic
35
+
36
+ # shape: [number of states, number of anchors, 2]
37
+ self.anchors = torch.tensor(anchors)
38
+ self.num_anchors_per_scale = self.anchors.shape[1]
39
+
40
+ def __len__(self):
41
+ return len(self.annotations)
42
+
43
+ @staticmethod
44
+ def iou(box, anchors):
45
+ """
46
+ box:
47
+ tensor shape: [2]
48
+ anchors:
49
+ tensor shape: [number of states, number of anchors, 2]
50
+
51
+ * 2 above is for width and height
52
+ """
53
+
54
+ intersection = torch.prod(torch.min(box, anchors), dim=-1)
55
+ union = torch.prod(box) + torch.prod(anchors, dim=-1) - intersection
56
+ return intersection / union
57
+
58
+ def __getitem__(self, index):
59
+ label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1])
60
+
61
+ # bboxes = np.loadtxt(fname=label_path, delimiter=" ", ndmin=2)
62
+ bboxes = np.roll(
63
+ np.loadtxt(fname=label_path, delimiter=" ", ndmin=2), 4, axis=1
64
+ )
65
+ img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
66
+ image = np.array(Image.open(img_path).convert("RGB"))
67
+
68
+ if self.transform:
69
+ augmentations = self.transform(image=image, bboxes=bboxes)
70
+ image = augmentations["image"]
71
+ bboxes = augmentations["bboxes"]
72
+
73
+ """
74
+ Below assumes 3 scale predictions (as paper) and same num of anchors per scale
75
+ 6 = [objectness, cx, cy, w, h, class]
76
+ """
77
+ targets = [torch.zeros((self.num_anchors_per_scale, S, S, 6)) for S in self.S]
78
+
79
+ for bbox in bboxes:
80
+ iou = self.iou(torch.tensor(bbox[2:4]), self.anchors)
81
+
82
+ idx = torch.argsort(iou, descending=True, dim=-1)
83
+ idx = idx[:, 0].tolist()
84
+
85
+ dimensions, class_ = np.array(bbox[:-1]), bbox[-1] # +1
86
+
87
+ for scale_idx, anchor_id in enumerate(idx):
88
+ scale_dim = self.S[scale_idx]
89
+ scale_cx, scale_cy, scale_w, scale_h = dimensions * scale_dim
90
+
91
+ row, col = int(scale_cy), int(scale_cx)
92
+
93
+ # fill values
94
+ scale_cx = scale_cx - col
95
+ scale_cy = scale_cy - row
96
+
97
+ box_target = torch.tensor(
98
+ [1, scale_cx, scale_cy, scale_w, scale_h, class_]
99
+ )
100
+
101
+ targets[scale_idx][anchor_id, row, col] = box_target
102
+
103
+ return image, targets
104
+
105
+
106
+ if __name__ == "__main__":
107
+ from src.run.yolov3 import config
108
+
109
+ IMAGE_SIZE = config.IMAGE_SIZE
110
+ train_dataset = YOLODataset(
111
+ config.DATASET + "/2examples.csv",
112
+ transform=config.train_transforms,
113
+ S=[IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8],
114
+ img_dir=config.IMG_DIR,
115
+ label_dir=config.LABEL_DIR,
116
+ anchors=config.ANCHORS,
117
+ )
src/datasets/pascal_voc/mosaic.py ADDED
File without changes
src/loss/yolov3/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .loss import YoloLoss
src/loss/yolov3/loss.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Implementation of Yolo Loss Function similar to the one in Yolov3 paper,
3
+ the difference from what I can tell is I use CrossEntropy for the classes
4
+ instead of BinaryCrossEntropy.
5
+ """
6
+ import random
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+
11
+
12
+ class YoloLoss(nn.Module):
13
+ def __init__(self, nclasses):
14
+ super().__init__()
15
+ self.mse = nn.MSELoss()
16
+ self.bce = nn.BCEWithLogitsLoss()
17
+ self.entropy = nn.CrossEntropyLoss()
18
+ self.sigmoid = nn.Sigmoid()
19
+
20
+ # Constants signifying how much to pay for each respective part of the loss
21
+ self.lambda_class = 5 # 1.5
22
+ self.lambda_noobj = 2
23
+ self.lambda_obj = 1
24
+ self.lambda_box = 2
25
+
26
+ self.nclasses = nclasses
27
+
28
+ # intersection over union
29
+ @staticmethod
30
+ def iou(box1, box2):
31
+ """
32
+ boxi shape = [any shape, 4] i.e [4] or [3,4] or [2,3,4] etc.
33
+
34
+ * 4 = [x, y, w, h]
35
+
36
+ output shape = [batch]
37
+ """
38
+ # box1 x1, x2
39
+ box1_x1 = box1[..., 0] - box1[..., 2] / 2
40
+ box1_x2 = box1[..., 0] + box1[..., 2] / 2
41
+
42
+ # box2 x1, x2
43
+ box2_x1 = box2[..., 0] - box2[..., 2] / 2
44
+ box2_x2 = box2[..., 0] + box2[..., 2] / 2
45
+
46
+ # the width of intersection (x)
47
+ x1 = torch.max(box1_x1, box2_x1)
48
+ x2 = torch.min(box1_x2, box2_x2)
49
+
50
+ x = (x2 - x1).clamp(0)
51
+
52
+ # box1 y1, y2
53
+ box1_y1 = box1[..., 1] - box1[..., 3] / 2
54
+ box1_y2 = box1[..., 1] + box1[..., 3] / 2
55
+
56
+ # box2 y1, y2
57
+ box2_y1 = box2[..., 1] - box2[..., 3] / 2
58
+ box2_y2 = box2[..., 1] + box2[..., 3] / 2
59
+
60
+ # the height of intersection (y)
61
+ y1 = torch.max(box1_y1, box2_y1)
62
+ y2 = torch.min(box1_y2, box2_y2)
63
+
64
+ y = (y2 - y1).clamp(0)
65
+
66
+ # intersection
67
+ intersection = x * y
68
+
69
+ # union
70
+ area_box1 = (box1_x2 - box1_x1) * (box1_y2 - box1_y1)
71
+ area_box2 = (box2_x2 - box2_x1) * (box2_y2 - box2_y1)
72
+
73
+ union = area_box1 + area_box2 - intersection + 1e-6
74
+
75
+ return intersection / union
76
+
77
+ def forward(self, predictions, target, anchors):
78
+ """
79
+ predictions: [batch, 3, 13, 13, 25] where 25 = [objectness, cx, cy, w, h] + 20 classes
80
+ target: [batch, 3, 13, 13, 6] where 6 = [objectness, cx, cy, w, h, true class]
81
+ anchors: [3, 2]
82
+
83
+ * 13 is S
84
+ * 3 is number of anchors
85
+ """
86
+ obj = target[..., 0] == 1
87
+ noobj = target[..., 0] == 0
88
+
89
+ """
90
+ both - no object and object loss
91
+ uncomment noobj above and below for only no_object_loss)
92
+
93
+ predictions shape: [batch, 3, 13, 13]
94
+ target shape: [batch, 3, 13, 13]
95
+ """
96
+ no_object_loss = self.bce(predictions[..., 0][noobj], target[..., 0][noobj])
97
+
98
+ """
99
+ object loss
100
+
101
+ predictions[..., 0][obj] shape: [total_object_in_batch]
102
+ target[..., 0][obj] shape: [total_object_in_batch]
103
+ """
104
+ object_loss = self.bce(predictions[..., 0][obj], target[..., 0][obj])
105
+
106
+ anchors = anchors.reshape(1, 3, 1, 1, 2)
107
+ box_preds = torch.cat(
108
+ [
109
+ self.sigmoid(predictions[..., 1:3]),
110
+ torch.exp(predictions[..., 3:5]) * anchors,
111
+ ],
112
+ dim=-1,
113
+ )
114
+
115
+ ious = self.iou(box_preds[obj], target[..., 1:5][obj]).detach()
116
+ object_loss += self.mse(
117
+ self.sigmoid(predictions[..., 0][obj]), ious * target[..., 0][obj]
118
+ )
119
+
120
+ """
121
+ coordinate loss or box loss
122
+
123
+ predictions[..., 1:5][obj] shape: [total_obj_in_batch, 4]
124
+ target[..., 1:5][obj] shape: [total_obj_in_batch, 4]
125
+ """
126
+ # x, y coordinates
127
+ predictions[..., 1:3] = self.sigmoid(predictions[..., 1:3])
128
+
129
+ # width, height coordinates
130
+ target[..., 3:5] = torch.log((1e-16 + target[..., 3:5] / anchors))
131
+ box_loss = self.mse(predictions[..., 1:5][obj], target[..., 1:5][obj])
132
+
133
+ """
134
+ classification loss : cross entropy
135
+
136
+ predictions[..., 5:][obj] shape: [total_obj_in_batch, nclasses]
137
+ target[..., 5][obj].long() shape: [total_obj_in_batch]
138
+ """
139
+ class_loss = self.entropy(
140
+ (predictions[..., 5:][obj]),
141
+ (target[..., 5][obj].long()),
142
+ )
143
+
144
+ """
145
+ classification loss : binary cross entropy
146
+
147
+ This is my innovation: could be wrong
148
+ Train and test without it as well.
149
+ """
150
+ binary_class_loss = self.bce(
151
+ predictions[..., 5:][obj],
152
+ F.one_hot(target[..., 5][obj].long(), num_classes=self.nclasses).float(),
153
+ )
154
+
155
+ return (
156
+ self.lambda_box * box_loss
157
+ + self.lambda_obj * object_loss
158
+ + self.lambda_noobj * no_object_loss
159
+ + self.lambda_class * class_loss
160
+ + self.lambda_class * binary_class_loss
161
+ )
162
+
163
+
164
+ if __name__ == "__main__":
165
+ from src.run.yolov3 import config
166
+ from src.datasets.pascal_voc import YOLODataset
167
+
168
+ S = 13
169
+ yl = YoloLoss(nclasses=20)
170
+
171
+ predictions = torch.rand((20, 3, S, S, 25))
172
+
173
+ # build target
174
+ IMAGE_SIZE = config.IMAGE_SIZE
175
+
176
+ train_dataset = YOLODataset(
177
+ config.DATASET + "/train.csv",
178
+ transform=None, # config.train_transforms,
179
+ S=[IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8],
180
+ img_dir=config.IMG_DIR,
181
+ label_dir=config.LABEL_DIR,
182
+ anchors=config.ANCHORS,
183
+ )
184
+ _, target = train_dataset[3]
185
+ target = target[0].unsqueeze(0) # target[0] if S=13
186
+ target = torch.cat([target, target] * 10)
187
+
188
+ # anchor
189
+ anchor = S * train_dataset.anchors[0]
190
+
191
+ print(yl(predictions, target, anchor))
src/model/yolov3/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .model import YOLOv3
src/model/yolov3/model.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Implementation of YOLOv3 architecture
3
+ """
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+
8
+ """
9
+ Information about architecture config:
10
+ Tuple is structured by (filters, kernel_size, stride)
11
+ Every conv is a same convolution.
12
+ List is structured by "B" indicating a residual block followed by the number of repeats
13
+ "S" is for scale prediction block and computing the yolo loss
14
+ "U" is for upsampling the feature map and concatenating with a previous layer
15
+ """
16
+ model_config = [
17
+ (32, 3, 1),
18
+ (64, 3, 2),
19
+ ["B", 1],
20
+ (128, 3, 2),
21
+ ["B", 2],
22
+ (256, 3, 2),
23
+ ["B", 8],
24
+ (512, 3, 2),
25
+ ["B", 8],
26
+ (1024, 3, 2),
27
+ ["B", 4], # To this point is Darknet-53
28
+ (512, 1, 1),
29
+ (1024, 3, 1),
30
+ "S",
31
+ (256, 1, 1),
32
+ "U",
33
+ (256, 1, 1),
34
+ (512, 3, 1),
35
+ "S",
36
+ (128, 1, 1),
37
+ "U",
38
+ (128, 1, 1),
39
+ (256, 3, 1),
40
+ "S",
41
+ ]
42
+
43
+
44
+ class CNNBlock(nn.Module):
45
+ def __init__(self, in_channels, out_channels, bn_act=True, **kwargs):
46
+ super().__init__()
47
+ self.conv = nn.Conv2d(in_channels, out_channels, bias=not bn_act, **kwargs)
48
+ self.bn = nn.BatchNorm2d(out_channels)
49
+ self.leaky = nn.LeakyReLU(0.1)
50
+ self.use_bn_act = bn_act
51
+
52
+ def forward(self, x):
53
+ if self.use_bn_act:
54
+ return self.leaky(self.bn(self.conv(x)))
55
+ else:
56
+ return self.conv(x)
57
+
58
+
59
+ class ResidualBlock(nn.Module):
60
+ def __init__(self, channels, use_residual=True, num_repeats=1):
61
+ super().__init__()
62
+ self.layers = nn.ModuleList()
63
+ for repeat in range(num_repeats):
64
+ self.layers += [
65
+ nn.Sequential(
66
+ CNNBlock(channels, channels // 2, kernel_size=1),
67
+ CNNBlock(channels // 2, channels, kernel_size=3, padding=1),
68
+ )
69
+ ]
70
+
71
+ self.use_residual = use_residual
72
+ self.num_repeats = num_repeats
73
+
74
+ def forward(self, x):
75
+ for layer in self.layers:
76
+ if self.use_residual:
77
+ x = x + layer(x)
78
+ else:
79
+ x = layer(x)
80
+
81
+ return x
82
+
83
+
84
+ class ScalePrediction(nn.Module):
85
+ def __init__(self, in_channels, num_classes):
86
+ super().__init__()
87
+ self.pred = nn.Sequential(
88
+ CNNBlock(in_channels, 2 * in_channels, kernel_size=3, padding=1),
89
+ CNNBlock(
90
+ 2 * in_channels, (num_classes + 5) * 3, bn_act=False, kernel_size=1
91
+ ),
92
+ )
93
+ self.num_classes = num_classes
94
+
95
+ def forward(self, x):
96
+ x = (
97
+ self.pred(x)
98
+ .reshape(x.shape[0], 3, self.num_classes + 5, x.shape[2], x.shape[3])
99
+ .permute(0, 1, 3, 4, 2)
100
+ )
101
+ return x
102
+
103
+
104
+ class YOLOv3(nn.Module):
105
+ def __init__(self, in_channels=3, num_classes=80):
106
+ super().__init__()
107
+ self.num_classes = num_classes
108
+ self.in_channels = in_channels
109
+ self.layers = self._create_conv_layers()
110
+
111
+ def forward(self, x):
112
+ outputs = [] # for each scale
113
+ route_connections = []
114
+
115
+ for layer in self.layers:
116
+ if isinstance(layer, ScalePrediction):
117
+ outputs.append(layer(x))
118
+ continue
119
+
120
+ x = layer(x)
121
+
122
+ if isinstance(layer, ResidualBlock) and layer.num_repeats == 8:
123
+ route_connections.append(x)
124
+
125
+ elif isinstance(layer, nn.Upsample):
126
+ x = torch.cat([x, route_connections[-1]], dim=1)
127
+ route_connections.pop()
128
+
129
+ return outputs
130
+
131
+ def _create_conv_layers(self):
132
+ layers = nn.ModuleList()
133
+ in_channels = self.in_channels
134
+
135
+ for module in model_config:
136
+ if isinstance(module, tuple):
137
+ out_channels, kernel_size, stride = module
138
+ layers.append(
139
+ CNNBlock(
140
+ in_channels,
141
+ out_channels,
142
+ kernel_size=kernel_size,
143
+ stride=stride,
144
+ padding=1 if kernel_size == 3 else 0,
145
+ )
146
+ )
147
+ in_channels = out_channels
148
+
149
+ elif isinstance(module, list):
150
+ num_repeats = module[1]
151
+ layers.append(
152
+ ResidualBlock(
153
+ in_channels,
154
+ num_repeats=num_repeats,
155
+ )
156
+ )
157
+
158
+ elif isinstance(module, str):
159
+ if module == "S":
160
+ layers += [
161
+ ResidualBlock(in_channels, use_residual=False, num_repeats=1),
162
+ CNNBlock(in_channels, in_channels // 2, kernel_size=1),
163
+ ScalePrediction(in_channels // 2, num_classes=self.num_classes),
164
+ ]
165
+ in_channels = in_channels // 2
166
+
167
+ elif module == "U":
168
+ layers.append(
169
+ nn.Upsample(scale_factor=2),
170
+ )
171
+ in_channels = in_channels * 3
172
+
173
+ return layers
src/run/__init__.py ADDED
File without changes
src/run/yolov3/config.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import albumentations as A
2
+ import cv2
3
+ import torch
4
+ from albumentations.pytorch import ToTensorV2
5
+
6
+
7
+ DATASET = "PASCAL_VOC"
8
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
9
+
10
+ NUM_WORKERS = 0
11
+ BATCH_SIZE = 32
12
+ IMAGE_SIZE = 416
13
+ NUM_CLASSES = 20
14
+ LEARNING_RATE = 1e-5
15
+ WEIGHT_DECAY = 1e-4
16
+ NUM_EPOCHS = 100
17
+ CONF_THRESHOLD = 0.05
18
+ MAP_IOU_THRESH = 0.5
19
+ NMS_IOU_THRESH = 0.45
20
+ S = [IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8]
21
+ PIN_MEMORY = True
22
+ LOAD_MODEL = False
23
+ SAVE_MODEL = True
24
+ CHECKPOINT_FILE = "checkpoint.pth.tar"
25
+ IMG_DIR = DATASET + "/images/"
26
+ LABEL_DIR = DATASET + "/labels/"
27
+
28
+ ANCHORS = [
29
+ [(0.28, 0.22), (0.38, 0.48), (0.9, 0.78)],
30
+ [(0.07, 0.15), (0.15, 0.11), (0.14, 0.29)],
31
+ [(0.02, 0.03), (0.04, 0.07), (0.08, 0.06)],
32
+ ] # Note these have been rescaled to be between [0, 1]
33
+
34
+ means = [0.485, 0.456, 0.406]
35
+
36
+ scale = 1.1
37
+ train_transforms = A.Compose(
38
+ [
39
+ A.LongestMaxSize(max_size=int(IMAGE_SIZE * scale)),
40
+ A.PadIfNeeded(
41
+ min_height=int(IMAGE_SIZE * scale),
42
+ min_width=int(IMAGE_SIZE * scale),
43
+ border_mode=cv2.BORDER_CONSTANT,
44
+ ),
45
+ A.Rotate(limit=10, interpolation=1, border_mode=4),
46
+ A.RandomCrop(width=IMAGE_SIZE, height=IMAGE_SIZE),
47
+ A.ColorJitter(brightness=0.6, contrast=0.6, saturation=0.6, hue=0.6, p=0.4),
48
+ A.OneOf(
49
+ [
50
+ A.ShiftScaleRotate(
51
+ rotate_limit=20, p=0.5, border_mode=cv2.BORDER_CONSTANT
52
+ ),
53
+ # A.Affine(shear=15, p=0.5, mode="constant"),
54
+ ],
55
+ p=1.0,
56
+ ),
57
+ A.HorizontalFlip(p=0.5),
58
+ A.Blur(p=0.1),
59
+ A.CLAHE(p=0.1),
60
+ A.Posterize(p=0.1),
61
+ A.ToGray(p=0.1),
62
+ A.ChannelShuffle(p=0.05),
63
+ A.Normalize(
64
+ mean=[0, 0, 0],
65
+ std=[1, 1, 1],
66
+ max_pixel_value=255,
67
+ ),
68
+ ToTensorV2(),
69
+ ],
70
+ bbox_params=A.BboxParams(
71
+ format="yolo",
72
+ min_visibility=0.4,
73
+ label_fields=[],
74
+ ),
75
+ )
76
+ test_transforms = A.Compose(
77
+ [
78
+ A.LongestMaxSize(max_size=IMAGE_SIZE),
79
+ A.PadIfNeeded(
80
+ min_height=IMAGE_SIZE, min_width=IMAGE_SIZE, border_mode=cv2.BORDER_CONSTANT
81
+ ),
82
+ A.Normalize(
83
+ mean=[0, 0, 0],
84
+ std=[1, 1, 1],
85
+ max_pixel_value=255,
86
+ ),
87
+ ToTensorV2(),
88
+ ],
89
+ bbox_params=A.BboxParams(format="yolo", min_visibility=0.4, label_fields=[]),
90
+ )
91
+
92
+ PASCAL_CLASSES = [
93
+ "aeroplane",
94
+ "bicycle",
95
+ "bird",
96
+ "boat",
97
+ "bottle",
98
+ "bus",
99
+ "car",
100
+ "cat",
101
+ "chair",
102
+ "cow",
103
+ "diningtable",
104
+ "dog",
105
+ "horse",
106
+ "motorbike",
107
+ "person",
108
+ "pottedplant",
109
+ "sheep",
110
+ "sofa",
111
+ "train",
112
+ "tvmonitor",
113
+ ]
114
+
115
+ COCO_LABELS = [
116
+ "person",
117
+ "bicycle",
118
+ "car",
119
+ "motorcycle",
120
+ "airplane",
121
+ "bus",
122
+ "train",
123
+ "truck",
124
+ "boat",
125
+ "traffic light",
126
+ "fire hydrant",
127
+ "stop sign",
128
+ "parking meter",
129
+ "bench",
130
+ "bird",
131
+ "cat",
132
+ "dog",
133
+ "horse",
134
+ "sheep",
135
+ "cow",
136
+ "elephant",
137
+ "bear",
138
+ "zebra",
139
+ "giraffe",
140
+ "backpack",
141
+ "umbrella",
142
+ "handbag",
143
+ "tie",
144
+ "suitcase",
145
+ "frisbee",
146
+ "skis",
147
+ "snowboard",
148
+ "sports ball",
149
+ "kite",
150
+ "baseball bat",
151
+ "baseball glove",
152
+ "skateboard",
153
+ "surfboard",
154
+ "tennis racket",
155
+ "bottle",
156
+ "wine glass",
157
+ "cup",
158
+ "fork",
159
+ "knife",
160
+ "spoon",
161
+ "bowl",
162
+ "banana",
163
+ "apple",
164
+ "sandwich",
165
+ "orange",
166
+ "broccoli",
167
+ "carrot",
168
+ "hot dog",
169
+ "pizza",
170
+ "donut",
171
+ "cake",
172
+ "chair",
173
+ "couch",
174
+ "potted plant",
175
+ "bed",
176
+ "dining table",
177
+ "toilet",
178
+ "tv",
179
+ "laptop",
180
+ "mouse",
181
+ "remote",
182
+ "keyboard",
183
+ "cell phone",
184
+ "microwave",
185
+ "oven",
186
+ "toaster",
187
+ "sink",
188
+ "refrigerator",
189
+ "book",
190
+ "clock",
191
+ "vase",
192
+ "scissors",
193
+ "teddy bear",
194
+ "hair drier",
195
+ "toothbrush",
196
+ ]
src/run/yolov3/dataloader.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.utils.data import DataLoader
2
+ from src.run.yolov3 import config
3
+ from src.datasets.pascal_voc import YOLODataset
4
+
5
+
6
+ def get_loaders(train_csv_path, test_csv_path):
7
+ IMAGE_SIZE = config.IMAGE_SIZE
8
+ train_dataset = YOLODataset(
9
+ train_csv_path,
10
+ transform=config.train_transforms,
11
+ S=[IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8],
12
+ img_dir=config.IMG_DIR,
13
+ label_dir=config.LABEL_DIR,
14
+ anchors=config.ANCHORS,
15
+ )
16
+ test_dataset = YOLODataset(
17
+ test_csv_path,
18
+ transform=config.test_transforms,
19
+ S=[IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8],
20
+ img_dir=config.IMG_DIR,
21
+ label_dir=config.LABEL_DIR,
22
+ anchors=config.ANCHORS,
23
+ )
24
+ train_loader = DataLoader(
25
+ dataset=train_dataset,
26
+ batch_size=config.BATCH_SIZE,
27
+ num_workers=config.NUM_WORKERS,
28
+ pin_memory=config.PIN_MEMORY,
29
+ shuffle=True,
30
+ drop_last=False,
31
+ )
32
+ test_loader = DataLoader(
33
+ dataset=test_dataset,
34
+ batch_size=config.BATCH_SIZE,
35
+ num_workers=config.NUM_WORKERS,
36
+ pin_memory=config.PIN_MEMORY,
37
+ shuffle=False,
38
+ drop_last=False,
39
+ )
40
+
41
+ train_eval_dataset = YOLODataset(
42
+ train_csv_path,
43
+ transform=config.test_transforms,
44
+ S=[IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8],
45
+ img_dir=config.IMG_DIR,
46
+ label_dir=config.LABEL_DIR,
47
+ anchors=config.ANCHORS,
48
+ )
49
+ train_eval_loader = DataLoader(
50
+ dataset=train_eval_dataset,
51
+ batch_size=config.BATCH_SIZE,
52
+ num_workers=config.NUM_WORKERS,
53
+ pin_memory=config.PIN_MEMORY,
54
+ shuffle=False,
55
+ drop_last=False,
56
+ )
57
+
58
+ return train_loader, test_loader, train_eval_loader
src/run/yolov3/example/yolov3.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
src/run/yolov3/inference.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import albumentations as A
4
+ import cv2
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ import torch
8
+ from albumentations.pytorch import ToTensorV2
9
+ from PIL import Image
10
+ from tqdm import tqdm
11
+
12
+ from src.loss.yolov3 import YoloLoss
13
+ from src.model.yolov3 import YOLOv3 as Model
14
+ from src.run.yolov3 import config
15
+
16
+
17
+ class YoloInfer:
18
+ def __init__(self, model_path):
19
+ self.model = self.load_model(model_path=model_path)
20
+ self.transform = A.Compose(
21
+ [
22
+ A.Resize(config.IMAGE_SIZE, config.IMAGE_SIZE),
23
+ A.Normalize(
24
+ mean=[0, 0, 0],
25
+ std=[1, 1, 1],
26
+ max_pixel_value=255,
27
+ ),
28
+ ToTensorV2(),
29
+ ]
30
+ )
31
+
32
+ self.scaled_anchors = (
33
+ torch.tensor(config.ANCHORS) * torch.tensor(config.S).reshape(-1, 1, 1)
34
+ ).to(config.DEVICE)
35
+
36
+ def load_model(self, model_path):
37
+ model = Model(num_classes=config.NUM_CLASSES).to(config.DEVICE)
38
+
39
+ if os.path.isfile(model_path):
40
+ checkpoint = torch.load(model_path, map_location=config.DEVICE)
41
+ model.load_state_dict(checkpoint["model_state_dict"], strict=False)
42
+
43
+ return model
44
+
45
+ @staticmethod
46
+ def pred_to_boxes(prediction, anchors):
47
+ """
48
+ prediction tensor = [batch, num_anchors_per_scale, scale, scale, 5 + num_classes]
49
+ 5 = [objness, cx, cy, w, h]
50
+ anchors tensor = [num_anchors_per_scale, 2]
51
+
52
+ Note: The below operation could been done entirely inplace.
53
+ Slightly unoptimsed implementation below to maintain readability
54
+
55
+ Output shape: [batch, num_anchors_per_scale, scale, scale, 7]
56
+ 7: [predicted_class's_idx, obj score, cx, cy, width, height, predicted class probability score]
57
+ """
58
+ scale = prediction.shape[2]
59
+
60
+ # reversing the equations of box loss and obj in the loss function
61
+ anchors = anchors.reshape(1, len(anchors), 1, 1, 2)
62
+ cx_cy = torch.sigmoid(prediction[..., 1:3])
63
+ width_height = anchors * torch.exp(prediction[..., 3:5])
64
+
65
+ # reversing the equations we wrote while making training data
66
+ arange = torch.arange(scale, device=config.DEVICE)
67
+ cx = (arange.reshape(1, 1, 1, scale, 1) + cx_cy[..., 0:1]) / scale
68
+ cy = (arange.reshape(1, 1, scale, 1, 1) + cx_cy[..., 1:2]) / scale
69
+ width_height = width_height / scale
70
+
71
+ # class prediction
72
+ class_predictions = torch.softmax(prediction[..., 5:], dim=-1)
73
+ class_score, class_idx = torch.max(class_predictions, dim=-1)
74
+ class_score, class_idx = class_score.unsqueeze(-1), class_idx.unsqueeze(-1)
75
+
76
+ # objectness score
77
+ obj_score = torch.sigmoid(prediction[..., 0:1])
78
+ return torch.cat(
79
+ [class_idx, obj_score, cx, cy, width_height, class_score], dim=-1
80
+ )
81
+
82
+ @staticmethod
83
+ def sort_3Dtensor_rows_on_two_columns(
84
+ tensor, index1, index2, descending1=True, descending2=True
85
+ ):
86
+ """
87
+ tensor = tensor([[[1, 2, 3],
88
+ [1, 3, 4],
89
+ [0, 2, 1]],
90
+
91
+ [[0, 2, 3],
92
+ [1, 4, 5],
93
+ [0, 1, 2]]])
94
+
95
+ sort_tensor_rows_on_two_columns(tensor,
96
+ index1=0,
97
+ index2=1,
98
+ descending1=False,
99
+ descending2=True)
100
+
101
+ output = tensor([[[0, 2, 1],
102
+ [1, 3, 4],
103
+ [1, 2, 3]],
104
+
105
+ [[0, 2, 3],
106
+ [0, 1, 2],
107
+ [1, 4, 5]]])
108
+ """
109
+ inner_sorting = torch.argsort(tensor[..., index2], descending=descending1)
110
+ inner_sorted = torch.gather(
111
+ tensor, 1, inner_sorting.unsqueeze(-1).expand(-1, -1, tensor.size(2))
112
+ )
113
+
114
+ outer_sorting = torch.argsort(
115
+ inner_sorted[:, :, index1], stable=True, descending=descending2
116
+ )
117
+ outer_sorted = torch.gather(
118
+ inner_sorted,
119
+ 1,
120
+ outer_sorting.unsqueeze(-1).expand(-1, -1, inner_sorted.size(2)),
121
+ )
122
+ return outer_sorted
123
+
124
+ @staticmethod
125
+ def non_max_supression(
126
+ prediction, iou_threshold, object_threshold, class_threshold
127
+ ):
128
+ """
129
+ prediction = [batch, summation(num_anchors_per_scale * scale * scale), 7]
130
+ i.e. [batch, (3 * 13 * 13 + 3 * 26 * 26 + 3 * 52 * 52), 7]
131
+
132
+ 7: [class_pred, obj_score, cx, cy, width, height, class_score]
133
+ """
134
+ """
135
+ inside each batch output,
136
+ first sort by class prediction,
137
+ and inside each class sort objectness in descending
138
+ """
139
+ prediction = YoloInfer.sort_3Dtensor_rows_on_two_columns(
140
+ tensor=prediction, index1=0, index2=1, descending1=True, descending2=True
141
+ )
142
+
143
+ """
144
+ remove predictions with object threshold below the given threshold
145
+ and split prediction to get a list of tensors
146
+
147
+ length of list = batch size
148
+ each element in the list = results/output of 1 image
149
+ """
150
+ # objectness condition [threshold]
151
+ objectness = (prediction[..., 1] > object_threshold) & (
152
+ prediction[..., 6] > class_threshold
153
+ )
154
+ indices = torch.nonzero(objectness)
155
+ batch_boxes = torch.split(
156
+ tensor=prediction[objectness],
157
+ split_size_or_sections=torch.bincount(indices[:, 0]).tolist(),
158
+ dim=0,
159
+ )
160
+
161
+ # iterate for output
162
+ output = []
163
+
164
+ for boxes in tqdm(batch_boxes, disable=True):
165
+ # boxes shape = [-1, 7]
166
+ boxes = boxes.tolist()
167
+ final_boxes = []
168
+
169
+ while boxes:
170
+ top_box = boxes.pop(0)
171
+
172
+ idx = 0
173
+
174
+ while idx < len(boxes):
175
+ box = boxes[idx]
176
+
177
+ # class match
178
+ if box[0] != top_box[0]:
179
+ break
180
+
181
+ # iou match
182
+ if (
183
+ YoloLoss.iou(torch.tensor(top_box[2:6]), torch.tensor(box[2:6]))
184
+ > iou_threshold
185
+ ):
186
+ del boxes[idx]
187
+
188
+ idx -= 1
189
+
190
+ idx += 1
191
+
192
+ final_boxes.append(top_box)
193
+
194
+ output.append(final_boxes)
195
+
196
+ return output
197
+
198
+ @staticmethod
199
+ def draw_bounding_boxes(image, boxes, font_size=1):
200
+ """Draws bounding boxes on the image using OpenCV"""
201
+ cmap = plt.get_cmap("tab20b")
202
+ class_labels = (
203
+ config.COCO_LABELS if config.DATASET == "COCO" else config.PASCAL_CLASSES
204
+ )
205
+ colors = [cmap(i) for i in np.linspace(0, 1, len(class_labels))]
206
+ im = np.array(image)
207
+ height, width, _ = im.shape
208
+
209
+ # font = ImageFont.truetype("DejaVuSans.ttf", 20) # Load the DejaVuSans font
210
+
211
+ for box in boxes:
212
+ assert (
213
+ len(box) == 7
214
+ ), "box should contain class pred, confidence, x, y, width, height, class score"
215
+ class_pred = box[0]
216
+ class_score = round(box[-1], 2)
217
+
218
+ upper_left_x = int((box[2] - box[4] / 2) * width)
219
+ upper_left_y = int((box[3] - box[5] / 2) * height)
220
+ lower_right_x = int((box[2] + box[4] / 2) * width)
221
+ lower_right_y = int((box[3] + box[5] / 2) * height)
222
+
223
+ color = colors[int(class_pred)]
224
+ color_rgb = (int(color[0] * 255), int(color[1] * 255), int(color[2] * 255))
225
+ thickness = max(
226
+ int((0.0005 * (image.shape[0] + image.shape[1]) / 2) + 1), 1
227
+ )
228
+
229
+ cv2.rectangle(
230
+ im,
231
+ (upper_left_x, upper_left_y),
232
+ (lower_right_x, lower_right_y),
233
+ color_rgb,
234
+ thickness=thickness,
235
+ )
236
+
237
+ # label
238
+ font_scale = font_size
239
+ label = f"{class_labels[int(class_pred)]} {class_score}"
240
+ text_size = cv2.getTextSize(
241
+ label,
242
+ fontFace=cv2.FONT_HERSHEY_SIMPLEX,
243
+ fontScale=font_scale,
244
+ thickness=1,
245
+ )[0]
246
+
247
+ # Draw rectangle background
248
+ cv2.rectangle(
249
+ im,
250
+ (upper_left_x, upper_left_y),
251
+ (upper_left_x + text_size[0], upper_left_y - text_size[1]),
252
+ color_rgb,
253
+ thickness=-1,
254
+ )
255
+ cv2.putText(
256
+ im,
257
+ label,
258
+ (upper_left_x, upper_left_y),
259
+ fontFace=cv2.FONT_HERSHEY_SIMPLEX,
260
+ fontScale=font_scale,
261
+ color=[0, 0, 0],
262
+ thickness=1,
263
+ lineType=cv2.LINE_AA,
264
+ )
265
+
266
+ return im
267
+
268
+ def infer(
269
+ self,
270
+ image: np.array,
271
+ iou_threshold=0.75,
272
+ object_threshold=0.75,
273
+ class_threshold=0.5,
274
+ font_size=1,
275
+ ):
276
+ self.model.eval()
277
+ input_tensor = self.transform(image=image)["image"].unsqueeze(0)
278
+
279
+ with torch.no_grad():
280
+ """
281
+ output = list of tensors
282
+ tensor shape=[batch, num_anchors_per_scale, scale, scale, 5 + num_classes]
283
+ """
284
+ output = self.model(input_tensor.to(config.DEVICE))
285
+
286
+ # convert model prediction to actual box prediction
287
+ output = torch.cat(
288
+ [
289
+ self.pred_to_boxes(out, self.scaled_anchors[idx]).reshape(
290
+ out.shape[0], -1, 7
291
+ )
292
+ for idx, out in enumerate(output)
293
+ ],
294
+ dim=1,
295
+ )
296
+
297
+ # non max suppression
298
+ output = self.non_max_supression(
299
+ prediction=output,
300
+ iou_threshold=iou_threshold,
301
+ object_threshold=object_threshold,
302
+ class_threshold=class_threshold,
303
+ )
304
+
305
+ return self.draw_bounding_boxes(image, output[0], font_size=font_size)
306
+
307
+ @staticmethod
308
+ def load_image_as_array(image_path):
309
+ # Load a PIL image
310
+ pil_image = Image.open(image_path)
311
+
312
+ # Convert PIL image to NumPy array
313
+ return np.array(pil_image.convert("RGB"))
314
+
315
+ @staticmethod
316
+ def plot_array(array: np.array, figsize=(10, 10)):
317
+ plt.figure(figsize=figsize)
318
+ plt.imshow(array)
319
+ plt.show()
320
+
321
+ @staticmethod
322
+ def save_numpy_as_image(numpy_array, image_path):
323
+ """
324
+ Saves a NumPy array as an image.
325
+ Args:
326
+ numpy_array (numpy.ndarray): The NumPy array to be saved as an image.
327
+ image_path (str): The path where the image will be saved.
328
+ """
329
+ # Convert the NumPy array to a PIL image
330
+ image = Image.fromarray(numpy_array)
331
+
332
+ # Save the PIL image to the specified path
333
+ image.save(image_path)
src/run/yolov3/train.py ADDED
File without changes