makiisthebes
commited on
Upload 9 files
Browse files- .gitattributes +2 -0
- best_model.txt +1 -0
- le_net_learning_mnist.py +266 -0
- lenet_mnist_model.pth +3 -0
- let_net_arch.png +0 -0
- mnist_dataset/t10k-images.idx3-ubyte +3 -0
- mnist_dataset/t10k-labels.idx1-ubyte +0 -0
- mnist_dataset/train-images.idx3-ubyte +3 -0
- mnist_dataset/train-labels.idx1-ubyte +0 -0
- utils.py +62 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
mnist_dataset/t10k-images.idx3-ubyte filter=lfs diff=lfs merge=lfs -text
|
37 |
+
mnist_dataset/train-images.idx3-ubyte filter=lfs diff=lfs merge=lfs -text
|
best_model.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
0.9906
|
le_net_learning_mnist.py
ADDED
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Rewriting the LeNet model to learn the MNIST dataset and save the model parameters,
|
2 |
+
# This is considered something we should do in Week 3 of the Deep Learning and Computer Vision course.
|
3 |
+
|
4 |
+
# We will implement LeNet-5 architecture to learn the MNIST dataset.
|
5 |
+
|
6 |
+
from torchvision.transforms import ToTensor
|
7 |
+
# from torchvision.transforms import v2
|
8 |
+
from torchvision import transforms
|
9 |
+
from torch.utils.data import DataLoader
|
10 |
+
from torch.utils.data import Dataset
|
11 |
+
from torchvision import datasets
|
12 |
+
import matplotlib.pyplot as plt
|
13 |
+
from PIL import Image
|
14 |
+
from time import time
|
15 |
+
from torch import nn
|
16 |
+
import pandas as pd
|
17 |
+
import numpy as np
|
18 |
+
import torch, os
|
19 |
+
from utils import ApplyEnhancementFilter
|
20 |
+
|
21 |
+
# Load device first (GPU or CPU)
|
22 |
+
device = (
|
23 |
+
"cuda"
|
24 |
+
if torch.cuda.is_available()
|
25 |
+
else "mps"
|
26 |
+
if torch.backends.mps.is_available()
|
27 |
+
else "cpu"
|
28 |
+
)
|
29 |
+
print(f"Using {device} device for training/inference.")
|
30 |
+
if device == "cuda":
|
31 |
+
print(f"GPU being used: {torch.cuda.get_device_name(0)}")
|
32 |
+
|
33 |
+
|
34 |
+
train_transform = transforms.Compose([
|
35 |
+
# Data augmentation transformations
|
36 |
+
# ApplyEnhancementFilter(out_channels=1, kernel_size=3, stride=1, padding=1),
|
37 |
+
transforms.RandomAffine(degrees=35, translate=(0.1, 0.1), scale=(0.9, 1.1)),
|
38 |
+
transforms.RandomRotation(degrees=35),
|
39 |
+
# Convert images to tensors and normalize
|
40 |
+
transforms.ToTensor(),
|
41 |
+
transforms.Normalize((0.13066047430038452,), (0.30810782313346863,)),
|
42 |
+
# Pad the image to make it 32x32
|
43 |
+
transforms.Pad(2, fill=0, padding_mode='constant'),
|
44 |
+
])
|
45 |
+
|
46 |
+
# For the test dataset, you should not apply these augmentations
|
47 |
+
test_transform = transforms.Compose([
|
48 |
+
transforms.ToTensor(),
|
49 |
+
transforms.Normalize((0.13066047430038452,), (0.30810782313346863,)),
|
50 |
+
transforms.Pad(2, fill=0, padding_mode='constant'),
|
51 |
+
])
|
52 |
+
|
53 |
+
|
54 |
+
# Load the MNIST dataset which is 32x32x1 images (black and white ~ 1 channel)
|
55 |
+
|
56 |
+
# http://yann.lecun.com/exdb/mnist/
|
57 |
+
# datasets.MNIST
|
58 |
+
|
59 |
+
# Loading from Dataset and DataLoader, https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
|
60 |
+
# Load using known datasets, but what if we have our own dataset?
|
61 |
+
# training_data = datasets.MNIST(
|
62 |
+
# root="data",
|
63 |
+
# train=True,
|
64 |
+
# download=True,
|
65 |
+
# transform=ToTensor()
|
66 |
+
# )
|
67 |
+
#
|
68 |
+
# test_data = datasets.MNIST(
|
69 |
+
# root="data",
|
70 |
+
# train=False,
|
71 |
+
# download=True,
|
72 |
+
# transform=ToTensor()
|
73 |
+
# )
|
74 |
+
|
75 |
+
# Loading from a custom dataset
|
76 |
+
import idx2numpy
|
77 |
+
class CustomImageDataset(Dataset):
|
78 |
+
"""
|
79 |
+
This class must inherit from the torch.utils.data.Dataset class.
|
80 |
+
And contina functions __init__, __len__, and __getitem__.
|
81 |
+
"""
|
82 |
+
def __init__(self, annotations_file, image_file, transform=None, target_transform=None):
|
83 |
+
self.img_labels = idx2numpy.convert_from_file(annotations_file)
|
84 |
+
self.images = idx2numpy.convert_from_file(image_file)
|
85 |
+
self.transform = transform
|
86 |
+
self.target_transform = target_transform
|
87 |
+
|
88 |
+
def __len__(self):
|
89 |
+
return len(self.img_labels)
|
90 |
+
|
91 |
+
def __getitem__(self, idx):
|
92 |
+
"""Get the image and label at the index idx."""
|
93 |
+
label = self.img_labels[idx]
|
94 |
+
img = self.images[idx]
|
95 |
+
img = Image.fromarray(img)
|
96 |
+
|
97 |
+
if self.transform:
|
98 |
+
img = self.transform(img)
|
99 |
+
if self.target_transform:
|
100 |
+
label = self.target_transform(label)
|
101 |
+
# Adding 0 padding to make it 32x32, as the model expects this.
|
102 |
+
|
103 |
+
# img = img.unsqueeze(0) # Add channel dimension, as model expects this.
|
104 |
+
return img, label # Return as float32, and label as int., should solve issue.
|
105 |
+
|
106 |
+
|
107 |
+
# Make the LeNet-5 model
|
108 |
+
class LeNet5Model(nn.Module):
|
109 |
+
def __init__(self):
|
110 |
+
super().__init__()
|
111 |
+
# Define activation, and sequential layers, then make forward pass.
|
112 |
+
self.tanh = nn.Tanh()
|
113 |
+
# Convolutional layers, https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
|
114 |
+
# Avg Pooling, https://pytorch.org/docs/stable/generated/torch.nn.AvgPool2d.html
|
115 |
+
self.le_stack = nn.Sequential(
|
116 |
+
nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1),
|
117 |
+
self.tanh,
|
118 |
+
nn.AvgPool2d(kernel_size=2, stride=2),
|
119 |
+
nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1),
|
120 |
+
self.tanh,
|
121 |
+
nn.AvgPool2d(kernel_size=2, stride=2),
|
122 |
+
nn.Conv2d(in_channels=16, out_channels=120, kernel_size=5, stride=1),
|
123 |
+
self.tanh
|
124 |
+
)
|
125 |
+
# Fully connected layers, https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
|
126 |
+
self.fc_stack = nn.Sequential(
|
127 |
+
nn.Linear(in_features=120, out_features=84),
|
128 |
+
self.tanh,
|
129 |
+
nn.Linear(in_features=84, out_features=10)
|
130 |
+
)
|
131 |
+
|
132 |
+
def forward(self, x):
|
133 |
+
"""Forward pass of the model."""
|
134 |
+
x = self.le_stack(x)
|
135 |
+
x = x.reshape(x.shape[0], -1)
|
136 |
+
x = self.fc_stack(x)
|
137 |
+
return x
|
138 |
+
|
139 |
+
|
140 |
+
def train_model(model, train_loader, test_loader, epochs=10, learning_rate=0.001, saved_model=None):
|
141 |
+
"""
|
142 |
+
Given a model, train the model using the train_loader and test_loader, and show metrics,
|
143 |
+
saving the best model parameters currently.
|
144 |
+
"""
|
145 |
+
# When we have model, we need the loss function and optimizer we will use.
|
146 |
+
# Loss function, https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
|
147 |
+
loss_fn = nn.CrossEntropyLoss() # because we calculating probabilities and this is a classification problem.
|
148 |
+
# Optimizer, https://pytorch.org/docs/stable/optim.html
|
149 |
+
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-6) # learning rate of 0.001
|
150 |
+
best_accuracy = 0.0
|
151 |
+
# See if best accuracy is saved, if so, get current best accuracy.
|
152 |
+
if os.path.exists("best_model.txt"):
|
153 |
+
with open("best_model.txt", "r") as file:
|
154 |
+
best_accuracy = float(file.read())
|
155 |
+
|
156 |
+
if saved_model is not None: # Load the model parameters if they exist.
|
157 |
+
model.load_state_dict(torch.load(saved_model))
|
158 |
+
|
159 |
+
# Training loop
|
160 |
+
for i in range(epochs):
|
161 |
+
model.train()
|
162 |
+
print("Epoch ", i)
|
163 |
+
for batch, (x, y) in enumerate(train_loader):
|
164 |
+
|
165 |
+
x, y = x.to(device), y.to(device)
|
166 |
+
# Forward pass
|
167 |
+
|
168 |
+
# print(x.shape, y.shape)
|
169 |
+
# Shape of x is [64, 28, 28] and y is [64,]
|
170 |
+
# But x needs to include the channels, so shape should be [64, 1, 28, 28]
|
171 |
+
# x = x.view(-1, 1, 32, 32)
|
172 |
+
|
173 |
+
y_pred = model(x)
|
174 |
+
# Compute loss
|
175 |
+
loss = loss_fn(y_pred, y)
|
176 |
+
# Zero gradients, backward pass, and update weights
|
177 |
+
optimizer.zero_grad()
|
178 |
+
loss.backward()
|
179 |
+
optimizer.step()
|
180 |
+
# Print loss
|
181 |
+
if batch % 250 == 0:
|
182 |
+
print(f"Epoch {i} batch {batch} loss: {loss.item()}")
|
183 |
+
# Evaluate the model
|
184 |
+
model.eval()
|
185 |
+
correct, total = 0, 0
|
186 |
+
with torch.no_grad():
|
187 |
+
for x, y in test_loader:
|
188 |
+
x, y = x.to(device), y.to(device)
|
189 |
+
#x = x.view(-1, 1, 32, 32)
|
190 |
+
y_pred = model(x)
|
191 |
+
_, predicted = torch.max(y_pred, 1)
|
192 |
+
total += y.size(0)
|
193 |
+
correct += (predicted == y).sum().item()
|
194 |
+
print(f"Epoch {i} accuracy: {correct/total}")
|
195 |
+
if correct/total > best_accuracy:
|
196 |
+
best_accuracy = correct/total
|
197 |
+
torch.save(model.state_dict(), "lenet_mnist_model.pth")
|
198 |
+
with open("best_model.txt", "w") as file:
|
199 |
+
file.write(f"{best_accuracy}")
|
200 |
+
print("Training complete.")
|
201 |
+
|
202 |
+
|
203 |
+
def init_weights(m):
|
204 |
+
if isinstance(m, nn.Conv2d):
|
205 |
+
nn.init.xavier_uniform_(m.weight)
|
206 |
+
if m.bias is not None:
|
207 |
+
m.bias.data.fill_(0.01)
|
208 |
+
elif isinstance(m, nn.Linear):
|
209 |
+
nn.init.xavier_uniform_(m.weight)
|
210 |
+
m.bias.data.fill_(0.01)
|
211 |
+
|
212 |
+
if __name__ == "__main__":
|
213 |
+
# Testing conversion from ubyte idx to numpy array
|
214 |
+
|
215 |
+
# file_name = "t10k-images.idx3-ubyte"
|
216 |
+
# label_file = "t10k-labels.idx1-ubyte"
|
217 |
+
# file_path = os.path.join("mnist_dataset", label_file)
|
218 |
+
# image_array = idx2numpy.convert_from_file(file_path)
|
219 |
+
# print(image_array.shape) # (10000, 28, 28) # 10000 images of 28x28 pixels
|
220 |
+
|
221 |
+
|
222 |
+
test_data = CustomImageDataset("mnist_dataset/t10k-labels.idx1-ubyte", "mnist_dataset/t10k-images.idx3-ubyte", transform=test_transform)
|
223 |
+
print((test_data[0])[0].shape, "label value", test_data[0][1]) # Getting image from dataset.
|
224 |
+
train_data = CustomImageDataset("mnist_dataset/train-labels.idx1-ubyte", "mnist_dataset/train-images.idx3-ubyte", transform=train_transform)
|
225 |
+
|
226 |
+
# Create a DataLoader, so we can iterate through the dataset in batches.
|
227 |
+
test_loader = DataLoader(test_data, batch_size=64, shuffle=True)
|
228 |
+
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
|
229 |
+
|
230 |
+
# print(f"Output shape of train function, ", next(iter(test_loader))[0].shape) # [ 64x28x28 ] [64,] Image and labels.
|
231 |
+
|
232 |
+
# Display image and label. - From docs.
|
233 |
+
# train_features, train_labels = next(iter(train_loader))
|
234 |
+
# print(f"Feature batch shape: {train_features.size()}")
|
235 |
+
# print(f"Labels batch shape: {train_labels.size()}")
|
236 |
+
# img = train_features[0].squeeze()
|
237 |
+
# label = train_labels[0]
|
238 |
+
# plt.imshow(img, cmap="gray")
|
239 |
+
# plt.show()
|
240 |
+
# print(f"Label: {label}")
|
241 |
+
|
242 |
+
model = LeNet5Model().to(device)
|
243 |
+
model.apply(init_weights) # Apply Xavier initialisation to the model.
|
244 |
+
print(model)
|
245 |
+
|
246 |
+
|
247 |
+
# Training the model
|
248 |
+
train_model(model, train_loader, test_loader, epochs=1000, learning_rate=0.001)
|
249 |
+
# Save the model parameters
|
250 |
+
torch.save(model.state_dict(), "lenet_mnist_model.pth")
|
251 |
+
|
252 |
+
# Current errors include:
|
253 |
+
# - RuntimeError: Input type (unsigned char) and bias type (float) should be the same
|
254 |
+
# - I solved this by converting the image from customer loader to float32 values.
|
255 |
+
# - RuntimeError: Calculated padded input size per channel: (4 x 4). Kernel size: (5 x 5). Kernel size can't be greater than actual input size
|
256 |
+
# - I solved this by adding padding to make it 32x32 as the model expect this and dataset is 28x28.
|
257 |
+
# - The model also had problems when evaluating, it is important dims are batch x channels x height x width, and labels are int.
|
258 |
+
|
259 |
+
# Ways to improve accuracy:
|
260 |
+
# We will try to normalise the dataset via z-score, so values which are brighter are not given more importance. [98.99% accuracy]
|
261 |
+
# We can apply rotations and affine to potentially improve the model by making it learn more abstractly from specific patterns rather than exact same orientation.
|
262 |
+
# Xavier intialisation of CNN and FC layers, to prevent vanishing gradients.
|
263 |
+
# Increase the angle of rotation and affine transformations to see if it improves the model.
|
264 |
+
# We could potentally help the model by applying a enhancement filter (negative laplacian) from computer vision, to the image, inverse laplacian
|
265 |
+
|
266 |
+
# We do not know whether model is overfitting, as we do not have a graph of the training and validation loss.
|
lenet_mnist_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:05ff80605ac574e7e667ec532c8c4b94845e2b11c0c69c06feccd7d86dbab95f
|
3 |
+
size 250431
|
let_net_arch.png
ADDED
mnist_dataset/t10k-images.idx3-ubyte
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0fa7898d509279e482958e8ce81c8e77db3f2f8254e26661ceb7762c4d494ce7
|
3 |
+
size 7840016
|
mnist_dataset/t10k-labels.idx1-ubyte
ADDED
Binary file (10 kB). View file
|
|
mnist_dataset/train-images.idx3-ubyte
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba891046e6505d7aadcbbe25680a0738ad16aec93bde7f9b65e87a2fc25776db
|
3 |
+
size 47040016
|
mnist_dataset/train-labels.idx1-ubyte
ADDED
Binary file (60 kB). View file
|
|
utils.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import idx2numpy, torch
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import torch.nn.functional as F
|
5 |
+
from torchvision import transforms, datasets
|
6 |
+
from PIL import Image
|
7 |
+
|
8 |
+
|
9 |
+
class ApplyEnhancementFilter:
|
10 |
+
def __init__(self, out_channels, kernel_size, stride=1, padding=0, bias=False):
|
11 |
+
"""
|
12 |
+
Initialize the convolution parameters.
|
13 |
+
"""
|
14 |
+
self.out_channels = out_channels
|
15 |
+
self.kernel_size = kernel_size
|
16 |
+
self.stride = stride
|
17 |
+
self.padding = padding
|
18 |
+
self.bias = bias
|
19 |
+
# Define the convolutional layer (not trained here)
|
20 |
+
self.conv = nn.Conv2d(in_channels=1, # Adjust this based on your image channels (1 for grayscale, 3 for RGB)
|
21 |
+
out_channels=out_channels,
|
22 |
+
kernel_size=kernel_size,
|
23 |
+
stride=stride,
|
24 |
+
padding=padding,
|
25 |
+
bias=bias)
|
26 |
+
|
27 |
+
# Example: Manually defining a simple edge-detection kernel
|
28 |
+
# For a real use-case, the kernel weights would be learned or defined according to the filter you need.
|
29 |
+
edge_detection_kernel = torch.tensor([[0, -1., 0.],
|
30 |
+
[-1., 5., -1.],
|
31 |
+
[0., -1., 0.]]).unsqueeze(0).unsqueeze(0)
|
32 |
+
self.conv.weight = nn.Parameter(edge_detection_kernel.float())
|
33 |
+
|
34 |
+
def __call__(self, img):
|
35 |
+
"""
|
36 |
+
Apply the convolution transformation.
|
37 |
+
"""
|
38 |
+
# Convert PIL image to tensor
|
39 |
+
img_tensor = transforms.functional.to_tensor(img).unsqueeze(0) # Add batch dimension
|
40 |
+
# Apply convolution
|
41 |
+
conv_img = self.conv(img_tensor)
|
42 |
+
# Remove batch dimension and convert back to PIL image for further transformations or visualization
|
43 |
+
conv_img_pil = transforms.functional.to_pil_image(conv_img.squeeze(0))
|
44 |
+
return conv_img_pil
|
45 |
+
|
46 |
+
|
47 |
+
if __name__ == "__main__":
|
48 |
+
# It is important to normalise the dataset, so no specific input effects the model more than other based purely on input values.
|
49 |
+
# As values can range from 0-255, this can cause problems, so z-score will be used via Transforms.
|
50 |
+
|
51 |
+
# First we need the mean and standard deviation of train dataset.
|
52 |
+
|
53 |
+
train_images = idx2numpy.convert_from_file("mnist_dataset/train-images.idx3-ubyte")
|
54 |
+
|
55 |
+
# Convert the training images to a PyTorch tensor and scale values to [0, 1]
|
56 |
+
train_images_tensor = torch.tensor(train_images, dtype=torch.float32) / 255.0
|
57 |
+
|
58 |
+
train_mean = train_images_tensor.mean()
|
59 |
+
train_std = train_images_tensor.std()
|
60 |
+
|
61 |
+
print(f"Mean: {train_mean}, Std: {train_std}")
|
62 |
+
# Mean: 0.13066047430038452, Std: 0.30810782313346863
|