Adal Abilbekov
commited on
Commit
·
c9f5661
1
Parent(s):
580eae2
Adding CNN model
Browse files- __pycache__/model.cpython-39.pyc +0 -0
- app.py +44 -3
- cnn_class_17.pt +3 -0
- model.py +67 -0
- try.ipynb +0 -0
__pycache__/model.cpython-39.pyc
ADDED
Binary file (1.65 kB). View file
|
|
app.py
CHANGED
@@ -1,7 +1,48 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
5 |
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
import torch
|
3 |
+
# from lr_ed.model import CNNEmotinoalClassifier
|
4 |
+
import torchaudio
|
5 |
+
import IPython.display as ipd
|
6 |
+
from torch import nn
|
7 |
+
from model import CNNEmotinoalClassifier
|
8 |
|
9 |
+
model = CNNEmotinoalClassifier()
|
10 |
+
model.load_state_dict(torch.load('./cnn_class_17.pt'))
|
11 |
+
model.eval()
|
12 |
|
13 |
+
to_melspec = torchaudio.transforms.MelSpectrogram(
|
14 |
+
sample_rate= 22050,
|
15 |
+
n_fft = 1024,
|
16 |
+
hop_length = 512,
|
17 |
+
n_mels=64
|
18 |
+
)
|
19 |
+
|
20 |
+
def _get_right_pad(target_waveform, waveform):
|
21 |
+
target_waveform = target_waveform
|
22 |
+
waveform_samples_number = waveform.shape[1]
|
23 |
+
if waveform_samples_number < target_waveform:
|
24 |
+
right_pad = target_waveform - waveform_samples_number
|
25 |
+
padding_touple = (0, right_pad)
|
26 |
+
waveform_padded = nn.functional.pad(waveform, padding_touple)
|
27 |
+
else:
|
28 |
+
waveform_padded = waveform
|
29 |
+
return waveform_padded
|
30 |
+
|
31 |
+
def get_probs(audio):
|
32 |
+
emotions = ['happy', 'angry', 'sad', 'neutral', 'surprised', 'fear']
|
33 |
+
emotions = sorted(emotions)
|
34 |
+
|
35 |
+
sr, waveform = audio
|
36 |
+
waveform = _get_right_pad(400384, waveform)
|
37 |
+
input_x = to_melspec(waveform)
|
38 |
+
input_x = torch.unsqueeze(input_x, dim=1)
|
39 |
+
|
40 |
+
probs = model(input_x)
|
41 |
+
prediction = emotions[probs.argmax(dim=1).item()]
|
42 |
+
return dict(zip(emotions, list(map(float, probs[0]))))
|
43 |
+
|
44 |
+
mic = gr.Audio(source="microphone", type="numpy", label="Speak here...")
|
45 |
+
label = gr.Label()
|
46 |
+
|
47 |
+
iface = gr.Interface(fn=get_probs, inputs=mic, outputs=label)
|
48 |
iface.launch()
|
cnn_class_17.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:776cb5ff06d6d2ddacef3ee8fdb61c67f241f82777c3f8a830d7c338256f174e
|
3 |
+
size 16823379
|
model.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torchaudio
|
3 |
+
import numpy as np
|
4 |
+
from torch import nn
|
5 |
+
from torchsummary import summary
|
6 |
+
|
7 |
+
class CNNEmotinoalClassifier(nn.Module):
|
8 |
+
def __init__(self):
|
9 |
+
super(CNNEmotinoalClassifier, self).__init__()
|
10 |
+
|
11 |
+
# conv : 4, flatten, linear, softmax
|
12 |
+
|
13 |
+
self.conv1 = nn.Sequential(
|
14 |
+
nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1),
|
15 |
+
nn.ReLU(),
|
16 |
+
nn.MaxPool2d(kernel_size=2)
|
17 |
+
)
|
18 |
+
|
19 |
+
self.conv2 = nn.Sequential(
|
20 |
+
nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),
|
21 |
+
nn.ReLU(),
|
22 |
+
nn.MaxPool2d(kernel_size=2)
|
23 |
+
)
|
24 |
+
|
25 |
+
self.conv3 = nn.Sequential(
|
26 |
+
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=2),
|
27 |
+
nn.ReLU(),
|
28 |
+
nn.MaxPool2d(kernel_size=2)
|
29 |
+
)
|
30 |
+
|
31 |
+
self.conv4 = nn.Sequential(
|
32 |
+
nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=2),
|
33 |
+
nn.ReLU(),
|
34 |
+
nn.MaxPool2d(kernel_size=2)
|
35 |
+
)
|
36 |
+
|
37 |
+
self.flatten = nn.Flatten()
|
38 |
+
|
39 |
+
self.fully_connected = nn.Sequential(
|
40 |
+
nn.Linear(128 * 5 * 50, 128),
|
41 |
+
nn.ReLU(),
|
42 |
+
nn.Linear(128, 64),
|
43 |
+
nn.ReLU(),
|
44 |
+
nn.Linear(64, 32),
|
45 |
+
nn.ReLU(),
|
46 |
+
nn.Linear(32, 16),
|
47 |
+
nn.ReLU(),
|
48 |
+
nn.Linear(16, 6)
|
49 |
+
)
|
50 |
+
|
51 |
+
self.softmax = nn.Softmax(dim=1)
|
52 |
+
|
53 |
+
def forward(self, x):
|
54 |
+
x = self.conv1(x)
|
55 |
+
x = self.conv2(x)
|
56 |
+
x = self.conv3(x)
|
57 |
+
x = self.conv4(x)
|
58 |
+
x = self.flatten(x)
|
59 |
+
logits = self.fully_connected(x)
|
60 |
+
probs = self.softmax(logits)
|
61 |
+
return probs
|
62 |
+
|
63 |
+
|
64 |
+
if __name__ == '__main__':
|
65 |
+
device = ('cuda' if torch.cuda.is_available() else 'cpu')
|
66 |
+
model = CNNEmotinoalClassifier().to(device)
|
67 |
+
summary(model, (1, 64, 783))
|
try.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|