saronium commited on
Commit
6abefd9
·
verified ·
1 Parent(s): 66ca40c

<audio controls src="https://cdn-uploads.huggingface.co/production/uploads/6586f45c65df457a558c1a24/ZQ0AmeGvvtVf02fGsHMuW.mpga"></audio>

Files changed (1) hide show
  1. app.py +103 -0
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify, render_template
2
+ import os
3
+ import torch
4
+ import librosa
5
+ import numpy as np
6
+ from torchvision import models
7
+ from scipy.ndimage import zoom
8
+ from sklearn.decomposition import PCA
9
+ import joblib
10
+ from keras.utils import to_categorical
11
+
12
+ # Assuming you already have the 'ann_model' trained and 'pca' instance from the previous code
13
+ language_mapping = {'malayalam': 0, 'english': 1, 'tamil': 2,'hindi':3,'kannada':4,'telugu':5}
14
+
15
+ app = Flask(__name__)
16
+
17
+ # Load the trained model and PCA instance
18
+ ann_model = torch.load('ann_model.pth')
19
+ pca = joblib.load('pca.pkl')
20
+
21
+ # Load VGG16 model
22
+ vgg16 = models.vgg16(pretrained=True).features
23
+ def preprocess_single_audio_vgg16(audio_file, vgg16_model, pca_instance):
24
+ # Load and preprocess the audio file
25
+ y, sr = librosa.load(audio_file, sr=None) # Load audio
26
+ mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) # Compute Mel spectrogram
27
+ log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) # Apply log transformation
28
+ norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec) # Normalize
29
+
30
+ # Resize mel spectrogram to the target shape (128, 128) using zoom
31
+ target_shape = (128, 128)
32
+ resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest')
33
+
34
+ # Stack the resized mel spectrogram along the third axis to create 3 channels
35
+ mel_spec_rgb = np.stack([resized_mel_spec] * 3, axis=-1)
36
+
37
+ # Convert the preprocessed audio data into a format suitable for the VGG16 model
38
+ mel_spec_tensor = torch.from_numpy(mel_spec_rgb).permute(2, 0, 1).unsqueeze(0).float() # Add batch dimension and change channel order
39
+
40
+ # Extract features using VGG16
41
+ vgg16_model.eval()
42
+ with torch.no_grad():
43
+ features = vgg16_model(mel_spec_tensor)
44
+
45
+ # Convert the features to numpy array and flatten them
46
+ features_np = features.squeeze().detach().numpy()
47
+ features_flattened = features_np.flatten().reshape(1, -1)
48
+
49
+ # Apply PCA transformation
50
+ features_pca = pca_instance.transform(features_flattened)
51
+
52
+ # Convert to PyTorch tensor
53
+ features_tensor = torch.from_numpy(features_pca).float()
54
+
55
+ return features_tensor
56
+
57
+
58
+ @app.route('/')
59
+ def home():
60
+ return render_template('index.html')
61
+
62
+ @app.route('/predict', methods=['POST'])
63
+ def predict():
64
+ try:
65
+ # Get the audio file from the request
66
+ audio_file = request.files['file']
67
+ audio_file.save('temp.wav')
68
+ audio_file_path = 'temp.wav'
69
+ # Preprocess the audio file
70
+ preprocessed_features = preprocess_single_audio_vgg16(audio_file_path, vgg16, pca)
71
+
72
+ # Make a prediction
73
+ ann_model.eval()
74
+ with torch.no_grad():
75
+ output = ann_model(preprocessed_features)
76
+ _, predicted_class = torch.max(output, 1)
77
+
78
+ # Map predicted class index to actual label
79
+ predicted_label = {v: k for k, v in language_mapping.items()}[predicted_class.item()]
80
+
81
+ # Delete the temporary audio file
82
+ os.remove('temp.wav')
83
+
84
+ # Return the prediction
85
+ return jsonify({'prediction': predicted_label})
86
+ except KeyError:
87
+ return jsonify({'error': 'Audio file not found in the request'}), 400
88
+
89
+ except Exception as e:
90
+ return jsonify({'error': str(e)}), 500
91
+ if __name__ == '__main__':
92
+ app.run(host='0.0.0.0', port=8000)
93
+
94
+
95
+
96
+
97
+
98
+ # Assuming you already have the 'ann_model' trained and 'pca' instance from the previous code
99
+
100
+ # Function to load and preprocess a single audio file
101
+
102
+ # Load VGG16 model
103
+