File size: 3,680 Bytes
6abefd9
d30bab3
 
6abefd9
 
 
 
8b10513
c0f1a73
6e4bc1e
5bba7e5
6abefd9
8b10513
d303dbe
3c699d2
6e4bc1e
0c3276d
6e4bc1e
0c3276d
6e4bc1e
0c3276d
6e4bc1e
0c3276d
6e4bc1e
 
 
 
 
 
 
51d0bd5
6e4bc1e
 
 
c0f1a73
 
0c3276d
c0f1a73
 
0c3276d
6e4bc1e
c0f1a73
3c699d2
5bba7e5
8b10513
5bba7e5
04813c5
 
6abefd9
 
 
 
 
f5f3175
6abefd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8ba3b8
5bba7e5
8b10513
ab3968e
5bba7e5
04813c5
ab3968e
5bba7e5
ab3968e
 
8b10513
 
5bba7e5
6abefd9
8b10513
3c699d2
 
 
 
6abefd9
3c699d2
 
6abefd9
3c699d2
6abefd9
5bba7e5
 
3c699d2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import torch
import torch.nn as nn
import torch.optim as optim
import librosa
import numpy as np
from torchvision import models
from scipy.ndimage import zoom
import gradio as gr
import pickle
from joblib import load
import soundfile as sf

# Assuming you already have the 'ann_model' trained and 'pca' instance from the previous code
language_mapping = {'malayalam': 0, 'english': 1, 'tamil': 2,'hindi':3}

class ANNModel(nn.Module):
    def __init__(self):
        super(ANNModel, self).__init__()
        self.fc1 = nn.Linear(300, 256)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 64)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, 4)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

# Create an instance of your model
ann_model = ANNModel()

# Load the trained model
ann_model.load_state_dict(torch.load('ann_model_256_01_94.pth'))

# Load the PCA instance
pca = load('pca_256_01_94.pkl')

vgg16 = models.vgg16(pretrained=True).features
# Function to load and preprocess a single audio file
def preprocess_single_audio_vgg16(audio_data, sr, vgg16_model, pca_instance):
    # Your existing preprocessing code goes here
    y= audio_data
    sr=sr
    # Load audio
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)  # Compute Mel spectrogram
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)  # Apply log transformation
    norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec)  # Normalize

    # Resize mel spectrogram to the target shape (128, 128) using zoom
    target_shape = (224, 224)
    resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest')

    # Stack the resized mel spectrogram along the third axis to create 3 channels
    mel_spec_rgb = np.stack([resized_mel_spec] * 3, axis=-1)

    # Convert the preprocessed audio data into a format suitable for the VGG16 model
    mel_spec_tensor = torch.from_numpy(mel_spec_rgb).permute(2, 0, 1).unsqueeze(0).float()  # Add batch dimension and change channel order

    # Extract features using VGG16
    vgg16_model.eval()
    with torch.no_grad():
        features = vgg16_model(mel_spec_tensor)

    # Convert the features to numpy array and flatten them
    features_np = features.squeeze().detach().numpy()
    features_flattened = features_np.flatten().reshape(1, -1)

    # Apply PCA transformation
    features_pca = pca_instance.transform(features_flattened)

    # Convert to PyTorch tensor
    features_tensor = torch.from_numpy(features_pca).float()
    return features_tensor


def predict_language(audio_input):
    # Load VGG16 model
    if isinstance(audio_input, str):
        # Load the audio file
      audio, sr = librosa.load(audio_input, sr=22050)
    else:
        # Get the sample rate and convert the audio data to float
      sr, audio = audio_input
      audio = audio.astype(np.float32)

    # Preprocess the single audio file using VGG16 for feature extraction
    preprocessed_features = preprocess_single_audio_vgg16(audio, sr, vgg16, pca)

    # Make predictions using the trained model
    ann_model.eval()
    with torch.no_grad():
        output = ann_model(preprocessed_features)
        _, predicted_class = torch.max(output, 1)

    # Map predicted class index to actual label
    predicted_label = {v: k for k, v in language_mapping.items()}[predicted_class.item()]

    return predicted_label

iface = gr.Interface(fn=predict_language, inputs="audio", outputs="text")

iface.launch()