import torch import torch.nn as nn import torch.optim as optim import librosa import numpy as np from torchvision import models from scipy.ndimage import zoom import gradio as gr import pickle from joblib import load import soundfile as sf # Assuming you already have the 'ann_model' trained and 'pca' instance from the previous code language_mapping = {'malayalam': 0, 'english': 1, 'tamil': 2,'hindi':3} class ANNModel(nn.Module): def __init__(self): super(ANNModel, self).__init__() self.fc1 = nn.Linear(300, 256) self.relu1 = nn.ReLU() self.fc2 = nn.Linear(256, 64) self.relu2 = nn.ReLU() self.fc3 = nn.Linear(64, 4) def forward(self, x): x = self.fc1(x) x = self.relu1(x) x = self.fc2(x) x = self.relu2(x) x = self.fc3(x) return x # Create an instance of your model ann_model = ANNModel() # Load the trained model ann_model.load_state_dict(torch.load('ann_model_256_01_94.pth')) # Load the PCA instance pca = load('pca_256_01_94.pkl') vgg16 = models.vgg16(pretrained=True).features # Function to load and preprocess a single audio file def preprocess_single_audio_vgg16(audio_data, sr, vgg16_model, pca_instance): # Your existing preprocessing code goes here y= audio_data sr=sr # Load audio mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128) # Compute Mel spectrogram log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max) # Apply log transformation norm_mel_spec = (log_mel_spec - np.mean(log_mel_spec)) / np.std(log_mel_spec) # Normalize # Resize mel spectrogram to the target shape (128, 128) using zoom target_shape = (224, 224) resized_mel_spec = zoom(norm_mel_spec, (target_shape[0] / norm_mel_spec.shape[0], target_shape[1] / norm_mel_spec.shape[1]), mode='nearest') # Stack the resized mel spectrogram along the third axis to create 3 channels mel_spec_rgb = np.stack([resized_mel_spec] * 3, axis=-1) # Convert the preprocessed audio data into a format suitable for the VGG16 model mel_spec_tensor = torch.from_numpy(mel_spec_rgb).permute(2, 0, 1).unsqueeze(0).float() # Add batch dimension and change channel order # Extract features using VGG16 vgg16_model.eval() with torch.no_grad(): features = vgg16_model(mel_spec_tensor) # Convert the features to numpy array and flatten them features_np = features.squeeze().detach().numpy() features_flattened = features_np.flatten().reshape(1, -1) # Apply PCA transformation features_pca = pca_instance.transform(features_flattened) # Convert to PyTorch tensor features_tensor = torch.from_numpy(features_pca).float() return features_tensor def predict_language(audio_input): # Load VGG16 model if isinstance(audio_input, str): # Load the audio file audio, sr = librosa.load(audio_input, sr=22050) else: # Get the sample rate and convert the audio data to float sr, audio = audio_input audio = audio.astype(np.float32) # Preprocess the single audio file using VGG16 for feature extraction preprocessed_features = preprocess_single_audio_vgg16(audio, sr, vgg16, pca) # Make predictions using the trained model ann_model.eval() with torch.no_grad(): output = ann_model(preprocessed_features) _, predicted_class = torch.max(output, 1) # Map predicted class index to actual label predicted_label = {v: k for k, v in language_mapping.items()}[predicted_class.item()] return predicted_label iface = gr.Interface(fn=predict_language, inputs="audio", outputs="text") iface.launch()