Transformers_whisper_cleft

Running

App Files Files Community

lilyhof commited on Jul 9

Commit

3ce86ea

•

1 Parent(s): c779966

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -29

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import datasets
 from datasets import load_dataset, DatasetDict, Audio
 from huggingface_hub import PyTorchModelHubMixin
 import numpy as np
 # Ensure you have the device setup (cuda or cpu)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -56,16 +57,24 @@ class SpeechClassifier(nn.Module, PyTorchModelHubMixin):
 # Prepare data function
 def prepare_data(audio_data, sampling_rate, model_checkpoint="openai/whisper-base"):
     feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
-    inputs = feature_extractor(audio_data, sampling_rate=sampling_rate, return_tensors="pt")
-    input_features = inputs.input_features
-    decoder_input_ids = torch.tensor([[1, 1]])  # Modify as per your model's requirements
-    return input_features.to(device), decoder_input_ids.to(device)
 # Prediction function
 def predict(audio_data, sampling_rate, config):
     input_features, decoder_input_ids = prepare_data(audio_data, sampling_rate, config["encoder"])
     model = SpeechClassifier(config).to(device)
     # Here we load the model from Hugging Face Hub
     model.load_state_dict(torch.hub.load_state_dict_from_url("https://huggingface.co/jcho02/whisper_cleft/resolve/main/pytorch_model.bin", map_location=device))
@@ -86,34 +95,29 @@ def gradio_file_interface(uploaded_file):
     return label
 def gradio_mic_interface(mic_input):
-    # mic_input is a dictionary with 'data' and 'sample_rate' keys
-    prediction = predict(mic_input['data'], mic_input['sample_rate'], config)
     label = "Hypernasality Detected" if prediction == 1 else "No Hypernasality Detected"
     return label
-# Initialize Blocks
-demo = gr.Blocks()
-# Tab 1: Upload File
-file_interface = gr.Interface(
-    fn=gradio_file_interface,
-    inputs=gr.Audio(sources="upload", type="filepath"),  # Use filepath for uploaded audio files
-    outputs=gr.Textbox(label="Prediction")
-)
-# Tab 2: Record with Mic
-mic_interface = gr.Interface(
-        fn=gradio_mic_interface,
-        inputs=gr.Audio(sources="microphone", type="numpy"),  # Use numpy for real-time audio like microphone
-        outputs=gr.Textbox(label="Prediction")
-    )
 # Define the interfaces inside the Blocks context
-with demo:
-    gr.TabbedInterface(
-        [file_interface, mic_interface],
-        ["Upload File", "Record Using Microphone"]
-    )
 # Launch the demo with debugging enabled
 demo.launch(debug=True)

 from datasets import load_dataset, DatasetDict, Audio
 from huggingface_hub import PyTorchModelHubMixin
 import numpy as np
+import librosa
 # Ensure you have the device setup (cuda or cpu)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Prepare data function
 def prepare_data(audio_data, sampling_rate, model_checkpoint="openai/whisper-base"):
+    # Resample audio data to 16000 Hz
+    audio_data_resampled = librosa.resample(audio_data, orig_sr=sampling_rate, target_sr=16000)
+    # Initialize the feature extractor
     feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
+    # Use Dataset class
+    dataset = SpeechInferenceDataset([{"audio": {"array": audio_data_resampled, "sampling_rate": 16000}}],
+                                     text_processor=feature_extractor)
+    return dataset
 # Prediction function
 def predict(audio_data, sampling_rate, config):
     input_features, decoder_input_ids = prepare_data(audio_data, sampling_rate, config["encoder"])
     model = SpeechClassifier(config).to(device)
     # Here we load the model from Hugging Face Hub
     model.load_state_dict(torch.hub.load_state_dict_from_url("https://huggingface.co/jcho02/whisper_cleft/resolve/main/pytorch_model.bin", map_location=device))
     return label
 def gradio_mic_interface(mic_input):
+    # mic_input is a tuple with sample_rate and data as entries
+    # (44100, array([   0,    0,    0, ..., -153, -140, -120], dtype=int16))
+    prediction = predict(mic_input[1], mic_input[0], config)
     label = "Hypernasality Detected" if prediction == 1 else "No Hypernasality Detected"
     return label
 # Define the interfaces inside the Blocks context
+with gr.Blocks() as demo:
+    # File Upload Tab
+    with gr.Tab("Upload File"):
+        gr.Interface(
+            fn=gradio_file_interface,
+            inputs=gr.Audio(sources="upload", type="filepath"),  # Use filepath for uploaded audio files
+            outputs=gr.Textbox(label="Prediction")
+        )
+    # Mic Tab
+    with gr.Tab("Record Using Microphone"):
+        gr.Interface(
+            fn=gradio_mic_interface,
+            inputs=gr.Audio(sources="microphone", type="numpy"),  # Use numpy for real-time audio like microphone
+            outputs=gr.Textbox(label="Prediction")
+        )
 # Launch the demo with debugging enabled
 demo.launch(debug=True)