Macedonian-ASR
/

wav2vec2-aed-macedonian-asr

Automatic Speech Recognition

speechbrain

Macedonian

Eval Results

Model card Files Files and versions Community

Porjaz commited on Sep 30, 2024

Commit

4b65bb8

•

1 Parent(s): 134a152

Update custom_interface.py

Browse files

Files changed (1) hide show

custom_interface.py +8 -132

custom_interface.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import torch
 from speechbrain.inference.interfaces import Pretrained
 import librosa
-import numpy as np
 class ASR(Pretrained):
@@ -84,139 +83,16 @@ class ASR(Pretrained):
                     seq.append(token)
             output = []
         return seq
-    # def classify_file(self, path):
-    #     # waveform = self.load_audio(path)
-    #     waveform, sr = librosa.load(path, sr=16000)
-        # waveform = torch.tensor(waveform)
-        # # Fake a batch:
-        # batch = waveform.unsqueeze(0)
-        # rel_length = torch.tensor([1.0])
-        # outputs = self.encode_batch(batch, rel_length)
-    #     return outputs
     def classify_file(self, path, device):
-        # Load the audio file
-        # path = "long_sample.wav"
         waveform, sr = librosa.load(path, sr=16000)
-        # Get audio length in seconds
-        audio_length = len(waveform) / sr
-        if audio_length >= 20:
-            print(f"Audio is too long ({audio_length:.2f} seconds), splitting into segments")
-            # Detect non-silent segments
-            non_silent_intervals = librosa.effects.split(waveform, top_db=20)  # Adjust top_db for sensitivity
-            segments = []
-            current_segment = []
-            current_length = 0
-            max_duration = 20 * sr  # Maximum segment duration in samples (20 seconds)
-            for interval in non_silent_intervals:
-                start, end = interval
-                segment_part = waveform[start:end]
-                # If adding the next part exceeds max duration, store the segment and start a new one
-                if current_length + len(segment_part) > max_duration:
-                    segments.append(np.concatenate(current_segment))
-                    current_segment = []
-                    current_length = 0
-                current_segment.append(segment_part)
-                current_length += len(segment_part)
-            # Append the last segment if it's not empty
-            if current_segment:
-                segments.append(np.concatenate(current_segment))
-            # Process each segment
-            outputs = []
-            for i, segment in enumerate(segments):
-                print(f"Processing segment {i + 1}/{len(segments)}, length: {len(segment) / sr:.2f} seconds")
-                segment_tensor = torch.tensor(segment).to(device)
-                # Fake a batch for the segment
-                batch = segment_tensor.unsqueeze(0).to(device)
-                rel_length = torch.tensor([1.0]).to(device)  # Adjust if necessary
-                # Pass the segment through the ASR model
-                segment_output = self.encode_batch(device, batch, rel_length)
-                yield segment_output
-        else:
-            waveform = torch.tensor(waveform).to(device)
-            waveform = waveform.to(device)
-            # Fake a batch:
-            batch = waveform.unsqueeze(0)
-            rel_length = torch.tensor([1.0]).to(device)
-            outputs = self.encode_batch(device, batch, rel_length)
-            yield outputs
-    def classify_file_whisper(self, path, pipe, device):
-        waveform, sr = librosa.load(path, sr=16000)
-        transcription = pipe(waveform, generate_kwargs={"language": "macedonian"})["text"]
-        return transcription
-    def classify_file_mms(self, path, processor, model, device):
-        # Load the audio file
-        waveform, sr = librosa.load(path, sr=16000)
-        # Get audio length in seconds
-        audio_length = len(waveform) / sr
-        if audio_length >= 20:
-            print(f"MMS Audio is too long ({audio_length:.2f} seconds), splitting into segments")
-            # Detect non-silent segments
-            non_silent_intervals = librosa.effects.split(waveform, top_db=20)  # Adjust top_db for sensitivity
-            segments = []
-            current_segment = []
-            current_length = 0
-            max_duration = 20 * sr  # Maximum segment duration in samples (20 seconds)
-            for interval in non_silent_intervals:
-                start, end = interval
-                segment_part = waveform[start:end]
-                # If adding the next part exceeds max duration, store the segment and start a new one
-                if current_length + len(segment_part) > max_duration:
-                    segments.append(np.concatenate(current_segment))
-                    current_segment = []
-                    current_length = 0
-                current_segment.append(segment_part)
-                current_length += len(segment_part)
-            # Append the last segment if it's not empty
-            if current_segment:
-                segments.append(np.concatenate(current_segment))
-            # Process each segment
-            outputs = []
-            for i, segment in enumerate(segments):
-                print(f"MMS Processing segment {i + 1}/{len(segments)}, length: {len(segment) / sr:.2f} seconds")
-                segment_tensor = torch.tensor(segment).to(device)
-                # Pass the segment through the ASR model
-                inputs = processor(segment_tensor, sampling_rate=16_000, return_tensors="pt").to(device)
-                outputs = model(**inputs).logits
-                ids = torch.argmax(outputs, dim=-1)[0]
-                segment_output = processor.decode(ids)
-                yield segment_output
-        else:
-            waveform = torch.tensor(waveform).to(device)
-            inputs = processor(waveform, sampling_rate=16_000, return_tensors="pt").to(device)
-            outputs = model(**inputs).logits
-            ids = torch.argmax(outputs, dim=-1)[0]
-            transcription = processor.decode(ids)
-            yield transcription

 import torch
 from speechbrain.inference.interfaces import Pretrained
 import librosa
 class ASR(Pretrained):
                     seq.append(token)
             output = []
         return seq
     def classify_file(self, path, device):
         waveform, sr = librosa.load(path, sr=16000)
+        waveform = torch.tensor(waveform).to(device)
+        waveform = waveform.to(device)
+        # Fake a batch:
+        batch = waveform.unsqueeze(0)
+        rel_length = torch.tensor([1.0]).to(device)
+        outputs = self.encode_batch(device, batch, rel_length)
+        outputs = " ".join(outputs[0])
+        return outputs