Update README.md
Browse files
README.md
CHANGED
@@ -257,7 +257,14 @@ This version of ZeroSwot is trained with ASR data from CommonVoice, and adapting
|
|
257 |
|
258 |
```python
|
259 |
from transformers import Wav2Vec2Processor, NllbTokenizer, AutoModel, AutoModelForSeq2SeqLM
|
260 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
|
262 |
# Load processors and tokenizers
|
263 |
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
|
@@ -277,8 +284,7 @@ nllb_model.eval()
|
|
277 |
nllb_model.to("cuda")
|
278 |
|
279 |
# Load sample .wav
|
280 |
-
audio
|
281 |
-
assert sr == 16000, "Input of wav2vec2.0 is expected to have sampling rate of 16,000"
|
282 |
input_values = processor(audio, sampling_rate=16000, return_tensors="pt").cuda()
|
283 |
|
284 |
# translation to German
|
|
|
257 |
|
258 |
```python
|
259 |
from transformers import Wav2Vec2Processor, NllbTokenizer, AutoModel, AutoModelForSeq2SeqLM
|
260 |
+
import torchaudio
|
261 |
+
|
262 |
+
def load_and_resample_audio(audio_path, target_sr=16000):
|
263 |
+
audio, orig_freq = torchaudio.load(audio_path)
|
264 |
+
if orig_freq != target_sr:
|
265 |
+
audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=target_sr)
|
266 |
+
audio = audio.squeeze(0).numpy()
|
267 |
+
return audio
|
268 |
|
269 |
# Load processors and tokenizers
|
270 |
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
|
|
|
284 |
nllb_model.to("cuda")
|
285 |
|
286 |
# Load sample .wav
|
287 |
+
audio = load_and_resample_audio("sample.wav")
|
|
|
288 |
input_values = processor(audio, sampling_rate=16000, return_tensors="pt").cuda()
|
289 |
|
290 |
# translation to German
|