ixxan commited on
Commit
544e017
·
verified ·
1 Parent(s): 20aa839

Create asr.py

Browse files
Files changed (1) hide show
  1. asr.py +43 -0
asr.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torchaudio
2
+ import torch
3
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
4
+
5
+ # Load processor and model
6
+ processor = AutoProcessor.from_pretrained("ixxan/whisper-small-ug-cv-15")
7
+ model = AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-ug-cv-15")
8
+
9
+ def transcribe(audio_path: str) -> str:
10
+ """
11
+ Transcribes audio to text using the Whisper model for Uyghur.
12
+
13
+ Args:
14
+ - audio_path (str): Path to the audio file to transcribe.
15
+
16
+ Returns:
17
+ - str: The transcription of the audio.
18
+ """
19
+
20
+ # Load audio file
21
+ audio_input, sampling_rate = torchaudio.load(audio_path)
22
+
23
+ # Resample if needed
24
+ if sampling_rate != processor.feature_extractor.sampling_rate:
25
+ resampler = torchaudio.transforms.Resample(sampling_rate, processor.feature_extractor.sampling_rate)
26
+ audio_input = resampler(audio_input)
27
+
28
+ # Preprocess the audio input
29
+ inputs = processor(audio_input.squeeze(), sampling_rate=16000, return_tensors="pt")
30
+
31
+ # Move model to GPU if available
32
+ device = "cuda" if torch.cuda.is_available() else "cpu"
33
+ model.to(device)
34
+ inputs = {key: val.to(device) for key, val in inputs.items()}
35
+
36
+ # Generate transcription
37
+ with torch.no_grad():
38
+ generated_ids = model.generate(inputs["input_features"], max_length=225)
39
+
40
+ # Decode the output to get the transcription text
41
+ transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
42
+
43
+ return transcription