Irpan commited on
Commit
3c9ecf2
·
1 Parent(s): d6d3fa1
Files changed (2) hide show
  1. asr.py +1 -1
  2. util.py +14 -3
asr.py CHANGED
@@ -15,7 +15,7 @@ asr_model = asr_model.to(device)
15
 
16
  def asr(user_audio):
17
  # Load and resample user audio
18
- audio_input, sampling_rate = util.load_and_resample_audio(file_path = user_audio, target_rate=16000)
19
 
20
  # Process audio through ASR model
21
  inputs = asr_processor(audio_input.squeeze(), sampling_rate=sampling_rate, return_tensors="pt", padding=True)
 
15
 
16
  def asr(user_audio):
17
  # Load and resample user audio
18
+ audio_input, sampling_rate = util.load_and_resample_audio(audio_data = user_audio, target_rate=16000)
19
 
20
  # Process audio through ASR model
21
  inputs = asr_processor(audio_input.squeeze(), sampling_rate=sampling_rate, return_tensors="pt", padding=True)
util.py CHANGED
@@ -4,6 +4,7 @@ import torchaudio
4
  import string
5
  import epitran
6
  from difflib import SequenceMatcher
 
7
 
8
  # Lists of Uyghur short and long texts
9
  short_texts = [
@@ -12,7 +13,7 @@ short_texts = [
12
  long_texts = [
13
  "مەكتەپكە بارغاندا تېخىمۇ بىلىملىك بولۇمەن.",
14
  "يېزا مەنزىرىسى ھەقىقەتەن گۈزەل.",
15
- "بىزنىڭ ئۆيدەپ تۆت تەكچە تۆتىلىسى تەكتەكچە"
16
  ]
17
 
18
  # Front-End Utils
@@ -35,12 +36,22 @@ def generate_long_text(script_choice):
35
  return text
36
 
37
  # ASR Utils
38
- def load_and_resample_audio(file_path, target_rate):
39
  """Load audio and resample based on target sample rate"""
40
- audio_input, sampling_rate = torchaudio.load(file_path)
 
 
 
 
 
 
 
 
 
41
  if sampling_rate != target_rate:
42
  resampler = torchaudio.transforms.Resample(sampling_rate, target_rate)
43
  audio_input = resampler(audio_input)
 
44
  return audio_input, target_rate
45
 
46
  def calculate_pronunciation_accuracy(reference_text, output_text, language_code='uig-Arab'):
 
4
  import string
5
  import epitran
6
  from difflib import SequenceMatcher
7
+ import numpy as np
8
 
9
  # Lists of Uyghur short and long texts
10
  short_texts = [
 
13
  long_texts = [
14
  "مەكتەپكە بارغاندا تېخىمۇ بىلىملىك بولۇمەن.",
15
  "يېزا مەنزىرىسى ھەقىقەتەن گۈزەل.",
16
+ "بىزنىڭ ئۆيدە تۆت تەكچە تۆتىلىسى تەكتەكچە"
17
  ]
18
 
19
  # Front-End Utils
 
36
  return text
37
 
38
  # ASR Utils
39
+ def load_and_resample_audio(audio_data, target_rate):
40
  """Load audio and resample based on target sample rate"""
41
+ if isinstance(audio_data, tuple):
42
+ # microphone
43
+ sampling_rate, audio_input = audio_data
44
+ audio_input = (audio_input / 32768.0).astype(np.float32)
45
+ elif isinstance(audio_data, str):
46
+ # file upload
47
+ audio_input, sampling_rate = torchaudio.load(audio_data)
48
+ else:
49
+ return "<<ERROR: Invalid Audio Input Instance: {}>>".format(type(audio_data))
50
+ # Resample if needed
51
  if sampling_rate != target_rate:
52
  resampler = torchaudio.transforms.Resample(sampling_rate, target_rate)
53
  audio_input = resampler(audio_input)
54
+
55
  return audio_input, target_rate
56
 
57
  def calculate_pronunciation_accuracy(reference_text, output_text, language_code='uig-Arab'):