Spaces:
Sleeping
Sleeping
Irpan
commited on
Commit
·
3c9ecf2
1
Parent(s):
d6d3fa1
app
Browse files
asr.py
CHANGED
@@ -15,7 +15,7 @@ asr_model = asr_model.to(device)
|
|
15 |
|
16 |
def asr(user_audio):
|
17 |
# Load and resample user audio
|
18 |
-
audio_input, sampling_rate = util.load_and_resample_audio(
|
19 |
|
20 |
# Process audio through ASR model
|
21 |
inputs = asr_processor(audio_input.squeeze(), sampling_rate=sampling_rate, return_tensors="pt", padding=True)
|
|
|
15 |
|
16 |
def asr(user_audio):
|
17 |
# Load and resample user audio
|
18 |
+
audio_input, sampling_rate = util.load_and_resample_audio(audio_data = user_audio, target_rate=16000)
|
19 |
|
20 |
# Process audio through ASR model
|
21 |
inputs = asr_processor(audio_input.squeeze(), sampling_rate=sampling_rate, return_tensors="pt", padding=True)
|
util.py
CHANGED
@@ -4,6 +4,7 @@ import torchaudio
|
|
4 |
import string
|
5 |
import epitran
|
6 |
from difflib import SequenceMatcher
|
|
|
7 |
|
8 |
# Lists of Uyghur short and long texts
|
9 |
short_texts = [
|
@@ -12,7 +13,7 @@ short_texts = [
|
|
12 |
long_texts = [
|
13 |
"مەكتەپكە بارغاندا تېخىمۇ بىلىملىك بولۇمەن.",
|
14 |
"يېزا مەنزىرىسى ھەقىقەتەن گۈزەل.",
|
15 |
-
"بىزنىڭ
|
16 |
]
|
17 |
|
18 |
# Front-End Utils
|
@@ -35,12 +36,22 @@ def generate_long_text(script_choice):
|
|
35 |
return text
|
36 |
|
37 |
# ASR Utils
|
38 |
-
def load_and_resample_audio(
|
39 |
"""Load audio and resample based on target sample rate"""
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
if sampling_rate != target_rate:
|
42 |
resampler = torchaudio.transforms.Resample(sampling_rate, target_rate)
|
43 |
audio_input = resampler(audio_input)
|
|
|
44 |
return audio_input, target_rate
|
45 |
|
46 |
def calculate_pronunciation_accuracy(reference_text, output_text, language_code='uig-Arab'):
|
|
|
4 |
import string
|
5 |
import epitran
|
6 |
from difflib import SequenceMatcher
|
7 |
+
import numpy as np
|
8 |
|
9 |
# Lists of Uyghur short and long texts
|
10 |
short_texts = [
|
|
|
13 |
long_texts = [
|
14 |
"مەكتەپكە بارغاندا تېخىمۇ بىلىملىك بولۇمەن.",
|
15 |
"يېزا مەنزىرىسى ھەقىقەتەن گۈزەل.",
|
16 |
+
"بىزنىڭ ئۆيدە تۆت تەكچە تۆتىلىسى تەكتەكچە"
|
17 |
]
|
18 |
|
19 |
# Front-End Utils
|
|
|
36 |
return text
|
37 |
|
38 |
# ASR Utils
|
39 |
+
def load_and_resample_audio(audio_data, target_rate):
|
40 |
"""Load audio and resample based on target sample rate"""
|
41 |
+
if isinstance(audio_data, tuple):
|
42 |
+
# microphone
|
43 |
+
sampling_rate, audio_input = audio_data
|
44 |
+
audio_input = (audio_input / 32768.0).astype(np.float32)
|
45 |
+
elif isinstance(audio_data, str):
|
46 |
+
# file upload
|
47 |
+
audio_input, sampling_rate = torchaudio.load(audio_data)
|
48 |
+
else:
|
49 |
+
return "<<ERROR: Invalid Audio Input Instance: {}>>".format(type(audio_data))
|
50 |
+
# Resample if needed
|
51 |
if sampling_rate != target_rate:
|
52 |
resampler = torchaudio.transforms.Resample(sampling_rate, target_rate)
|
53 |
audio_input = resampler(audio_input)
|
54 |
+
|
55 |
return audio_input, target_rate
|
56 |
|
57 |
def calculate_pronunciation_accuracy(reference_text, output_text, language_code='uig-Arab'):
|