Spaces:
Runtime error
Runtime error
jaekookang
commited on
Commit
•
63da1fa
1
Parent(s):
3d0edc0
added librosa
Browse files
gradio_asr_en_libri100_word_vs_bpe.py
CHANGED
@@ -44,17 +44,17 @@ logger.info('model loaded')
|
|
44 |
def predict(wav_file):
|
45 |
logger.info('wav file loaded')
|
46 |
# Load audio
|
47 |
-
|
48 |
-
rate, speech = wav_file
|
49 |
-
if len(speech.shape) == 2:
|
50 |
-
|
51 |
# wav = wavio.read(wav_file)
|
52 |
# speech = wav.data
|
53 |
# rate = wav.rate
|
54 |
logger.info(f'speech.shape: {speech.shape}')
|
55 |
logger.info(f'speech.dtype: {speech.dtype}')
|
56 |
logger.info(f'speech rate: {rate}')
|
57 |
-
speech = speech.astype('float32')
|
58 |
|
59 |
# number_of_samples = round(len(speech_raw) * float(16000) / rate)
|
60 |
# speech = sps.resample(speech_raw, number_of_samples)
|
@@ -76,7 +76,7 @@ iface = gr.Interface(
|
|
76 |
title='Comparison between word vs BPE tokens based on ESPNet2 ASR models',
|
77 |
description='Two models were trained on Librispeech (clean-100h)',
|
78 |
inputs=[
|
79 |
-
gr.inputs.Audio(label='wav file', source='microphone', type='
|
80 |
],
|
81 |
outputs=[
|
82 |
gr.outputs.Textbox(label='Decoding result (word-token model)'),
|
|
|
44 |
def predict(wav_file):
|
45 |
logger.info('wav file loaded')
|
46 |
# Load audio
|
47 |
+
speech, rate = librosa.load(wav_file, sr=16000)
|
48 |
+
# rate, speech = wav_file
|
49 |
+
# if len(speech.shape) == 2:
|
50 |
+
# speech = speech.T[0]
|
51 |
# wav = wavio.read(wav_file)
|
52 |
# speech = wav.data
|
53 |
# rate = wav.rate
|
54 |
logger.info(f'speech.shape: {speech.shape}')
|
55 |
logger.info(f'speech.dtype: {speech.dtype}')
|
56 |
logger.info(f'speech rate: {rate}')
|
57 |
+
# speech = speech.astype('float32')
|
58 |
|
59 |
# number_of_samples = round(len(speech_raw) * float(16000) / rate)
|
60 |
# speech = sps.resample(speech_raw, number_of_samples)
|
|
|
76 |
title='Comparison between word vs BPE tokens based on ESPNet2 ASR models',
|
77 |
description='Two models were trained on Librispeech (clean-100h)',
|
78 |
inputs=[
|
79 |
+
gr.inputs.Audio(label='wav file', source='microphone', type='filepath')
|
80 |
],
|
81 |
outputs=[
|
82 |
gr.outputs.Textbox(label='Decoding result (word-token model)'),
|