Spaces:
Running
Running
from espnet2.bin.asr_inference import Speech2Text | |
import resampy | |
from espnet_model_zoo.downloader import ModelDownloader | |
TAGS_TO_MODELS = { | |
'phones': 'asr_tts-phn_en.zip', | |
'STT': 'asr_stt_en.zip', | |
'TTS': 'asr_tts_en.zip' | |
} | |
class DemoASR: | |
def __init__(self, model_path, model_tag, device): | |
self.model_tag = model_tag | |
d = ModelDownloader() | |
self.speech2text = Speech2Text( | |
**d.download_and_unpack(str(model_path / TAGS_TO_MODELS[self.model_tag])), | |
device=str(device), | |
minlenratio=0.0, | |
maxlenratio=0.0, | |
ctc_weight=0.4, | |
beam_size=15, | |
batch_size=1, | |
nbest=1 | |
) | |
def recognize_speech(self, audio, sr): | |
if len(audio.shape) == 2: | |
audio = audio.T[0] | |
speech = resampy.resample(audio, sr, 16000) | |
nbests = self.speech2text(speech) | |
text, *_ = nbests[0] | |
return text | |