Spaces:
Running
Running
# Copyright (c) 2022 Horizon Robotics. (authors: Binbin Zhang) | |
# 2022 Chengdong Liang (liangchengdong@mail.nwpu.edu.cn) | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import gradio as gr | |
import torch | |
from wenet.cli.model import load_model | |
def process_cat_embs(cat_embs): | |
device = "cpu" | |
cat_embs = torch.tensor( | |
[float(c) for c in cat_embs.split(',')]).to(device) | |
return cat_embs | |
def download_rev_models(): | |
# from huggingface_hub import hf_hub_download | |
# import joblib | |
# REPO_ID = "Revai/reginald" | |
# FILENAME = "sklearn_model.joblib" | |
# model = joblib.load( | |
# hf_hub_download(repo_id=REPO_ID, filename=FILENAME) | |
# ) | |
model_path = "/Users/natalie/NERD-2941/reginald/10.jit.zip" | |
units_path = "/Users/natalie/NERD-2941/reginald/tk.units.txt" | |
audio_path = "/Users/natalie/NERD-2941/rev-wenet/runtime/web/fdhc0_si1559.wav" | |
cat_embs = "1,0" | |
device = "cpu" | |
cat_embs = process_cat_embs | |
model = load_model(model_path, units_path) | |
return model | |
model = download_rev_models() | |
def recognition(audio, style=0): | |
if audio is None: | |
return "Input Error! Please enter one audio!" | |
# NOTE: model supports 16k sample_rate | |
cat_embs = ','.join([str(s) for s in (1-style, style)]) | |
cat_embs = process_cat_embs(cat_embs) | |
ans = model.transcribe(audio, cat_embs = cat_embs) | |
if ans is None: | |
return "ERROR! No text output! Please try again!" | |
txt = ans['text'] | |
return txt | |
# input | |
inputs = [ | |
gr.inputs.Audio(source="microphone", type="filepath", label='Input audio'), | |
gr.Slider(0, 1, value=0, label="Style", info="Choose between verbatim and NV"), | |
] | |
output = gr.outputs.Textbox(label="Output Text") | |
text = "Reginald Demo" | |
# description | |
description = ( | |
"This is a speech recognition demo that supports verbatim and non-verbatim transcription. Try recording an audio with disfluencies (ex: \'uh\', \'um\') and testing both transcription styles." # noqa | |
) | |
article = ( | |
"<p style='text-align: center'>" | |
"<a href='https://rev.com' target='_blank'>Github: Learn more about Rev</a>" # noqa | |
"</p>") | |
interface = gr.Interface( | |
fn=recognition, | |
inputs=inputs, | |
outputs=output, | |
title=text, | |
description=description, | |
article=article, | |
theme='huggingface', | |
) | |
interface.launch(enable_queue=True) | |