Spaces:
Sleeping
Sleeping
import json | |
import requests | |
from datetime import datetime | |
import time | |
import traceback | |
API_URL = "https://api-inference.huggingface.co/models/" | |
def date_now(): | |
return datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
def record_opt(msg): | |
return f"{date_now()} {msg}\n" | |
def speech_recognize(audio, model_name, hf_token, opt): | |
opt += record_opt("Transcription starts ...") | |
yield "Transcribing, please wait..", opt | |
start = time.monotonic() | |
with open(audio, "rb") as f: | |
data = f.read() | |
try: | |
url = API_URL + model_name | |
print(f">>> url is {url}") | |
headers = {"Authorization": f"Bearer {hf_token}"} | |
response = requests.request("POST", url, headers=headers, data=data) | |
text = json.loads(response.content.decode("utf-8")) | |
print(f">>> text is {text}") | |
text = text['text'] | |
except: | |
text = f"Transcription failed:\n{traceback.format_exc()}" | |
cost = time.monotonic() - start | |
opt += record_opt(f"Transcription ends, time consuming{cost:.3f}s") | |
yield text, opt | |
import gradio as gr | |
with gr.Blocks() as demo: | |
gr.HTML("""<h2 align="center">Automatic Speech Recognition (OpenAI Whisper with Inference API)</h2>""") | |
with gr.Row(): | |
gr.Markdown( | |
"""🤗 Call the huggingface API and use the OpenAI Whisper model for speech recognition, which can also be called speech to text(Speech to Text, STT) | |
👉 The purpose is to practice using the Gradio Audio component and explore using the Huggingface Inference API | |
> 💡Tip: You need to fill in the Huggingface token to call the Huggingface Inference API | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
audio = gr.Audio(source="microphone", type="filepath") | |
model_name = gr.Dropdown( | |
label="Select model", | |
choices=[ | |
"openai/whisper-large-v3", | |
"openai/whisper-large-v2", | |
"openai/whisper-large", | |
"openai/whisper-medium", | |
"openai/whisper-small", | |
"openai/whisper-base", | |
"openai/whisper-tiny", | |
], | |
value="openai/whisper-large-v3", | |
) | |
hf_token = gr.Textbox(label="Huggingface token") | |
with gr.Column(): | |
output = gr.Textbox(label="Transcription results") | |
operation = gr.Textbox(label="Component operation history") | |
audio.start_recording( | |
lambda x: x + record_opt("Start recording ..."), | |
inputs=operation, outputs=operation | |
) | |
audio.play( | |
lambda x: x + record_opt("Play recording"), | |
inputs=operation, outputs=operation | |
) | |
audio.pause( | |
lambda x: x + record_opt("Pause playback"), | |
inputs=operation, outputs=operation | |
) | |
audio.stop( | |
lambda x: x + record_opt("Stop play"), | |
inputs=operation, outputs=operation | |
) | |
audio.end( | |
lambda x: x + record_opt("Finished playing"), | |
inputs=operation, outputs=operation | |
) | |
audio.stop_recording(speech_recognize, inputs=[audio, model_name, hf_token, operation], outputs=[output, operation]) | |
demo.queue(max_size=4, concurrency_count=4) | |
demo.launch() | |