File size: 3,331 Bytes
69fe4c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebff6ea
598487f
 
69fe4c0
 
 
 
 
 
 
ebff6ea
69fe4c0
 
 
 
 
598487f
69fe4c0
 
598487f
69fe4c0
 
 
 
 
 
 
 
598487f
69fe4c0
598487f
b502f30
ebff6ea
69fe4c0
 
 
 
 
 
598487f
69fe4c0
86a86db
69fe4c0
 
 
 
 
 
 
bc29b0f
69fe4c0
ebff6ea
69fe4c0
598487f
 
69fe4c0
598487f
69fe4c0
 
 
598487f
69fe4c0
 
 
598487f
69fe4c0
 
 
598487f
69fe4c0
 
 
598487f
69fe4c0
 
ebff6ea
69fe4c0
ed7d1fd
 
69fe4c0
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import json
import requests

from datetime import datetime

import time
import traceback

API_URL = "https://api-inference.huggingface.co/models/"


def date_now():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def record_opt(msg):
    return f"{date_now()} {msg}\n"


def speech_recognize(audio, model_name, access_token, opt):
    opt += record_opt("Transcription starts ...")
    yield "Transcribing, please wait..", opt
    start = time.monotonic()

    with open(audio, "rb") as f:
        data = f.read()
    try:
        url = API_URL + model_name
        print(f">>> url is {url}")
        headers = {"Authorization": f"Bearer {access_token}"}
        response = requests.request("POST", url, headers=headers, data=data)
        text = json.loads(response.content.decode("utf-8"))
        print(f">>> text is {text}")
        text = text['text']
    except:
        text = f"Transcription failed:\n{traceback.format_exc()}"

    cost = time.monotonic() - start
    opt += record_opt(f"Transcription ends, time consuming{cost:.3f}s")
    yield text, opt

import gradio as gr

with gr.Blocks() as demo:
    gr.HTML("""<h2 align="center">Automatic Speech Recognition (OpenAI Whisper with Inference API)</h2>""")
    with gr.Row():
        gr.Markdown(
            """🤗 Call the huggingface API and use the OpenAI Whisper model for speech recognition, which can also be called speech to text(Speech to Text, STT)

            👉 The purpose is to practice using the Gradio Audio component and explore using the Huggingface Inference API

            > 💡Tip: You need to fill in the Huggingface access token to call the Huggingface Inference API
            """
        )
    with gr.Row():
        with gr.Column():
            audio = gr.Audio(source="microphone", type="filepath")
            model_name = gr.Dropdown(
                label="Select model",
                choices=[
                    "openai/whisper-large-v3",
                    "openai/whisper-large-v2",
                    "openai/whisper-large",
                    "openai/whisper-medium",
                    "openai/whisper-small",
                    "openai/whisper-base",
                    "openai/whisper-tiny",
                ],
                value="openai/whisper-large-v3",
            )
            access_token = gr.Textbox(label="Huggingface access token")
        with gr.Column():
            output = gr.Textbox(label="Transcription results")
            operation = gr.Textbox(label="Component operation history")
    audio.start_recording(
        lambda x: x + record_opt("Start recording ..."),
        inputs=operation, outputs=operation
    )
    audio.play(
        lambda x: x + record_opt("Play recording"),
        inputs=operation, outputs=operation
    )
    audio.pause(
        lambda x: x + record_opt("Pause playback"),
        inputs=operation, outputs=operation
    )
    audio.stop(
        lambda x: x + record_opt("Stop play"),
        inputs=operation, outputs=operation
    )
    audio.end(
        lambda x: x + record_opt("Finished playing"),
        inputs=operation, outputs=operation
    )
    audio.stop_recording(speech_recognize, inputs=[audio, model_name, access_token, operation], outputs=[output, operation])

demo.queue(max_size=4, concurrency_count=4)
demo.launch()