File size: 4,926 Bytes
d4a5e8c
97cc5f3
d4a5e8c
d0a28d9
9d03774
 
d4a5e8c
27444e5
9d03774
 
27444e5
d4a5e8c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d03774
 
 
9338b19
9d03774
9338b19
9d03774
 
 
 
 
 
 
9338b19
9d03774
 
c949392
9d03774
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b71666
 
 
9d03774
 
 
5b71666
c949392
9d03774
 
 
c949392
5b71666
9d03774
5b71666
 
9d03774
 
5b71666
9d03774
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c949392
9d03774
 
c949392
d4a5e8c
 
9d03774
d4a5e8c
5b71666
c949392
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import gradio as gr
import numpy as np
from huggingface_hub import InferenceClient
import os
import requests
import scipy.io.wavfile

client = InferenceClient(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    token=os.getenv('hf_token')
)

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""

    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content

        response += token
        yield response

def process_audio(audio_data):
    if audio_data is None:
        return "No audio provided"
    
    print("audio_data:", audio_data)  # 添加这行代码

    # 检查 audio_data 是否是元组,并提取数据
    if isinstance(audio_data, tuple):
        sample_rate, data = audio_data
        print("Sample rate:", sample_rate)
        print("Data type:", type(data))
    else:
        return "Invalid audio data format"

    # Define the local file path to save the WAV file
    local_wav_file = "converted_audio.wav"

    # Save the audio data as a WAV file
    scipy.io.wavfile.write(local_wav_file, sample_rate, data)
    
    API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v2"
    headers = {"Authorization": f"Bearer {os.getenv('hf_token')}"}

    def query(filename):
        with open(filename, "rb") as f:
            file_data = f.read()
        response = requests.post(API_URL, headers=headers, data=file_data)
        return response.json()

    # Call the API to process the audio
    output = query(local_wav_file)

    print(output)

    # Check the API response
    if 'text' in output:
        return output['text']
    else:
        return "Error in processing audio"

# 定义函数以禁用按钮并显示加载指示器
def disable_components():
    # 更新 recognized_text 的内容,提示用户正在处理
    recognized_text_update = gr.update(value='正在处理,请稍候...')
    # 禁用 process_button
    process_button_update = gr.update(interactive=False)
    # 显示加载动画
    loading_animation_update = gr.update(visible=True)
    return recognized_text_update, process_button_update, loading_animation_update

# 定义函数以启用按钮并隐藏加载指示器
def enable_components(recognized_text):
    # 处理完成后,recognized_text 已经由 process_audio 更新
    # 重新启用 process_button
    process_button_update = gr.update(interactive=True)
    # 隐藏加载动画
    loading_animation_update = gr.update(visible=False)
    return recognized_text, process_button_update, loading_animation_update


# 创建界面
def create_interface():
    with gr.Blocks() as demo:
        # 标题
        gr.Markdown("# 语音识别与聊天系统")
        
        # 音频输入部分
        with gr.Row():
            audio_input = gr.Audio(
                sources="microphone",
                type="numpy",  # 获取音频数据和采样率
                label="上传音频"
            )
        
        # 文本识别输出部分
        with gr.Row():
            recognized_text = gr.Textbox(label="识别文本")
        
        # 处理音频的按钮
        process_button = gr.Button("处理音频")
        
        # 加载动画
        loading_animation = gr.HTML(
            value='<div style="text-align: center;"><span style="font-size: 18px;">ASR Model is running...</span></div>',
            visible=False
        )
        
        # 关联音频处理函数,并在点击时更新组件状态
        process_button.click(
            fn=disable_components,
            inputs=[],
            outputs=[recognized_text, process_button, loading_animation]
        ).then(
            fn=process_audio,
            inputs=[audio_input],
            outputs=recognized_text
        ).then(
            fn=enable_components,
            inputs=[recognized_text],
            outputs=[recognized_text, process_button, loading_animation]
        )
        
        # Chatbot 界面
        chatbot = gr.ChatInterface(
            fn=respond,
            additional_inputs=[
                gr.Textbox(value="You are a helpful chatbot that answers questions.", label="系统消息")
            ]
        )
        
        # 布局包含 Chatbot
        with gr.Row():
            chatbot_output = chatbot
                
    return demo



if __name__ == "__main__":
    demo = create_interface()
    demo.launch()