File size: 5,851 Bytes
d4a5e8c
97cc5f3
d4a5e8c
d0a28d9
9d03774
 
867343a
68f0d8d
d4a5e8c
27444e5
9d03774
 
27444e5
d4a5e8c
 
9d03774
 
68f0d8d
9338b19
9d03774
 
 
 
68f0d8d
c949392
867343a
 
 
 
 
68f0d8d
9d03774
 
 
867343a
 
9d03774
 
 
867343a
9d03774
68f0d8d
9d03774
 
 
68f0d8d
 
9d03774
68f0d8d
 
9d03774
 
 
 
5915a6a
9d03774
 
 
 
 
 
 
 
 
 
 
 
 
68f0d8d
5915a6a
68f0d8d
 
 
 
 
 
5915a6a
68f0d8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5915a6a
ea1feff
68f0d8d
 
 
 
 
5915a6a
68f0d8d
5915a6a
 
 
 
 
 
 
 
 
 
 
 
 
 
9d03774
5b71666
 
5915a6a
68f0d8d
 
ea1feff
5915a6a
68f0d8d
 
 
0546cb0
68f0d8d
 
 
 
5915a6a
 
 
 
68f0d8d
 
5915a6a
a09c8c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5915a6a
 
 
 
a09c8c9
68f0d8d
9d03774
 
 
 
 
 
 
68f0d8d
9d03774
 
 
 
 
5915a6a
 
 
 
 
 
 
 
 
 
 
 
9d03774
c949392
d4a5e8c
 
9d03774
d01e2e3
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import gradio as gr
import numpy as np
from huggingface_hub import InferenceClient
import os
import requests
import scipy.io.wavfile
import io
import time

client = InferenceClient(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    token=os.getenv('hf_token')
)


def process_audio(audio_data):
    if audio_data is None:
        return "No audio provided.", ""

    # 检查 audio_data 是否是元组,并提取数据
    if isinstance(audio_data, tuple):
        sample_rate, data = audio_data
    else:
        return "Invalid audio data format.", ""

    # Convert the audio data to WAV format in memory
    buf = io.BytesIO()
    scipy.io.wavfile.write(buf, sample_rate, data)
    wav_bytes = buf.getvalue()
    buf.close()

    API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v2"
    headers = {"Authorization": f"Bearer {os.getenv('hf_token')}"}

    def query(wav_data):
        response = requests.post(API_URL, headers=headers, data=wav_data)
        return response.json()

    # Call the API to process the audio
    output = query(wav_bytes)

    print(output) # Check output in console (logs in HF space)

    # Check the API response
    if 'text' in output:
        recognized_text = output['text']
        return recognized_text, recognized_text
    else:
        recognized_text = "The ASR module is still loading, please press the button again!"
        return recognized_text, ""

# 定义函数以禁用按钮并显示加载指示器
def disable_components():
    # 更新 recognized_text 的内容,提示用户正在处理
    recognized_text_update = gr.update(value='Voice Recognization Running...')
    # 禁用 process_button
    process_button_update = gr.update(interactive=False)
    # 显示加载动画
    loading_animation_update = gr.update(visible=True)
    return recognized_text_update, process_button_update, loading_animation_update

# 定义函数以启用按钮并隐藏加载指示器
def enable_components(recognized_text):
    process_button_update = gr.update(interactive=True)
    # 隐藏加载动画
    loading_animation_update = gr.update(visible=False)
    return recognized_text, process_button_update, loading_animation_update

llama_responded = 0
responded_answer = ""

def respond(
    message,
    history: list[tuple[str, str]]
):
    global llama_responded
    global responded_answer
    system_message = "You are a helpful chatbot that answers questions. Give any answer within 50 words."
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        print(val[0])
        if val[0] != None:
            if val[0]:
                messages.append({"role": "user", "content": val[0]})
            if val[1]:
                messages.append({"role": "assistant", "content": val[1]})
    messages.append({"role": "user", "content": message})

    response = ""

    for message in client.chat_completion(
        messages,
        stream=True,
    ):
        token = message.choices[0].delta.content
        response += token

    llama_responded = 1
    responded_answer = response
    return response

def update_response_display():
    while not llama_responded:
        time.sleep(1)

def tts_part():
    global llama_responded
    global responded_answer
    result = ""
    if responded_answer != "":
        text = responded_answer

        client = Client("tonyassi/voice-clone")
        result = client.predict(
                text,
                audio=file('siri.wav'),
                api_name="/predict"
        )
        llama_responded = 0
        responded_answer = ""
    return result

def create_interface():
    with gr.Blocks() as demo:

        # Chat interface using the custom chatbot instance
        chatbot = gr.ChatInterface(
            title="Exodia AI Assistant",
            fill_height=True,
            fn=respond,
            submit_btn="Start Chatting"
        )
        user_start = chatbot.textbox.submit(
            fn=update_response_display, 
            inputs=[],
            outputs=[],
        )
        user_click = chatbot.submit_btn.click(
            fn=update_response_display, 
            inputs=[],
            outputs=[],
        )


        # Audio input section
        with gr.Row():
            audio_input = gr.Audio(
                sources="microphone",
                type="numpy",  # Get audio data and sample rate
                label="Say Something..."
            )
            recognized_text = gr.Textbox(label="Recognized Text",interactive=False)
        
        # Process audio button
        process_button = gr.Button("Process Audio")
        
        # Loading animation
        loading_animation = gr.HTML(
            value='<div style="text-align: center;"><span style="font-size: 18px;">ASR Model is running...</span></div>',
            visible=False
        )

        text_speaker = gr.Audio(
            label="Generated Audio"
        )
        
        # Associate audio processing function and update component states on click
        process_button.click(
            fn=disable_components,
            inputs=[],
            outputs=[recognized_text, process_button, loading_animation]
        ).then(
            fn=process_audio,
            inputs=[audio_input],
            outputs=[recognized_text, chatbot.textbox]
        ).then(
            fn=enable_components,
            inputs=[recognized_text],
            outputs=[recognized_text, process_button, loading_animation]
        )

        user_start.then(
            fn=tts_part,
            inputs=[], 
            outputs=text_speaker
        )

        user_click.then(
            fn=tts_part,
            inputs=[], 
            outputs=text_speaker
        )
                
    return demo



if __name__ == "__main__":
    demo = create_interface()
    demo.launch()