File size: 5,279 Bytes
d4a5e8c
97cc5f3
d4a5e8c
d0a28d9
9d03774
 
867343a
68f0d8d
94023b9
d4a5e8c
8b2c674
c191795
 
 
 
 
9d03774
 
68f0d8d
9338b19
282a1f2
9d03774
 
 
68f0d8d
c949392
867343a
 
 
 
 
68f0d8d
9d03774
 
 
867343a
 
9d03774
 
 
867343a
9d03774
1b35ec9
9d03774
 
1b35ec9
 
68f0d8d
9d03774
1b35ec9
 
 
68f0d8d
9d03774
1b35ec9
0a1d26c
 
4fe53d8
8084a16
4fe53d8
8084a16
 
 
0a1d26c
 
 
 
 
 
 
 
2189879
0a1d26c
 
 
4fe53d8
0a1d26c
 
 
 
 
 
 
 
 
 
2189879
0a1d26c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1bbd0bd
 
 
 
98f4015
 
 
 
 
 
80d8e52
0a1d26c
 
 
cade13f
 
 
0a1d26c
 
1bbd0bd
 
 
 
d01e2e3
1bbd0bd
 
8084a16
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import gradio as gr
import numpy as np
from huggingface_hub import InferenceClient
import os
import requests
import scipy.io.wavfile
import io
import time
from gradio_client import Client, file


client = InferenceClient(
    "meta-llama/Meta-Llama-3-8B-Instruct", token=os.getenv("hf_token")
)


def process_audio(audio_data):
    if audio_data is None:
        return "No audio provided.", ""

    # Check if audio_data is a tuple and extract data
    if isinstance(audio_data, tuple):
        sample_rate, data = audio_data
    else:
        return "Invalid audio data format.", ""

    # Convert the audio data to WAV format in memory
    buf = io.BytesIO()
    scipy.io.wavfile.write(buf, sample_rate, data)
    wav_bytes = buf.getvalue()
    buf.close()

    API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v2"
    headers = {"Authorization": f"Bearer {os.getenv('hf_token')}"}

    def query(wav_data):
        response = requests.post(API_URL, headers=headers, data=wav_data)
        return response.json()

    # Call the API to process the audio
    output = query(wav_bytes)

    print(output)  # Check output in console (logs in HF space)

    # Check the API response
    if "text" in output:
        recognized_text = output["text"]
        return recognized_text, recognized_text
    else:
        recognized_text = (
            "The ASR module is still loading, please press the button again!"
        )
        return recognized_text, ""


def master_decision(message):
    decision_response = ""
    judge_system_message = """You are helpful assistant. You will be given queries from the user and you decide on which domain the query belongs to. You have three domains : ["movies","music","others"]. If you don't know about the domain of a query, it is to be classified as "others". Please give a one word answer in smaller caps."""

    m_message = [
        {"role": "system", "content": judge_system_message},
        {"role": "user", "content": message},
    ]
    for m in client.chat_completion(
        m_message,
        stream=True,
    ):
        token = m.choices[0].delta.content
        decision_response += token
    print(decision_response)

    if "movies" in decision_response:
        movie_client = Client("ironserengety/movies-recommender")
        result = movie_client.predict(
            message=message,
            system_message="You are a movie recommender named 'Exodia'. You are extremely reliable. You always mention your name in the beginning of conversation. You will provide me with answers from the given info. Give not more than 3 choices and make sure that answers are complete sentences. Give short one-line descriptions of each sentence.",
            max_tokens=512,
            temperature=0.7,
            top_p=0.95,
            api_name="/chat",
        )
        print(result)

        return decision_response, result

    # elif "music" in decision_response:
    elif "music" in decision_response:
        music_client = Client("ironserengety/MusicRetriever")
        result = music_client.predict(message=message, api_name="/respond")
        response = result

        return decision_response, response
    else:
        # others
        system_message = "You are a helpful chatbot that answers questions. Give any answer within 50 words."
        messages = [{"role": "system", "content": system_message}]

        # for val in history:
        #     print(val[0])
        #     if val[0] != None:
        #         if val[0]:
        #             messages.append({"role": "user", "content": val[0]})
        #         if val[1]:
        #             messages.append({"role": "assistant", "content": val[1]})
        messages.append({"role": "user", "content": message})

        response = ""
        print(messages)

        for message in client.chat_completion(
            messages,
            stream=True,
        ):
            token = message.choices[0].delta.content
            response += token

        return decision_response, response


def tts_part_new(response):
    result = ""
    text = response
    client = Client("tonyassi/voice-clone")
    result = client.predict(text, audio=file("siri.wav"), api_name="/predict")
    return result


def get_chatbot_response(audio_data):
    response_text, _ = process_audio(audio_data)
    domain, response = master_decision(response_text)
    if domain == "music":
        return response, response
    else:
        return response, tts_part_new(response)


def chat_interface():
    with gr.Blocks() as demo:
        # audio_input = gr.Audio(source="microphone", type="filepath", label="Speak")
        audio_input = gr.Audio(
            sources="microphone",
            type="numpy",  # Get audio data and sample rate
            label="Say Something...",
        )
        btn = gr.Button(value="Send")
        response_textbox = gr.Textbox(label="Response Text")
        audio_output = gr.Audio(label="Response Audio")

        btn.click(
            get_chatbot_response,
            inputs=[audio_input],
            outputs=[response_textbox, audio_output],
        )

    return demo


if __name__ == "__main__":
    # demo = create_interface()
    demo = chat_interface()
    demo.launch(show_error=True)