Exodia

Sleeping

File size: 5,279 Bytes

import gradio as gr
import numpy as np
from huggingface_hub import InferenceClient
import os
import requests
import scipy.io.wavfile
import io
import time
from gradio_client import Client, file


client = InferenceClient(
    "meta-llama/Meta-Llama-3-8B-Instruct", token=os.getenv("hf_token")
)


def process_audio(audio_data):
    if audio_data is None:
        return "No audio provided.", ""

    # Check if audio_data is a tuple and extract data
    if isinstance(audio_data, tuple):
        sample_rate, data = audio_data
    else:
        return "Invalid audio data format.", ""

    # Convert the audio data to WAV format in memory
    buf = io.BytesIO()
    scipy.io.wavfile.write(buf, sample_rate, data)
    wav_bytes = buf.getvalue()
    buf.close()

    API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v2"
    headers = {"Authorization": f"Bearer {os.getenv('hf_token')}"}

    def query(wav_data):
        response = requests.post(API_URL, headers=headers, data=wav_data)
        return response.json()

    # Call the API to process the audio
    output = query(wav_bytes)

    print(output)  # Check output in console (logs in HF space)

    # Check the API response
    if "text" in output:
        recognized_text = output["text"]
        return recognized_text, recognized_text
    else:
        recognized_text = (
            "The ASR module is still loading, please press the button again!"
        )
        return recognized_text, ""


def master_decision(message):
    decision_response = ""
    judge_system_message = """You are helpful assistant. You will be given queries from the user and you decide on which domain the query belongs to. You have three domains : ["movies","music","others"]. If you don't know about the domain of a query, it is to be classified as "others". Please give a one word answer in smaller caps."""

    m_message = [
        {"role": "system", "content": judge_system_message},
        {"role": "user", "content": message},
    ]
    for m in client.chat_completion(
        m_message,
        stream=True,
    ):
        token = m.choices[0].delta.content
        decision_response += token
    print(decision_response)

    if "movies" in decision_response:
        movie_client = Client("ironserengety/movies-recommender")
        result = movie_client.predict(
            message=message,
            system_message="You are a movie recommender named 'Exodia'. You are extremely reliable. You always mention your name in the beginning of conversation. You will provide me with answers from the given info. Give not more than 3 choices and make sure that answers are complete sentences. Give short one-line descriptions of each sentence.",
            max_tokens=512,
            temperature=0.7,
            top_p=0.95,
            api_name="/chat",
        )
        print(result)

        return decision_response, result

    # elif "music" in decision_response:
    elif "music" in decision_response:
        music_client = Client("ironserengety/MusicRetriever")
        result = music_client.predict(message=message, api_name="/respond")
        response = result

        return decision_response, response
    else:
        # others
        system_message = "You are a helpful chatbot that answers questions. Give any answer within 50 words."
        messages = [{"role": "system", "content": system_message}]

        # for val in history:
        #     print(val[0])
        #     if val[0] != None:
        #         if val[0]:
        #             messages.append({"role": "user", "content": val[0]})
        #         if val[1]:
        #             messages.append({"role": "assistant", "content": val[1]})
        messages.append({"role": "user", "content": message})

        response = ""
        print(messages)

        for message in client.chat_completion(
            messages,
            stream=True,
        ):
            token = message.choices[0].delta.content
            response += token

        return decision_response, response


def tts_part_new(response):
    result = ""
    text = response
    client = Client("tonyassi/voice-clone")
    result = client.predict(text, audio=file("siri.wav"), api_name="/predict")
    return result


def get_chatbot_response(audio_data):
    response_text, _ = process_audio(audio_data)
    domain, response = master_decision(response_text)
    if domain == "music":
        return response, response
    else:
        return response, tts_part_new(response)


def chat_interface():
    with gr.Blocks() as demo:
        # audio_input = gr.Audio(source="microphone", type="filepath", label="Speak")
        audio_input = gr.Audio(
            sources="microphone",
            type="numpy",  # Get audio data and sample rate
            label="Say Something...",
        )
        btn = gr.Button(value="Send")
        response_textbox = gr.Textbox(label="Response Text")
        audio_output = gr.Audio(label="Response Audio")

        btn.click(
            get_chatbot_response,
            inputs=[audio_input],
            outputs=[response_textbox, audio_output],
        )

    return demo


if __name__ == "__main__":
    # demo = create_interface()
    demo = chat_interface()
    demo.launch(show_error=True)