Spaces:

sasan
/

KITT

Build error

File size: 17,206 Bytes

import subprocess

subprocess.run(
    "pip install flash-attn --no-build-isolation",
    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
    shell=True,
)
subprocess.run(
    "python -m unidic download",
    shell=True,
)

import gradio as gr
import spaces
from langchain.tools import tool
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.utils.function_calling import convert_to_openai_tool
from loguru import logger

from kitt.core import tts_gradio
from kitt.core import utils as kitt_utils
from kitt.core import voice_options
from kitt.core.model import generate_function_call as process_query
from kitt.core.stt import transcribe_audio
from kitt.core.tts import prep_for_tts, run_melo_tts, run_tts_replicate
from kitt.skills import (
    code_interpreter,
    date_time_info,
    do_anything_else,
    extract_func_args,
    find_route,
    get_forecast,
    get_weather,
    get_weather_current_location,
    search_along_route_w_coordinates,
    search_points_of_interest,
    set_vehicle_destination,
    set_vehicle_speed,
)
from kitt.skills.common import config, vehicle
from kitt.skills.routing import calculate_route, find_address

ORIGIN = "Luxembourg, Luxembourg"
DESTINATION = "Paris, France"
DEFAULT_LLM_BACKEND = "local"
ENABLE_HISTORY = True
ENABLE_TTS = True
TTS_BACKEND = "local"
USER_PREFERENCES = "User prefers italian food."

global_context = {
    "vehicle": vehicle,
    "query": "How is the weather?",
    "route_points": [],
    "origin": ORIGIN,
    "destination": DESTINATION,
    "enable_history": ENABLE_HISTORY,
    "tts_enabled": ENABLE_TTS,
    "tts_backend": TTS_BACKEND,
    "llm_backend": DEFAULT_LLM_BACKEND,
    "map_origin": ORIGIN,
    "map_destination": DESTINATION,
    "update_proxy": 0,
    "map": None,
}

speaker_embedding_cache = {}
history = ChatMessageHistory()


# Generate options for hours (00-23)
hour_options = [f"{i:02d}:00:00" for i in range(24)]


@tool
def search_along_route(query=""):
    """Search for points of interest along the route/way to the destination.

    Args:
        query (str, optional): The type of point of interest to search for. Defaults to "restaurant".

    """
    points = global_context["route_points"]
    # maybe reshape
    return search_along_route_w_coordinates(points, query)


def set_time(time_picker):
    vehicle.time = time_picker
    return vehicle


functions = [
    # set_vehicle_speed,
    set_vehicle_destination,
    get_weather,
    find_route,
    search_points_of_interest,
    search_along_route,
]
openai_tools = [convert_to_openai_tool(tool) for tool in functions]


def clear_history():
    logger.info("Clearing the conversation history...")
    history.clear()


@spaces.GPU
def run_llama3_model(query, voice_character, state):

    assert len(functions) > 0, "No functions to call"
    assert len(openai_tools) > 0, "No openai tools to call"

    output_text = process_query(
        query,
        history=history,
        user_preferences=state["user_preferences"],
        tools=openai_tools,
        functions=functions,
        backend=state["llm_backend"],
    )
    gr.Info(f"Output text: {output_text}\nGenerating voice output...")
    output_text_tts = prep_for_tts(output_text)
    voice_out = None
    if global_context["tts_enabled"]:
        if "Fast" in voice_character:
            voice_out = run_melo_tts(output_text_tts, voice_character)
        elif global_context["tts_backend"] == "replicate":
            voice_out = run_tts_replicate(output_text_tts, voice_character)
        else:
            voice_out = tts_gradio(
                output_text_tts, voice_character, speaker_embedding_cache
            )[0]
    return (
        output_text,
        voice_out,
    )


def run_model(query, voice_character, state):
    model = state.get("model", "llama3")
    query = query.strip().replace("'", "")
    logger.info(
        f"Running model: {model} with query: {query}, voice_character: {voice_character} and llm_backend: {state['llm_backend']}, tts_enabled: {state['tts_enabled']}"
    )
    global_context["query"] = query
    text, voice = run_llama3_model(query, voice_character, state)

    if not state["enable_history"]:
        history.clear()
    global_context["update_proxy"] += 1

    return (
        text,
        voice,
        vehicle.model_dump(),
        state,
        dict(update_proxy=global_context["update_proxy"]),
    )


def calculate_route_gradio(origin, destination):
    _, points = calculate_route(origin, destination)
    plot = kitt_utils.plot_route(points, vehicle=vehicle.location_coordinates)
    global_context["map"] = plot
    global_context["route_points"] = points
    # state.value["route_points"] = points
    vehicle.location_coordinates = points[0]["latitude"], points[0]["longitude"]
    return plot, vehicle.model_dump(), 0


def update_vehicle_status(trip_progress, origin, destination, state):
    if not global_context["route_points"]:
        _, points = calculate_route(origin, destination)
        global_context["route_points"] = points
    global_context["destination"] = destination
    global_context["route_points"] = global_context["route_points"]
    n_points = len(global_context["route_points"])
    index = min(int(trip_progress / 100 * n_points), n_points - 1)
    logger.info(f"Trip progress: {trip_progress} len: {n_points}, index: {index}")
    new_coords = global_context["route_points"][index]
    new_coords = new_coords["latitude"], new_coords["longitude"]
    logger.info(
        f"Trip progress: {trip_progress}, len: {n_points}, new_coords: {new_coords}"
    )
    vehicle.location_coordinates = new_coords
    new_vehicle_location = find_address(new_coords[0], new_coords[1])
    vehicle.location = new_vehicle_location
    plot = kitt_utils.plot_route(
        global_context["route_points"], vehicle=vehicle.location_coordinates
    )
    return vehicle, plot, state


@spaces.GPU
def save_and_transcribe_run_model(audio, voice_character, state):
    text = transcribe_audio(audio)
    out_text, out_voice, vehicle_status, state, update_proxy = run_model(
        text, voice_character, state
    )
    return None, text, out_text, out_voice, vehicle_status, state, update_proxy


def set_tts_enabled(tts_enabled, state):
    new_tts_enabled = tts_enabled == "Yes"
    logger.info(
        f"TTS enabled was {state['tts_enabled']} and changed to {new_tts_enabled}"
    )
    state["tts_enabled"] = new_tts_enabled
    global_context["tts_enabled"] = new_tts_enabled
    return state


def set_llm_backend(llm_backend, state):
    assert llm_backend in ["Ollama", "Replicate", "Local"], "Invalid LLM backend"
    new_llm_backend = llm_backend.lower()
    logger.info(
        f"LLM backend was {state['llm_backend']} and changed to {new_llm_backend}"
    )
    state["llm_backend"] = new_llm_backend
    global_context["llm_backend"] = new_llm_backend
    return state


def set_user_preferences(preferences, state):
    new_preferences = preferences
    logger.info(f"User preferences changed to: {new_preferences}")
    state["user_preferences"] = new_preferences
    global_context["user_preferences"] = new_preferences
    return state


def set_enable_history(enable_history, state):
    new_enable_history = enable_history == "Yes"
    logger.info(
        f"Enable history was {state['enable_history']} and changed to {new_enable_history}"
    )
    state["enable_history"] = new_enable_history
    global_context["enable_history"] = new_enable_history
    return state


def set_tts_backend(tts_backend, state):
    new_tts_backend = tts_backend.lower()
    logger.info(
        f"TTS backend was {state['tts_backend']} and changed to {new_tts_backend}"
    )
    state["tts_backend"] = new_tts_backend
    global_context["tts_backend"] = new_tts_backend
    return state


def conditional_update():
    if global_context["destination"] != vehicle.destination:
        global_context["destination"] = vehicle.destination

    if global_context["origin"] != vehicle.location:
        global_context["origin"] = vehicle.location

    if (
        global_context["map_origin"] != vehicle.location
        or global_context["map_destination"] != vehicle.destination
        or global_context["update_proxy"] == 0
    ):
        logger.info(f"Updating the map plot... in conditional_update")
        map_plot, _, _ = calculate_route_gradio(vehicle.location, vehicle.destination)
        global_context["map"] = map_plot
    return global_context["map"]


# to be able to use the microphone on chrome, you will have to go to chrome://flags/#unsafely-treat-insecure-origin-as-secure and enter http://10.186.115.21:7860/
# in "Insecure origins treated as secure", enable it and relaunch chrome

# example question:
# what's the weather like outside?
# What's the closest restaurant from here?


def create_demo(tts_server: bool = False, model="llama3"):
    print(f"Running the demo with model: {model} and TTSServer: {tts_server}")
    with gr.Blocks(theme=gr.themes.Default(), title="KITT") as demo:
        state = gr.State(
            value={
                # "context": initial_context,
                "query": "",
                "route_points": [],
                "model": model,
                "tts_enabled": ENABLE_TTS,
                "llm_backend": DEFAULT_LLM_BACKEND,
                "user_preferences": USER_PREFERENCES,
                "enable_history": ENABLE_HISTORY,
                "tts_backend": TTS_BACKEND,
                "destination": DESTINATION,
            }
        )

        plot, _, _ = calculate_route_gradio(ORIGIN, DESTINATION)
        global_context["map"] = plot

        with gr.Row():
            # with gr.Row():
            #     gr.Text("KITT", interactive=False)
            with gr.Column(scale=1, min_width=300):
                vehicle_status = gr.JSON(
                    value=vehicle.model_dump(), label="Vehicle status"
                )
                time_picker = gr.Dropdown(
                    choices=hour_options,
                    label="What time is it? (HH:MM)",
                    value="08:00:00",
                    interactive=True,
                )
                voice_character = gr.Radio(
                    choices=voice_options,
                    label="Choose a voice",
                    value=voice_options[0],
                    show_label=True,
                )
                # voice_character = gr.Textbox(
                #     label="Choose a voice",
                #     value="freeman",
                #     show_label=True,
                # )
                origin = gr.Textbox(
                    value=ORIGIN,
                    label="Origin",
                    interactive=True,
                )
                destination = gr.Textbox(
                    value=DESTINATION,
                    label="Destination",
                    interactive=True,
                )
                preferences = gr.Textbox(
                    value=USER_PREFERENCES,
                    label="User preferences",
                    lines=3,
                    interactive=True,
                )

            with gr.Column(scale=2, min_width=600):
                map_plot = gr.Plot(value=plot, label="Map")
                trip_progress = gr.Slider(
                    0, 100, step=5, label="Trip progress", interactive=True
                )

            # with gr.Column(scale=1, min_width=300):
            #     gr.Image("linkedin-1.png", label="Linkedin - Sasan Jafarnejad")
            #     gr.Image(
            #         "team-ubix.png",
            #         label="Research Team - UBIX - University of Luxembourg",
            #     )

        with gr.Row():
            with gr.Column():
                input_audio = gr.Audio(
                    type="numpy",
                    sources=["microphone"],
                    label="Input audio",
                    elem_id="input_audio",
                )
                input_text = gr.Textbox(
                    value="How is the weather?", label="Input text", interactive=True
                )
                with gr.Accordion("Debug"):
                    input_audio_debug = gr.Audio(
                        type="numpy",
                        sources=["microphone"],
                        label="Input audio",
                        elem_id="input_audio",
                    )
                    input_text_debug = gr.Textbox(
                        value="How is the weather?",
                        label="Input text",
                        interactive=True,
                    )
                    update_proxy = gr.JSON(
                        value=dict(update_proxy=0),
                        label="Global context",
                    )
                with gr.Accordion("Config"):
                    tts_enabled = gr.Radio(
                        ["Yes", "No"],
                        label="Enable TTS",
                        value="Yes" if ENABLE_TTS else "No",
                        interactive=True,
                    )
                    tts_backend = gr.Radio(
                        ["Local"],
                        label="TTS Backend",
                        value=TTS_BACKEND.title(),
                        interactive=True,
                    )
                    llm_backend = gr.Radio(
                        choices=["Ollama", "Local"],
                        label="LLM Backend",
                        value=DEFAULT_LLM_BACKEND.title(),
                        interactive=True,
                    )
                    enable_history = gr.Radio(
                        ["Yes", "No"],
                        label="Maintain the conversation history?",
                        value="Yes" if ENABLE_HISTORY else "No",
                        interactive=True,
                    )
                # Push button
                clear_history_btn = gr.Button(value="Clear History")
            with gr.Column():
                output_audio = gr.Audio(label="output audio", autoplay=True)
                output_text = gr.TextArea(
                    value="", label="Output text", interactive=False
                )

        # Update plot based on the origin and destination
        # Sets the current location and destination
        origin.submit(
            fn=calculate_route_gradio,
            inputs=[origin, destination],
            outputs=[map_plot, vehicle_status, trip_progress],
        )
        destination.submit(
            fn=calculate_route_gradio,
            inputs=[origin, destination],
            outputs=[map_plot, vehicle_status, trip_progress],
        )
        preferences.submit(
            fn=set_user_preferences, inputs=[preferences, state], outputs=[state]
        )

        # Update time based on the time picker
        time_picker.select(fn=set_time, inputs=[time_picker], outputs=[vehicle_status])

        # Run the model if the input text is changed
        input_text.submit(
            fn=run_model,
            inputs=[input_text, voice_character, state],
            outputs=[output_text, output_audio, vehicle_status, state, update_proxy],
        )
        input_text_debug.submit(
            fn=run_model,
            inputs=[input_text_debug, voice_character, state],
            outputs=[output_text, output_audio, vehicle_status, state, update_proxy],
        )

        # Set the vehicle status based on the trip progress
        trip_progress.release(
            fn=update_vehicle_status,
            inputs=[trip_progress, origin, destination, state],
            outputs=[vehicle_status, map_plot, state],
        )

        # Save and transcribe the audio
        input_audio.stop_recording(
            fn=save_and_transcribe_run_model,
            inputs=[input_audio, voice_character, state],
            outputs=[
                input_audio,
                input_text,
                output_text,
                output_audio,
                vehicle_status,
                state,
                update_proxy,
            ],
        )
        input_audio_debug.stop_recording(
            fn=transcribe_audio,
            inputs=[input_audio_debug],
            outputs=[input_text_debug],
        )

        # Clear the history
        clear_history_btn.click(fn=clear_history, inputs=[], outputs=[])

        # Config
        tts_enabled.change(
            fn=set_tts_enabled, inputs=[tts_enabled, state], outputs=[state]
        )
        tts_backend.change(
            fn=set_tts_backend, inputs=[tts_backend, state], outputs=[state]
        )
        llm_backend.change(
            fn=set_llm_backend, inputs=[llm_backend, state], outputs=[state]
        )
        enable_history.change(
            fn=set_enable_history, inputs=[enable_history, state], outputs=[state]
        )
        update_proxy.change(fn=conditional_update, inputs=[], outputs=[map_plot])

    return demo


# close all interfaces open to make the port available
gr.close_all()


demo = create_demo(False, "llama3")
demo.launch(
    debug=True,
    server_name="0.0.0.0",
    server_port=7860,
    ssl_verify=False,
    share=False,
)