import os
from threading import Thread
from typing import Iterator
import gradio as gr
import torch
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

MODEL_LIST = ["LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"]
HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL = os.environ.get("MODEL_ID")

DESCRIPTION = """\
# EXAONE 3.0 7.8B Instruct

<span class="We-hope-EXAONE-continues-to-advance-Expert-AI-with-its-effectiveness-and-bilingual-skills">We hope EXAONE continues to advance Expert AI with its effectiveness and bilingual skills.</span>

<center>This is a official demo of <a href=https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct>LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct</a>, fine-tuned for instruction following.</center>

<center>👋 For more details, please check <a href=https://www.lgresearch.ai/blog/view?seq=460>our blog</a> or <a href=https://arxiv.org/abs/2408.03541>technical report</a></center>
"""

MAX_MAX_NEW_TOKENS = 4096
DEFAULT_MAX_NEW_TOKENS = 128
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "3840"))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)

model.eval()


@spaces.GPU()
def generate(
    message: str,
    chat_history: list[tuple[str, str]],
    system_prompt: str,
    max_new_tokens: int = 128,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
) -> Iterator[str]:
    messages = [{"role":"system","content": system_prompt}]
    print(f'message: {message}')
    print(f'chat_history: {chat_history}')
    for user, assistant in chat_history:
        messages.extend(
            [
                {"role": "user", "content": user},
                {"role": "assistant", "content": assistant},
            ]
        )
    messages.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(
        messages, 
        add_generation_prompt=True, 
        return_tensors="pt"
    )
    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
        gr.Warning(f"Trimmed input from messages as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
    input_ids = input_ids.to(model.device)

    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        {"input_ids": input_ids},
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=False if top_k == 1 else True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        num_beams=1,
        repetition_penalty=1.0,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)


BOT_AVATAR = "EXAONE_logo.png"

chatbot = gr.Chatbot(
    label="EXAONE-3.0-7.8B-Instruct",
    avatar_images=[None, BOT_AVATAR],
    layout="bubble",
    bubble_full_width=False
)

chat_interface = gr.ChatInterface(
    fn=generate,
    chatbot=chatbot,
    additional_inputs=[
        gr.Textbox(
                value="You are EXAONE model from LG AI Research, a helpful assistant.",
                label="System Prompt",
                render=False,
            ),
        gr.Slider(
            label="Max new tokens",
            minimum=1,
            maximum=MAX_MAX_NEW_TOKENS,
            step=1,
            value=DEFAULT_MAX_NEW_TOKENS,
        ),
        gr.Slider(
            label="Temperature",
            minimum=0.1,
            maximum=2.0,
            step=0.1,
            value=0.7,
        ),
        gr.Slider(
            label="Top-p (nucleus sampling)",
            minimum=0.05,
            maximum=1.0,
            step=0.05,
            value=0.9,
        ),
        gr.Slider(
            label="Top-k",
            minimum=1,
            maximum=1000,
            step=1,
            value=50,
        ),
    ],
    stop_btn=None,
    examples=[
        ["Explain who you are"],
        ["너의 소원을 말해봐"],
    ],
    cache_examples=False,
)

with gr.Blocks(css="style.css", fill_height=True) as demo:
    gr.Markdown(DESCRIPTION)
    chat_interface.render()

if __name__ == "__main__":
    demo.queue(max_size=20).launch()