from huggingface_hub import InferenceClient
import os

# HF Inference Endpoints parameter
endpoint_url = "https://YOUR_ENDPOINT.endpoints.huggingface.cloud"

hf_token = os.getenv("TOKEN_HF")

# Streaming Client
client = InferenceClient(endpoint_url, token=hf_token)

# generation parameter
gen_kwargs = dict(
    max_new_tokens=512,
    top_k=30,
    top_p=0.9,
    temperature=0.2,
    repetition_penalty=1.02,
    stop_sequences=["\nUser:", "<|endoftext|>", "</s>"],
)
# prompt
prompt = "What can you do in Nuremberg, Germany? Give me 3 Tips"

stream = client.text_generation(prompt, stream=True, details=True, **gen_kwargs)

# yield each generated token
for r in stream:
    # skip special tokens
    if r.token.special:
        continue
    # stop if we encounter a stop sequence
    if r.token.text in gen_kwargs["stop_sequences"]:
        break
    # yield the generated token
    print(r.token.text, end = "")
    # yield r.token.text


#######################################################################
#Darstellung mit Gradio

with open("custom.css", "r", encoding="utf-8") as f:
    customCSS = f.read()
    
with gr.Blocks(theme=small_and_beautiful_theme) as demo:
    history = gr.State([])
    user_question = gr.State("")
    gr.Markdown("KIs am LI - wähle aus, was du bzgl. KI-Bots ausprobieren möchtest!")
    with gr.Tabs():
        with gr.TabItem("LI-Chat"):
            with gr.Row():
                gr.HTML(title)
                status_display = gr.Markdown("Erfolg", elem_id="status_display")
            gr.Markdown(description_top)
            with gr.Row(scale=1).style(equal_height=True):
                with gr.Column(scale=5):
                    with gr.Row(scale=1):
                        chatbotGr = gr.Chatbot(elem_id="LI_chatbot").style(height="100%")
                    with gr.Row(scale=1):
                        with gr.Column(scale=12):
                            user_input = gr.Textbox(
                                show_label=False, placeholder="Gib deinen Text / Frage ein."
                            ).style(container=False)
                        with gr.Column(min_width=100, scale=1):
                            submitBtn = gr.Button("Absenden")
                        with gr.Column(min_width=100, scale=1):
                            cancelBtn = gr.Button("Stoppen")
                    with gr.Row(scale=1):
                        emptyBtn = gr.Button(
                            "🧹 Neuer Chat",
                        )
                with gr.Column():
                    with gr.Column(min_width=50, scale=1):
                        with gr.Tab(label="Nur zum Testen:"):
                            gr.Markdown("# Parameter")
                            top_p = gr.Slider(
                                minimum=-0,
                                maximum=1.0,
                                value=0.95,
                                step=0.05,
                                interactive=True,
                                label="Top-p",
                            )
                            temperature = gr.Slider(
                                minimum=0.1,
                                maximum=2.0,
                                value=1,
                                step=0.1,
                                interactive=True,
                                label="Temperature",
                            )
                            max_length_tokens = gr.Slider(
                                minimum=0,
                                maximum=512,
                                value=512,
                                step=8,
                                interactive=True,
                                label="Max Generation Tokens",
                            )
                            max_context_length_tokens = gr.Slider(
                                minimum=0,
                                maximum=4096,
                                value=2048,
                                step=128,
                                interactive=True,
                                label="Max History Tokens",
                            )
            gr.Markdown(description)

        with gr.TabItem("Übersetzungen"):
            with gr.Row():
                    gr.Textbox(
                                show_label=False, placeholder="Ist noch in Arbeit..."
                            ).style(container=False)
        with gr.TabItem("Code-Generierungen"):
            with gr.Row():
                    gr.Textbox(
                                show_label=False, placeholder="Ist noch in Arbeit..."
                            ).style(container=False)
    
    predict_args = dict(
        fn=predict,
        inputs=[
            user_question,
            chatbotGr,
            history,
            top_p,
            temperature,
            max_length_tokens,
            max_context_length_tokens,
        ],
        outputs=[chatbotGr, history, status_display],
        show_progress=True,
    )
        
    #neuer Chat
    reset_args = dict(
        #fn=reset_chat, inputs=[], outputs=[user_input, status_display]
        fn=reset_textbox, inputs=[], outputs=[user_input, status_display]
    )
            
    # Chatbot
    transfer_input_args = dict(
        fn=transfer_input, inputs=[user_input], outputs=[user_question, user_input, submitBtn], show_progress=True
    )
        
    #Listener auf Start-Click auf Button oder Return
    predict_event1 = user_input.submit(**transfer_input_args).then(**predict_args)
    predict_event2 = submitBtn.click(**transfer_input_args).then(**predict_args)
        
    #Listener, Wenn reset...
    emptyBtn.click(
        reset_state,
        outputs=[chatbotGr, history, status_display],
        show_progress=True,
    )
    emptyBtn.click(**reset_args)

demo.title = "LI Chat"
#demo.queue(concurrency_count=1).launch(share=True) 
demo.queue(concurrency_count=1).launch(debug=True)