from huggingface_hub import InferenceClient import os # HF Inference Endpoints parameter endpoint_url = "https://YOUR_ENDPOINT.endpoints.huggingface.cloud" hf_token = os.getenv("TOKEN_HF") # Streaming Client client = InferenceClient(endpoint_url, token=hf_token) # generation parameter gen_kwargs = dict( max_new_tokens=512, top_k=30, top_p=0.9, temperature=0.2, repetition_penalty=1.02, stop_sequences=["\nUser:", "<|endoftext|>", ""], ) # prompt prompt = "What can you do in Nuremberg, Germany? Give me 3 Tips" stream = client.text_generation(prompt, stream=True, details=True, **gen_kwargs) # yield each generated token for r in stream: # skip special tokens if r.token.special: continue # stop if we encounter a stop sequence if r.token.text in gen_kwargs["stop_sequences"]: break # yield the generated token print(r.token.text, end = "") # yield r.token.text ####################################################################### #Darstellung mit Gradio with open("custom.css", "r", encoding="utf-8") as f: customCSS = f.read() with gr.Blocks(theme=small_and_beautiful_theme) as demo: history = gr.State([]) user_question = gr.State("") gr.Markdown("KIs am LI - wähle aus, was du bzgl. KI-Bots ausprobieren möchtest!") with gr.Tabs(): with gr.TabItem("LI-Chat"): with gr.Row(): gr.HTML(title) status_display = gr.Markdown("Erfolg", elem_id="status_display") gr.Markdown(description_top) with gr.Row(scale=1).style(equal_height=True): with gr.Column(scale=5): with gr.Row(scale=1): chatbotGr = gr.Chatbot(elem_id="LI_chatbot").style(height="100%") with gr.Row(scale=1): with gr.Column(scale=12): user_input = gr.Textbox( show_label=False, placeholder="Gib deinen Text / Frage ein." ).style(container=False) with gr.Column(min_width=100, scale=1): submitBtn = gr.Button("Absenden") with gr.Column(min_width=100, scale=1): cancelBtn = gr.Button("Stoppen") with gr.Row(scale=1): emptyBtn = gr.Button( "🧹 Neuer Chat", ) with gr.Column(): with gr.Column(min_width=50, scale=1): with gr.Tab(label="Nur zum Testen:"): gr.Markdown("# Parameter") top_p = gr.Slider( minimum=-0, maximum=1.0, value=0.95, step=0.05, interactive=True, label="Top-p", ) temperature = gr.Slider( minimum=0.1, maximum=2.0, value=1, step=0.1, interactive=True, label="Temperature", ) max_length_tokens = gr.Slider( minimum=0, maximum=512, value=512, step=8, interactive=True, label="Max Generation Tokens", ) max_context_length_tokens = gr.Slider( minimum=0, maximum=4096, value=2048, step=128, interactive=True, label="Max History Tokens", ) gr.Markdown(description) with gr.TabItem("Übersetzungen"): with gr.Row(): gr.Textbox( show_label=False, placeholder="Ist noch in Arbeit..." ).style(container=False) with gr.TabItem("Code-Generierungen"): with gr.Row(): gr.Textbox( show_label=False, placeholder="Ist noch in Arbeit..." ).style(container=False) predict_args = dict( fn=predict, inputs=[ user_question, chatbotGr, history, top_p, temperature, max_length_tokens, max_context_length_tokens, ], outputs=[chatbotGr, history, status_display], show_progress=True, ) #neuer Chat reset_args = dict( #fn=reset_chat, inputs=[], outputs=[user_input, status_display] fn=reset_textbox, inputs=[], outputs=[user_input, status_display] ) # Chatbot transfer_input_args = dict( fn=transfer_input, inputs=[user_input], outputs=[user_question, user_input, submitBtn], show_progress=True ) #Listener auf Start-Click auf Button oder Return predict_event1 = user_input.submit(**transfer_input_args).then(**predict_args) predict_event2 = submitBtn.click(**transfer_input_args).then(**predict_args) #Listener, Wenn reset... emptyBtn.click( reset_state, outputs=[chatbotGr, history, status_display], show_progress=True, ) emptyBtn.click(**reset_args) demo.title = "LI Chat" #demo.queue(concurrency_count=1).launch(share=True) demo.queue(concurrency_count=1).launch(debug=True)