import gradio as gr from huggingface_hub import InferenceClient import torch from transformers import AutoModelForCausalLM, AutoTokenizer import time import traceback model_name_or_path = "ClosedCharacter/Peach-9B-8k-Roleplay" tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) model = AutoModelForCausalLM.from_pretrained( model_name_or_path, torch_dtype=torch.bfloat16, trust_remote_code=True) """ messages = [ {"role": "system", "content": "你是黑丝御姐"}, {"role": "user", "content": "你好,你是谁"}, ] input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, return_tensors="pt") output = model.generate( inputs=input_ids.to("cpu"), do_sample=True, temperature=0.3, top_p=0.5, no_repeat_ngram_size=6, repetition_penalty=1.1, max_new_tokens=512) generated_response = tokenizer.decode(output[0]) print("Generated response:", generated_response) print("First response to 'hi user first':", "你好,我是你的黑丝御姐?") """ def slow_echo(system_message, user_message): try: messages = [ {"role": "system", "content": system_message}, {"role": "user", "content": user_message}, ] input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, return_tensors="pt") output = model.generate( inputs=input_ids.to("cpu"), do_sample=True, temperature=0.3, top_p=0.5, no_repeat_ngram_size=6, repetition_penalty=1.1, max_new_tokens=512) generated_response = tokenizer.decode(output[0]) for i in range(len(generated_response)): time.sleep(0.05) yield generated_response[: i + 1] except Exception as e: error_message = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" yield error_message iface = gr.Interface( fn=slow_echo, inputs=[ gr.Textbox(label="System Message"), gr.Textbox(label="User Message") ], outputs=gr.Textbox(label="Generated Response"), title="roleplay Chatbot" ) if __name__ == "__main__": iface.launch()