import gradio as gr from huggingface_hub import InferenceClient import torch from transformers import AutoModelForCausalLM, AutoTokenizer import time import traceback model_name_or_path = "ClosedCharacter/Peach-9B-8k-Roleplay" tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) # Check if GPU is available if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") print("GPU not available, using CPU.") model = AutoModelForCausalLM.from_pretrained( model_name_or_path, torch_dtype=torch.bfloat16, trust_remote_code=True).to(device) def slow_echo(system_message, user_message): try: messages = [ {"role": "system", "content": system_message}, {"role": "user", "content": user_message}, ] input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, return_tensors="pt").to(device) output = model.generate( inputs=input_ids, do_sample=True, temperature=0.3, top_p=0.5, no_repeat_ngram_size=6, repetition_penalty=1.1, max_new_tokens=512) generated_response = tokenizer.decode(output[0]) for i in range(len(generated_response)): time.sleep(0.05) yield generated_response[: i + 1] except Exception as e: error_message = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" yield error_message iface = gr.Interface( fn=slow_echo, inputs=[ gr.Textbox(label="System Message"), gr.Textbox(label="User Message") ], outputs=gr.Textbox(label="Generated Response"), title="Roleplay Chatbot" ) if __name__ == "__main__": iface.launch()