import spaces import gradio as gr import torch from gradio import State from transformers import AutoTokenizer, AutoModelForCausalLM # Select the device (GPU if available, else CPU) device = "cuda" if torch.cuda.is_available() else "cpu" # Load the tokenizer and model tokenizer = AutoTokenizer.from_pretrained("berkeley-nest/Starling-LM-7B-alpha") model = AutoModelForCausalLM.from_pretrained("berkeley-nest/Starling-LM-7B-alpha").to(device) model.eval() # Set the model to evaluation mode @spaces.GPU def generate_response(user_input, chat_history): try: prompt = "GPT4 Correct User: " + user_input + "GPT4 Correct Assistant: " if chat_history: prompt = chat_history[-1024:] + prompt # Keep last 1024 tokens of history inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024) inputs = {k: v.to(device) for k, v in inputs.items()} # Move input tensors to the same device as the model with torch.no_grad(): output = model.generate(**inputs, max_length=512, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id) response = tokenizer.decode(output[0], skip_special_tokens=True) new_history = chat_history + prompt + response return response, new_history[-1024:] # Return last 1024 tokens of history except Exception as e: return f"Error occurred: {e}", chat_history # Gradio Interface def clear_chat(): return "", "" with gr.Blocks(gr.themes.Soft()) as app: with gr.Row(): gr.Markdown("## Starling Chatbot") gr.Markdown("Run with your own hardware. This application exceeds 24GB VRAM") gr.Markdown("```docker run -it -p 7860:7860 --platform=linux/amd64 --gpus all \ registry.hf.space/macadeliccc-starling-lm-7b-alpha-chat:latest python app.py```") with gr.Row(): chatbot = gr.Chatbot() with gr.Row(): user_input = gr.Textbox(label="Your Message", placeholder="Type your message here...") send = gr.Button("Send") clear = gr.Button("Clear") chat_history = gr.State() # Holds the chat history send.click(generate_response, inputs=[user_input, chat_history], outputs=[chatbot, chat_history]) clear.click(clear_chat, outputs=[chatbot, chat_history]) app.launch()