import torch from transformers import LlamaForCausalLM, LlamaTokenizer import gradio as gr # Load the model and tokenizer MODEL_NAME = "meta-llama/Llama-2-8b-hf" # Update this if using a custom LLaMA model DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print("Loading model...") tokenizer = LlamaTokenizer.from_pretrained(MODEL_NAME) model = LlamaForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float16, # Use float16 for better performance device_map="auto" # Automatically load onto available GPU ) # Define a function for generating responses def generate_response(prompt): inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(DEVICE) with torch.no_grad(): outputs = model.generate( input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=512, temperature=0.7, # Adjust creativity level top_p=0.95, # Top-p sampling num_return_sequences=1 ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response # Gradio UI iface = gr.Interface( fn=generate_response, inputs=gr.Textbox(lines=3, placeholder="Enter your prompt here..."), outputs=gr.Textbox(label="LLaMA Response"), title="LLaMA 3.1 8B Chatbot", description="An interactive demo of the LLaMA 3.1 8B model using Hugging Face Spaces." ) # Launch the app if __name__ == "__main__": iface.launch()