# app.py import streamlit as st from transformers import AutoModelForCausalLM, AutoTokenizer import torch import os from dotenv import load_dotenv # Load environment variables load_dotenv() # Set environment variables for Hugging Face (if needed) # os.environ["HF_HOME"] = "/path/to/huggingface" # os.environ["TRANSFORMERS_CACHE"] = "/path/to/transformers/cache" # Streamlit app setup st.title('Llama2 Chatbot Deployment on Hugging Face Spaces') st.write("This chatbot is powered by the Llama2 model. Ask me anything!") # User input user_input = st.text_input("You:", "") if user_input: with st.spinner("Generating response..."): try: # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") model = AutoModelForCausalLM.from_pretrained( "meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, # Use float16 for reduced memory usage device_map="auto" # Automatically map to available devices ) # Encode the input inputs = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors="pt").to(model.device) # Generate a response output = model.generate( inputs, max_length=1000, temperature=0.7, top_p=0.9, do_sample=True, eos_token_id=tokenizer.eos_token_id ) # Decode the response response = tokenizer.decode(output[0], skip_special_tokens=True) # Display the response st.text_area("Bot:", value=response, height=200, max_chars=None, key=None) except Exception as e: st.error(f"An error occurred: {e}")