Spaces:
Sleeping
Sleeping
# app.py | |
import streamlit as st | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
import os | |
from dotenv import load_dotenv | |
# Load environment variables | |
load_dotenv() | |
# Retrieve Hugging Face API token from environment variables (if accessing private models) | |
HF_API_TOKEN = os.getenv("HF_API_TOKEN") # Ensure you set this in Hugging Face Secrets | |
# Streamlit app setup | |
st.title('Llama2 Chatbot Deployment on Hugging Face Spaces') | |
st.write("This chatbot is powered by the Llama2 model. Ask me anything!") | |
def load_model(): | |
""" | |
Load the tokenizer and model from Hugging Face. | |
This function is cached to prevent re-loading on every interaction. | |
""" | |
tokenizer = AutoTokenizer.from_pretrained( | |
"meta-llama/Llama-2-7b-chat-hf", | |
use_auth_token=HF_API_TOKEN # Remove if the model is public | |
) | |
model = AutoModelForCausalLM.from_pretrained( | |
"meta-llama/Llama-2-7b-chat-hf", | |
torch_dtype=torch.float16, # Use float16 for reduced memory usage | |
device_map="auto", | |
use_auth_token=HF_API_TOKEN # Remove if the model is public | |
) | |
return tokenizer, model | |
# Load the model and tokenizer | |
tokenizer, model = load_model() | |
# Initialize session state for conversation history | |
if "conversation" not in st.session_state: | |
st.session_state.conversation = [] | |
# User input | |
user_input = st.text_input("You:", "") | |
if user_input: | |
st.session_state.conversation.append({"role": "user", "content": user_input}) | |
with st.spinner("Generating response..."): | |
try: | |
# Prepare the conversation history for the model | |
conversation_text = "" | |
for message in st.session_state.conversation: | |
if message["role"] == "user": | |
conversation_text += f"User: {message['content']}\n" | |
elif message["role"] == "assistant": | |
conversation_text += f"Assistant: {message['content']}\n" | |
# Encode the input | |
inputs = tokenizer.encode(conversation_text + "Assistant:", return_tensors="pt").to(model.device) | |
# Generate a response | |
output = model.generate( | |
inputs, | |
max_length=1000, | |
temperature=0.7, | |
top_p=0.9, | |
do_sample=True, | |
eos_token_id=tokenizer.eos_token_id, | |
pad_token_id=tokenizer.eos_token_id # To avoid warnings | |
) | |
# Decode the response | |
response = tokenizer.decode(output[0], skip_special_tokens=True) | |
# Extract the assistant's reply | |
assistant_reply = response[len(conversation_text + "Assistant: "):].strip() | |
# Append the assistant's reply to the conversation history | |
st.session_state.conversation.append({"role": "assistant", "content": assistant_reply}) | |
# Display the updated conversation | |
conversation_display = "" | |
for message in st.session_state.conversation: | |
if message["role"] == "user": | |
conversation_display += f"**You:** {message['content']}\n\n" | |
elif message["role"] == "assistant": | |
conversation_display += f"**Bot:** {message['content']}\n\n" | |
st.markdown(conversation_display) | |
except Exception as e: | |
st.error(f"An error occurred: {e}") | |