File size: 3,338 Bytes
cdb6b83
 
 
 
 
 
 
 
 
 
 
822b0d9
 
cdb6b83
 
 
 
 
 
 
 
 
 
 
 
 
822b0d9
cdb6b83
 
 
 
 
822b0d9
cdb6b83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# app.py

import streamlit as st
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Retrieve Hugging Face API token from environment variables
HF_API_TOKEN = os.getenv("HF_API_TOKEN")

# Streamlit app setup
st.title('Llama2 Chatbot Deployment on Hugging Face Spaces')
st.write("This chatbot is powered by the Llama2 model. Ask me anything!")

@st.cache_resource
def load_model():
    """
    Load the tokenizer and model from Hugging Face.
    This function is cached to prevent re-loading on every interaction.
    """
    tokenizer = AutoTokenizer.from_pretrained(
        "meta-llama/Llama-2-7b-chat-hf",
        use_auth_token=HF_API_TOKEN  # Use the secret token
    )
    model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Llama-2-7b-chat-hf",
        torch_dtype=torch.float16,  # Use float16 for reduced memory usage
        device_map="auto",
        use_auth_token=HF_API_TOKEN  # Use the secret token
    )
    return tokenizer, model

# Load the model and tokenizer
tokenizer, model = load_model()

# Initialize session state for conversation history
if "conversation" not in st.session_state:
    st.session_state.conversation = []

# User input
user_input = st.text_input("You:", "")

if user_input:
    st.session_state.conversation.append({"role": "user", "content": user_input})
    with st.spinner("Generating response..."):
        try:
            # Prepare the conversation history for the model
            conversation_text = ""
            for message in st.session_state.conversation:
                if message["role"] == "user":
                    conversation_text += f"User: {message['content']}\n"
                elif message["role"] == "assistant":
                    conversation_text += f"Assistant: {message['content']}\n"

            # Encode the input
            inputs = tokenizer.encode(conversation_text + "Assistant:", return_tensors="pt").to(model.device)

            # Generate a response
            output = model.generate(
                inputs,
                max_length=1000,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.eos_token_id  # To avoid warnings
            )

            # Decode the response
            response = tokenizer.decode(output[0], skip_special_tokens=True)

            # Extract the assistant's reply
            assistant_reply = response[len(conversation_text + "Assistant: "):].strip()

            # Append the assistant's reply to the conversation history
            st.session_state.conversation.append({"role": "assistant", "content": assistant_reply})

            # Display the updated conversation
            conversation_display = ""
            for message in st.session_state.conversation:
                if message["role"] == "user":
                    conversation_display += f"**You:** {message['content']}\n\n"
                elif message["role"] == "assistant":
                    conversation_display += f"**Bot:** {message['content']}\n\n"

            st.markdown(conversation_display)

        except Exception as e:
            st.error(f"An error occurred: {e}")