Spaces:
Sleeping
Sleeping
""" fine_tuning_app.py | |
Running a basic chatbot app that can compare base and fine-tuned models from Hugging face. | |
Note: | |
- run using streamlit run fine_tuning_app.py | |
- use free -h then sudo sysctl vm.drop_caches=2 to ensure I have cache space but this can mess up the venv | |
- may need to run huggingface-cli login in terminal to enable access to model | |
- Or: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/discussions/130 for above | |
- Hugging face can use up a lot of disc space - cd ~/.cache/huggingface/hub then rm -rf <subdir> | |
""" | |
import streamlit as st | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import transformers | |
import time | |
import torch | |
from pynvml import * # needs restart of IDE to install, from nvidia-ml-py3 | |
# --------------------------------------------------------------------------------------- | |
# GENERAL SETUP: | |
# --------------------------------------------------------------------------------------- | |
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |
hf_token = "" | |
# model_name = "thebigoed/PreFineLlama-3.1-8B" # this works badly as it does not know chat structure | |
# model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit" # this is what we were fine tuning - also bad without chat instruct | |
# model_name = "Qwen/Qwen2.5-7B-Instruct" # working well now | |
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct" # very effective. NB: if using fine grained access token, make sure it can access gated repos | |
st.title("Fine Tuning Testing") | |
col1, col2 = st.columns(2) | |
if 'conversation' not in st.session_state: | |
st.session_state.conversation = [] | |
user_input = st.text_input("You:", "") # user input | |
def print_gpu_utilization(): | |
# Used for basic resource monioring. | |
nvmlInit() | |
handle = nvmlDeviceGetHandleByIndex(0) | |
info = nvmlDeviceGetMemoryInfo(handle) | |
print(f"GPU memory occupied: {info.used//1024**2} MB.") | |
# --------------------------------------------------------------------------------------- | |
# MODEL SETUP: | |
# --------------------------------------------------------------------------------------- | |
def load_model(): | |
""" Load model from Hugging face.""" | |
print_gpu_utilization() | |
# see https://huggingface.co/mlabonne/FineLlama-3.1-8B for how to run | |
# https://huggingface.co/docs/transformers/main/en/chat_templating look into this to decide on how we do templating | |
success_placeholder = st.empty() | |
with st.spinner("Loading model... please wait"): | |
if str(DEVICE) == "cuda:0": # may not need this, need to test on CPU if device map is okay anyway | |
tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto", device_map="auto") | |
else: | |
tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype="auto") | |
model = AutoModelForCausalLM.from_pretrained(model_name, | |
torch_dtype="auto", | |
device_map="auto" | |
) | |
# Not using terminators at the moment | |
#terminator = tokenizer.eos_token if tokenizer.eos_token else "<|endoftext|>" | |
success_placeholder.success("Model loaded successfully!", icon="π₯") | |
time.sleep(2) | |
success_placeholder.empty() | |
print_gpu_utilization() | |
return model, tokenizer | |
def generate_response(): | |
""" Query the model. """ | |
success_placeholder = st.empty() | |
with st.spinner("Thinking..."): | |
# Tokenising the conversation | |
if tokenizer.chat_template: | |
text = tokenizer.apply_chat_template(st.session_state.conversation, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(DEVICE) | |
else: # base models do not have chat templates | |
print("Assuming base model.") | |
model_input = "" | |
for entry in st.session_state.conversation: | |
model_input += f"{entry['role']}: {entry['content']}\n" | |
text = tokenizer(model_input + "assistant: ", return_tensors="pt")["input_ids"].to(DEVICE) | |
outputs = model.generate(text, | |
max_new_tokens=512, | |
) | |
outputs = tokenizer.batch_decode(outputs[:,text.shape[1]:], skip_special_tokens=True)[0] | |
print_gpu_utilization() | |
success_placeholder.success("Response generated!", icon="β ") | |
time.sleep(2) | |
success_placeholder.empty() | |
return outputs | |
# --------------------------------------------------------------------------------------- | |
# RUNTIME EVENTS: | |
# --------------------------------------------------------------------------------------- | |
model, tokenizer = load_model() | |
# Submit button to send the query | |
with col1: | |
if st.button("send"): | |
if user_input: | |
st.session_state.conversation.append({"role": "user", "content": user_input}) | |
st.session_state.conversation.append({"role": "assistant", "content": generate_response()}) | |
# Clear button to reset | |
with col2: | |
if st.button("clear chat"): | |
if user_input: | |
st.session_state.conversation = [] | |
# Display conversation history | |
for chat in st.session_state.conversation: | |
if chat['role'] == 'user': | |
st.write(f"You: {chat['content']}") | |
else: | |
st.write(f"Assistant: {chat['content']}") |