zetavg
add inference_ui and stuff
d754e91 unverified
raw
history blame
2.08 kB
import os
import sys
import torch
import transformers
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
from .globals import Global
def get_device():
if torch.cuda.is_available():
return "cuda"
else:
return "cpu"
try:
if torch.backends.mps.is_available():
return "mps"
except: # noqa: E722
pass
device = get_device()
def get_base_model():
load_base_model()
return Global.loaded_base_model
def get_model_with_lora(lora_weights: str = "tloen/alpaca-lora-7b"):
if device == "cuda":
return PeftModel.from_pretrained(
get_base_model(),
lora_weights,
torch_dtype=torch.float16,
)
elif device == "mps":
return PeftModel.from_pretrained(
get_base_model(),
lora_weights,
device_map={"": device},
torch_dtype=torch.float16,
)
else:
return PeftModel.from_pretrained(
get_base_model(),
lora_weights,
device_map={"": device},
)
def get_tokenizer():
load_base_model()
return Global.loaded_tokenizer
def load_base_model():
if Global.loaded_tokenizer is None:
Global.loaded_tokenizer = LlamaTokenizer.from_pretrained(
Global.base_model)
if Global.loaded_base_model is None:
if device == "cuda":
Global.loaded_base_model = LlamaForCausalLM.from_pretrained(
Global.base_model,
load_in_8bit=Global.load_8bit,
torch_dtype=torch.float16,
device_map="auto",
)
elif device == "mps":
Global.loaded_base_model = LlamaForCausalLM.from_pretrained(
Global.base_model,
device_map={"": device},
torch_dtype=torch.float16,
)
else:
model = LlamaForCausalLM.from_pretrained(
base_model, device_map={"": device}, low_cpu_mem_usage=True
)