from transformers import AutoTokenizer, AutoModelForCausalLM from huggingface_hub import login import os import torch PREFERED_MODEL = "pretrained" if torch.cuda.is_available(): print("Using GPU") device = torch.device("cuda") print("GPU:", torch.cuda.get_device_name(0)) else: print("Using CPU") device = torch.device("cpu") # Login to huggingface token = os.getenv('HF_TOKEN') login(token = token) if PREFERED_MODEL == "pretrained": #print("Using pretrained model") model_id = "mattshumer/Llama-3-8B-16K" print("Loading model...") model = AutoModelForCausalLM.from_pretrained(model_id, device_map = "cuda", load_in_8bit = True) print("loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(model_id) print("Pretrained model loaded.") elif PREFERED_MODEL == "fine-tuned": print("Using fine-tuned model") model_id = os.getenv('MODEL_ID') if model_id is None: raise ValueError("MODEL_ID is not set") print("Loading model...") model = AutoModelForCausalLM.from_pretrained(model_id, device_map = "cuda", load_in_8bit = True) print("loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(model_id) print("Fine-tuned model loaded.") def answer(prompt): inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to(device) prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)) outputs = model.generate(inputs, max_length=150, do_sample=True, top_p=0.95, top_k=60, pad_token_id=tokenizer.eos_token_id) generated = tokenizer.decode(outputs[0])[prompt_length:] return generated if __name__ == "__main__": prompt = "Who is Leonardo Da Vinci?" print(answer(prompt))