--- license: mit language: - en base_model: - NousResearch/Hermes-3-Llama-3.1-8B --- ## Inf ```py !git clone https://github.com/huggingface/transformers.git %cd transformers !git checkout !pip install . !pip install -q accelerate==0.34.2 bitsandbytes==0.44.1 peft==0.13.1 ``` #### Importing libs ```py import os import torch from datasets import load_dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline, logging, ) ``` #### Bits&Bytes Config ```py use_4bit = True # Compute dtype for 4-bit base models bnb_4bit_compute_dtype = "float16" # Quantization type (fp4 or nf4) compute_dtype = getattr(torch, bnb_4bit_compute_dtype) use_nested_quant = False bnb_4bit_quant_type = "nf4" bnb_config = BitsAndBytesConfig( load_in_4bit=use_4bit, bnb_4bit_quant_type=bnb_4bit_quant_type, bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=use_nested_quant, ) ``` #### Loading Model ```py # Load base model model_name = 'Ahanaas/HermesWithYou' model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, device_map=0 ) ``` #### Loading Tokenizer ```py # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" ``` # Predictions ```py # Run text generation pipeline with our next model system_prompt = '''''' prompt = '''''' pipe = pipeline( task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128, # Increase this to allow for longer outputs temperature=0.5, # Encourages more varied outputs top_k=50, # Limits to the top 50 tokens do_sample=True, # Enables sampling return_full_text=True, ) result = pipe(f"<|im_start|>system\n {system_prompt}\n<|im_end|>\n<|im_start|>user\n{prompt}\n<|im_end|>\n<|im_start|>assistant\n") # print(result[0]['generated_text']) generated_text = result[0]['generated_text'] # Print the extracted response text print(generated_text) ```