File size: 6,216 Bytes

fe1ad0f
31c7ef6
fe1ad0f
c1ec7d4
 
 
5e7c6b0
fe1ad0f
31c7ef6
 
7c4cfa7
31c7ef6
 
 
 
c1ec7d4
31c7ef6
 
 
b1db2e4
31c7ef6
fe1ad0f
b2cb6fd
fe1ad0f
b2cb6fd
d4fef3a
 
b1db2e4
d4fef3a
b1db2e4
d4fef3a
b1db2e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4fef3a
b1db2e4
 
 
 
 
 
 
 
d4fef3a
a85c56a
 
b1db2e4
 
 
 
 
405bbd1
b1db2e4
 
 
 
d4fef3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1db2e4
 
 
 
d4fef3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d965a7
 
 
 
 
 
d4fef3a
 
b1db2e4
d4fef3a
 
 
 
 
 
 
 
 
 
 
b2cb6fd

---
library_name: peft
tags:
- autotrain
- meta-llama
- meta-llama/Llama-2-7b-hf
inference: false
widget:
- text: 'instruction: "If you are a doctor, please answer the medical questions based
    on the patient''s description."

    input: "Hi, I had a subarachnoid bleed and coiling of brain aneurysm last year.
    I am having some major bilateral temple pain along with numbness that comes and
    goes in my left arm/hand/fingers. I have had headaches since the aneurysm, but
    this is different. Also, my moods have been horrible for the past few weeks."

    response: ''''

    '
pipeline_tag: text-generation
base_model: meta-llama/Llama-2-7b-hf
---
llama-2-7b-hf model finetuned for medical consultation. Works on T4 GPU (16GB VRAM), as well as CPU (32GB RAM)

**To run on GPU :**

```python
import transformers
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from torch import cuda, bfloat16

base_model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)


hf_auth = "your-huggingface-access-token"
model_config = transformers.AutoConfig.from_pretrained(
    base_model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

config = PeftConfig.from_pretrained("Ashishkr/llama-2-medical-consultation")
model = PeftModel.from_pretrained(model, "Ashishkr/llama-2-medical-consultation").to(device)

model.eval()
print(f"Model loaded on {device}")

tokenizer = transformers.AutoTokenizer.from_pretrained(
    base_model_id,
    use_auth_token=hf_auth
)



```



```python
def llama_generate(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    prompt: str,
    max_new_tokens: int = 128,
    temperature: float = 0.92):

    inputs = tokenizer(
        [prompt],
        return_tensors="pt",
        return_token_type_ids=False,
    ).to(
        device
    )

    # Check if bfloat16 is supported, otherwise use float16
    dtype_to_use = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

    with torch.autocast("cuda", dtype=dtype_to_use):
        response = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            return_dict_in_generate=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    decoded_output = tokenizer.decode(
        response["sequences"][0],
        skip_special_tokens=True,
    )

    return decoded_output[len(prompt) :]

prompt = """
 instruction: "If you are a doctor, please answer the medical questions based on the patient's description." \n

input: "Hi, I had a subarachnoid bleed and coiling of brain aneurysm last year.
I am having some major bilateral temple pain along with numbness that comes and
goes in my left arm/hand/fingers. I have had headaches since the aneurysm,
but this is different. Also, my moods have been horrible for the past few weeks.\n

response:  """
# You can use the function as before
response = llama_generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=100,
    temperature=0.92,
)

print(response)


```

**To run on CPU**


```python


import torch
import transformers
from torch import cuda, bfloat16
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer


base_model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    llm_int8_enable_fp32_cpu_offload = True
)

import torch
hf_auth = "YOUR-HUGGINGFACE-ACCESS-TOKEN"
model_config = transformers.AutoConfig.from_pretrained(
    base_model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    base_model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    # device_map='auto',
    use_auth_token=hf_auth
)

config = PeftConfig.from_pretrained("Ashishkr/llama-2-medical-consultation")
model = PeftModel.from_pretrained(model, "Ashishkr/llama-2-medical-consultation").to(device)

model.eval()
print(f"Model loaded on {device}")

tokenizer = transformers.AutoTokenizer.from_pretrained(
    base_model_id,
    use_auth_token=hf_auth
)

def llama_generate(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    prompt: str,
    max_new_tokens: int = 128,
    temperature: float = 0.92):

    inputs = tokenizer(
        [prompt],
        return_tensors="pt",
        return_token_type_ids=False,
    ).to(
        device
    )

    # Check if bfloat16 is supported, otherwise use float16
    dtype_to_use = torch.float32
    with torch.autocast("cuda", dtype=dtype_to_use):
        response = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            return_dict_in_generate=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    decoded_output = tokenizer.decode(
        response["sequences"][0],
        skip_special_tokens=True,
    )

    return decoded_output[len(prompt) :]

prompt = """
 instruction: "If you are a doctor, please answer the medical questions based on the patient's description." \n

input: "Hi, I had a subarachnoid bleed and coiling of brain aneurysm last year.
I am having some major bilateral temple pain along with numbness that comes and
goes in my left arm/hand/fingers. I have had headaches since the aneurysm,
but this is different. Also, my moods have been horrible for the past few weeks.\n

response:  """
# You can use the function as before
response = llama_generate(
    model,
    tokenizer,
    prompt,
    max_new_tokens=100,
    temperature=0.92,
)

print(response)

```