File size: 1,858 Bytes

997becd
 
 
 
229a1e9
 
 
 
 
 
997becd
3555efd
229a1e9
 
997becd
 
 
 
229a1e9
80ef686
229a1e9
 
 
997becd
80ef686
997becd
 
80ef686
 
997becd
229a1e9
 
 
 
997becd
 
229a1e9
997becd
 
229a1e9
997becd
229a1e9
997becd

import torch
from typing import Dict, List, Any
from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline

from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

nvmlInit()
gpu_h1 = nvmlDeviceGetHandleByIndex(0)

print('loaded_imports')
# get dtype
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16
print('chose dtype', dtype)


class EndpointHandler:
    def __init__(self, path=""):
        # load the model
        print('starting to load tokenizer')
        tokenizer = LlamaTokenizer.from_pretrained("/repository/orca_tokenizer", local_files_only=True)
        print('loaded tokenizer')
        gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
        print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')
        model = LlamaForCausalLM.from_pretrained(
            "/repository/pytorch_model",
            device_map="auto",
            torch_dtype=dtype,
            offload_folder="offload",
            local_files_only=True
        )
        gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
        print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')

        print('loaded model')
        # create inference pipeline
        self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
        print('created pipeline')

    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
        print('starting to call')
        inputs = data.pop("inputs", data)
        print('inputs: ', inputs)
        parameters = data.pop("parameters", None)

        # pass inputs with all kwargs in data
        if parameters is not None:
            prediction = self.pipeline(inputs, **parameters)
        else:
            prediction = self.pipeline(inputs)
        # postprocess the prediction
        return prediction