swapnice
/

swapnice-openorcaxopenchat-preview2-13b

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

swapnice-openorcaxopenchat-preview2-13b / handler.py

Conrad Lippert-Zajaczkowski

test

80ef686 about 1 year ago

1.86 kB

	import torch
	from typing import Dict, List, Any
	from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline

	from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

	nvmlInit()
	gpu_h1 = nvmlDeviceGetHandleByIndex(0)

	print('loaded_imports')
	# get dtype
	dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16
	print('chose dtype', dtype)


	class EndpointHandler:
	def __init__(self, path=""):
	# load the model
	print('starting to load tokenizer')
	tokenizer = LlamaTokenizer.from_pretrained("/repository/orca_tokenizer", local_files_only=True)
	print('loaded tokenizer')
	gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
	print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')
	model = LlamaForCausalLM.from_pretrained(
	"/repository/pytorch_model",
	device_map="auto",
	torch_dtype=dtype,
	offload_folder="offload",
	local_files_only=True
	)
	gpu_info1 = nvmlDeviceGetMemoryInfo(gpu_h1)
	print(f'vram {gpu_info1.total} used {gpu_info1.used} free {gpu_info1.free}')

	print('loaded model')
	# create inference pipeline
	self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
	print('created pipeline')

	def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
	print('starting to call')
	inputs = data.pop("inputs", data)
	print('inputs: ', inputs)
	parameters = data.pop("parameters", None)

	# pass inputs with all kwargs in data
	if parameters is not None:
	prediction = self.pipeline(inputs, **parameters)
	else:
	prediction = self.pipeline(inputs)
	# postprocess the prediction
	return prediction