Spaces:

fxmarty
/

bettertransformer-demo

Running

Felix Marty

add demo

35e3254 about 2 years ago

3.53 kB

	import json

	from .defaults import SPAM_N_REQUESTS, ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER, HEADERS
	from .utils import ElapsedFuturesSession

	from datasets import load_dataset

	data = load_dataset("glue", "sst2", split="validation")

	RETURN_MESSAGE_SINGLE = """
	Inference statistics:

	* Response status: {0}
	* Prediction: {1}
	* Inference latency (preprocessing/forward/postprocessing): {2} ms
	* Peak GPU memory usage: {3} MB
	* End-to-end latency (communication + pre/forward/post): {4} ms
	* Padding ratio: 0.0 %
	"""

	RETURN_MESSAGE_SPAM = """
	Processing """ + f"{SPAM_N_REQUESTS}" + """ inputs sent asynchronously. Grab a coffee.

	Inference statistics:

	* Promise resolution time: {0} ms
	* Mean inference latency (preprocessing/forward/postprocessing): {1} ms
	* Mean peak GPU memory: {2} MB
	* Mean padding ratio: {3} %
	* Mean sequence length: {4} tokens
	"""

	def get_message_single(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs):
	return RETURN_MESSAGE_SINGLE.format(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency)

	def get_message_spam(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length, **kwargs):
	return RETURN_MESSAGE_SPAM.format(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length)

	SESSION = ElapsedFuturesSession()

	def send_single(input_model_vanilla, address: str):
	assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]

	promise = SESSION.post(address, headers=HEADERS, data=input_model_vanilla.encode("utf-8"))

	response = promise.result() # resolve immediately

	status = response.status_code

	response_text = json.loads(response.text)
	prediction = response_text[0]
	inf_latency = response_text[1]
	peak_gpu_memory = response_text[2]
	end_to_end_latency = response.elapsed

	return get_message_single(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency)

	def send_spam(address: str):
	assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]

	# data = "this is positive lol" #TODO: use dynamic data with padding

	assert SPAM_N_REQUESTS <= len(data)

	inp = data.shuffle().select(range(SPAM_N_REQUESTS))

	resolution_time = 0
	mean_inference_latency = 0
	mean_peak_gpu_memory = 0

	n_pads = 0
	n_elems = 0
	sequence_length = 0

	promises = []

	for i in range(SPAM_N_REQUESTS):
	input_data = inp[i]["sentence"].encode("utf-8")
	promises.append(SESSION.post(address, headers=HEADERS, data=input_data))

	for promise in promises:
	response = promise.result()

	response_text = json.loads(response.text)

	resolution_time = max(resolution_time, response.elapsed)

	mean_inference_latency += response_text[1]
	mean_peak_gpu_memory += response_text[2]
	n_pads += response_text[3]
	n_elems += response_text[4]
	sequence_length += response_text[5]

	mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
	mean_sequence_length = sequence_length / SPAM_N_REQUESTS

	resolution_time = round(resolution_time, 2)
	mean_inference_latency = round(mean_inference_latency / SPAM_N_REQUESTS, 2)
	mean_peak_gpu_memory = round(mean_peak_gpu_memory / SPAM_N_REQUESTS, 2)

	return get_message_spam(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length)