import json from .defaults import SPAM_N_REQUESTS, ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER, HEADERS from .utils import ElapsedFuturesSession from datasets import load_dataset data = load_dataset("glue", "sst2", split="validation") RETURN_MESSAGE_SINGLE = """ Inference statistics: * Response status: {0} * Prediction: {1} * Inference latency (preprocessing/forward/postprocessing): {2} ms * Peak GPU memory usage: {3} MB * End-to-end latency (communication + pre/forward/post): {4} ms * Padding ratio: 0.0 % """ RETURN_MESSAGE_SPAM = """ Processing """ + f"{SPAM_N_REQUESTS}" + """ inputs sent asynchronously. Grab a coffee. Inference statistics: * Promise resolution time: {0} ms * Mean inference latency (preprocessing/forward/postprocessing): {1} ms * Mean peak GPU memory: {2} MB * Mean padding ratio: {3} % * Mean sequence length: {4} tokens """ def get_message_single(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs): return RETURN_MESSAGE_SINGLE.format(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency) def get_message_spam(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length, **kwargs): return RETURN_MESSAGE_SPAM.format(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length) SESSION = ElapsedFuturesSession() def send_single(input_model_vanilla, address: str): assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER] promise = SESSION.post(address, headers=HEADERS, data=input_model_vanilla.encode("utf-8")) response = promise.result() # resolve immediately status = response.status_code response_text = json.loads(response.text) prediction = response_text[0] inf_latency = response_text[1] peak_gpu_memory = response_text[2] end_to_end_latency = response.elapsed return get_message_single(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency) def send_spam(address: str): assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER] # data = "this is positive lol" #TODO: use dynamic data with padding assert SPAM_N_REQUESTS <= len(data) inp = data.shuffle().select(range(SPAM_N_REQUESTS)) resolution_time = 0 mean_inference_latency = 0 mean_peak_gpu_memory = 0 n_pads = 0 n_elems = 0 sequence_length = 0 promises = [] for i in range(SPAM_N_REQUESTS): input_data = inp[i]["sentence"].encode("utf-8") promises.append(SESSION.post(address, headers=HEADERS, data=input_data)) for promise in promises: response = promise.result() response_text = json.loads(response.text) resolution_time = max(resolution_time, response.elapsed) mean_inference_latency += response_text[1] mean_peak_gpu_memory += response_text[2] n_pads += response_text[3] n_elems += response_text[4] sequence_length += response_text[5] mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}" mean_sequence_length = sequence_length / SPAM_N_REQUESTS resolution_time = round(resolution_time, 2) mean_inference_latency = round(mean_inference_latency / SPAM_N_REQUESTS, 2) mean_peak_gpu_memory = round(mean_peak_gpu_memory / SPAM_N_REQUESTS, 2) return get_message_spam(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length)