|
import json |
|
import time |
|
|
|
from datasets import Dataset |
|
from requests_futures.sessions import FuturesSession |
|
from transformers import AutoTokenizer |
|
|
|
from defaults import (ADDRESS_BETTERTRANSFORMER, ADDRESS_VANILLA, HEADERS, |
|
MODEL_NAME) |
|
|
|
RETURN_MESSAGE_SINGLE = """ |
|
Inference statistics: |
|
|
|
* Response status: {0} |
|
* Prediction: {1} |
|
* Inference latency (preprocessing/forward/postprocessing): {2} ms |
|
* Peak GPU memory usage: {3} MB |
|
* End-to-end latency (communication + pre/forward/post): {4} ms |
|
* Padding ratio: 0.0 % |
|
""" |
|
|
|
RETURN_MESSAGE_SPAM = """ |
|
Processing inputs sent asynchronously. Grab a coffee. |
|
|
|
Inference statistics: |
|
|
|
* Throughput: {0} samples/s |
|
* Mean inference latency (preprocessing/forward/postprocessing): {1} ms |
|
* Mean peak GPU memory: {2} MB |
|
* Mean padding ratio: {3} % |
|
* Mean sequence length: {4} tokens |
|
* Effective mean batch size: {5} |
|
""" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
|
|
|
def get_message_single( |
|
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs |
|
): |
|
return RETURN_MESSAGE_SINGLE.format( |
|
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency |
|
) |
|
|
|
|
|
def get_message_spam( |
|
throughput, |
|
mean_inference_latency, |
|
mean_peak_gpu_memory, |
|
mean_padding_ratio, |
|
mean_sequence_length, |
|
effective_batch_size, |
|
**kwargs, |
|
): |
|
return RETURN_MESSAGE_SPAM.format( |
|
throughput, |
|
mean_inference_latency, |
|
mean_peak_gpu_memory, |
|
mean_padding_ratio, |
|
mean_sequence_length, |
|
effective_batch_size, |
|
) |
|
|
|
|
|
SESSION = FuturesSession() |
|
|
|
|
|
def send_single(input_model_vanilla: str, address: str): |
|
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER] |
|
|
|
|
|
inp = json.dumps({"text": input_model_vanilla, "pre_tokenized": False}).encode( |
|
"utf-8" |
|
) |
|
start = time.time() |
|
promise = SESSION.post(address, headers=HEADERS, data=inp, timeout=10) |
|
|
|
try: |
|
response = promise.result() |
|
end = time.time() |
|
except Exception as e: |
|
return f"{e}" |
|
|
|
status = response.status_code |
|
|
|
response_text = json.loads(response.text) |
|
prediction = response_text[0] |
|
inf_latency = response_text[1] |
|
peak_gpu_memory = response_text[2] |
|
end_to_end_latency = round((end - start) * 1e3, 2) |
|
|
|
return get_message_single( |
|
status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency |
|
) |
|
|
|
|
|
def send_spam(inp: Dataset, address: str): |
|
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER] |
|
|
|
mean_inference_latency = 0 |
|
mean_peak_gpu_memory = 0 |
|
|
|
n_pads = 0 |
|
n_elems = 0 |
|
sequence_length = 0 |
|
effective_batch_size = 0 |
|
|
|
promises = [] |
|
|
|
n_inputs = len(inp) |
|
|
|
start = time.time() |
|
for i in range(n_inputs): |
|
input_data = inp[i]["sentence"].encode("utf-8") |
|
|
|
|
|
promises.append( |
|
SESSION.post(address, headers=HEADERS, data=input_data, timeout=15) |
|
) |
|
|
|
|
|
end = 0 |
|
for promise in promises: |
|
try: |
|
response = promise.result() |
|
except Exception as e: |
|
return f"{e}" |
|
|
|
end = max(time.time(), end) |
|
|
|
|
|
for promise in promises: |
|
response = promise.result() |
|
response_text = json.loads(response.text) |
|
|
|
mean_inference_latency += response_text[1] |
|
mean_peak_gpu_memory += response_text[2] |
|
n_pads += response_text[3] |
|
n_elems += response_text[4] |
|
sequence_length += response_text[5] |
|
effective_batch_size += response_text[6] |
|
|
|
throughput = n_inputs / (end - start) |
|
mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}" |
|
mean_sequence_length = sequence_length / n_inputs |
|
effective_batch_size = effective_batch_size / n_inputs |
|
|
|
throughput = round(throughput, 2) |
|
mean_inference_latency = round(mean_inference_latency / n_inputs, 2) |
|
mean_peak_gpu_memory = round(mean_peak_gpu_memory / n_inputs, 2) |
|
|
|
return get_message_spam( |
|
throughput, |
|
mean_inference_latency, |
|
mean_peak_gpu_memory, |
|
mean_padding_ratio, |
|
mean_sequence_length, |
|
effective_batch_size, |
|
) |
|
|