import os import pandas as pd from huggingface_hub import hf_hub_download from .utils import process_quantization_scheme, process_arch LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset" HF_TOKEN = os.environ.get("HF_TOKEN", None) COLUMNS_MAPPING = { "Model": "Model ๐Ÿค—", "experiment_name": "Experiment ๐Ÿงช", # primary measurements "forward.latency(s)": "Prefill (s)", "decode.throughput(tokens/s)": "Decode (tokens/s)", "generate.max_memory_allocated(MB)": "Memory (MB)", "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)", # deployment settings "backend.name": "Backend ๐Ÿญ", "backend.torch_dtype": "DType ๐Ÿ“ฅ", "optimization": "Optimization ๐Ÿ› ๏ธ", "quantization": "Quantization ๐Ÿ—œ๏ธ", # additional measurements "Size": "Params (B)", "Arch": "Architecture ๐Ÿ›๏ธ", "Score": "Open LLM Score (%)", "generate.latency(s)": "End-to-End (s)", "generate.throughput(tokens/s)": "End-to-End (tokens/s)", "generate.max_memory_reserved(MB)": "Reserved Memory (MB)", "generate.max_memory_used(MB)": "Used Memory (MB)", } SORTING_COLUMNS = [ "Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)", ] SORTING_ASCENDING = [False, True, False] def get_llm_df(): # commented for now since scraping script is not working hf_hub_download( repo_id=LLM_PERF_DATASET_REPO, filename="open-llm.csv", local_dir="dataset", repo_type="dataset", token=HF_TOKEN, ) llm_df = pd.read_csv("dataset/open-llm.csv") return llm_df def get_perf_df(machine: str = "hf-dgx-01"): hf_hub_download( repo_id=LLM_PERF_DATASET_REPO, filename=f"{machine}/perf-report.csv", local_dir="dataset", repo_type="dataset", token=HF_TOKEN, ) perf_df = pd.read_csv(f"dataset/{machine}/perf-report.csv") return perf_df def get_llm_perf_df(machine: str = "hf-dgx-01"): # get dataframes llm_df = get_llm_df() perf_df = get_perf_df(machine=machine) llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="model") # some assertions assert llm_perf_df["benchmark.input_shapes.batch_size"].nunique() == 1 assert llm_perf_df["benchmark.input_shapes.sequence_length"].nunique() == 1 assert llm_perf_df["benchmark.new_tokens"].nunique() == 1 # transpose energy consumption llm_perf_df["generate.energy_consumption(tokens/kWh)"] = ( 1 / llm_perf_df["generate.energy_consumption(kWh/token)"].fillna(1) ).astype(int) # fix nan values llm_perf_df.loc[ llm_perf_df["generate.energy_consumption(tokens/kWh)"] == 1, "generate.energy_consumption(tokens/kWh)", ] = pd.NA # add optimization column llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply( lambda x: ( "BetterTransformer" if x["backend.to_bettertransformer"] else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None") ), axis=1, ) # add quantization scheme llm_perf_df["quantization"] = llm_perf_df[ [ "backend.quantization_scheme", "backend.quantization_config.bits", "backend.quantization_config.version", "backend.quantization_config.load_in_4bit", "backend.quantization_config.load_in_8bit", "backend.quantization_config.exllama_config.version", ] ].apply(lambda x: process_quantization_scheme(x), axis=1) # process experiment name llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("pytorch+cuda+", "")) llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply( lambda x: x.replace("float16+", "").replace("float32+", "").replace("bfloat16+", "") if "bit" in x else x ) llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bnb-4bit", "BnB-4bit")) llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bnb-8bit", "BnB-8bit")) llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("awq-4bit", "AWQ-4bit")) llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("gptq-4bit", "GPTQ-4bit")) llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("bettertransformer", "SDPA")) llm_perf_df["experiment_name"] = llm_perf_df["experiment_name"].apply(lambda x: x.replace("flash-attention-v2", "FA-v2")) # add arch llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch) # filter columns llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())] # rename columns llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True) # sort by metric llm_perf_df.sort_values( by=SORTING_COLUMNS, ascending=SORTING_ASCENDING, inplace=True, ) return llm_perf_df