Spaces:

Intel
/

low_bit_open_llm_leaderboard

Running

File size: 10,322 Bytes

import json
import os
from datetime import datetime, timezone
import time

from huggingface_hub import ModelCard, snapshot_download

from src.display.formatting import styled_error, styled_message, styled_warning
from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_REPO, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA, REPO, GIT_REQUESTS_PATH, GIT_STATUS_PATH, GLOBAL_COND
from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
from src.submission.check_validity import (
    already_submitted_models,
    check_model_card,
    get_model_size,
    get_quantized_model_parameters_memory,
    is_model_on_hub,
    is_gguf_on_hub,
    user_submission_permission,
    get_model_tags
)

REQUESTED_MODELS = None
USERS_TO_SUBMISSION_DATES = None

def add_new_eval(
    model: str,
    revision: str,
    private: bool,
    compute_dtype: str="float16",
    precision: str="4bit",
    weight_dtype: str="int4",
    gguf_ftype: str="*Q4_0.gguf",
):
    global REQUESTED_MODELS
    global USERS_TO_SUBMISSION_DATES
    if not REQUESTED_MODELS:
        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(GIT_STATUS_PATH)

    quant_type = None
    user_name = ""
    model_path = model
    if "/" in model:
        user_name = model.split("/")[0]
        model_path = model.split("/")[1]

    precision = precision.split(" ")[0]
    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

    # Is the user rate limited?
    if user_name != "":
        user_can_submit, error_msg = user_submission_permission(
            user_name, USERS_TO_SUBMISSION_DATES, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
        )
        if not user_can_submit:
            return styled_error(error_msg)

    # Did the model authors forbid its submission to the leaderboard?
    if model in DO_NOT_SUBMIT_MODELS:
        return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")

    # Does the model actually exist?
    if revision == "":
        revision = "main"

    architecture = "?"
    downloads = 0
    created_at = ""
    gguf_on_hub, error, gguf_files, new_gguf_ftype = is_gguf_on_hub(repo_id=model, filename=gguf_ftype)
    if new_gguf_ftype is not None:
        gguf_ftype = new_gguf_ftype

    model_on_hub, error, model_config = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)

    # Is the model on the hub?
    if (not model_on_hub or model_config is None) and (not gguf_on_hub or gguf_files is None):
        return styled_error(f'Model "{model}" {error}')

    if model_config is not None:
        architectures = getattr(model_config, "architectures", None)
        if architectures:
            architecture = ";".join(architectures)
        downloads = getattr(model_config, 'downloads', 0)
        created_at = getattr(model_config, 'created_at', '')
        quantization_config = getattr(model_config, 'quantization_config', None)

    if gguf_files is not None:
        architectures = ""
        downloads = 0
        created_at = ""
        quantization_config = None
        quant_type = "llama.cpp"


    # Is the model info correctly filled?
    try:
        model_info = API.model_info(repo_id=model, revision=revision)
    except Exception:
        return styled_error("Could not get your model information. Please fill it up properly.")

    # Were the model card and license filled?
    try:
        if model_info.cardData is None:
            license = "unknown"
        else:
            license = model_info.cardData.get("license", "unknown")
    except Exception:
        return styled_error("Please select a license for your model")

    modelcard_OK, error_msg, model_card = check_model_card(model)

    # maybe don't have model card
    """
    if not modelcard_OK:
        return styled_error(error_msg)
    """

    tags = get_model_tags(model_card, model)

    # Seems good, creating the eval
    print("Adding new eval")

    script = "ITREX"
    hardware = "cpu"
    precision = "4bit"
    if quantization_config is not None:
        quant_method = quantization_config.get("quant_method", None)
        if "bnb_4bit_quant_type" in quantization_config:
            quant_method = "bitsandbytes"
            quant_type = "bitsandbytes"
            hardware = "gpu"
            if quantization_config.get("load_in_4bit", True):
                precision = "4bit"
            if quantization_config.get("load_in_8bit", True):
                precision = "8bit"
        if quant_method == "gptq":
            hardware = "cpu"
            quant_type = "GPTQ"
            precision = f"{quantization_config.get('bits', '4bit')}bit"
        if quant_method == "awq":
            hardware = "gpu"
            quant_type = "AWQ"
            precision = f"{quantization_config.get('bits', '4bit')}bit"
        if quant_method == "aqlm":
            hardware = "gpu"
            quant_type = "AQLM"
            nbits_per_codebook = quantization_config.get('nbits_per_codebook')
            num_codebooks = quantization_config.get('num_codebooks')
            in_group_size = quantization_config.get('in_group_size')
            bits = int(nbits_per_codebook * num_codebooks / in_group_size)
            precision = f"{bits}bit"

    if precision == "4bit":
        weight_dtype = "int4"
    elif precision == "3bit":
        weight_dtype = "int3"
    elif precision == "2bit":
        weight_dtype = "int2"

    if quant_type is None or quant_type == "":
        # return styled_error("Please select a quantization model like GPTQ, AWQ etc.")
        # for eval fp32/fp16/bf16
        quant_type = None

    if quant_type is None:
        weight_dtype = str(getattr(model_config, "torch_dtype", "float16"))
        if weight_dtype in ["torch.float16", "float16"]:
            weight_dtype = "float16"
            precision = "16bit"
        elif weight_dtype in ["torch.bfloat16", "bfloat16"]:
            weight_dtype = "bfloat16"
            precision = "16bit"
        elif weight_dtype in ["torch.float32", "float32"]:
            weight_dtype = "float32"
            precision = "32bit"
        else:
            weight_dtype = "float32"
            precision = "32bit"
        model_type = "original"
        model_params, model_size = get_model_size(model_info=model_info, precision=precision)
    else:
        model_params, model_size = get_quantized_model_parameters_memory(model_info,
            quant_method=quant_type.lower(),
            bits=precision)
        model_type = "quantization"

    if quant_type == "llama.cpp":
        hardware = "cpu"
        script = "llama_cpp"
        tags = "llama.cpp"
    else:
        hardware = "gpu"

    if compute_dtype == "?":
        compute_dtype = "float16"

    eval_entry = {
        "model": model,
        "revision": revision,
        "private": private,
        "params": model_size,
        "architectures": architecture,
        "quant_type": quant_type,
        "precision": precision,
        "model_params": model_params,
        "model_size": model_size,
        "precision": precision,
        "weight_dtype": weight_dtype,
        "compute_dtype": compute_dtype,
        "gguf_ftype": gguf_ftype,
        "hardware": hardware,
        "status": "Pending",
        "submitted_time": current_time,
        "model_type": model_type,
        "job_id": -1,
        "job_start_time": None,
        "scripts": script
    }

    supplementary_info = {
        "likes": model_info.likes,
        "license": license,
        "still_on_hub": True,
        "tags": tags,
        "downloads": downloads,
        "created_at": created_at
    }
    print(eval_entry)

    # ToDo: need open
    # Check for duplicate submission
    if f"{model}_{revision}_{quant_type}_{precision}_{weight_dtype}_{compute_dtype}" in REQUESTED_MODELS:
        return styled_warning("This model has been already submitted.")

    print("Creating huggingface/dataset eval file")
    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
    os.makedirs(OUT_DIR, exist_ok=True)
    out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{quant_type}_{precision}_{weight_dtype}_{compute_dtype}.json"

    with open(out_path, "w") as f:
        f.write(json.dumps(eval_entry))

    print("Uploading eval file")
    try:
        API.upload_file(
            path_or_fileobj=out_path,
            path_in_repo=out_path.split("eval-queue/")[1],
            repo_id=QUEUE_REPO,
            repo_type="dataset",
            commit_message=f"Add {model} to eval queue",
        )
    except Exception as e:
        print(str(e))
        print("upload error........")

    print("Creating git eval file")
    OUT_DIR = f"{GIT_REQUESTS_PATH}/{user_name}"
    os.makedirs(OUT_DIR, exist_ok=True)
    req_out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{quant_type}_{precision}_{weight_dtype}_{compute_dtype}.json"
    req_git_path = "/".join(req_out_path.split('/')[1:])

    print("Creating status file")
    OUT_DIR = f"{GIT_STATUS_PATH}/{user_name}"
    os.makedirs(OUT_DIR, exist_ok=True)
    sta_out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{quant_type}_{precision}_{weight_dtype}_{compute_dtype}.json"
    sta_git_path = "/".join(sta_out_path.split('/')[1:])

    print("Uploading eval file")
    try:
        print("git-push get lock..............")
        GLOBAL_COND.acquire()
        branch = REPO.active_branch.name
        REPO.remotes.origin.pull(branch)

        REPO.index.remove("requests", False, r=True)

        with open(req_out_path, "w") as f:
            f.write(json.dumps(eval_entry, indent=4))
        with open(sta_out_path, "w") as f:
            f.write(json.dumps(eval_entry, indent=4))

        REPO.index.add([req_git_path, sta_git_path])
        commit = REPO.index.commit(f"Add {model} to eval requests/status.")
        REPO.remotes.origin.push(branch)
        time.sleep(10)

        print("git-push release lock..............")
        GLOBAL_COND.release()
    except Exception as e:
        print(str(e))
        print("git-push error........")
        GLOBAL_COND.release()

    return styled_message(
        "Your request has been submitted to the evaluation queue!\nPlease wait for up to 3 hours for the model to show in the PENDING list."
    )