giskard-evaluator

Running

File size: 9,278 Bytes

8f809e2
58c39e0
 
 
0607989
53fe897
8c47a22
53fe897
666860b
53fe897
1b20cb8
cdcc48e
53fe897
cdcc48e
53fe897
cdcc48e
53fe897
cdcc48e
85a8a8b
0607989
d1e5b15
 
 
 
c50532c
d1e5b15
 
0607989
9e4233f
5b8d6d5
9e4233f
 
3573a39
85a8a8b
9e4233f
d1e5b15
5b8d6d5
9e4233f
be473e6
136af2d
8092547
136af2d
9e4233f
d1e5b15
 
 
 
 
 
 
 
9e4233f
8c47a22
 
 
 
 
 
 
3573a39
9e4233f
d1e5b15
 
 
 
 
 
3573a39
5559b52
d2a76c0
5559b52
7055d8b
5b8d6d5
 
d2a76c0
53fe897
5b8d6d5
53fe897
 
9e4233f
7055d8b
 
 
9e4233f
d1e5b15
 
 
666860b
3573a39
9e4233f
3573a39
 
 
be473e6
 
 
 
 
5b8d6d5
be473e6
 
3573a39
5b8d6d5
be473e6
 
3573a39
8f114e2
35be7f4
58c39e0
666860b
58c39e0
 
8c47a22
58c39e0
3573a39
0607989
c50532c
0607989
 
1b20cb8
0607989
 
 
8c47a22
8f114e2
1b0298e
 
 
666860b
58c39e0
8092547
 
 
0607989
8092547
 
02cf07d
d1e5b15
 
02cf07d
 
 
 
 
 
d1e5b15
58c39e0
 
 
9e4233f
 
 
 
 
5b8d6d5
9e4233f
 
3573a39
8f809e2
53fe897
0607989
8f114e2
53fe897
 
 
3573a39
136af2d
 
8c47a22
 
 
8f114e2
8c47a22
5311dba
 
 
 
7f86019
d1e5b15
35be7f4
666860b
35be7f4
 
d1e5b15
8c47a22
 
d1e5b15
 
 
 
 
666860b
cdcc48e
d1e5b15
 
 
 
 
cdcc48e
d1e5b15
cdcc48e
 
3573a39
 
8f809e2
3573a39
136af2d
3573a39
 
 
9e4233f
1c00552
 
 
 
 
 
 
 
 
 
3573a39
 
 
136af2d
3573a39
53fe897
 
5b8d6d5
 
 
 
 
 
53fe897
7055d8b
d1e5b15
7055d8b
 
5311dba
7055d8b
d1e5b15
 
 
53fe897
5b8d6d5
 
 
 
3573a39
5b8d6d5
3573a39
 
 
 
 
5b8d6d5
8c47a22
3573a39
 
7055d8b
5311dba
3573a39
 
5b8d6d5
7055d8b
3573a39
 
 
9e4233f
 
 
 
3573a39
9e4233f
8f809e2
3573a39
 
 
 
5b8d6d5
3573a39
1b0298e
3573a39
b56bfdc
d1e5b15
 
 
b56bfdc
 
 
 
d1e5b15
3573a39
 
8f809e2
 
5b8d6d5
 
3573a39
8f809e2
8c47a22
cdcc48e
d1e5b15
 
 
 
 
8c47a22
3573a39
 
 
8f809e2
5b8d6d5
8f809e2
8c47a22
cdcc48e
d1e5b15
 
 
 
 
8c47a22
3573a39

import uuid

import gradio as gr

from io_utils import read_scanners, write_scanners
from text_classification_ui_helpers import (
    get_related_datasets_from_leaderboard,
    align_columns_and_show_prediction,
    get_dataset_splits,
    check_dataset,
    show_hf_token_info,
    precheck_model_ds_enable_example_btn,
    try_submit,
    empty_column_mapping,
    write_column_mapping_to_config,
    enable_run_btn,
)

import logging
from wordings import (
    EXAMPLE_MODEL_ID,
    CONFIRM_MAPPING_DETAILS_MD,
    INTRODUCTION_MD,
    USE_INFERENCE_API_TIP,
    USE_INFERENCE_API_NOTICE,
    CHECK_LOG_SECTION_RAW,
    HF_TOKEN_INVALID_STYLED,
)

MAX_LABELS = 40
MAX_FEATURES = 20

CONFIG_PATH = "./config.yaml"
logger = logging.getLogger(__name__)


def get_demo():
    with gr.Row():
        gr.Markdown(INTRODUCTION_MD)
        uid_label = gr.Textbox(
            label="Evaluation ID:", value=uuid.uuid4, visible=False, interactive=False
        )
    with gr.Row():
        with gr.Column():
            with gr.Row():
                model_id_input = gr.Textbox(
                    label="Hugging Face Model id",
                    placeholder=f"e.g. {EXAMPLE_MODEL_ID}",
                )
            with gr.Row():
                model_id_error_info = gr.HTML(visible=False)

        with gr.Column():
            dataset_id_input = gr.Dropdown(
                choices=[],
                value="",
                allow_custom_value=True,
                label="Hugging Face Dataset id",
            )

    with gr.Row():
        dataset_config_input = gr.Dropdown(
            label="Dataset Config", visible=False, allow_custom_value=True
        )
        dataset_split_input = gr.Dropdown(
            label="Dataset Split", visible=False, allow_custom_value=True
        )

    with gr.Row():
        first_line_ds = gr.DataFrame(label="Dataset Preview", visible=False)
    with gr.Row():
        loading_dataset_info = gr.HTML(visible=True)
    with gr.Row():
        example_btn = gr.Button(
            "Validate Model & Dataset",
            visible=True,
            variant="primary",
            interactive=False,
        )
    with gr.Row():
        loading_validation = gr.HTML(visible=True)
    with gr.Row():
        validation_result = gr.HTML(visible=False)
    with gr.Row():
        example_input = gr.Textbox(
            label="Example Input", visible=False, interactive=False
        )
        example_prediction = gr.Label(label="Model Sample Prediction", visible=False)

    with gr.Row():
        with gr.Accordion(
            label="Label and Feature Mapping", visible=False, open=False
        ) as column_mapping_accordion:
            with gr.Row():
                gr.Markdown(CONFIRM_MAPPING_DETAILS_MD)
            column_mappings = []
            with gr.Row():
                with gr.Column():
                    gr.Markdown("# Label Mapping")
                    for _ in range(MAX_LABELS):
                        column_mappings.append(gr.Dropdown(visible=False))
                with gr.Column():
                    gr.Markdown("# Feature Mapping")
                    for _ in range(MAX_LABELS, MAX_LABELS + MAX_FEATURES):
                        column_mappings.append(gr.Dropdown(visible=False))

    with gr.Accordion(label="Model Wrap Advanced Config", open=True):
        gr.HTML(USE_INFERENCE_API_TIP)
        inference_token = gr.Textbox(
            placeholder="hf_xxxxxxxxxxxxxxxxxxxx",
            value="",
            label="HF Token for Inference API",
            visible=True,
            interactive=True,
        )
        inference_token_info = gr.HTML(value=HF_TOKEN_INVALID_STYLED, visible=False)
        gr.HTML(USE_INFERENCE_API_NOTICE)

        inference_token.change(
            fn=show_hf_token_info,
            inputs=[inference_token],
            outputs=[inference_token_info],
        )

    with gr.Accordion(label="Scanner Advanced Config (optional)", open=False):
        with gr.Group():
            verbose_mode_checkbox = gr.Checkbox(label="Verbose mode")

        scanners = gr.CheckboxGroup(visible=True)

        @gr.on(triggers=[uid_label.change], inputs=[uid_label], outputs=[scanners])
        def get_scanners(uid):
            selected = read_scanners(uid)
            # we remove data_leakage from the default scanners
            # Reason: data_leakage barely raises any issues and takes too many requests
            # when using inference API, causing rate limit error
            scan_config = [
                "ethical_bias",
                "text_perturbation",
                "robustness",
                "performance",
                "underconfidence",
                "overconfidence",
                "spurious_correlation",
                "data_leakage",
            ]
            return gr.update(
                choices=scan_config, value=selected, label="Scan Settings", visible=True
            )

    with gr.Row():
        run_btn = gr.Button(
            "Get Evaluation Result",
            variant="primary",
            interactive=False,
            size="lg",
        )

    with gr.Row():
        logs = gr.Textbox(
            value=CHECK_LOG_SECTION_RAW,
            label="Log",
            visible=False,
            every=0.5,
        )

    scanners.change(write_scanners, inputs=[scanners, uid_label])

    gr.on(
        triggers=[model_id_input.change],
        fn=get_related_datasets_from_leaderboard,
        inputs=[model_id_input, dataset_id_input],
        outputs=[dataset_id_input],
    ).then(
        fn=check_dataset,
        inputs=[dataset_id_input],
        outputs=[dataset_config_input, dataset_split_input, loading_dataset_info],
    )

    gr.on(
        triggers=[dataset_id_input.input, dataset_id_input.select],
        fn=check_dataset,
        inputs=[dataset_id_input],
        outputs=[dataset_config_input, dataset_split_input, loading_dataset_info],
    )

    dataset_config_input.change(
        fn=get_dataset_splits,
        inputs=[dataset_id_input, dataset_config_input],
        outputs=[dataset_split_input],
    )

    gr.on(
        triggers=[
            model_id_input.change,
            dataset_id_input.change,
            dataset_config_input.change,
        ],
        fn=empty_column_mapping,
        inputs=[uid_label],
    )

    gr.on(
        triggers=[label.change for label in column_mappings],
        fn=write_column_mapping_to_config,
        inputs=[
            uid_label,
            *column_mappings,
        ],
    )

    # label.change sometimes does not pass the changed value
    gr.on(
        triggers=[label.input for label in column_mappings],
        fn=write_column_mapping_to_config,
        inputs=[
            uid_label,
            *column_mappings,
        ],
    )

    gr.on(
        triggers=[
            model_id_input.change,
            dataset_id_input.change,
            dataset_config_input.change,
            dataset_split_input.change,
        ],
        fn=precheck_model_ds_enable_example_btn,
        inputs=[
            model_id_input,
            dataset_id_input,
            dataset_config_input,
            dataset_split_input,
        ],
        outputs=[
            example_btn,
            first_line_ds,
            validation_result,
            example_input,
            example_prediction,
            column_mapping_accordion,
            model_id_error_info,
        ],
    )

    gr.on(
        triggers=[
            example_btn.click,
        ],
        fn=align_columns_and_show_prediction,
        inputs=[
            model_id_input,
            dataset_id_input,
            dataset_config_input,
            dataset_split_input,
            uid_label,
            inference_token,
        ],
        outputs=[
            validation_result,
            example_input,
            example_prediction,
            column_mapping_accordion,
            run_btn,
            loading_validation,
            *column_mappings,
        ],
    )

    gr.on(
        triggers=[
            run_btn.click,
        ],
        fn=try_submit,
        inputs=[
            model_id_input,
            dataset_id_input,
            dataset_config_input,
            dataset_split_input,
            inference_token,
            uid_label,
            verbose_mode_checkbox,
        ],
        outputs=[
            run_btn,
            logs,
            uid_label,
            validation_result,
            example_input,
            example_prediction,
            column_mapping_accordion,
        ],
    )

    gr.on(
        triggers=[
            inference_token.input,
            scanners.input,
        ],
        fn=enable_run_btn,
        inputs=[
            uid_label,
            inference_token,
            model_id_input,
            dataset_id_input,
            dataset_config_input,
            dataset_split_input,
        ],
        outputs=[run_btn],
    )

    gr.on(
        triggers=[label.input for label in column_mappings],
        fn=enable_run_btn,
        inputs=[
            uid_label,
            inference_token,
            model_id_input,
            dataset_id_input,
            dataset_config_input,
            dataset_split_input,
        ],  # FIXME
        outputs=[run_btn],
    )