inoki-giskard's picture
Use another process to run scan
60a5363
raw
history blame
9.91 kB
import gradio as gr
import datasets
import huggingface_hub
import os
import time
import subprocess
import logging
import json
from transformers.pipelines import TextClassificationPipeline
from text_classification import text_classification_fix_column_mapping
HF_REPO_ID = 'HF_REPO_ID'
HF_SPACE_ID = 'SPACE_ID'
HF_WRITE_TOKEN = 'HF_WRITE_TOKEN'
theme = gr.themes.Soft(
primary_hue="green",
)
def check_model(model_id):
try:
task = huggingface_hub.model_info(model_id).pipeline_tag
except Exception:
return None, None
try:
from transformers import pipeline
ppl = pipeline(task=task, model=model_id)
return model_id, ppl
except Exception as e:
return model_id, e
def check_dataset(dataset_id, dataset_config="default", dataset_split="test"):
try:
configs = datasets.get_dataset_config_names(dataset_id)
except Exception:
# Dataset may not exist
return None, dataset_config, dataset_split
if dataset_config not in configs:
# Need to choose dataset subset (config)
return dataset_id, configs, dataset_split
ds = datasets.load_dataset(dataset_id, dataset_config)
if isinstance(ds, datasets.DatasetDict):
# Need to choose dataset split
if dataset_split not in ds.keys():
return dataset_id, None, list(ds.keys())
elif not isinstance(ds, datasets.Dataset):
# Unknown type
return dataset_id, None, None
return dataset_id, dataset_config, dataset_split
def try_validate(model_id, dataset_id, dataset_config, dataset_split, column_mapping):
# Validate model
m_id, ppl = check_model(model_id=model_id)
if m_id is None:
gr.Warning(f'Model "{model_id}" is not accessible. Please set your HF_TOKEN if it is a private model.')
return (
dataset_config, dataset_split,
gr.update(interactive=False), # Submit button
gr.update(visible=False), # Model prediction preview
gr.update(visible=False), # Label mapping preview
gr.update(visible=True), # Column mapping
)
if isinstance(ppl, Exception):
gr.Warning(f'Failed to load "{model_id} model": {ppl}')
return (
dataset_config, dataset_split,
gr.update(interactive=False), # Submit button
gr.update(visible=False), # Model prediction preview
gr.update(visible=False), # Label mapping preview
gr.update(visible=True), # Column mapping
)
# Validate dataset
d_id, config, split = check_dataset(dataset_id=dataset_id, dataset_config=dataset_config, dataset_split=dataset_split)
dataset_ok = False
if d_id is None:
gr.Warning(f'Dataset "{dataset_id}" is not accessible. Please set your HF_TOKEN if it is a private dataset.')
elif isinstance(config, list):
gr.Warning(f'Dataset "{dataset_id}" does not have "{dataset_config}" config. Please choose a valid config.')
config = gr.update(choices=config, value=config[0])
elif isinstance(split, list):
gr.Warning(f'Dataset "{dataset_id}" does not have "{dataset_split}" split. Please choose a valid split.')
split = gr.update(choices=split, value=split[0])
else:
dataset_ok = True
if not dataset_ok:
return (
config, split,
gr.update(interactive=False), # Submit button
gr.update(visible=False), # Model prediction preview
gr.update(visible=False), # Label mapping preview
gr.update(visible=True), # Column mapping
)
# TODO: Validate column mapping by running once
prediction_result = None
id2label_df = None
if isinstance(ppl, TextClassificationPipeline):
try:
column_mapping = json.loads(column_mapping)
except Exception:
column_mapping = {}
column_mapping, prediction_result, id2label_df = \
text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, split)
column_mapping = json.dumps(column_mapping, indent=2)
del ppl
if prediction_result is None:
gr.Warning('The model failed to predict with the first row in the dataset. Please provide column mappings in "Advance" settings.')
return (
config, split,
gr.update(interactive=False), # Submit button
gr.update(visible=False), # Model prediction preview
gr.update(visible=False), # Label mapping preview
gr.update(value=column_mapping, visible=True, interactive=True), # Column mapping
)
elif id2label_df is None:
gr.Warning('The prediction result does not conform the labels in the dataset. Please provide label mappings in "Advance" settings.')
return (
config, split,
gr.update(interactive=False), # Submit button
gr.update(value=prediction_result, visible=True), # Model prediction preview
gr.update(visible=False), # Label mapping preview
gr.update(value=column_mapping, visible=True, interactive=True), # Column mapping
)
gr.Info("Model and dataset validations passed. Your can submit the evaluation task.")
return (
config, split,
gr.update(interactive=True), # Submit button
gr.update(value=prediction_result, visible=True), # Model prediction preview
gr.update(value=id2label_df, visible=True), # Label mapping preview
gr.update(value=column_mapping, visible=True, interactive=True), # Column mapping
)
def try_submit(m_id, d_id, config, split, local):
if local:
command = [
"python",
"cli.py",
"--loader", "huggingface",
"--model", m_id,
"--dataset", d_id,
"--dataset_config", config,
"--dataset_split", split,
"--hf_token", os.environ.get(HF_WRITE_TOKEN),
"--discussion_repo", os.environ.get(HF_REPO_ID) or os.environ.get(HF_SPACE_ID),
"--output_format", "markdown",
"--output_portal", "huggingface",
]
eval_str = f"[{m_id}]<{d_id}({config}, {split} set)>"
start = time.time()
logging.info(f"Start local evaluation on {eval_str}")
evaluator = subprocess.Popen(
command,
cwd=os.path.join(os.path.dirname(os.path.realpath(__file__)), "cicd"),
stderr=subprocess.STDOUT,
)
result = evaluator.wait()
logging.info(f"Finished local evaluation exit code {result} on {eval_str}: {time.time() - start:.2f}s")
with gr.Blocks(theme=theme) as iface:
with gr.Row():
with gr.Column():
model_id_input = gr.Textbox(
label="Hugging Face model id",
placeholder="cardiffnlp/twitter-roberta-base-sentiment-latest",
)
# TODO: Add supported model pairs: Text Classification - text-classification
model_type = gr.Dropdown(
label="Hugging Face model type",
choices=[
("Auto-detect", 0),
("Text Classification", 1),
],
value=0,
)
example_labels = gr.Label(label='Model pipeline test prediction result', visible=False)
with gr.Column():
dataset_id_input = gr.Textbox(
label="Hugging Face dataset id",
placeholder="tweet_eval",
)
dataset_config_input = gr.Dropdown(
label="Hugging Face dataset subset",
choices=[
"default",
],
allow_custom_value=True,
value="default",
)
dataset_split_input = gr.Dropdown(
label="Hugging Face dataset split",
choices=[
"test",
],
allow_custom_value=True,
value="test",
)
id2label_mapping_dataframe = gr.DataFrame(visible=False)
with gr.Row():
with gr.Accordion("Advance", open=False):
run_local = gr.Checkbox(value=True, label="Run in this Space")
column_mapping_input = gr.Textbox(
value="",
lines=5,
label="Column mapping",
placeholder="Description of mapping of columns in model to dataset, in json format, e.g.:\n"
'{\n'
' "text": "context",\n'
' "label": {0: "Positive", 1: "Negative"}\n'
'}',
)
with gr.Row():
validate_btn = gr.Button("Validate model and dataset", variant="primary")
run_btn = gr.Button(
"Submit evaluation task",
variant="primary",
interactive=False,
)
validate_btn.click(
try_validate,
inputs=[
model_id_input,
dataset_id_input,
dataset_config_input,
dataset_split_input,
column_mapping_input,
],
outputs=[
dataset_config_input,
dataset_split_input,
run_btn,
example_labels,
id2label_mapping_dataframe,
column_mapping_input,
],
)
run_btn.click(
try_submit,
inputs=[
model_id_input,
dataset_id_input,
dataset_config_input,
dataset_split_input,
run_local,
],
)
iface.queue(max_size=20)
iface.launch()