lewtun HF staff commited on
Commit
daea199
Β·
1 Parent(s): 9eeb453

Archive project

Browse files
Files changed (1) hide show
  1. app.py +548 -545
app.py CHANGED
@@ -131,6 +131,9 @@ SUPPORTED_METRICS = [
131
  # APP #
132
  #######
133
  st.title("Evaluation on the Hub")
 
 
 
134
  st.markdown(
135
  """
136
  Welcome to Hugging Face's automatic model evaluator πŸ‘‹!
@@ -146,548 +149,548 @@ st.markdown(
146
  """
147
  )
148
 
149
- all_datasets = [d.id for d in list_datasets()]
150
- query_params = st.experimental_get_query_params()
151
- if "first_query_params" not in st.session_state:
152
- st.session_state.first_query_params = query_params
153
- first_query_params = st.session_state.first_query_params
154
- default_dataset = all_datasets[0]
155
- if "dataset" in first_query_params:
156
- if len(first_query_params["dataset"]) > 0 and first_query_params["dataset"][0] in all_datasets:
157
- default_dataset = first_query_params["dataset"][0]
158
-
159
- selected_dataset = st.selectbox(
160
- "Select a dataset",
161
- all_datasets,
162
- index=all_datasets.index(default_dataset),
163
- help="""Datasets with metadata can be evaluated with 1-click. Configure an evaluation job to add \
164
- new metadata to a dataset card.""",
165
- )
166
- st.experimental_set_query_params(**{"dataset": [selected_dataset]})
167
-
168
- # Check if selected dataset can be streamed
169
- is_valid_dataset = http_get(
170
- path="/is-valid",
171
- domain=DATASETS_PREVIEW_API,
172
- params={"dataset": selected_dataset},
173
- ).json()
174
- if is_valid_dataset["viewer"] is False and is_valid_dataset["preview"] is False:
175
- st.error(
176
- """The dataset you selected is not currently supported. Open a \
177
- [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) for support."""
178
- )
179
-
180
- metadata = get_metadata(selected_dataset, token=HF_TOKEN)
181
- print(f"INFO -- Dataset metadata: {metadata}")
182
- if metadata is None:
183
- st.warning("No evaluation metadata found. Please configure the evaluation job below.")
184
-
185
- with st.expander("Advanced configuration"):
186
- # Select task
187
- selected_task = st.selectbox(
188
- "Select a task",
189
- SUPPORTED_TASKS,
190
- index=SUPPORTED_TASKS.index(metadata[0]["task_id"]) if metadata is not None else 0,
191
- help="""Don't see your favourite task here? Open a \
192
- [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) to request it!""",
193
- )
194
- # Select config
195
- configs = get_dataset_config_names(selected_dataset)
196
- selected_config = st.selectbox(
197
- "Select a config",
198
- configs,
199
- help="""Some datasets contain several sub-datasets, known as _configurations_. \
200
- Select one to evaluate your models on. \
201
- See the [docs](https://huggingface.co/docs/datasets/master/en/load_hub#configurations) for more details.
202
- """,
203
- )
204
- # Some datasets have multiple metadata (one per config), so we grab the one associated with the selected config
205
- config_metadata = get_config_metadata(selected_config, metadata)
206
- print(f"INFO -- Config metadata: {config_metadata}")
207
-
208
- # Select splits
209
- splits_resp = http_get(
210
- path="/splits",
211
- domain=DATASETS_PREVIEW_API,
212
- params={"dataset": selected_dataset},
213
- )
214
- if splits_resp.status_code == 200:
215
- split_names = []
216
- all_splits = splits_resp.json()
217
- for split in all_splits["splits"]:
218
- if split["config"] == selected_config:
219
- split_names.append(split["split"])
220
-
221
- if config_metadata is not None:
222
- eval_split = config_metadata["splits"].get("eval_split", None)
223
- else:
224
- eval_split = None
225
- selected_split = st.selectbox(
226
- "Select a split",
227
- split_names,
228
- index=split_names.index(eval_split) if eval_split is not None else 0,
229
- help="Be wary when evaluating models on the `train` split.",
230
- )
231
-
232
- # Select columns
233
- rows_resp = http_get(
234
- path="/first-rows",
235
- domain=DATASETS_PREVIEW_API,
236
- params={
237
- "dataset": selected_dataset,
238
- "config": selected_config,
239
- "split": selected_split,
240
- },
241
- ).json()
242
- col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
243
-
244
- st.markdown("**Map your dataset columns**")
245
- st.markdown(
246
- """The model evaluator uses a standardised set of column names for the input examples and labels. \
247
- Please define the mapping between your dataset columns (right) and the standardised column names (left)."""
248
- )
249
- col1, col2 = st.columns(2)
250
-
251
- # TODO: find a better way to layout these items
252
- # TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata
253
- col_mapping = {}
254
- if selected_task in ["binary_classification", "multi_class_classification"]:
255
- with col1:
256
- st.markdown("`text` column")
257
- st.text("")
258
- st.text("")
259
- st.text("")
260
- st.text("")
261
- st.markdown("`target` column")
262
- with col2:
263
- text_col = st.selectbox(
264
- "This column should contain the text to be classified",
265
- col_names,
266
- index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
267
- if config_metadata is not None
268
- else 0,
269
- )
270
- target_col = st.selectbox(
271
- "This column should contain the labels associated with the text",
272
- col_names,
273
- index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
274
- if config_metadata is not None
275
- else 0,
276
- )
277
- col_mapping[text_col] = "text"
278
- col_mapping[target_col] = "target"
279
-
280
- elif selected_task == "text_zero_shot_classification":
281
- with col1:
282
- st.markdown("`text` column")
283
- st.text("")
284
- st.text("")
285
- st.text("")
286
- st.text("")
287
- st.markdown("`classes` column")
288
- st.text("")
289
- st.text("")
290
- st.text("")
291
- st.text("")
292
- st.markdown("`target` column")
293
- with col2:
294
- text_col = st.selectbox(
295
- "This column should contain the text to be classified",
296
- col_names,
297
- index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
298
- if config_metadata is not None
299
- else 0,
300
- )
301
- classes_col = st.selectbox(
302
- "This column should contain the classes associated with the text",
303
- col_names,
304
- index=col_names.index(get_key(config_metadata["col_mapping"], "classes"))
305
- if config_metadata is not None
306
- else 0,
307
- )
308
- target_col = st.selectbox(
309
- "This column should contain the index of the correct class",
310
- col_names,
311
- index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
312
- if config_metadata is not None
313
- else 0,
314
- )
315
- col_mapping[text_col] = "text"
316
- col_mapping[classes_col] = "classes"
317
- col_mapping[target_col] = "target"
318
-
319
- if selected_task in ["natural_language_inference"]:
320
- config_metadata = get_config_metadata(selected_config, metadata)
321
- with col1:
322
- st.markdown("`text1` column")
323
- st.text("")
324
- st.text("")
325
- st.text("")
326
- st.text("")
327
- st.text("")
328
- st.markdown("`text2` column")
329
- st.text("")
330
- st.text("")
331
- st.text("")
332
- st.text("")
333
- st.text("")
334
- st.markdown("`target` column")
335
- with col2:
336
- text1_col = st.selectbox(
337
- "This column should contain the first text passage to be classified",
338
- col_names,
339
- index=col_names.index(get_key(config_metadata["col_mapping"], "text1"))
340
- if config_metadata is not None
341
- else 0,
342
- )
343
- text2_col = st.selectbox(
344
- "This column should contain the second text passage to be classified",
345
- col_names,
346
- index=col_names.index(get_key(config_metadata["col_mapping"], "text2"))
347
- if config_metadata is not None
348
- else 0,
349
- )
350
- target_col = st.selectbox(
351
- "This column should contain the labels associated with the text",
352
- col_names,
353
- index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
354
- if config_metadata is not None
355
- else 0,
356
- )
357
- col_mapping[text1_col] = "text1"
358
- col_mapping[text2_col] = "text2"
359
- col_mapping[target_col] = "target"
360
-
361
- elif selected_task == "entity_extraction":
362
- with col1:
363
- st.markdown("`tokens` column")
364
- st.text("")
365
- st.text("")
366
- st.text("")
367
- st.text("")
368
- st.markdown("`tags` column")
369
- with col2:
370
- tokens_col = st.selectbox(
371
- "This column should contain the array of tokens to be classified",
372
- col_names,
373
- index=col_names.index(get_key(config_metadata["col_mapping"], "tokens"))
374
- if config_metadata is not None
375
- else 0,
376
- )
377
- tags_col = st.selectbox(
378
- "This column should contain the labels associated with each part of the text",
379
- col_names,
380
- index=col_names.index(get_key(config_metadata["col_mapping"], "tags"))
381
- if config_metadata is not None
382
- else 0,
383
- )
384
- col_mapping[tokens_col] = "tokens"
385
- col_mapping[tags_col] = "tags"
386
-
387
- elif selected_task == "translation":
388
- with col1:
389
- st.markdown("`source` column")
390
- st.text("")
391
- st.text("")
392
- st.text("")
393
- st.text("")
394
- st.markdown("`target` column")
395
- with col2:
396
- text_col = st.selectbox(
397
- "This column should contain the text to be translated",
398
- col_names,
399
- index=col_names.index(get_key(config_metadata["col_mapping"], "source"))
400
- if config_metadata is not None
401
- else 0,
402
- )
403
- target_col = st.selectbox(
404
- "This column should contain the target translation",
405
- col_names,
406
- index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
407
- if config_metadata is not None
408
- else 0,
409
- )
410
- col_mapping[text_col] = "source"
411
- col_mapping[target_col] = "target"
412
-
413
- elif selected_task == "summarization":
414
- with col1:
415
- st.markdown("`text` column")
416
- st.text("")
417
- st.text("")
418
- st.text("")
419
- st.text("")
420
- st.markdown("`target` column")
421
- with col2:
422
- text_col = st.selectbox(
423
- "This column should contain the text to be summarized",
424
- col_names,
425
- index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
426
- if config_metadata is not None
427
- else 0,
428
- )
429
- target_col = st.selectbox(
430
- "This column should contain the target summary",
431
- col_names,
432
- index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
433
- if config_metadata is not None
434
- else 0,
435
- )
436
- col_mapping[text_col] = "text"
437
- col_mapping[target_col] = "target"
438
-
439
- elif selected_task == "extractive_question_answering":
440
- if config_metadata is not None:
441
- col_mapping = config_metadata["col_mapping"]
442
- # Hub YAML parser converts periods to hyphens, so we remap them here
443
- col_mapping = format_col_mapping(col_mapping)
444
- with col1:
445
- st.markdown("`context` column")
446
- st.text("")
447
- st.text("")
448
- st.text("")
449
- st.text("")
450
- st.markdown("`question` column")
451
- st.text("")
452
- st.text("")
453
- st.text("")
454
- st.text("")
455
- st.markdown("`answers.text` column")
456
- st.text("")
457
- st.text("")
458
- st.text("")
459
- st.text("")
460
- st.markdown("`answers.answer_start` column")
461
- with col2:
462
- context_col = st.selectbox(
463
- "This column should contain the question's context",
464
- col_names,
465
- index=col_names.index(get_key(col_mapping, "context")) if config_metadata is not None else 0,
466
- )
467
- question_col = st.selectbox(
468
- "This column should contain the question to be answered, given the context",
469
- col_names,
470
- index=col_names.index(get_key(col_mapping, "question")) if config_metadata is not None else 0,
471
- )
472
- answers_text_col = st.selectbox(
473
- "This column should contain example answers to the question, extracted from the context",
474
- col_names,
475
- index=col_names.index(get_key(col_mapping, "answers.text")) if config_metadata is not None else 0,
476
- )
477
- answers_start_col = st.selectbox(
478
- "This column should contain the indices in the context of the first character of each `answers.text`",
479
- col_names,
480
- index=col_names.index(get_key(col_mapping, "answers.answer_start"))
481
- if config_metadata is not None
482
- else 0,
483
- )
484
- col_mapping[context_col] = "context"
485
- col_mapping[question_col] = "question"
486
- col_mapping[answers_text_col] = "answers.text"
487
- col_mapping[answers_start_col] = "answers.answer_start"
488
- elif selected_task in ["image_binary_classification", "image_multi_class_classification"]:
489
- with col1:
490
- st.markdown("`image` column")
491
- st.text("")
492
- st.text("")
493
- st.text("")
494
- st.text("")
495
- st.markdown("`target` column")
496
- with col2:
497
- image_col = st.selectbox(
498
- "This column should contain the images to be classified",
499
- col_names,
500
- index=col_names.index(get_key(config_metadata["col_mapping"], "image"))
501
- if config_metadata is not None
502
- else 0,
503
- )
504
- target_col = st.selectbox(
505
- "This column should contain the labels associated with the images",
506
- col_names,
507
- index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
508
- if config_metadata is not None
509
- else 0,
510
- )
511
- col_mapping[image_col] = "image"
512
- col_mapping[target_col] = "target"
513
-
514
- # Select metrics
515
- st.markdown("**Select metrics**")
516
- st.markdown("The following metrics will be computed")
517
- html_string = " ".join(
518
- [
519
- '<div style="padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left">'
520
- + '<div style="background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;'
521
- + 'padding-left:5px;color:white">'
522
- + metric
523
- + "</div></div>"
524
- for metric in TASK_TO_DEFAULT_METRICS[selected_task]
525
- ]
526
- )
527
- st.markdown(html_string, unsafe_allow_html=True)
528
- selected_metrics = st.multiselect(
529
- "(Optional) Select additional metrics",
530
- sorted(list(set(SUPPORTED_METRICS) - set(TASK_TO_DEFAULT_METRICS[selected_task]))),
531
- help="""User-selected metrics will be computed with their default arguments. \
532
- For example, `f1` will report results for binary labels. \
533
- Check out the [available metrics](https://huggingface.co/metrics) for more details.""",
534
- )
535
-
536
- with st.form(key="form"):
537
- compatible_models = get_compatible_models(selected_task, [selected_dataset])
538
- selected_models = st.multiselect(
539
- "Select the models you wish to evaluate",
540
- compatible_models,
541
- help="""Don't see your favourite model in this list? Add the dataset and task it was trained on to the \
542
- [model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""",
543
- )
544
- print("INFO -- Selected models before filter:", selected_models)
545
-
546
- hf_username = st.text_input("Enter your πŸ€— Hub username to be notified when the evaluation is finished")
547
-
548
- submit_button = st.form_submit_button("Evaluate models πŸš€")
549
-
550
- if submit_button:
551
- if len(hf_username) == 0:
552
- st.warning("No πŸ€— Hub username provided! Please enter your username and try again.")
553
- elif len(selected_models) == 0:
554
- st.warning("⚠️ No models were selected for evaluation! Please select at least one model and try again.")
555
- elif len(selected_models) > 10:
556
- st.warning("Only 10 models can be evaluated at once. Please select fewer models and try again.")
557
- else:
558
- # Filter out previously evaluated models
559
- selected_models = filter_evaluated_models(
560
- selected_models,
561
- selected_task,
562
- selected_dataset,
563
- selected_config,
564
- selected_split,
565
- selected_metrics,
566
- )
567
- print("INFO -- Selected models after filter:", selected_models)
568
- if len(selected_models) > 0:
569
- project_payload = {
570
- "username": AUTOTRAIN_USERNAME,
571
- "proj_name": create_autotrain_project_name(selected_dataset, selected_config),
572
- "task": TASK_TO_ID[selected_task],
573
- "config": {
574
- "language": AUTOTRAIN_TASK_TO_LANG[selected_task]
575
- if selected_task in AUTOTRAIN_TASK_TO_LANG
576
- else "en",
577
- "max_models": 5,
578
- "instance": {
579
- "provider": "sagemaker" if selected_task in AUTOTRAIN_MACHINE.keys() else "ovh",
580
- "instance_type": AUTOTRAIN_MACHINE[selected_task]
581
- if selected_task in AUTOTRAIN_MACHINE.keys()
582
- else "p3",
583
- "max_runtime_seconds": 172800,
584
- "num_instances": 1,
585
- "disk_size_gb": 200,
586
- },
587
- "evaluation": {
588
- "metrics": selected_metrics,
589
- "models": selected_models,
590
- "hf_username": hf_username,
591
- },
592
- },
593
- }
594
- print(f"INFO -- Payload: {project_payload}")
595
- project_json_resp = http_post(
596
- path="/projects/create",
597
- payload=project_payload,
598
- token=HF_TOKEN,
599
- domain=AUTOTRAIN_BACKEND_API,
600
- ).json()
601
- print(f"INFO -- Project creation response: {project_json_resp}")
602
-
603
- if project_json_resp["created"]:
604
- data_payload = {
605
- "split": 4, # use "auto" split choice in AutoTrain
606
- "col_mapping": col_mapping,
607
- "load_config": {"max_size_bytes": 0, "shuffle": False},
608
- "dataset_id": selected_dataset,
609
- "dataset_config": selected_config,
610
- "dataset_split": selected_split,
611
- }
612
- data_json_resp = http_post(
613
- path=f"/projects/{project_json_resp['id']}/data/dataset",
614
- payload=data_payload,
615
- token=HF_TOKEN,
616
- domain=AUTOTRAIN_BACKEND_API,
617
- ).json()
618
- print(f"INFO -- Dataset creation response: {data_json_resp}")
619
- if data_json_resp["download_status"] == 1:
620
- train_json_resp = http_post(
621
- path=f"/projects/{project_json_resp['id']}/data/start_processing",
622
- token=HF_TOKEN,
623
- domain=AUTOTRAIN_BACKEND_API,
624
- ).json()
625
- # For local development we process and approve projects on-the-fly
626
- if "localhost" in AUTOTRAIN_BACKEND_API:
627
- with st.spinner("⏳ Waiting for data processing to complete ..."):
628
- is_data_processing_success = False
629
- while is_data_processing_success is not True:
630
- project_status = http_get(
631
- path=f"/projects/{project_json_resp['id']}",
632
- token=HF_TOKEN,
633
- domain=AUTOTRAIN_BACKEND_API,
634
- ).json()
635
- if project_status["status"] == 3:
636
- is_data_processing_success = True
637
- time.sleep(10)
638
-
639
- # Approve training job
640
- train_job_resp = http_post(
641
- path=f"/projects/{project_json_resp['id']}/start_training",
642
- token=HF_TOKEN,
643
- domain=AUTOTRAIN_BACKEND_API,
644
- ).json()
645
- st.success("βœ… Data processing and project approval complete - go forth and evaluate!")
646
- else:
647
- # Prod/staging submissions are evaluated in a cron job via run_evaluation_jobs.py
648
- print(f"INFO -- AutoTrain job response: {train_json_resp}")
649
- if train_json_resp["success"]:
650
- train_eval_index = {
651
- "train-eval-index": [
652
- {
653
- "config": selected_config,
654
- "task": AUTOTRAIN_TASK_TO_HUB_TASK[selected_task],
655
- "task_id": selected_task,
656
- "splits": {"eval_split": selected_split},
657
- "col_mapping": col_mapping,
658
- }
659
- ]
660
- }
661
- selected_metadata = yaml.dump(train_eval_index, sort_keys=False)
662
- dataset_card_url = get_dataset_card_url(selected_dataset)
663
- st.success("βœ… Successfully submitted evaluation job!")
664
- st.markdown(
665
- f"""
666
- Evaluation can take up to 1 hour to complete, so grab a β˜•οΈ or 🍡 while you wait:
667
-
668
- * πŸ”” A [Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) with the evaluation results will be opened for each model you selected. Check your email for notifications.
669
- * πŸ“Š Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) to view the results from your submission once the Hub pull request is merged.
670
- * πŸ₯± Tired of configuring evaluations? Add the following metadata to the [dataset card]({dataset_card_url}) to enable 1-click evaluations:
671
- """ # noqa
672
- )
673
- st.markdown(
674
- f"""
675
- ```yaml
676
- {selected_metadata}
677
- """
678
- )
679
- print("INFO -- Pushing evaluation job logs to the Hub")
680
- evaluation_log = {}
681
- evaluation_log["project_id"] = project_json_resp["id"]
682
- evaluation_log["autotrain_env"] = (
683
- "staging" if "staging" in AUTOTRAIN_BACKEND_API else "prod"
684
- )
685
- evaluation_log["payload"] = project_payload
686
- evaluation_log["project_creation_response"] = project_json_resp
687
- evaluation_log["dataset_creation_response"] = data_json_resp
688
- evaluation_log["autotrain_job_response"] = train_json_resp
689
- commit_evaluation_log(evaluation_log, hf_access_token=HF_TOKEN)
690
- else:
691
- st.error("πŸ™ˆ Oh no, there was an error submitting your evaluation job!")
692
- else:
693
- st.warning("⚠️ No models left to evaluate! Please select other models and try again.")
 
131
  # APP #
132
  #######
133
  st.title("Evaluation on the Hub")
134
+ st.warning(
135
+ "**⚠️ This project has been archived. If you want to evaluate LLMs, checkout [this collection](https://huggingface.co/collections/clefourrier/llm-leaderboards-and-benchmarks-✨-64f99d2e11e92ca5568a7cce) of leaderboards.**"
136
+ )
137
  st.markdown(
138
  """
139
  Welcome to Hugging Face's automatic model evaluator πŸ‘‹!
 
149
  """
150
  )
151
 
152
+ # all_datasets = [d.id for d in list_datasets()]
153
+ # query_params = st.experimental_get_query_params()
154
+ # if "first_query_params" not in st.session_state:
155
+ # st.session_state.first_query_params = query_params
156
+ # first_query_params = st.session_state.first_query_params
157
+ # default_dataset = all_datasets[0]
158
+ # if "dataset" in first_query_params:
159
+ # if len(first_query_params["dataset"]) > 0 and first_query_params["dataset"][0] in all_datasets:
160
+ # default_dataset = first_query_params["dataset"][0]
161
+
162
+ # selected_dataset = st.selectbox(
163
+ # "Select a dataset",
164
+ # all_datasets,
165
+ # index=all_datasets.index(default_dataset),
166
+ # help="""Datasets with metadata can be evaluated with 1-click. Configure an evaluation job to add \
167
+ # new metadata to a dataset card.""",
168
+ # )
169
+ # st.experimental_set_query_params(**{"dataset": [selected_dataset]})
170
+
171
+ # # Check if selected dataset can be streamed
172
+ # is_valid_dataset = http_get(
173
+ # path="/is-valid",
174
+ # domain=DATASETS_PREVIEW_API,
175
+ # params={"dataset": selected_dataset},
176
+ # ).json()
177
+ # if is_valid_dataset["viewer"] is False and is_valid_dataset["preview"] is False:
178
+ # st.error(
179
+ # """The dataset you selected is not currently supported. Open a \
180
+ # [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) for support."""
181
+ # )
182
+
183
+ # metadata = get_metadata(selected_dataset, token=HF_TOKEN)
184
+ # print(f"INFO -- Dataset metadata: {metadata}")
185
+ # if metadata is None:
186
+ # st.warning("No evaluation metadata found. Please configure the evaluation job below.")
187
+
188
+ # with st.expander("Advanced configuration"):
189
+ # # Select task
190
+ # selected_task = st.selectbox(
191
+ # "Select a task",
192
+ # SUPPORTED_TASKS,
193
+ # index=SUPPORTED_TASKS.index(metadata[0]["task_id"]) if metadata is not None else 0,
194
+ # help="""Don't see your favourite task here? Open a \
195
+ # [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) to request it!""",
196
+ # )
197
+ # # Select config
198
+ # configs = get_dataset_config_names(selected_dataset)
199
+ # selected_config = st.selectbox(
200
+ # "Select a config",
201
+ # configs,
202
+ # help="""Some datasets contain several sub-datasets, known as _configurations_. \
203
+ # Select one to evaluate your models on. \
204
+ # See the [docs](https://huggingface.co/docs/datasets/master/en/load_hub#configurations) for more details.
205
+ # """,
206
+ # )
207
+ # # Some datasets have multiple metadata (one per config), so we grab the one associated with the selected config
208
+ # config_metadata = get_config_metadata(selected_config, metadata)
209
+ # print(f"INFO -- Config metadata: {config_metadata}")
210
+
211
+ # # Select splits
212
+ # splits_resp = http_get(
213
+ # path="/splits",
214
+ # domain=DATASETS_PREVIEW_API,
215
+ # params={"dataset": selected_dataset},
216
+ # )
217
+ # if splits_resp.status_code == 200:
218
+ # split_names = []
219
+ # all_splits = splits_resp.json()
220
+ # for split in all_splits["splits"]:
221
+ # if split["config"] == selected_config:
222
+ # split_names.append(split["split"])
223
+
224
+ # if config_metadata is not None:
225
+ # eval_split = config_metadata["splits"].get("eval_split", None)
226
+ # else:
227
+ # eval_split = None
228
+ # selected_split = st.selectbox(
229
+ # "Select a split",
230
+ # split_names,
231
+ # index=split_names.index(eval_split) if eval_split is not None else 0,
232
+ # help="Be wary when evaluating models on the `train` split.",
233
+ # )
234
+
235
+ # # Select columns
236
+ # rows_resp = http_get(
237
+ # path="/first-rows",
238
+ # domain=DATASETS_PREVIEW_API,
239
+ # params={
240
+ # "dataset": selected_dataset,
241
+ # "config": selected_config,
242
+ # "split": selected_split,
243
+ # },
244
+ # ).json()
245
+ # col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
246
+
247
+ # st.markdown("**Map your dataset columns**")
248
+ # st.markdown(
249
+ # """The model evaluator uses a standardised set of column names for the input examples and labels. \
250
+ # Please define the mapping between your dataset columns (right) and the standardised column names (left)."""
251
+ # )
252
+ # col1, col2 = st.columns(2)
253
+
254
+ # # TODO: find a better way to layout these items
255
+ # # TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata
256
+ # col_mapping = {}
257
+ # if selected_task in ["binary_classification", "multi_class_classification"]:
258
+ # with col1:
259
+ # st.markdown("`text` column")
260
+ # st.text("")
261
+ # st.text("")
262
+ # st.text("")
263
+ # st.text("")
264
+ # st.markdown("`target` column")
265
+ # with col2:
266
+ # text_col = st.selectbox(
267
+ # "This column should contain the text to be classified",
268
+ # col_names,
269
+ # index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
270
+ # if config_metadata is not None
271
+ # else 0,
272
+ # )
273
+ # target_col = st.selectbox(
274
+ # "This column should contain the labels associated with the text",
275
+ # col_names,
276
+ # index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
277
+ # if config_metadata is not None
278
+ # else 0,
279
+ # )
280
+ # col_mapping[text_col] = "text"
281
+ # col_mapping[target_col] = "target"
282
+
283
+ # elif selected_task == "text_zero_shot_classification":
284
+ # with col1:
285
+ # st.markdown("`text` column")
286
+ # st.text("")
287
+ # st.text("")
288
+ # st.text("")
289
+ # st.text("")
290
+ # st.markdown("`classes` column")
291
+ # st.text("")
292
+ # st.text("")
293
+ # st.text("")
294
+ # st.text("")
295
+ # st.markdown("`target` column")
296
+ # with col2:
297
+ # text_col = st.selectbox(
298
+ # "This column should contain the text to be classified",
299
+ # col_names,
300
+ # index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
301
+ # if config_metadata is not None
302
+ # else 0,
303
+ # )
304
+ # classes_col = st.selectbox(
305
+ # "This column should contain the classes associated with the text",
306
+ # col_names,
307
+ # index=col_names.index(get_key(config_metadata["col_mapping"], "classes"))
308
+ # if config_metadata is not None
309
+ # else 0,
310
+ # )
311
+ # target_col = st.selectbox(
312
+ # "This column should contain the index of the correct class",
313
+ # col_names,
314
+ # index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
315
+ # if config_metadata is not None
316
+ # else 0,
317
+ # )
318
+ # col_mapping[text_col] = "text"
319
+ # col_mapping[classes_col] = "classes"
320
+ # col_mapping[target_col] = "target"
321
+
322
+ # if selected_task in ["natural_language_inference"]:
323
+ # config_metadata = get_config_metadata(selected_config, metadata)
324
+ # with col1:
325
+ # st.markdown("`text1` column")
326
+ # st.text("")
327
+ # st.text("")
328
+ # st.text("")
329
+ # st.text("")
330
+ # st.text("")
331
+ # st.markdown("`text2` column")
332
+ # st.text("")
333
+ # st.text("")
334
+ # st.text("")
335
+ # st.text("")
336
+ # st.text("")
337
+ # st.markdown("`target` column")
338
+ # with col2:
339
+ # text1_col = st.selectbox(
340
+ # "This column should contain the first text passage to be classified",
341
+ # col_names,
342
+ # index=col_names.index(get_key(config_metadata["col_mapping"], "text1"))
343
+ # if config_metadata is not None
344
+ # else 0,
345
+ # )
346
+ # text2_col = st.selectbox(
347
+ # "This column should contain the second text passage to be classified",
348
+ # col_names,
349
+ # index=col_names.index(get_key(config_metadata["col_mapping"], "text2"))
350
+ # if config_metadata is not None
351
+ # else 0,
352
+ # )
353
+ # target_col = st.selectbox(
354
+ # "This column should contain the labels associated with the text",
355
+ # col_names,
356
+ # index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
357
+ # if config_metadata is not None
358
+ # else 0,
359
+ # )
360
+ # col_mapping[text1_col] = "text1"
361
+ # col_mapping[text2_col] = "text2"
362
+ # col_mapping[target_col] = "target"
363
+
364
+ # elif selected_task == "entity_extraction":
365
+ # with col1:
366
+ # st.markdown("`tokens` column")
367
+ # st.text("")
368
+ # st.text("")
369
+ # st.text("")
370
+ # st.text("")
371
+ # st.markdown("`tags` column")
372
+ # with col2:
373
+ # tokens_col = st.selectbox(
374
+ # "This column should contain the array of tokens to be classified",
375
+ # col_names,
376
+ # index=col_names.index(get_key(config_metadata["col_mapping"], "tokens"))
377
+ # if config_metadata is not None
378
+ # else 0,
379
+ # )
380
+ # tags_col = st.selectbox(
381
+ # "This column should contain the labels associated with each part of the text",
382
+ # col_names,
383
+ # index=col_names.index(get_key(config_metadata["col_mapping"], "tags"))
384
+ # if config_metadata is not None
385
+ # else 0,
386
+ # )
387
+ # col_mapping[tokens_col] = "tokens"
388
+ # col_mapping[tags_col] = "tags"
389
+
390
+ # elif selected_task == "translation":
391
+ # with col1:
392
+ # st.markdown("`source` column")
393
+ # st.text("")
394
+ # st.text("")
395
+ # st.text("")
396
+ # st.text("")
397
+ # st.markdown("`target` column")
398
+ # with col2:
399
+ # text_col = st.selectbox(
400
+ # "This column should contain the text to be translated",
401
+ # col_names,
402
+ # index=col_names.index(get_key(config_metadata["col_mapping"], "source"))
403
+ # if config_metadata is not None
404
+ # else 0,
405
+ # )
406
+ # target_col = st.selectbox(
407
+ # "This column should contain the target translation",
408
+ # col_names,
409
+ # index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
410
+ # if config_metadata is not None
411
+ # else 0,
412
+ # )
413
+ # col_mapping[text_col] = "source"
414
+ # col_mapping[target_col] = "target"
415
+
416
+ # elif selected_task == "summarization":
417
+ # with col1:
418
+ # st.markdown("`text` column")
419
+ # st.text("")
420
+ # st.text("")
421
+ # st.text("")
422
+ # st.text("")
423
+ # st.markdown("`target` column")
424
+ # with col2:
425
+ # text_col = st.selectbox(
426
+ # "This column should contain the text to be summarized",
427
+ # col_names,
428
+ # index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
429
+ # if config_metadata is not None
430
+ # else 0,
431
+ # )
432
+ # target_col = st.selectbox(
433
+ # "This column should contain the target summary",
434
+ # col_names,
435
+ # index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
436
+ # if config_metadata is not None
437
+ # else 0,
438
+ # )
439
+ # col_mapping[text_col] = "text"
440
+ # col_mapping[target_col] = "target"
441
+
442
+ # elif selected_task == "extractive_question_answering":
443
+ # if config_metadata is not None:
444
+ # col_mapping = config_metadata["col_mapping"]
445
+ # # Hub YAML parser converts periods to hyphens, so we remap them here
446
+ # col_mapping = format_col_mapping(col_mapping)
447
+ # with col1:
448
+ # st.markdown("`context` column")
449
+ # st.text("")
450
+ # st.text("")
451
+ # st.text("")
452
+ # st.text("")
453
+ # st.markdown("`question` column")
454
+ # st.text("")
455
+ # st.text("")
456
+ # st.text("")
457
+ # st.text("")
458
+ # st.markdown("`answers.text` column")
459
+ # st.text("")
460
+ # st.text("")
461
+ # st.text("")
462
+ # st.text("")
463
+ # st.markdown("`answers.answer_start` column")
464
+ # with col2:
465
+ # context_col = st.selectbox(
466
+ # "This column should contain the question's context",
467
+ # col_names,
468
+ # index=col_names.index(get_key(col_mapping, "context")) if config_metadata is not None else 0,
469
+ # )
470
+ # question_col = st.selectbox(
471
+ # "This column should contain the question to be answered, given the context",
472
+ # col_names,
473
+ # index=col_names.index(get_key(col_mapping, "question")) if config_metadata is not None else 0,
474
+ # )
475
+ # answers_text_col = st.selectbox(
476
+ # "This column should contain example answers to the question, extracted from the context",
477
+ # col_names,
478
+ # index=col_names.index(get_key(col_mapping, "answers.text")) if config_metadata is not None else 0,
479
+ # )
480
+ # answers_start_col = st.selectbox(
481
+ # "This column should contain the indices in the context of the first character of each `answers.text`",
482
+ # col_names,
483
+ # index=col_names.index(get_key(col_mapping, "answers.answer_start"))
484
+ # if config_metadata is not None
485
+ # else 0,
486
+ # )
487
+ # col_mapping[context_col] = "context"
488
+ # col_mapping[question_col] = "question"
489
+ # col_mapping[answers_text_col] = "answers.text"
490
+ # col_mapping[answers_start_col] = "answers.answer_start"
491
+ # elif selected_task in ["image_binary_classification", "image_multi_class_classification"]:
492
+ # with col1:
493
+ # st.markdown("`image` column")
494
+ # st.text("")
495
+ # st.text("")
496
+ # st.text("")
497
+ # st.text("")
498
+ # st.markdown("`target` column")
499
+ # with col2:
500
+ # image_col = st.selectbox(
501
+ # "This column should contain the images to be classified",
502
+ # col_names,
503
+ # index=col_names.index(get_key(config_metadata["col_mapping"], "image"))
504
+ # if config_metadata is not None
505
+ # else 0,
506
+ # )
507
+ # target_col = st.selectbox(
508
+ # "This column should contain the labels associated with the images",
509
+ # col_names,
510
+ # index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
511
+ # if config_metadata is not None
512
+ # else 0,
513
+ # )
514
+ # col_mapping[image_col] = "image"
515
+ # col_mapping[target_col] = "target"
516
+
517
+ # # Select metrics
518
+ # st.markdown("**Select metrics**")
519
+ # st.markdown("The following metrics will be computed")
520
+ # html_string = " ".join(
521
+ # [
522
+ # '<div style="padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left">'
523
+ # + '<div style="background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;'
524
+ # + 'padding-left:5px;color:white">'
525
+ # + metric
526
+ # + "</div></div>"
527
+ # for metric in TASK_TO_DEFAULT_METRICS[selected_task]
528
+ # ]
529
+ # )
530
+ # st.markdown(html_string, unsafe_allow_html=True)
531
+ # selected_metrics = st.multiselect(
532
+ # "(Optional) Select additional metrics",
533
+ # sorted(list(set(SUPPORTED_METRICS) - set(TASK_TO_DEFAULT_METRICS[selected_task]))),
534
+ # help="""User-selected metrics will be computed with their default arguments. \
535
+ # For example, `f1` will report results for binary labels. \
536
+ # Check out the [available metrics](https://huggingface.co/metrics) for more details.""",
537
+ # )
538
+
539
+ # with st.form(key="form"):
540
+ # compatible_models = get_compatible_models(selected_task, [selected_dataset])
541
+ # selected_models = st.multiselect(
542
+ # "Select the models you wish to evaluate",
543
+ # compatible_models,
544
+ # help="""Don't see your favourite model in this list? Add the dataset and task it was trained on to the \
545
+ # [model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""",
546
+ # )
547
+ # print("INFO -- Selected models before filter:", selected_models)
548
+
549
+ # hf_username = st.text_input("Enter your πŸ€— Hub username to be notified when the evaluation is finished")
550
+
551
+ # submit_button = st.form_submit_button("Evaluate models πŸš€")
552
+
553
+ # if submit_button:
554
+ # if len(hf_username) == 0:
555
+ # st.warning("No πŸ€— Hub username provided! Please enter your username and try again.")
556
+ # elif len(selected_models) == 0:
557
+ # st.warning("⚠️ No models were selected for evaluation! Please select at least one model and try again.")
558
+ # elif len(selected_models) > 10:
559
+ # st.warning("Only 10 models can be evaluated at once. Please select fewer models and try again.")
560
+ # else:
561
+ # # Filter out previously evaluated models
562
+ # selected_models = filter_evaluated_models(
563
+ # selected_models,
564
+ # selected_task,
565
+ # selected_dataset,
566
+ # selected_config,
567
+ # selected_split,
568
+ # selected_metrics,
569
+ # )
570
+ # print("INFO -- Selected models after filter:", selected_models)
571
+ # if len(selected_models) > 0:
572
+ # project_payload = {
573
+ # "username": AUTOTRAIN_USERNAME,
574
+ # "proj_name": create_autotrain_project_name(selected_dataset, selected_config),
575
+ # "task": TASK_TO_ID[selected_task],
576
+ # "config": {
577
+ # "language": AUTOTRAIN_TASK_TO_LANG[selected_task]
578
+ # if selected_task in AUTOTRAIN_TASK_TO_LANG
579
+ # else "en",
580
+ # "max_models": 5,
581
+ # "instance": {
582
+ # "provider": "sagemaker" if selected_task in AUTOTRAIN_MACHINE.keys() else "ovh",
583
+ # "instance_type": AUTOTRAIN_MACHINE[selected_task]
584
+ # if selected_task in AUTOTRAIN_MACHINE.keys()
585
+ # else "p3",
586
+ # "max_runtime_seconds": 172800,
587
+ # "num_instances": 1,
588
+ # "disk_size_gb": 200,
589
+ # },
590
+ # "evaluation": {
591
+ # "metrics": selected_metrics,
592
+ # "models": selected_models,
593
+ # "hf_username": hf_username,
594
+ # },
595
+ # },
596
+ # }
597
+ # print(f"INFO -- Payload: {project_payload}")
598
+ # project_json_resp = http_post(
599
+ # path="/projects/create",
600
+ # payload=project_payload,
601
+ # token=HF_TOKEN,
602
+ # domain=AUTOTRAIN_BACKEND_API,
603
+ # ).json()
604
+ # print(f"INFO -- Project creation response: {project_json_resp}")
605
+
606
+ # if project_json_resp["created"]:
607
+ # data_payload = {
608
+ # "split": 4, # use "auto" split choice in AutoTrain
609
+ # "col_mapping": col_mapping,
610
+ # "load_config": {"max_size_bytes": 0, "shuffle": False},
611
+ # "dataset_id": selected_dataset,
612
+ # "dataset_config": selected_config,
613
+ # "dataset_split": selected_split,
614
+ # }
615
+ # data_json_resp = http_post(
616
+ # path=f"/projects/{project_json_resp['id']}/data/dataset",
617
+ # payload=data_payload,
618
+ # token=HF_TOKEN,
619
+ # domain=AUTOTRAIN_BACKEND_API,
620
+ # ).json()
621
+ # print(f"INFO -- Dataset creation response: {data_json_resp}")
622
+ # if data_json_resp["download_status"] == 1:
623
+ # train_json_resp = http_post(
624
+ # path=f"/projects/{project_json_resp['id']}/data/start_processing",
625
+ # token=HF_TOKEN,
626
+ # domain=AUTOTRAIN_BACKEND_API,
627
+ # ).json()
628
+ # # For local development we process and approve projects on-the-fly
629
+ # if "localhost" in AUTOTRAIN_BACKEND_API:
630
+ # with st.spinner("⏳ Waiting for data processing to complete ..."):
631
+ # is_data_processing_success = False
632
+ # while is_data_processing_success is not True:
633
+ # project_status = http_get(
634
+ # path=f"/projects/{project_json_resp['id']}",
635
+ # token=HF_TOKEN,
636
+ # domain=AUTOTRAIN_BACKEND_API,
637
+ # ).json()
638
+ # if project_status["status"] == 3:
639
+ # is_data_processing_success = True
640
+ # time.sleep(10)
641
+
642
+ # # Approve training job
643
+ # train_job_resp = http_post(
644
+ # path=f"/projects/{project_json_resp['id']}/start_training",
645
+ # token=HF_TOKEN,
646
+ # domain=AUTOTRAIN_BACKEND_API,
647
+ # ).json()
648
+ # st.success("βœ… Data processing and project approval complete - go forth and evaluate!")
649
+ # else:
650
+ # # Prod/staging submissions are evaluated in a cron job via run_evaluation_jobs.py
651
+ # print(f"INFO -- AutoTrain job response: {train_json_resp}")
652
+ # if train_json_resp["success"]:
653
+ # train_eval_index = {
654
+ # "train-eval-index": [
655
+ # {
656
+ # "config": selected_config,
657
+ # "task": AUTOTRAIN_TASK_TO_HUB_TASK[selected_task],
658
+ # "task_id": selected_task,
659
+ # "splits": {"eval_split": selected_split},
660
+ # "col_mapping": col_mapping,
661
+ # }
662
+ # ]
663
+ # }
664
+ # selected_metadata = yaml.dump(train_eval_index, sort_keys=False)
665
+ # dataset_card_url = get_dataset_card_url(selected_dataset)
666
+ # st.success("βœ… Successfully submitted evaluation job!")
667
+ # st.markdown(
668
+ # f"""
669
+ # Evaluation can take up to 1 hour to complete, so grab a β˜•οΈ or 🍡 while you wait:
670
+
671
+ # * πŸ”” A [Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) with the evaluation results will be opened for each model you selected. Check your email for notifications.
672
+ # * πŸ“Š Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) to view the results from your submission once the Hub pull request is merged.
673
+ # * πŸ₯± Tired of configuring evaluations? Add the following metadata to the [dataset card]({dataset_card_url}) to enable 1-click evaluations:
674
+ # """ # noqa
675
+ # )
676
+ # st.markdown(
677
+ # f"""
678
+ # ```yaml
679
+ # {selected_metadata}
680
+ # """
681
+ # )
682
+ # print("INFO -- Pushing evaluation job logs to the Hub")
683
+ # evaluation_log = {}
684
+ # evaluation_log["project_id"] = project_json_resp["id"]
685
+ # evaluation_log["autotrain_env"] = (
686
+ # "staging" if "staging" in AUTOTRAIN_BACKEND_API else "prod"
687
+ # )
688
+ # evaluation_log["payload"] = project_payload
689
+ # evaluation_log["project_creation_response"] = project_json_resp
690
+ # evaluation_log["dataset_creation_response"] = data_json_resp
691
+ # evaluation_log["autotrain_job_response"] = train_json_resp
692
+ # commit_evaluation_log(evaluation_log, hf_access_token=HF_TOKEN)
693
+ # else:
694
+ # st.error("πŸ™ˆ Oh no, there was an error submitting your evaluation job!")
695
+ # else:
696
+ # st.warning("⚠️ No models left to evaluate! Please select other models and try again.")