Spaces:

sasha
/

evaluation-buddy

Running

App Files Files Community

Sasha commited on Apr 25, 2022

Commit

4f31875

1 Parent(s): 74e9f8c

catching dataset-specific metrics

Browse files

Files changed (1) hide show

app.py +62 -58

app.py CHANGED Viewed

@@ -31,6 +31,8 @@ tasks= ['classification', 'question answering', 'automatic speech recognition',
         'textual entailment', 'commonsense reasoning', 'summarization']
 metrics= ['matthews_correlation', 'perplexity', 'meteor', 'code_eval', 'super_glue', 'rouge', 'mauve', 'cer', 'accuracy', 'recall', 'bleurt', 'sari', 'precision', 'mean_iou', 'squad', 'mahalanobis', 'chrf', 'mae', 'squad_v2', 'seqeval', 'cuad', 'wiki_split', 'google_bleu', 'competition_math', 'pearsonr', 'xtreme_s', 'comet', 'gleu', 'spearmanr', 'f1', 'frugalscore', 'bertscore', 'indic_glue', 'mse', 'xnli', 'ter', 'coval', 'wer', 'bleu', 'glue', 'sacrebleu']
 def find_task(dname):
     task = None
     dataset_builder = load_dataset_builder(dataset_name, dataset_config)
@@ -83,28 +85,29 @@ if dataset_name in metrics:
         code = ''' from datasets import load_metric
      metric = load_metric(\"'''+dataset_name+'''\")'''
         st.code(code, language='python')
 else:
     st.markdown("This dataset doesn't have a dedicated metric, but that's ok! :wink:")
     dedicated_metric = False
-st.markdown("### Task-Specific Metrics")
-task = find_task(dataset_name)
-if task is not None:
-    st.markdown("The task associated to it this dataset is: " + task.replace('-',' '))
-    if task == 'automatic-speech-recognition':
-        st.markdown('Automatic Speech Recognition has some dedicated metrics such as:')
-        st.markdown('[Word Error Rate](https://huggingface.co/metrics/wer)')
-        wer_code = '''from datasets import load_metric
-    metric = load_metric("wer")'''
-        st.code(wer_code, language='python')
-        st.markdown('[Character Error Rate](https://huggingface.co/metrics/cer)')
-        cer_code = '''from datasets import load_metric
-    metric = load_metric("cer")'''
-        st.code(cer_code, language='python')
-else:
-    st.markdown("The task for this dataset doesn't have any dedicated metrics, but you can still use general ones! :cowboy_hat_face:")
 #print(dataset_builder.info.task_templates)
@@ -119,44 +122,45 @@ else:
 #print(dataset_name, dataset_config, dataset_split)
 #print(labels.head())
-if dataset_name in ['glue','super_glue', 'paws']:
-    dataset = load_dataset(dataset_name, dataset_config, split=dataset_split)
-else:
-    dataset = load_dataset(dataset_name, split=dataset_split)
-try:
-    num_classes = dataset_builder.info.features['label'].num_classes
-    labels = query("SELECT COUNT(*) from dataset GROUP BY label").to_pandas()
-    labels = labels.rename(columns={"count_star()": "count"})
-    labels.index = dataset_builder.info.features['label'].names
-    st.markdown("### Labelled  Metrics")
-    st.markdown("This dataset has "+ str(dataset_builder.info.features['label'].num_classes) + " labels : " + ', '.join(dataset_builder.info.features['label'].names))
-    #TODO : figure out how to make a label plot
-    st.plotly_chart(px.pie(labels, values = "count", names = labels.index, width=800, height=400))
-    total = sum(c for c in labels['count'])
-    proportion = [c/total for c in labels['count']]
-    #proportion = [0.85, 0.15]
-    stdev_dataset= statistics.stdev(proportion)
-    if stdev_dataset <= balanced_stdev:
-            st.markdown("Since this dataset is well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
-            st.markdown('[Accuracy](https://huggingface.co/metrics/accuracy)')
-            accuracy_code = '''from datasets import load_metric
-        metric = load_metric("accuracy")'''
-            st.code(accuracy_code, language='python')
     else:
-            st.markdown("Since this dataset is not well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
-            st.markdown('[F1 Score](https://huggingface.co/metrics/f1)')
-            accuracy_code = '''from datasets import load_metric
-        metric = load_metric("accuracy")'''
-            st.code(accuracy_code, language='python')
-            st.markdown('Since it takes into account both precision and recall, which works well to evaluate model performance on minority classes.')
-except:
-    if task != 'automatic-speech-recognition':
-        st.markdown("### Unsupervised  Metrics")
-        st.markdown("Since this dataset doesn't have any labels, the metrics that you can use for evaluation are:")
-        st.markdown('[Perplexity](https://huggingface.co/metrics/perplexity)')
-        perplexity_code = '''from datasets import load_metric
-    metric = load_metric("perplexity")'''
-        st.code(perplexity_code, language='python')
-        st.markdown('If you choose a model that was trained on **' + dataset_name + '** and use it to compute perplexity on text generated by your model, this can help determine how similar the two are.')

         'textual entailment', 'commonsense reasoning', 'summarization']
 metrics= ['matthews_correlation', 'perplexity', 'meteor', 'code_eval', 'super_glue', 'rouge', 'mauve', 'cer', 'accuracy', 'recall', 'bleurt', 'sari', 'precision', 'mean_iou', 'squad', 'mahalanobis', 'chrf', 'mae', 'squad_v2', 'seqeval', 'cuad', 'wiki_split', 'google_bleu', 'competition_math', 'pearsonr', 'xtreme_s', 'comet', 'gleu', 'spearmanr', 'f1', 'frugalscore', 'bertscore', 'indic_glue', 'mse', 'xnli', 'ter', 'coval', 'wer', 'bleu', 'glue', 'sacrebleu']
+dedicated_metric = False
 def find_task(dname):
     task = None
     dataset_builder = load_dataset_builder(dataset_name, dataset_config)
         code = ''' from datasets import load_metric
      metric = load_metric(\"'''+dataset_name+'''\")'''
         st.code(code, language='python')
+    dedicated_metric = True
 else:
     st.markdown("This dataset doesn't have a dedicated metric, but that's ok! :wink:")
     dedicated_metric = False
+if dedicated_metric == False:
+    st.markdown("### Task-Specific Metrics")
+    task = find_task(dataset_name)
+    if task is not None:
+        st.markdown("The task associated to it this dataset is: " + task.replace('-',' '))
+        if task == 'automatic-speech-recognition':
+            st.markdown('Automatic Speech Recognition has some dedicated metrics such as:')
+            st.markdown('[Word Error Rate](https://huggingface.co/metrics/wer)')
+            wer_code = '''from datasets import load_metric
+        metric = load_metric("wer")'''
+            st.code(wer_code, language='python')
+            st.markdown('[Character Error Rate](https://huggingface.co/metrics/cer)')
+            cer_code = '''from datasets import load_metric
+        metric = load_metric("cer")'''
+            st.code(cer_code, language='python')
+    else:
+        st.markdown("The task for this dataset doesn't have any dedicated metrics, but you can still use general ones! :cowboy_hat_face:")
 #print(dataset_builder.info.task_templates)
 #print(dataset_name, dataset_config, dataset_split)
 #print(labels.head())
+if dedicated_metric == False:
+    if dataset_name in ['glue','super_glue', 'paws', 'squad_es']:
+        dataset = load_dataset(dataset_name, dataset_config, split=dataset_split)
     else:
+        dataset = load_dataset(dataset_name, split=dataset_split)
+    try:
+        num_classes = dataset_builder.info.features['label'].num_classes
+        labels = query("SELECT COUNT(*) from dataset GROUP BY label").to_pandas()
+        labels = labels.rename(columns={"count_star()": "count"})
+        labels.index = dataset_builder.info.features['label'].names
+        st.markdown("### Labelled  Metrics")
+        st.markdown("This dataset has "+ str(dataset_builder.info.features['label'].num_classes) + " labels : " + ', '.join(dataset_builder.info.features['label'].names))
+        #TODO : figure out how to make a label plot
+        st.plotly_chart(px.pie(labels, values = "count", names = labels.index, width=800, height=400))
+        total = sum(c for c in labels['count'])
+        proportion = [c/total for c in labels['count']]
+        #proportion = [0.85, 0.15]
+        stdev_dataset= statistics.stdev(proportion)
+        if stdev_dataset <= balanced_stdev:
+                st.markdown("Since this dataset is well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
+                st.markdown('[Accuracy](https://huggingface.co/metrics/accuracy)')
+                accuracy_code = '''from datasets import load_metric
+            metric = load_metric("accuracy")'''
+                st.code(accuracy_code, language='python')
+        else:
+                st.markdown("Since this dataset is not well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
+                st.markdown('[F1 Score](https://huggingface.co/metrics/f1)')
+                accuracy_code = '''from datasets import load_metric
+            metric = load_metric("accuracy")'''
+                st.code(accuracy_code, language='python')
+                st.markdown('Since it takes into account both precision and recall, which works well to evaluate model performance on minority classes.')
+    except:
+        if task != 'automatic-speech-recognition':
+            st.markdown("### Unsupervised  Metrics")
+            st.markdown("Since this dataset doesn't have any labels, the metrics that you can use for evaluation are:")
+            st.markdown('[Perplexity](https://huggingface.co/metrics/perplexity)')
+            perplexity_code = '''from datasets import load_metric
+        metric = load_metric("perplexity")'''
+            st.code(perplexity_code, language='python')
+            st.markdown('If you choose a model that was trained on **' + dataset_name + '** and use it to compute perplexity on text generated by your model, this can help determine how similar the two are.')