Spaces:

clarin-pl
/

datasets-explorer

Runtime error

App Files Files Community

Mariusz Kossakowski commited on Sep 1, 2022

Commit

abb1c69

•

1 Parent(s): 997a159

Black formatting

Browse files

Files changed (4) hide show

clarin_datasets/aspectemo_dataset.py +7 -5
clarin_datasets/kpwr_ner_datasets.py +8 -4
clarin_datasets/nkjp_pos_dataset.py +11 -9
clarin_datasets/punctuation_restoration_dataset.py +5 -6

clarin_datasets/aspectemo_dataset.py CHANGED Viewed

@@ -37,7 +37,7 @@ class AspectEmoDataset(DatasetToShow):
             Example: ['Dużo', 'wymaga', ',', 'ale', 'bardzo', 'uczciwy', 'i', 'przyjazny', 'studentom', '.', 'Warto', 'chodzić',
             'na', 'konsultacje', '.', 'Docenia', 'postępy', 'i', 'zaangażowanie', '.', 'Polecam', '.'] → ['O', 'a_plus_s', 'O',
             'O', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'a_zero', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O']
-            """
         ]
     def load_data(self):
@@ -70,7 +70,9 @@ class AspectEmoDataset(DatasetToShow):
         with dataframe_head:
             st.header("First 10 observations of the chosen subset")
-            selected_subset = st.selectbox(label="Select subset to see", options=self.subsets)
             df_to_show = self.data_dict[selected_subset].head(10)
             st.dataframe(df_to_show)
             st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
@@ -84,9 +86,9 @@ class AspectEmoDataset(DatasetToShow):
             all_labels_from_subset = pd.Series(all_labels_from_subset)
             class_distribution_dict[subset] = (
                 all_labels_from_subset.value_counts(normalize=True)
-                    .sort_index()
-                    .reset_index()
-                    .rename({"index": "class", 0: subset}, axis="columns")
             )
         class_distribution_df = pd.merge(

             Example: ['Dużo', 'wymaga', ',', 'ale', 'bardzo', 'uczciwy', 'i', 'przyjazny', 'studentom', '.', 'Warto', 'chodzić',
             'na', 'konsultacje', '.', 'Docenia', 'postępy', 'i', 'zaangażowanie', '.', 'Polecam', '.'] → ['O', 'a_plus_s', 'O',
             'O', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'a_zero', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O']
+            """,
         ]
     def load_data(self):
         with dataframe_head:
             st.header("First 10 observations of the chosen subset")
+            selected_subset = st.selectbox(
+                label="Select subset to see", options=self.subsets
+            )
             df_to_show = self.data_dict[selected_subset].head(10)
             st.dataframe(df_to_show)
             st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
             all_labels_from_subset = pd.Series(all_labels_from_subset)
             class_distribution_dict[subset] = (
                 all_labels_from_subset.value_counts(normalize=True)
+                .sort_index()
+                .reset_index()
+                .rename({"index": "class", 0: subset}, axis="columns")
             )
         class_distribution_df = pd.merge(

clarin_datasets/kpwr_ner_datasets.py CHANGED Viewed

@@ -33,7 +33,7 @@ class KpwrNerDataset(DatasetToShow):
             ‘dużo’, ‘młodsze’, ‘,’, ‘powstało’, ‘w’, ‘Niemczech’, ‘.’] → [‘B-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’,
             ‘O’, ‘B-nam_loc_gpe_country’, ‘O’, ‘B-nam_pro_title’, ‘I-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’,
             ‘B-nam_loc_gpe_country’, ‘O’]
-            """
         ]
     def load_data(self):
@@ -84,7 +84,9 @@ class KpwrNerDataset(DatasetToShow):
         with dataframe_head:
             st.header("First 10 observations of the chosen subset")
-            selected_subset = st.selectbox(label="Select subset to see", options=self.subsets)
             df_to_show = self.data_dict[selected_subset].head(10)
             st.dataframe(df_to_show)
             st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
@@ -128,10 +130,12 @@ class KpwrNerDataset(DatasetToShow):
             full_df_unzipped = full_df_unzipped.loc[
                 (full_df_unzipped["ner"] != "O")
                 & ~(full_df_unzipped["ner"].str.startswith("I-"))
-                ]
             possible_options = sorted(full_df_unzipped["ner"].unique())
             with most_common_tokens:
-                st.header("10 most common tokens from selected class (without 'O' and 'I-*')")
                 selected_class = st.selectbox(
                     label="Select class to show", options=possible_options
                 )

             ‘dużo’, ‘młodsze’, ‘,’, ‘powstało’, ‘w’, ‘Niemczech’, ‘.’] → [‘B-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’,
             ‘O’, ‘B-nam_loc_gpe_country’, ‘O’, ‘B-nam_pro_title’, ‘I-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’,
             ‘B-nam_loc_gpe_country’, ‘O’]
+            """,
         ]
     def load_data(self):
         with dataframe_head:
             st.header("First 10 observations of the chosen subset")
+            selected_subset = st.selectbox(
+                label="Select subset to see", options=self.subsets
+            )
             df_to_show = self.data_dict[selected_subset].head(10)
             st.dataframe(df_to_show)
             st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
             full_df_unzipped = full_df_unzipped.loc[
                 (full_df_unzipped["ner"] != "O")
                 & ~(full_df_unzipped["ner"].str.startswith("I-"))
+            ]
             possible_options = sorted(full_df_unzipped["ner"].unique())
             with most_common_tokens:
+                st.header(
+                    "10 most common tokens from selected class (without 'O' and 'I-*')"
+                )
                 selected_class = st.selectbox(
                     label="Select class to show", options=possible_options
                 )

clarin_datasets/nkjp_pos_dataset.py CHANGED Viewed

@@ -33,7 +33,7 @@ class NkjpPosDataset(DatasetToShow):
             Input (translated by DeepL): Register as unemployed.
             Output: ['impt', 'qub', 'conj', 'subst', 'interp']
-            """
         ]
     def load_data(self):
@@ -75,8 +75,12 @@ class NkjpPosDataset(DatasetToShow):
         with dataframe_head:
             st.header("First 10 observations of the chosen subset")
-            subset_to_show = st.selectbox(label="Select subset to see", options=self.subsets)
-            df_to_show = self.data_dict[subset_to_show].head(10).drop("id", axis="columns")
             st.dataframe(df_to_show)
             st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
@@ -84,16 +88,14 @@ class NkjpPosDataset(DatasetToShow):
         for subset in self.subsets:
             all_labels_from_subset = self.data_dict_named[subset]["tags"].tolist()
             all_labels_from_subset = [
-                x
-                for subarray in all_labels_from_subset
-                for x in subarray
             ]
             all_labels_from_subset = pd.Series(all_labels_from_subset)
             class_distribution_dict[subset] = (
                 all_labels_from_subset.value_counts(normalize=True)
-                    .sort_index()
-                    .reset_index()
-                    .rename({"index": "class", 0: subset}, axis="columns")
             )
         class_distribution_df = pd.merge(

             Input (translated by DeepL): Register as unemployed.
             Output: ['impt', 'qub', 'conj', 'subst', 'interp']
+            """,
         ]
     def load_data(self):
         with dataframe_head:
             st.header("First 10 observations of the chosen subset")
+            subset_to_show = st.selectbox(
+                label="Select subset to see", options=self.subsets
+            )
+            df_to_show = (
+                self.data_dict[subset_to_show].head(10).drop("id", axis="columns")
+            )
             st.dataframe(df_to_show)
             st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
         for subset in self.subsets:
             all_labels_from_subset = self.data_dict_named[subset]["tags"].tolist()
             all_labels_from_subset = [
+                x for subarray in all_labels_from_subset for x in subarray
             ]
             all_labels_from_subset = pd.Series(all_labels_from_subset)
             class_distribution_dict[subset] = (
                 all_labels_from_subset.value_counts(normalize=True)
+                .sort_index()
+                .reset_index()
+                .rename({"index": "class", 0: subset}, axis="columns")
             )
         class_distribution_df = pd.merge(

clarin_datasets/punctuation_restoration_dataset.py CHANGED Viewed

@@ -36,7 +36,7 @@ class PunctuationRestorationDataset(DatasetToShow):
             """,
             "Task description",
             "The purpose of this task is to restore punctuation in the ASR recognition of texts read out loud.",
-            "clarin_datasets/punctuation_restoration_task.png"
         ]
     def load_data(self):
@@ -81,7 +81,9 @@ class PunctuationRestorationDataset(DatasetToShow):
         with dataframe_head:
             st.header("First 10 observations of the chosen subset")
-            subset_to_show = st.selectbox(label="Select subset to see", options=self.subsets)
             df_to_show = self.data_dict[subset_to_show].head(10)
             st.dataframe(df_to_show)
             st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
@@ -90,10 +92,7 @@ class PunctuationRestorationDataset(DatasetToShow):
         for subset in self.subsets:
             all_labels_from_subset = self.data_dict_named[subset]["tags"].tolist()
             all_labels_from_subset = [
-                x
-                for subarray in all_labels_from_subset
-                for x in subarray
-                if x != "O"
             ]
             all_labels_from_subset = pd.Series(all_labels_from_subset)
             class_distribution_dict[subset] = (

             """,
             "Task description",
             "The purpose of this task is to restore punctuation in the ASR recognition of texts read out loud.",
+            "clarin_datasets/punctuation_restoration_task.png",
         ]
     def load_data(self):
         with dataframe_head:
             st.header("First 10 observations of the chosen subset")
+            subset_to_show = st.selectbox(
+                label="Select subset to see", options=self.subsets
+            )
             df_to_show = self.data_dict[subset_to_show].head(10)
             st.dataframe(df_to_show)
             st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
         for subset in self.subsets:
             all_labels_from_subset = self.data_dict_named[subset]["tags"].tolist()
             all_labels_from_subset = [
+                x for subarray in all_labels_from_subset for x in subarray if x != "O"
             ]
             all_labels_from_subset = pd.Series(all_labels_from_subset)
             class_distribution_dict[subset] = (