Mariusz Kossakowski commited on
Commit
abb1c69
1 Parent(s): 997a159

Black formatting

Browse files
clarin_datasets/aspectemo_dataset.py CHANGED
@@ -37,7 +37,7 @@ class AspectEmoDataset(DatasetToShow):
37
  Example: ['Dużo', 'wymaga', ',', 'ale', 'bardzo', 'uczciwy', 'i', 'przyjazny', 'studentom', '.', 'Warto', 'chodzić',
38
  'na', 'konsultacje', '.', 'Docenia', 'postępy', 'i', 'zaangażowanie', '.', 'Polecam', '.'] → ['O', 'a_plus_s', 'O',
39
  'O', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'a_zero', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O']
40
- """
41
  ]
42
 
43
  def load_data(self):
@@ -70,7 +70,9 @@ class AspectEmoDataset(DatasetToShow):
70
 
71
  with dataframe_head:
72
  st.header("First 10 observations of the chosen subset")
73
- selected_subset = st.selectbox(label="Select subset to see", options=self.subsets)
 
 
74
  df_to_show = self.data_dict[selected_subset].head(10)
75
  st.dataframe(df_to_show)
76
  st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
@@ -84,9 +86,9 @@ class AspectEmoDataset(DatasetToShow):
84
  all_labels_from_subset = pd.Series(all_labels_from_subset)
85
  class_distribution_dict[subset] = (
86
  all_labels_from_subset.value_counts(normalize=True)
87
- .sort_index()
88
- .reset_index()
89
- .rename({"index": "class", 0: subset}, axis="columns")
90
  )
91
 
92
  class_distribution_df = pd.merge(
 
37
  Example: ['Dużo', 'wymaga', ',', 'ale', 'bardzo', 'uczciwy', 'i', 'przyjazny', 'studentom', '.', 'Warto', 'chodzić',
38
  'na', 'konsultacje', '.', 'Docenia', 'postępy', 'i', 'zaangażowanie', '.', 'Polecam', '.'] → ['O', 'a_plus_s', 'O',
39
  'O', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'a_zero', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O']
40
+ """,
41
  ]
42
 
43
  def load_data(self):
 
70
 
71
  with dataframe_head:
72
  st.header("First 10 observations of the chosen subset")
73
+ selected_subset = st.selectbox(
74
+ label="Select subset to see", options=self.subsets
75
+ )
76
  df_to_show = self.data_dict[selected_subset].head(10)
77
  st.dataframe(df_to_show)
78
  st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
 
86
  all_labels_from_subset = pd.Series(all_labels_from_subset)
87
  class_distribution_dict[subset] = (
88
  all_labels_from_subset.value_counts(normalize=True)
89
+ .sort_index()
90
+ .reset_index()
91
+ .rename({"index": "class", 0: subset}, axis="columns")
92
  )
93
 
94
  class_distribution_df = pd.merge(
clarin_datasets/kpwr_ner_datasets.py CHANGED
@@ -33,7 +33,7 @@ class KpwrNerDataset(DatasetToShow):
33
  ‘dużo’, ‘młodsze’, ‘,’, ‘powstało’, ‘w’, ‘Niemczech’, ‘.’] → [‘B-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’,
34
  ‘O’, ‘B-nam_loc_gpe_country’, ‘O’, ‘B-nam_pro_title’, ‘I-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’,
35
  ‘B-nam_loc_gpe_country’, ‘O’]
36
- """
37
  ]
38
 
39
  def load_data(self):
@@ -84,7 +84,9 @@ class KpwrNerDataset(DatasetToShow):
84
 
85
  with dataframe_head:
86
  st.header("First 10 observations of the chosen subset")
87
- selected_subset = st.selectbox(label="Select subset to see", options=self.subsets)
 
 
88
  df_to_show = self.data_dict[selected_subset].head(10)
89
  st.dataframe(df_to_show)
90
  st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
@@ -128,10 +130,12 @@ class KpwrNerDataset(DatasetToShow):
128
  full_df_unzipped = full_df_unzipped.loc[
129
  (full_df_unzipped["ner"] != "O")
130
  & ~(full_df_unzipped["ner"].str.startswith("I-"))
131
- ]
132
  possible_options = sorted(full_df_unzipped["ner"].unique())
133
  with most_common_tokens:
134
- st.header("10 most common tokens from selected class (without 'O' and 'I-*')")
 
 
135
  selected_class = st.selectbox(
136
  label="Select class to show", options=possible_options
137
  )
 
33
  ‘dużo’, ‘młodsze’, ‘,’, ‘powstało’, ‘w’, ‘Niemczech’, ‘.’] → [‘B-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’,
34
  ‘O’, ‘B-nam_loc_gpe_country’, ‘O’, ‘B-nam_pro_title’, ‘I-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’,
35
  ‘B-nam_loc_gpe_country’, ‘O’]
36
+ """,
37
  ]
38
 
39
  def load_data(self):
 
84
 
85
  with dataframe_head:
86
  st.header("First 10 observations of the chosen subset")
87
+ selected_subset = st.selectbox(
88
+ label="Select subset to see", options=self.subsets
89
+ )
90
  df_to_show = self.data_dict[selected_subset].head(10)
91
  st.dataframe(df_to_show)
92
  st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
 
130
  full_df_unzipped = full_df_unzipped.loc[
131
  (full_df_unzipped["ner"] != "O")
132
  & ~(full_df_unzipped["ner"].str.startswith("I-"))
133
+ ]
134
  possible_options = sorted(full_df_unzipped["ner"].unique())
135
  with most_common_tokens:
136
+ st.header(
137
+ "10 most common tokens from selected class (without 'O' and 'I-*')"
138
+ )
139
  selected_class = st.selectbox(
140
  label="Select class to show", options=possible_options
141
  )
clarin_datasets/nkjp_pos_dataset.py CHANGED
@@ -33,7 +33,7 @@ class NkjpPosDataset(DatasetToShow):
33
  Input (translated by DeepL): Register as unemployed.
34
 
35
  Output: ['impt', 'qub', 'conj', 'subst', 'interp']
36
- """
37
  ]
38
 
39
  def load_data(self):
@@ -75,8 +75,12 @@ class NkjpPosDataset(DatasetToShow):
75
 
76
  with dataframe_head:
77
  st.header("First 10 observations of the chosen subset")
78
- subset_to_show = st.selectbox(label="Select subset to see", options=self.subsets)
79
- df_to_show = self.data_dict[subset_to_show].head(10).drop("id", axis="columns")
 
 
 
 
80
  st.dataframe(df_to_show)
81
  st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
82
 
@@ -84,16 +88,14 @@ class NkjpPosDataset(DatasetToShow):
84
  for subset in self.subsets:
85
  all_labels_from_subset = self.data_dict_named[subset]["tags"].tolist()
86
  all_labels_from_subset = [
87
- x
88
- for subarray in all_labels_from_subset
89
- for x in subarray
90
  ]
91
  all_labels_from_subset = pd.Series(all_labels_from_subset)
92
  class_distribution_dict[subset] = (
93
  all_labels_from_subset.value_counts(normalize=True)
94
- .sort_index()
95
- .reset_index()
96
- .rename({"index": "class", 0: subset}, axis="columns")
97
  )
98
 
99
  class_distribution_df = pd.merge(
 
33
  Input (translated by DeepL): Register as unemployed.
34
 
35
  Output: ['impt', 'qub', 'conj', 'subst', 'interp']
36
+ """,
37
  ]
38
 
39
  def load_data(self):
 
75
 
76
  with dataframe_head:
77
  st.header("First 10 observations of the chosen subset")
78
+ subset_to_show = st.selectbox(
79
+ label="Select subset to see", options=self.subsets
80
+ )
81
+ df_to_show = (
82
+ self.data_dict[subset_to_show].head(10).drop("id", axis="columns")
83
+ )
84
  st.dataframe(df_to_show)
85
  st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
86
 
 
88
  for subset in self.subsets:
89
  all_labels_from_subset = self.data_dict_named[subset]["tags"].tolist()
90
  all_labels_from_subset = [
91
+ x for subarray in all_labels_from_subset for x in subarray
 
 
92
  ]
93
  all_labels_from_subset = pd.Series(all_labels_from_subset)
94
  class_distribution_dict[subset] = (
95
  all_labels_from_subset.value_counts(normalize=True)
96
+ .sort_index()
97
+ .reset_index()
98
+ .rename({"index": "class", 0: subset}, axis="columns")
99
  )
100
 
101
  class_distribution_df = pd.merge(
clarin_datasets/punctuation_restoration_dataset.py CHANGED
@@ -36,7 +36,7 @@ class PunctuationRestorationDataset(DatasetToShow):
36
  """,
37
  "Task description",
38
  "The purpose of this task is to restore punctuation in the ASR recognition of texts read out loud.",
39
- "clarin_datasets/punctuation_restoration_task.png"
40
  ]
41
 
42
  def load_data(self):
@@ -81,7 +81,9 @@ class PunctuationRestorationDataset(DatasetToShow):
81
 
82
  with dataframe_head:
83
  st.header("First 10 observations of the chosen subset")
84
- subset_to_show = st.selectbox(label="Select subset to see", options=self.subsets)
 
 
85
  df_to_show = self.data_dict[subset_to_show].head(10)
86
  st.dataframe(df_to_show)
87
  st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
@@ -90,10 +92,7 @@ class PunctuationRestorationDataset(DatasetToShow):
90
  for subset in self.subsets:
91
  all_labels_from_subset = self.data_dict_named[subset]["tags"].tolist()
92
  all_labels_from_subset = [
93
- x
94
- for subarray in all_labels_from_subset
95
- for x in subarray
96
- if x != "O"
97
  ]
98
  all_labels_from_subset = pd.Series(all_labels_from_subset)
99
  class_distribution_dict[subset] = (
 
36
  """,
37
  "Task description",
38
  "The purpose of this task is to restore punctuation in the ASR recognition of texts read out loud.",
39
+ "clarin_datasets/punctuation_restoration_task.png",
40
  ]
41
 
42
  def load_data(self):
 
81
 
82
  with dataframe_head:
83
  st.header("First 10 observations of the chosen subset")
84
+ subset_to_show = st.selectbox(
85
+ label="Select subset to see", options=self.subsets
86
+ )
87
  df_to_show = self.data_dict[subset_to_show].head(10)
88
  st.dataframe(df_to_show)
89
  st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
 
92
  for subset in self.subsets:
93
  all_labels_from_subset = self.data_dict_named[subset]["tags"].tolist()
94
  all_labels_from_subset = [
95
+ x for subarray in all_labels_from_subset for x in subarray if x != "O"
 
 
 
96
  ]
97
  all_labels_from_subset = pd.Series(all_labels_from_subset)
98
  class_distribution_dict[subset] = (