Mariusz Kossakowski commited on
Commit
997a159
1 Parent(s): 3362a6a

Add class distribution to cst wikinews

Browse files
clarin_datasets/cst_wikinews_dataset.py CHANGED
@@ -22,13 +22,35 @@ class CSTWikinewsDataset(DatasetToShow):
22
  def show_dataset(self):
23
  header = st.container()
24
  dataframe_head = st.container()
 
25
 
26
  with header:
27
  st.title(self.dataset_name)
28
 
29
  with dataframe_head:
30
  st.header("First 10 observations of the chosen subset")
31
- subset_to_show = st.selectbox(label="Select subset to see", options=self.subsets)
 
 
32
  df_to_show = self.data_dict[subset_to_show].head(10)
33
  st.dataframe(df_to_show)
34
  st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def show_dataset(self):
23
  header = st.container()
24
  dataframe_head = st.container()
25
+ class_distribution = st.container()
26
 
27
  with header:
28
  st.title(self.dataset_name)
29
 
30
  with dataframe_head:
31
  st.header("First 10 observations of the chosen subset")
32
+ subset_to_show = st.selectbox(
33
+ label="Select subset to see", options=self.subsets
34
+ )
35
  df_to_show = self.data_dict[subset_to_show].head(10)
36
  st.dataframe(df_to_show)
37
  st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
38
+
39
+ class_distribution_df = pd.merge(
40
+ pd.DataFrame(
41
+ self.data_dict["train"]["label"]
42
+ .value_counts(normalize=True)
43
+ .reset_index(drop=False)
44
+ .rename({"index": "class"}, axis="columns")
45
+ ),
46
+ pd.DataFrame(
47
+ self.data_dict["test"]["label"]
48
+ .value_counts(normalize=True)
49
+ .reset_index(drop=False)
50
+ .rename({"index": "class"}, axis="columns")
51
+ ),
52
+ on="class",
53
+ ).rename({"label_x": "train", "label_y": "test"}, axis="columns")
54
+
55
+ with class_distribution:
56
+ st.dataframe(class_distribution_df)