Mariusz Kossakowski commited on
Commit
2b9d84c
1 Parent(s): d3fc096

Add components to NKJP POS dataset

Browse files
Files changed (1) hide show
  1. clarin_datasets/nkjp_pos_dataset.py +91 -4
clarin_datasets/nkjp_pos_dataset.py CHANGED
@@ -8,16 +8,103 @@ from clarin_datasets.dataset_to_show import DatasetToShow
8
  class NkjpPosDataset(DatasetToShow):
9
  def __init__(self):
10
  DatasetToShow.__init__(self)
 
11
  self.dataset_name = "clarin-pl/nkjp-pos"
12
- self.description = """
13
-
14
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  def load_data(self):
17
  raw_dataset = load_dataset(self.dataset_name)
18
  self.data_dict = {
19
  subset: raw_dataset[subset].to_pandas() for subset in self.subsets
20
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  def show_dataset(self):
23
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  class NkjpPosDataset(DatasetToShow):
9
  def __init__(self):
10
  DatasetToShow.__init__(self)
11
+ self.data_dict_named = None
12
  self.dataset_name = "clarin-pl/nkjp-pos"
13
+ self.description = [
14
+ """
15
+ NKJP-POS is a part the National Corpus of Polish (Narodowy Korpus Języka Polskiego).
16
+ Its objective is part-of-speech tagging, e.g. nouns, verbs, adjectives, adverbs, etc. During the creation of
17
+ corpus, texts of were annotated by humans from various sources, covering many domains and genres.
18
+ """,
19
+ "Tasks (input, output and metrics)",
20
+ """
21
+ Part-of-speech tagging (POS tagging) - tagging words in text with their corresponding part of speech.
22
+
23
+ Input ('tokens' column): sequence of tokens
24
+
25
+ Output ('pos_tags' column): sequence of predicted tokens’ classes (35 possible classes, described in detail in the annotation guidelines)
26
+
27
+ Measurements: F1-score (seqeval)
28
+
29
+ Example:
30
+
31
+ Input: ['Zarejestruj', 'się', 'jako', 'bezrobotny', '.']
32
+
33
+ Input (translated by DeepL): Register as unemployed.
34
+
35
+ Output: ['impt', 'qub', 'conj', 'subst', 'interp']
36
+ """
37
+ ]
38
 
39
  def load_data(self):
40
  raw_dataset = load_dataset(self.dataset_name)
41
  self.data_dict = {
42
  subset: raw_dataset[subset].to_pandas() for subset in self.subsets
43
  }
44
+ self.data_dict_named = {}
45
+ for subset in self.subsets:
46
+ references = raw_dataset[subset]["pos_tags"]
47
+ references_named = [
48
+ [
49
+ raw_dataset[subset].features["pos_tags"].feature.names[label]
50
+ for label in labels
51
+ ]
52
+ for labels in references
53
+ ]
54
+ self.data_dict_named[subset] = pd.DataFrame(
55
+ {
56
+ "tokens": self.data_dict[subset]["tokens"],
57
+ "tags": references_named,
58
+ }
59
+ )
60
 
61
  def show_dataset(self):
62
+ header = st.container()
63
+ description = st.container()
64
+ dataframe_head = st.container()
65
+ class_distribution = st.container()
66
+
67
+ with header:
68
+ st.title(self.dataset_name)
69
+
70
+ with description:
71
+ st.header("Dataset description")
72
+ st.write(self.description[0])
73
+ st.subheader(self.description[1])
74
+ st.write(self.description[2])
75
+
76
+ with dataframe_head:
77
+ st.header("First 10 observations of the chosen subset")
78
+ subset_to_show = st.selectbox(label="Select subset to see", options=self.subsets)
79
+ df_to_show = self.data_dict[subset_to_show].head(10).drop("id", axis="columns")
80
+ st.dataframe(df_to_show)
81
+ st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
82
+
83
+ class_distribution_dict = {}
84
+ for subset in self.subsets:
85
+ all_labels_from_subset = self.data_dict_named[subset]["tags"].tolist()
86
+ all_labels_from_subset = [
87
+ x
88
+ for subarray in all_labels_from_subset
89
+ for x in subarray
90
+ ]
91
+ all_labels_from_subset = pd.Series(all_labels_from_subset)
92
+ class_distribution_dict[subset] = (
93
+ all_labels_from_subset.value_counts(normalize=True)
94
+ .sort_index()
95
+ .reset_index()
96
+ .rename({"index": "class", 0: subset}, axis="columns")
97
+ )
98
+
99
+ class_distribution_df = pd.merge(
100
+ class_distribution_dict["train"],
101
+ class_distribution_dict["test"],
102
+ on="class",
103
+ )
104
+
105
+ with class_distribution:
106
+ st.header("Class distribution in each subset")
107
+ st.dataframe(class_distribution_df)
108
+ st.text_area(
109
+ label="LaTeX code", value=class_distribution_df.style.to_latex()
110
+ )