Mariusz Kossakowski commited on
Commit
2b9022f
·
1 Parent(s): abb1c69

Add datasets links

Browse files
clarin_datasets/abusive_clauses_dataset.py CHANGED
@@ -15,7 +15,9 @@ class AbusiveClausesDataset(DatasetToShow):
15
  DatasetToShow.__init__(self)
16
  self.dataset_name = "laugustyniak/abusive-clauses-pl"
17
  self.subsets = ["train", "validation", "test"]
18
- self.description = """
 
 
19
  ''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
20
  Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
21
  But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
 
15
  DatasetToShow.__init__(self)
16
  self.dataset_name = "laugustyniak/abusive-clauses-pl"
17
  self.subsets = ["train", "validation", "test"]
18
+ self.description = f"""
19
+ Dataset link: https://huggingface.co/datasets/{self.dataset_name}
20
+
21
  ''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
22
  Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
23
  But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
clarin_datasets/aspectemo_dataset.py CHANGED
@@ -10,7 +10,9 @@ class AspectEmoDataset(DatasetToShow):
10
  DatasetToShow.__init__(self)
11
  self.dataset_name = "clarin-pl/aspectemo"
12
  self.description = [
13
- """
 
 
14
  AspectEmo Corpus is an extended version of a publicly available PolEmo 2.0
15
  corpus of Polish customer reviews used in many projects on the use of different methods in sentiment
16
  analysis. The AspectEmo corpus consists of four subcorpora, each containing online customer reviews from the
 
10
  DatasetToShow.__init__(self)
11
  self.dataset_name = "clarin-pl/aspectemo"
12
  self.description = [
13
+ f"""
14
+ Dataset link: https://huggingface.co/datasets/{self.dataset_name}
15
+
16
  AspectEmo Corpus is an extended version of a publicly available PolEmo 2.0
17
  corpus of Polish customer reviews used in many projects on the use of different methods in sentiment
18
  analysis. The AspectEmo corpus consists of four subcorpora, each containing online customer reviews from the
clarin_datasets/cst_wikinews_dataset.py CHANGED
@@ -9,8 +9,8 @@ class CSTWikinewsDataset(DatasetToShow):
9
  def __init__(self):
10
  DatasetToShow.__init__(self)
11
  self.dataset_name = "clarin-pl/cst-wikinews"
12
- self.description = """
13
-
14
  """
15
 
16
  def load_data(self):
 
9
  def __init__(self):
10
  DatasetToShow.__init__(self)
11
  self.dataset_name = "clarin-pl/cst-wikinews"
12
+ self.description = f"""
13
+ Dataset link: https://huggingface.co/datasets/{self.dataset_name}
14
  """
15
 
16
  def load_data(self):
clarin_datasets/kpwr_ner_datasets.py CHANGED
@@ -11,7 +11,9 @@ class KpwrNerDataset(DatasetToShow):
11
  self.data_dict_named = None
12
  self.dataset_name = "clarin-pl/kpwr-ner"
13
  self.description = [
14
- """
 
 
15
  KPWR-NER is a part the Polish Corpus of Wrocław University of Technology (Korpus Języka
16
  Polskiego Politechniki Wrocławskiej). Its objective is named entity recognition for fine-grained categories
17
  of entities. It is the ‘n82’ version of the KPWr, which means that number of classes is restricted to 82 (
 
11
  self.data_dict_named = None
12
  self.dataset_name = "clarin-pl/kpwr-ner"
13
  self.description = [
14
+ f"""
15
+ Dataset link: https://huggingface.co/datasets/{self.dataset_name}
16
+
17
  KPWR-NER is a part the Polish Corpus of Wrocław University of Technology (Korpus Języka
18
  Polskiego Politechniki Wrocławskiej). Its objective is named entity recognition for fine-grained categories
19
  of entities. It is the ‘n82’ version of the KPWr, which means that number of classes is restricted to 82 (
clarin_datasets/nkjp_pos_dataset.py CHANGED
@@ -11,7 +11,9 @@ class NkjpPosDataset(DatasetToShow):
11
  self.data_dict_named = None
12
  self.dataset_name = "clarin-pl/nkjp-pos"
13
  self.description = [
14
- """
 
 
15
  NKJP-POS is a part the National Corpus of Polish (Narodowy Korpus Języka Polskiego).
16
  Its objective is part-of-speech tagging, e.g. nouns, verbs, adjectives, adverbs, etc. During the creation of
17
  corpus, texts of were annotated by humans from various sources, covering many domains and genres.
 
11
  self.data_dict_named = None
12
  self.dataset_name = "clarin-pl/nkjp-pos"
13
  self.description = [
14
+ f"""
15
+ Dataset link: https://huggingface.co/datasets/{self.dataset_name}
16
+
17
  NKJP-POS is a part the National Corpus of Polish (Narodowy Korpus Języka Polskiego).
18
  Its objective is part-of-speech tagging, e.g. nouns, verbs, adjectives, adverbs, etc. During the creation of
19
  corpus, texts of were annotated by humans from various sources, covering many domains and genres.
clarin_datasets/polemo_dataset.py CHANGED
@@ -16,7 +16,10 @@ class PolemoDataset(DatasetToShow):
16
  DatasetToShow.__init__(self)
17
  self.dataset_name = "clarin-pl/polemo2-official"
18
  self.subsets = ["train", "validation", "test"]
19
- self.description = """The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
 
 
 
20
  hotels, products, and university. It is human-annotated on a level of full reviews and individual
21
  sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and
22
  sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197,
 
16
  DatasetToShow.__init__(self)
17
  self.dataset_name = "clarin-pl/polemo2-official"
18
  self.subsets = ["train", "validation", "test"]
19
+ self.description = f"""
20
+ Dataset link: https://huggingface.co/datasets/{self.dataset_name}
21
+
22
+ The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
23
  hotels, products, and university. It is human-annotated on a level of full reviews and individual
24
  sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and
25
  sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197,
clarin_datasets/punctuation_restoration_dataset.py CHANGED
@@ -11,7 +11,9 @@ class PunctuationRestorationDataset(DatasetToShow):
11
  self.data_dict_named = None
12
  self.dataset_name = "clarin-pl/2021-punctuation-restoration"
13
  self.description = [
14
- """
 
 
15
  Speech transcripts generated by Automatic Speech Recognition (ASR) systems typically do
16
  not contain any punctuation or capitalization. In longer stretches of automatically recognized speech,
17
  the lack of punctuation affects the general clarity of the output text [1]. The primary purpose of
 
11
  self.data_dict_named = None
12
  self.dataset_name = "clarin-pl/2021-punctuation-restoration"
13
  self.description = [
14
+ f"""
15
+ Dataset link: https://huggingface.co/datasets/{self.dataset_name}
16
+
17
  Speech transcripts generated by Automatic Speech Recognition (ASR) systems typically do
18
  not contain any punctuation or capitalization. In longer stretches of automatically recognized speech,
19
  the lack of punctuation affects the general clarity of the output text [1]. The primary purpose of