Spaces:
Runtime error
Runtime error
Mariusz Kossakowski
commited on
Commit
·
2b9022f
1
Parent(s):
abb1c69
Add datasets links
Browse files- clarin_datasets/abusive_clauses_dataset.py +3 -1
- clarin_datasets/aspectemo_dataset.py +3 -1
- clarin_datasets/cst_wikinews_dataset.py +2 -2
- clarin_datasets/kpwr_ner_datasets.py +3 -1
- clarin_datasets/nkjp_pos_dataset.py +3 -1
- clarin_datasets/polemo_dataset.py +4 -1
- clarin_datasets/punctuation_restoration_dataset.py +3 -1
clarin_datasets/abusive_clauses_dataset.py
CHANGED
@@ -15,7 +15,9 @@ class AbusiveClausesDataset(DatasetToShow):
|
|
15 |
DatasetToShow.__init__(self)
|
16 |
self.dataset_name = "laugustyniak/abusive-clauses-pl"
|
17 |
self.subsets = ["train", "validation", "test"]
|
18 |
-
self.description = """
|
|
|
|
|
19 |
''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
|
20 |
Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
|
21 |
But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
|
|
|
15 |
DatasetToShow.__init__(self)
|
16 |
self.dataset_name = "laugustyniak/abusive-clauses-pl"
|
17 |
self.subsets = ["train", "validation", "test"]
|
18 |
+
self.description = f"""
|
19 |
+
Dataset link: https://huggingface.co/datasets/{self.dataset_name}
|
20 |
+
|
21 |
''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
|
22 |
Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
|
23 |
But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
|
clarin_datasets/aspectemo_dataset.py
CHANGED
@@ -10,7 +10,9 @@ class AspectEmoDataset(DatasetToShow):
|
|
10 |
DatasetToShow.__init__(self)
|
11 |
self.dataset_name = "clarin-pl/aspectemo"
|
12 |
self.description = [
|
13 |
-
"""
|
|
|
|
|
14 |
AspectEmo Corpus is an extended version of a publicly available PolEmo 2.0
|
15 |
corpus of Polish customer reviews used in many projects on the use of different methods in sentiment
|
16 |
analysis. The AspectEmo corpus consists of four subcorpora, each containing online customer reviews from the
|
|
|
10 |
DatasetToShow.__init__(self)
|
11 |
self.dataset_name = "clarin-pl/aspectemo"
|
12 |
self.description = [
|
13 |
+
f"""
|
14 |
+
Dataset link: https://huggingface.co/datasets/{self.dataset_name}
|
15 |
+
|
16 |
AspectEmo Corpus is an extended version of a publicly available PolEmo 2.0
|
17 |
corpus of Polish customer reviews used in many projects on the use of different methods in sentiment
|
18 |
analysis. The AspectEmo corpus consists of four subcorpora, each containing online customer reviews from the
|
clarin_datasets/cst_wikinews_dataset.py
CHANGED
@@ -9,8 +9,8 @@ class CSTWikinewsDataset(DatasetToShow):
|
|
9 |
def __init__(self):
|
10 |
DatasetToShow.__init__(self)
|
11 |
self.dataset_name = "clarin-pl/cst-wikinews"
|
12 |
-
self.description = """
|
13 |
-
|
14 |
"""
|
15 |
|
16 |
def load_data(self):
|
|
|
9 |
def __init__(self):
|
10 |
DatasetToShow.__init__(self)
|
11 |
self.dataset_name = "clarin-pl/cst-wikinews"
|
12 |
+
self.description = f"""
|
13 |
+
Dataset link: https://huggingface.co/datasets/{self.dataset_name}
|
14 |
"""
|
15 |
|
16 |
def load_data(self):
|
clarin_datasets/kpwr_ner_datasets.py
CHANGED
@@ -11,7 +11,9 @@ class KpwrNerDataset(DatasetToShow):
|
|
11 |
self.data_dict_named = None
|
12 |
self.dataset_name = "clarin-pl/kpwr-ner"
|
13 |
self.description = [
|
14 |
-
"""
|
|
|
|
|
15 |
KPWR-NER is a part the Polish Corpus of Wrocław University of Technology (Korpus Języka
|
16 |
Polskiego Politechniki Wrocławskiej). Its objective is named entity recognition for fine-grained categories
|
17 |
of entities. It is the ‘n82’ version of the KPWr, which means that number of classes is restricted to 82 (
|
|
|
11 |
self.data_dict_named = None
|
12 |
self.dataset_name = "clarin-pl/kpwr-ner"
|
13 |
self.description = [
|
14 |
+
f"""
|
15 |
+
Dataset link: https://huggingface.co/datasets/{self.dataset_name}
|
16 |
+
|
17 |
KPWR-NER is a part the Polish Corpus of Wrocław University of Technology (Korpus Języka
|
18 |
Polskiego Politechniki Wrocławskiej). Its objective is named entity recognition for fine-grained categories
|
19 |
of entities. It is the ‘n82’ version of the KPWr, which means that number of classes is restricted to 82 (
|
clarin_datasets/nkjp_pos_dataset.py
CHANGED
@@ -11,7 +11,9 @@ class NkjpPosDataset(DatasetToShow):
|
|
11 |
self.data_dict_named = None
|
12 |
self.dataset_name = "clarin-pl/nkjp-pos"
|
13 |
self.description = [
|
14 |
-
"""
|
|
|
|
|
15 |
NKJP-POS is a part the National Corpus of Polish (Narodowy Korpus Języka Polskiego).
|
16 |
Its objective is part-of-speech tagging, e.g. nouns, verbs, adjectives, adverbs, etc. During the creation of
|
17 |
corpus, texts of were annotated by humans from various sources, covering many domains and genres.
|
|
|
11 |
self.data_dict_named = None
|
12 |
self.dataset_name = "clarin-pl/nkjp-pos"
|
13 |
self.description = [
|
14 |
+
f"""
|
15 |
+
Dataset link: https://huggingface.co/datasets/{self.dataset_name}
|
16 |
+
|
17 |
NKJP-POS is a part the National Corpus of Polish (Narodowy Korpus Języka Polskiego).
|
18 |
Its objective is part-of-speech tagging, e.g. nouns, verbs, adjectives, adverbs, etc. During the creation of
|
19 |
corpus, texts of were annotated by humans from various sources, covering many domains and genres.
|
clarin_datasets/polemo_dataset.py
CHANGED
@@ -16,7 +16,10 @@ class PolemoDataset(DatasetToShow):
|
|
16 |
DatasetToShow.__init__(self)
|
17 |
self.dataset_name = "clarin-pl/polemo2-official"
|
18 |
self.subsets = ["train", "validation", "test"]
|
19 |
-
self.description = """
|
|
|
|
|
|
|
20 |
hotels, products, and university. It is human-annotated on a level of full reviews and individual
|
21 |
sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and
|
22 |
sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197,
|
|
|
16 |
DatasetToShow.__init__(self)
|
17 |
self.dataset_name = "clarin-pl/polemo2-official"
|
18 |
self.subsets = ["train", "validation", "test"]
|
19 |
+
self.description = f"""
|
20 |
+
Dataset link: https://huggingface.co/datasets/{self.dataset_name}
|
21 |
+
|
22 |
+
The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
|
23 |
hotels, products, and university. It is human-annotated on a level of full reviews and individual
|
24 |
sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and
|
25 |
sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197,
|
clarin_datasets/punctuation_restoration_dataset.py
CHANGED
@@ -11,7 +11,9 @@ class PunctuationRestorationDataset(DatasetToShow):
|
|
11 |
self.data_dict_named = None
|
12 |
self.dataset_name = "clarin-pl/2021-punctuation-restoration"
|
13 |
self.description = [
|
14 |
-
"""
|
|
|
|
|
15 |
Speech transcripts generated by Automatic Speech Recognition (ASR) systems typically do
|
16 |
not contain any punctuation or capitalization. In longer stretches of automatically recognized speech,
|
17 |
the lack of punctuation affects the general clarity of the output text [1]. The primary purpose of
|
|
|
11 |
self.data_dict_named = None
|
12 |
self.dataset_name = "clarin-pl/2021-punctuation-restoration"
|
13 |
self.description = [
|
14 |
+
f"""
|
15 |
+
Dataset link: https://huggingface.co/datasets/{self.dataset_name}
|
16 |
+
|
17 |
Speech transcripts generated by Automatic Speech Recognition (ASR) systems typically do
|
18 |
not contain any punctuation or capitalization. In longer stretches of automatically recognized speech,
|
19 |
the lack of punctuation affects the general clarity of the output text [1]. The primary purpose of
|