File size: 7,622 Bytes
500fbd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7987659
500fbd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
{
    "tquad": {
        "name": "TQUAD",
        "task": "extractive_question_answering",
        "description": "This dataset is the Turkish Question & Answer dataset on Turkish & Islamic Science History within the scope of Teknofest 2018 Artificial Intelligence competition.",
        "url": "https://github.com/TQuad/turkish-nlp-qa-dataset",
        "hf_name": "mcemilg/tquad",
        "generative": false
    },
    "xquad_tr": {
        "name": "XQUAD",
        "task": "extractive_question_answering",
        "description": "XQuAD (Cross-lingual Question Answering Dataset) is a benchmark dataset for evaluating cross-lingual question answering performance. The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the development set of SQuAD v1.1 together with their professional translations into ten languages: Spanish, German, Greek, Russian, Turkish, Arabic, Vietnamese, Thai, Chinese, and Hindi..",
        "url": "https://github.com/google-deepmind/xquad",
        "hf_name": "google/xquad",
        "generative": false
    },
    "mkqa_tr": {
        "name": "MKQA",
        "task": "extractive_question_answering",
        "description": "MKQA: Multilingual Knowledge Questions & Answers. MKQA includes 10k open-domain question-answer pairs in 26 languages, resulting 260k examples in total.",
        "url": "https://github.com/apple/ml-mkqa",
        "hf_name": "mcemilg/mkqa_tr",
        "generative": false
    },
    "xlsum_tr": {
        "name": "XLSum",
        "task": "summarization",
        "description": "Abstractive summarization dataset for 44 languages.",
        "url": "https://github.com/csebuetnlp/xl-sum",
        "hf_name": "csebuetnlp/xlsum",
        "generative": true
    },
    "mlsum_tr": {
        "name": "MLSum",
        "task": "summarization",
        "description": "A multilingual summarization dataset collected from the newspapers' websites. MLSum contains 1.5M examples in 5 languages including Turkish.",
        "url": "https://huggingface.co/datasets/reciTAL/mlsum",
        "hf_name": "reciTAL/mlsum",
        "generative": true
    },
    "wiki_lingua_tr": {
        "name": "WikiLingua",
        "task": "summarization",
        "description": "A multilingual abstractive summarization dataset covering 17 languages.",
        "url": "https://github.com/esdurmus/Wikilingua",
        "hf_name": "GEM/wiki_lingua",
        "generative": true
    },
    "tr-wikihow-summ": {
        "name": "WikiHow",
        "task": "summarization",
        "description": "A summarization dataset obtained from WikiHow website.",
        "url": "https://huggingface.co/datasets/ardauzunoglu/tr-wikihow-summ",
        "hf_name": "ardauzunoglu/tr-wikihow-summ",
        "generative": true
    },
    "mnli_tr": {
        "name": "MNLI",
        "task": "natural_language_inference",
        "description": "Multi-Genre NLI (MNLI) dataset.",
        "url": "https://cims.nyu.edu/~sbowman/multinli/",
        "hf_name": "boun-tabi/nli_tr",
        "generative": false
    },
    "snli_tr": {
        "name": "SNLI",
        "task": "natural_language_inference",
        "description": "The Stanford NLI (SNLI) dataset.",
        "url": "https://nlp.stanford.edu/projects/snli/",
        "hf_name": "boun-tabi/nli_tr",
        "generative": false
    },
    "xnli_tr": {
        "name": "XNLI",
        "task": "natural_language_inference",
        "description": "The Cross-Lingual NLI (XNLI) dataset.",
        "url": "https://github.com/facebookresearch/XNLI",
        "hf_name": "boun-tabi/nli_tr",
        "generative": false
    },
    "xcopa_tr": {
        "name": "XCOPA",
        "task": "multiple_choice",
        "description": "A multilingual dataset for evaluating causal commonsense reasoning capabilities of language models.",
        "url": "https://github.com/cambridgeltl/xcopa",
        "hf_name": "cambridgeltl/xcopa",
        "generative": false
    },
    "exams_tr": {
        "name": "Exams",
        "task": "multiple_choice",
        "description": "A question answering dataset covering high school exams.",
        "url": "https://huggingface.co/datasets/exams",
        "hf_name": "exams",
        "generative": false
    },
    "belebele_tr": {
        "name": "Belebele",
        "task": "multiple_choice",
        "description": "A multiple choice question answering dataset to evaluate machine comprehension.",
        "url": "https://github.com/facebookresearch/belebele",
        "generative": false
    },
    "turkish_plu_goal_inference": {
        "name": "PLU-GI",
        "task": "multiple_choice",
        "description": "TurkishPLU - Goal Inference task.",
        "url": "https://github.com/GGLAB-KU/turkish-plu",
        "hf_name": "mcemilg/turkish-plu-goal-inference",
        "generative": false
    },
    "turkish_plu_next_event_prediction": {
        "name": "PLU-NE",
        "task": "multiple_choice",
        "description": "TurkishPLU - Next Event Prediction task.",
        "url": "https://github.com/GGLAB-KU/turkish-plu",
        "hf_name": "mcemilg/turkish-plu-next-event-prediction",
        "generative": false
    },
    "turkish_plu_step_inference": {
        "name": "PLU-SI",
        "task": "multiple_choice",
        "description": "TurkishPLU - Step Inference task.",
        "url": "https://github.com/GGLAB-KU/turkish-plu",
        "hf_name": "mcemilg/turkish-plu-step-inference",
        "generative": false
    },
    "turkish_plu_step_ordering": {
        "name": "PLU-SO",
        "task": "multiple_choice",
        "description": "TurkishPLU - Step Ordering task.",
        "url": "https://github.com/GGLAB-KU/turkish-plu",
        "hf_name": "mcemilg/turkish-plu-step-ordering",
        "generative": false
    },
    "sts_tr": {
        "name": "STS",
        "task": "text_classification",
        "description": "The machine-translated Semantic Textual Similarity dataset in Turkish.",
        "url": "https://github.com/emrecncelik/sts-benchmark-tr",
        "hf_name": "emrecan/stsb-mt-turkish",
        "generative": false
    },
    "offenseval_tr": {
        "name": "OffensEval",
        "task": "text_classification",
        "description": "A dataset for offensive speech recognition in Turkish.",
        "url": "https://sites.google.com/site/offensevalsharedtask/offenseval-2020",
        "hf_name": "coltekin/offenseval2020_tr",
        "generative": false
    },
    "news_cat": {
        "name": "NewsCat",
        "task": "text_classification",
        "description": "News classification dataset collected from Turkish newspapers websites.",
        "url": "http://www.kemik.yildiz.edu.tr/veri_kumelerimiz.html",
        "hf_name": "mcemilg/news-cat",
        "generative": false
    },
    "ironytr": {
        "name": "IronyTR",
        "task": "text_classification",
        "description": "Irony detection dataset in Turkish.",
        "url": "https://github.com/teghub/IronyTR",
        "hf_name": "mcemilg/IronyTR",
        "generative": false
    },
    "wmt-tr-en-prompt": {
        "name": "WMT",
        "task": "machine_translation",
        "description": "English-to-Turkish machine translation dataset.",
        "url": "http://www.aclweb.org/anthology/W/W16/W16-2301",
        "hf_name": "wmt/wmt16",
        "generative": true
    },
    "gecturk_generation": {
        "name": "GECTurk",
        "task": "grammatical_error_correction",
        "description": "A dataset for grammatical error correction.",
        "url": "https://github.com/GGLAB-KU/gecturk",
        "hf_name": "mcemilg/GECTurk-generation",
        "generative": true
    }
}