laithAzzam commited on
Commit
5394e90
·
1 Parent(s): 1a80bc8

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +356 -1
  2. arabic_billion_words.py +173 -0
  3. dataset_infos.json +1 -0
  4. gitattributes.txt +27 -0
README.md CHANGED
@@ -1,3 +1,358 @@
1
  ---
2
- license: openrail
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ annotations_creators:
3
+ - found
4
+ language_creators:
5
+ - found
6
+ language:
7
+ - ar
8
+ license:
9
+ - unknown
10
+ multilinguality:
11
+ - monolingual
12
+ size_categories:
13
+ - 100K<n<1M
14
+ - 10K<n<100K
15
+ - 1M<n<10M
16
+ source_datasets:
17
+ - original
18
+ task_categories:
19
+ - text-generation
20
+ - fill-mask
21
+ task_ids:
22
+ - language-modeling
23
+ - masked-language-modeling
24
+ paperswithcode_id: null
25
+ pretty_name: Arabic Billion Words
26
+ configs:
27
+ - Alittihad
28
+ - Almasryalyoum
29
+ - Almustaqbal
30
+ - Alqabas
31
+ - Echoroukonline
32
+ - Ryiadh
33
+ - Sabanews
34
+ - SaudiYoum
35
+ - Techreen
36
+ - Youm7
37
+ dataset_info:
38
+ - config_name: Alittihad
39
+ features:
40
+ - name: url
41
+ dtype: string
42
+ - name: head_line
43
+ dtype: string
44
+ - name: date
45
+ dtype: string
46
+ - name: text
47
+ dtype: string
48
+ splits:
49
+ - name: train
50
+ num_bytes: 1601790302
51
+ num_examples: 349342
52
+ download_size: 348259999
53
+ dataset_size: 1601790302
54
+ - config_name: Almasryalyoum
55
+ features:
56
+ - name: url
57
+ dtype: string
58
+ - name: head_line
59
+ dtype: string
60
+ - name: date
61
+ dtype: string
62
+ - name: text
63
+ dtype: string
64
+ splits:
65
+ - name: train
66
+ num_bytes: 1056197870
67
+ num_examples: 291723
68
+ download_size: 242604438
69
+ dataset_size: 1056197870
70
+ - config_name: Almustaqbal
71
+ features:
72
+ - name: url
73
+ dtype: string
74
+ - name: head_line
75
+ dtype: string
76
+ - name: date
77
+ dtype: string
78
+ - name: text
79
+ dtype: string
80
+ splits:
81
+ - name: train
82
+ num_bytes: 1545659336
83
+ num_examples: 446873
84
+ download_size: 350826797
85
+ dataset_size: 1545659336
86
+ - config_name: Alqabas
87
+ features:
88
+ - name: url
89
+ dtype: string
90
+ - name: head_line
91
+ dtype: string
92
+ - name: date
93
+ dtype: string
94
+ - name: text
95
+ dtype: string
96
+ splits:
97
+ - name: train
98
+ num_bytes: 2631729746
99
+ num_examples: 817274
100
+ download_size: 595274646
101
+ dataset_size: 2631729746
102
+ - config_name: Echoroukonline
103
+ features:
104
+ - name: url
105
+ dtype: string
106
+ - name: head_line
107
+ dtype: string
108
+ - name: date
109
+ dtype: string
110
+ - name: text
111
+ dtype: string
112
+ splits:
113
+ - name: train
114
+ num_bytes: 464386206
115
+ num_examples: 139732
116
+ download_size: 108184378
117
+ dataset_size: 464386206
118
+ - config_name: Ryiadh
119
+ features:
120
+ - name: url
121
+ dtype: string
122
+ - name: head_line
123
+ dtype: string
124
+ - name: date
125
+ dtype: string
126
+ - name: text
127
+ dtype: string
128
+ splits:
129
+ - name: train
130
+ num_bytes: 3101294859
131
+ num_examples: 858188
132
+ download_size: 691264971
133
+ dataset_size: 3101294859
134
+ - config_name: Sabanews
135
+ features:
136
+ - name: url
137
+ dtype: string
138
+ - name: head_line
139
+ dtype: string
140
+ - name: date
141
+ dtype: string
142
+ - name: text
143
+ dtype: string
144
+ splits:
145
+ - name: train
146
+ num_bytes: 198019614
147
+ num_examples: 92149
148
+ download_size: 38214558
149
+ dataset_size: 198019614
150
+ - config_name: SaudiYoum
151
+ features:
152
+ - name: url
153
+ dtype: string
154
+ - name: head_line
155
+ dtype: string
156
+ - name: date
157
+ dtype: string
158
+ - name: text
159
+ dtype: string
160
+ splits:
161
+ - name: train
162
+ num_bytes: 2723291416
163
+ num_examples: 888068
164
+ download_size: 605537923
165
+ dataset_size: 2723291416
166
+ - config_name: Techreen
167
+ features:
168
+ - name: url
169
+ dtype: string
170
+ - name: head_line
171
+ dtype: string
172
+ - name: date
173
+ dtype: string
174
+ - name: text
175
+ dtype: string
176
+ splits:
177
+ - name: train
178
+ num_bytes: 1103458209
179
+ num_examples: 314597
180
+ download_size: 252976781
181
+ dataset_size: 1103458209
182
+ - config_name: Youm7
183
+ features:
184
+ - name: url
185
+ dtype: string
186
+ - name: head_line
187
+ dtype: string
188
+ - name: date
189
+ dtype: string
190
+ - name: text
191
+ dtype: string
192
+ splits:
193
+ - name: train
194
+ num_bytes: 3004689464
195
+ num_examples: 1172136
196
+ download_size: 617708074
197
+ dataset_size: 3004689464
198
  ---
199
+
200
+ # Dataset Card for Arabic Billion Words Corpus
201
+
202
+ ## Table of Contents
203
+ - [Dataset Description](#dataset-description)
204
+ - [Dataset Summary](#dataset-summary)
205
+ - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
206
+ - [Languages](#languages)
207
+ - [Dataset Structure](#dataset-structure)
208
+ - [Data Instances](#data-instances)
209
+ - [Data Fields](#data-fields)
210
+ - [Data Splits](#data-splits)
211
+ - [Dataset Creation](#dataset-creation)
212
+ - [Curation Rationale](#curation-rationale)
213
+ - [Source Data](#source-data)
214
+ - [Annotations](#annotations)
215
+ - [Personal and Sensitive Information](#personal-and-sensitive-information)
216
+ - [Considerations for Using the Data](#considerations-for-using-the-data)
217
+ - [Social Impact of Dataset](#social-impact-of-dataset)
218
+ - [Discussion of Biases](#discussion-of-biases)
219
+ - [Other Known Limitations](#other-known-limitations)
220
+ - [Additional Information](#additional-information)
221
+ - [Dataset Curators](#dataset-curators)
222
+ - [Licensing Information](#licensing-information)
223
+ - [Citation Information](#citation-information)
224
+ - [Contributions](#contributions)
225
+
226
+ ## Dataset Description
227
+
228
+ - **Homepage:** http://www.abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus
229
+ - **Repository:**
230
+ - **Paper:** https://arxiv.org/pdf/1611.04033
231
+ - **Leaderboard:**
232
+ - **Point of Contact:**[Ibrahim Abu El-Khair](iabuelkhair@gmail.com)
233
+
234
+ ### Dataset Summary
235
+
236
+ Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.
237
+ It contains over a billion and a half words in total, out of which, there are about three million unique words.
238
+ The corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.
239
+ Also it was marked with two mark-up languages, namely: SGML, and XML.
240
+
241
+ ### Supported Tasks and Leaderboards
242
+
243
+ [More Information Needed]
244
+
245
+ ### Languages
246
+
247
+ Arabic
248
+
249
+ ## Dataset Structure
250
+
251
+ ### Data Instances
252
+
253
+ This is an example of the "Almasryalyoum" configuration subset:
254
+ ```python
255
+ {
256
+ "url": "http://today.almasryalyoum.com/printerfriendly.aspx?ArticleID=61300",
257
+ "head_line": "رئيس وزراء المجر: عنصرية جماهير أوجبيست جلبت العار للبلاد",
258
+ "date": "19/5/2007",
259
+ "text": """قال متحدث باسم الحكومة المجرية: إن رئيس الوزراء فيرنك جيوركساني رحب بقرار اتحاد كرة القدم المجري بخصم ثلاث نقاط من نادي أوجبيست بسبب السلوك العنصري الذي صدر من جماهيره.
260
+ وعاقب الاتحاد المجري فريق أوجبيست بعد أن سخرت جماهيره من إبراهيم سيديبي مهاجم فريق ديبرينسين الأسود أثناء مباراة الفريقين أوائل مايو الجاري.
261
+ يذكر أن الاتحاد فرض أيضا غرامة مالية قدرها 20 ألف دولار علي أوجبيست في عام 2005 بعد أن رددت جماهيره شعارات معادية للسامية خلال مباراة بالدوري المجري.
262
+ وأوضح جيوركساني في خطاب إلي إيستفان كيستليكي رئيس الاتحاد المجري لكرة القدم، أن هذا السلوك العنصري من الجماهير «جلب العار لكرة القدم وللمجر». يذكر أن المجر بها مجموعة من مشجعي كرة القدم المشاغبين «الهوليجانز»، وشارك الكثير منهم في أعمال شغب معادية للحكومة في العام الماضي.""",
263
+ }
264
+ ```
265
+
266
+ ### Data Fields
267
+
268
+ The data fields are:
269
+ - "url": string, original url of the article,
270
+ - "head_line": string, headline of the article,
271
+ - "date": string, date of the article,
272
+ - "text": string, text content of the article,
273
+
274
+ ### Data Splits
275
+
276
+ There is only one "training" split for all configuration subsets, containing the following number of examples:
277
+
278
+ | | Number of examples |
279
+ |:---------------|-------------------:|
280
+ | Alittihad | 349342 |
281
+ | Almasryalyoum | 291723 |
282
+ | Almustaqbal | 446873 |
283
+ | Alqabas | 817274 |
284
+ | Echoroukonline | 139732 |
285
+ | Ryiadh | 858188 |
286
+ | Sabanews | 92149 |
287
+ | SaudiYoum | 888068 |
288
+ | Techreen | 314597 |
289
+ | Youm7 | 1172136 |
290
+
291
+ ## Dataset Creation
292
+
293
+ ### Curation Rationale
294
+
295
+ [More Information Needed]
296
+
297
+ ### Source Data
298
+
299
+ #### Initial Data Collection and Normalization
300
+
301
+ [More Information Needed]
302
+
303
+ #### Who are the source language producers?
304
+
305
+ [More Information Needed]
306
+
307
+ ### Annotations
308
+
309
+ #### Annotation process
310
+
311
+ [More Information Needed]
312
+
313
+ #### Who are the annotators?
314
+
315
+ [More Information Needed]
316
+
317
+ ### Personal and Sensitive Information
318
+
319
+ [More Information Needed]
320
+
321
+ ## Considerations for Using the Data
322
+
323
+ ### Social Impact of Dataset
324
+
325
+ [More Information Needed]
326
+
327
+ ### Discussion of Biases
328
+
329
+ [More Information Needed]
330
+
331
+ ### Other Known Limitations
332
+
333
+ [More Information Needed]
334
+
335
+ ## Additional Information
336
+
337
+ ### Dataset Curators
338
+
339
+ [More Information Needed]
340
+
341
+ ### Licensing Information
342
+
343
+ [More Information Needed]
344
+
345
+ ### Citation Information
346
+
347
+ ```
348
+ @article{el20161,
349
+ title={1.5 billion words arabic corpus},
350
+ author={El-Khair, Ibrahim Abu},
351
+ journal={arXiv preprint arXiv:1611.04033},
352
+ year={2016}
353
+ }
354
+ ```
355
+
356
+ ### Contributions
357
+
358
+ Thanks to [@zaidalyafeai](https://github.com/zaidalyafeai) and [@albertvillanova](https://github.com/albertvillanova) for adding this dataset.
arabic_billion_words.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Arabic Billion Words Corpus"""
16
+
17
+
18
+ import os
19
+ import re
20
+
21
+ import datasets
22
+
23
+
24
+ _CITATION = """\
25
+ @article{el20161,
26
+ title={1.5 billion words arabic corpus},
27
+ author={El-Khair, Ibrahim Abu},
28
+ journal={arXiv preprint arXiv:1611.04033},
29
+ year={2016}
30
+ }
31
+ """
32
+
33
+ _DESCRIPTION = """\
34
+ Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.
35
+ It contains over a billion and a half words in total, out of which, there are about three million unique words.
36
+ The corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.
37
+ Also it was marked with two mark-up languages, namely: SGML, and XML.
38
+ """
39
+
40
+ _HOMEPAGE = "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus"
41
+
42
+ _URL = "http://abuelkhair.net/corpus/"
43
+ _URLs = {
44
+ "Alittihad": _URL + "Alittihad_XML_utf_8.rar",
45
+ "Almasryalyoum": _URL + "Almasryalyoum_XML_utf_8.rar",
46
+ "Almustaqbal": _URL + "Almustaqbal_XML_utf_8.rar",
47
+ "Alqabas": _URL + "Alqabas_XML_utf_8.rar",
48
+ "Echoroukonline": _URL + "Echoroukonline_XML_utf_8.rar",
49
+ "Ryiadh": _URL + "Ryiadh_XML_utf_8.rar",
50
+ "Sabanews": _URL + "Sabanews_XML_utf_8.rar",
51
+ "SaudiYoum": _URL + "SaudiYoum_XML_utf_8.rar",
52
+ "Techreen": _URL + "Techreen_XML_utf_8.rar",
53
+ "Youm7": _URL + "Youm7_XML_utf_8.rar",
54
+ }
55
+
56
+ # Some tags are misspelled
57
+ # - Misspelled article tags:
58
+ # - Alqabas: <Alqabas>, <Alqabas1>
59
+ # - Ryiadh: <Ryiadh>, <Ryiadh1>
60
+ MISSPELLED_TAGS = {
61
+ "Dateline": ["Dateline", "dateline"],
62
+ "Headline": ["Headline", "Healine"],
63
+ "Text": ["Text"],
64
+ "URL": ["URL"],
65
+ }
66
+
67
+ TAG_PATTERNS = {
68
+ tag: [re.compile(rf".*?<{label}>(.*?)</{label}>.*?", re.MULTILINE | re.DOTALL) for label in labels]
69
+ for tag, labels in MISSPELLED_TAGS.items()
70
+ }
71
+
72
+
73
+ class ArabicBillionWords(datasets.GeneratorBasedBuilder):
74
+ """Arabic Billion Words Corpus"""
75
+
76
+ VERSION = datasets.Version("1.1.0")
77
+
78
+ BUILDER_CONFIGS = [
79
+ datasets.BuilderConfig(
80
+ name="Alittihad", version=VERSION, description="This part of dataset covers Alittihad news paper"
81
+ ),
82
+ datasets.BuilderConfig(
83
+ name="Almasryalyoum", version=VERSION, description="This part of dataset covers Almasryalyoum news paper"
84
+ ),
85
+ datasets.BuilderConfig(
86
+ name="Almustaqbal", version=VERSION, description="This part of dataset covers Almustaqbal news paper"
87
+ ),
88
+ datasets.BuilderConfig(
89
+ name="Alqabas", version=VERSION, description="This part of dataset covers Alqabas news paper"
90
+ ),
91
+ datasets.BuilderConfig(
92
+ name="Echoroukonline", version=VERSION, description="This part of dataset covers Echoroukonline news paper"
93
+ ),
94
+ datasets.BuilderConfig(
95
+ name="Ryiadh", version=VERSION, description="This part of dataset covers Ryiadh news paper"
96
+ ),
97
+ datasets.BuilderConfig(
98
+ name="Sabanews", version=VERSION, description="This part of dataset covers Sabanews news paper"
99
+ ),
100
+ datasets.BuilderConfig(
101
+ name="SaudiYoum", version=VERSION, description="This part of dataset covers SaudiYoum news paper"
102
+ ),
103
+ datasets.BuilderConfig(
104
+ name="Techreen", version=VERSION, description="This part of dataset covers Techreen news paper"
105
+ ),
106
+ datasets.BuilderConfig(
107
+ name="Youm7", version=VERSION, description="This part of dataset covers Youm7 news paper"
108
+ ),
109
+ ]
110
+
111
+ def _info(self):
112
+ features = datasets.Features(
113
+ {
114
+ "url": datasets.Value("string"),
115
+ "head_line": datasets.Value("string"),
116
+ "date": datasets.Value("string"),
117
+ "text": datasets.Value("string"),
118
+ }
119
+ )
120
+ return datasets.DatasetInfo(
121
+ description=_DESCRIPTION,
122
+ features=features,
123
+ homepage=_HOMEPAGE,
124
+ citation=_CITATION,
125
+ )
126
+
127
+ def _split_generators(self, dl_manager):
128
+ """Returns SplitGenerators."""
129
+ my_urls = _URLs[self.config.name]
130
+ data_dir = dl_manager.download_and_extract(my_urls)
131
+ my_file_name = f"{self.config.name}_utf_8.xml"
132
+ return [
133
+ datasets.SplitGenerator(
134
+ name=datasets.Split.TRAIN,
135
+ gen_kwargs={
136
+ "filepath": os.path.join(data_dir, my_file_name),
137
+ },
138
+ ),
139
+ ]
140
+
141
+ def _generate_examples(self, filepath):
142
+ """Yields examples."""
143
+ data_tag = self.config.name
144
+ pattern = re.compile(rf".*?<{data_tag}(.*?)</{data_tag}.*?", re.MULTILINE | re.DOTALL)
145
+ key = 0
146
+ lines = ""
147
+ with open(filepath, mode="r", encoding="utf-8") as f:
148
+ for i, line in enumerate(f):
149
+ lines += line
150
+ if f"</{data_tag}" in line:
151
+ match = pattern.match(lines)
152
+ lines = ""
153
+ if match:
154
+ record = match.group(1)
155
+ text = self._clean_text(self._extract_tag("Text", record))
156
+ url = self._extract_tag("URL", record)
157
+ head_line = self._clean_text(self._extract_tag("Headline", record))
158
+ date = self._extract_tag("Dateline", record)
159
+ yield key, {"url": url, "head_line": head_line, "date": date, "text": text}
160
+ key += 1
161
+
162
+ @staticmethod
163
+ def _extract_tag(tag, text):
164
+ # check if the tag is misspelled
165
+ for pattern in TAG_PATTERNS[tag]:
166
+ match = pattern.match(text)
167
+ if match:
168
+ return match.group(1)
169
+ return ""
170
+
171
+ @staticmethod
172
+ def _clean_text(text):
173
+ return text.replace("?", "")
dataset_infos.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"Alittihad": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Alittihad", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1601790302, "num_examples": 349342, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Alittihad_XML_utf_8.rar": {"num_bytes": 348259999, "checksum": "6dd90f7ca98699e924e0ea423dc9f4f648c645379f8bffe15eeb97af00fd6fc0"}}, "download_size": 348259999, "post_processing_size": null, "dataset_size": 1601790302, "size_in_bytes": 1950050301}, "Almasryalyoum": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Almasryalyoum", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1056197870, "num_examples": 291723, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Almasryalyoum_XML_utf_8.rar": {"num_bytes": 242604438, "checksum": "f88d24179fa97df8d179242cb564301be2c7a4ecd36a027815b8ce1563059e7a"}}, "download_size": 242604438, "post_processing_size": null, "dataset_size": 1056197870, "size_in_bytes": 1298802308}, "Almustaqbal": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Almustaqbal", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1545659336, "num_examples": 446873, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Almustaqbal_XML_utf_8.rar": {"num_bytes": 350826797, "checksum": "dff3361ad821f3bd3912cd7282db5c15a34919312b9bc7d708a8b30782c7fc36"}}, "download_size": 350826797, "post_processing_size": null, "dataset_size": 1545659336, "size_in_bytes": 1896486133}, "Alqabas": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Alqabas", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2631729746, "num_examples": 817274, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Alqabas_XML_utf_8.rar": {"num_bytes": 595274646, "checksum": "e5ea70add534220a8caf8d230959f134f49a822ce3612adb4f1bb537dc3cc6b4"}}, "download_size": 595274646, "post_processing_size": null, "dataset_size": 2631729746, "size_in_bytes": 3227004392}, "Echoroukonline": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Echoroukonline", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 464386206, "num_examples": 139732, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Echoroukonline_XML_utf_8.rar": {"num_bytes": 108184378, "checksum": "8f3e85bd99caeb9c5c4922edcd18720fc3700fd6751febfa7ee72e05a584a270"}}, "download_size": 108184378, "post_processing_size": null, "dataset_size": 464386206, "size_in_bytes": 572570584}, "Ryiadh": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Ryiadh", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3101294859, "num_examples": 858188, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Ryiadh_XML_utf_8.rar": {"num_bytes": 691264971, "checksum": "c934867e53cb57d45ff99a8b5cfa991ae255a1ecb20e79309a41af2aa3e45c15"}}, "download_size": 691264971, "post_processing_size": null, "dataset_size": 3101294859, "size_in_bytes": 3792559830}, "Sabanews": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Sabanews", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 198019614, "num_examples": 92149, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Sabanews_XML_utf_8.rar": {"num_bytes": 38214558, "checksum": "c9b2f1ac8ed2a5e89ab9a6bcd82a0d825569b813b53cd83419968782e9946dbe"}}, "download_size": 38214558, "post_processing_size": null, "dataset_size": 198019614, "size_in_bytes": 236234172}, "SaudiYoum": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "SaudiYoum", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2723291416, "num_examples": 888068, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/SaudiYoum_XML_utf_8.rar": {"num_bytes": 605537923, "checksum": "d4cbb5554acb03fb7ce271a0b708c1bc6bcf31593ae8c670bed7f8c22335a915"}}, "download_size": 605537923, "post_processing_size": null, "dataset_size": 2723291416, "size_in_bytes": 3328829339}, "Techreen": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Techreen", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1103458209, "num_examples": 314597, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Techreen_XML_utf_8.rar": {"num_bytes": 252976781, "checksum": "5e4ab520399069fd38d9d80f4429fc05efaae51a912e1467becfc2686e424770"}}, "download_size": 252976781, "post_processing_size": null, "dataset_size": 1103458209, "size_in_bytes": 1356434990}, "Youm7": {"description": "Abu El-Khair Corpus is an Arabic text corpus, that includes more than five million newspaper articles.\nIt contains over a billion and a half words in total, out of which, there are about three million unique words.\nThe corpus is encoded with two types of encoding, namely: UTF-8, and Windows CP-1256.\nAlso it was marked with two mark-up languages, namely: SGML, and XML.\n", "citation": "@article{el20161,\n title={1.5 billion words arabic corpus},\n author={El-Khair, Ibrahim Abu},\n journal={arXiv preprint arXiv:1611.04033},\n year={2016}\n}\n", "homepage": "http://abuelkhair.net/index.php/en/arabic/abu-el-khair-corpus", "license": "", "features": {"url": {"dtype": "string", "id": null, "_type": "Value"}, "head_line": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "arabic_billion_words", "config_name": "Youm7", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3004689464, "num_examples": 1172136, "dataset_name": "arabic_billion_words"}}, "download_checksums": {"http://abuelkhair.net/corpus/Youm7_XML_utf_8.rar": {"num_bytes": 617708074, "checksum": "cd81aa0b3d74e5d9a07377369ea473d8a7bd51cb5826e9809d700de2ddeffe23"}}, "download_size": 617708074, "post_processing_size": null, "dataset_size": 3004689464, "size_in_bytes": 3622397538}}
gitattributes.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text