Alyosha11 commited on
Commit
6455306
Β·
verified Β·
1 Parent(s): 5a5eaa3

Upload dataset_to_text.ipynb with huggingface_hub

Browse files
Files changed (1) hide show
  1. dataset_to_text.ipynb +269 -0
dataset_to_text.ipynb ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "ac3a07af-2b66-41c8-8548-6f951460aedb",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from datasets import load_dataset\n",
11
+ "\n",
12
+ "ACCESS_TOKEN = \n",
13
+ "NUM_SAMPLES = 500000\n",
14
+ "\n",
15
+ "dataset = load_dataset(\"uonlp/CulturaX\",\n",
16
+ " \"ur\",\n",
17
+ " split=f\"train[:{NUM_SAMPLES}]\",\n",
18
+ " token = ACCESS_TOKEN\n",
19
+ " )"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 2,
25
+ "id": "b6515d96-1129-4aac-a670-796fee9302db",
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "data": {
30
+ "text/plain": [
31
+ "Dataset({\n",
32
+ " features: ['text', 'timestamp', 'url', 'source'],\n",
33
+ " num_rows: 500000\n",
34
+ "})"
35
+ ]
36
+ },
37
+ "execution_count": 2,
38
+ "metadata": {},
39
+ "output_type": "execute_result"
40
+ }
41
+ ],
42
+ "source": [
43
+ "dataset"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 3,
49
+ "id": "13831730-9fc1-4d89-b4fd-060ce0a976cb",
50
+ "metadata": {},
51
+ "outputs": [
52
+ {
53
+ "data": {
54
+ "text/plain": [
55
+ "Dataset({\n",
56
+ " features: ['text'],\n",
57
+ " num_rows: 500000\n",
58
+ "})"
59
+ ]
60
+ },
61
+ "execution_count": 3,
62
+ "metadata": {},
63
+ "output_type": "execute_result"
64
+ }
65
+ ],
66
+ "source": [
67
+ "# remove columns other than text\n",
68
+ "\n",
69
+ "dataset = dataset.remove_columns([col for col in dataset.column_names if col != 'text'])\n",
70
+ "dataset"
71
+ ]
72
+ },
73
+ {
74
+ "cell_type": "code",
75
+ "execution_count": 4,
76
+ "id": "69466306-5190-4581-82fc-c5839bf15a80",
77
+ "metadata": {},
78
+ "outputs": [
79
+ {
80
+ "data": {
81
+ "text/plain": [
82
+ "500000"
83
+ ]
84
+ },
85
+ "execution_count": 4,
86
+ "metadata": {},
87
+ "output_type": "execute_result"
88
+ }
89
+ ],
90
+ "source": [
91
+ "len(dataset)"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "markdown",
96
+ "id": "25249452-d545-45ce-8c47-a6ee4b20eee1",
97
+ "metadata": {},
98
+ "source": [
99
+ "Curiously, I found out that number of rows counted using \"wc -l file.csv\" in a Linux terminal gives number of lines, not number of rows. See comment in https://stackoverflow.com/questions/32913151/is-it-possible-to-get-the-number-of-rows-in-a-csv-file-without-opening-it"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": 6,
105
+ "id": "68b6087b-0c27-4a9f-bc6c-1317a87c3f3f",
106
+ "metadata": {},
107
+ "outputs": [
108
+ {
109
+ "name": "stderr",
110
+ "output_type": "stream",
111
+ "text": [
112
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 500000/500000 [00:31<00:00, 16019.97it/s]\n"
113
+ ]
114
+ }
115
+ ],
116
+ "source": [
117
+ "from tqdm import tqdm\n",
118
+ "\n",
119
+ "for idx in tqdm(range(NUM_SAMPLES)):\n",
120
+ " with open(f'data/culturaX_ur_500k/ur_sample_{idx}.txt', 'w') as file:\n",
121
+ " file.write(dataset[idx][\"text\"])"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "markdown",
126
+ "id": "a3bd917f-a70f-421d-9680-33ee676f193b",
127
+ "metadata": {},
128
+ "source": [
129
+ "### Bengali"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": 1,
135
+ "id": "7c6a1722-7f4f-436b-a8b3-612f24483ee5",
136
+ "metadata": {},
137
+ "outputs": [
138
+ {
139
+ "data": {
140
+ "application/vnd.jupyter.widget-view+json": {
141
+ "model_id": "3d90843e65214727bf8ccf27c76caac9",
142
+ "version_major": 2,
143
+ "version_minor": 0
144
+ },
145
+ "text/plain": [
146
+ "Resolving data files: 0%| | 0/18 [00:00<?, ?it/s]"
147
+ ]
148
+ },
149
+ "metadata": {},
150
+ "output_type": "display_data"
151
+ }
152
+ ],
153
+ "source": [
154
+ "from datasets import load_dataset\n",
155
+ "\n",
156
+ "ACCESS_TOKEN = \n",
157
+ "NUM_SAMPLES = 500000\n",
158
+ "\n",
159
+ "dataset = load_dataset(\"uonlp/CulturaX\",\n",
160
+ " \"bn\",\n",
161
+ " split=f\"train[:{NUM_SAMPLES}]\",\n",
162
+ " token = ACCESS_TOKEN\n",
163
+ " )"
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "execution_count": 2,
169
+ "id": "1e431914-1402-4ac7-94ed-d2427da318c8",
170
+ "metadata": {},
171
+ "outputs": [
172
+ {
173
+ "data": {
174
+ "text/plain": [
175
+ "Dataset({\n",
176
+ " features: ['text', 'timestamp', 'url', 'source'],\n",
177
+ " num_rows: 500000\n",
178
+ "})"
179
+ ]
180
+ },
181
+ "execution_count": 2,
182
+ "metadata": {},
183
+ "output_type": "execute_result"
184
+ }
185
+ ],
186
+ "source": [
187
+ "dataset"
188
+ ]
189
+ },
190
+ {
191
+ "cell_type": "code",
192
+ "execution_count": 3,
193
+ "id": "b2a55b87-a0e8-40ea-bef7-1349652337b7",
194
+ "metadata": {},
195
+ "outputs": [
196
+ {
197
+ "data": {
198
+ "text/plain": [
199
+ "Dataset({\n",
200
+ " features: ['text'],\n",
201
+ " num_rows: 500000\n",
202
+ "})"
203
+ ]
204
+ },
205
+ "execution_count": 3,
206
+ "metadata": {},
207
+ "output_type": "execute_result"
208
+ }
209
+ ],
210
+ "source": [
211
+ "# remove columns other than text\n",
212
+ "\n",
213
+ "dataset = dataset.remove_columns([col for col in dataset.column_names if col != 'text'])\n",
214
+ "dataset"
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": 4,
220
+ "id": "2f1b64c9-f568-42b2-81f5-fb6de504bcfc",
221
+ "metadata": {},
222
+ "outputs": [
223
+ {
224
+ "name": "stderr",
225
+ "output_type": "stream",
226
+ "text": [
227
+ "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 500000/500000 [00:25<00:00, 19827.65it/s]\n"
228
+ ]
229
+ }
230
+ ],
231
+ "source": [
232
+ "from tqdm import tqdm\n",
233
+ "\n",
234
+ "for idx in tqdm(range(NUM_SAMPLES)):\n",
235
+ " with open(f'data/culturaX_bn_500k/bn_sample_{idx}.txt', 'w') as file:\n",
236
+ " file.write(dataset[idx][\"text\"])"
237
+ ]
238
+ },
239
+ {
240
+ "cell_type": "code",
241
+ "execution_count": null,
242
+ "id": "cc1b8c26-22da-4a5b-ba1e-1557a191c218",
243
+ "metadata": {},
244
+ "outputs": [],
245
+ "source": []
246
+ }
247
+ ],
248
+ "metadata": {
249
+ "kernelspec": {
250
+ "display_name": "Python 3 (ipykernel)",
251
+ "language": "python",
252
+ "name": "python3"
253
+ },
254
+ "language_info": {
255
+ "codemirror_mode": {
256
+ "name": "ipython",
257
+ "version": 3
258
+ },
259
+ "file_extension": ".py",
260
+ "mimetype": "text/x-python",
261
+ "name": "python",
262
+ "nbconvert_exporter": "python",
263
+ "pygments_lexer": "ipython3",
264
+ "version": "3.11.7"
265
+ }
266
+ },
267
+ "nbformat": 4,
268
+ "nbformat_minor": 5
269
+ }