marinone94 commited on
Commit
c9cb648
1 Parent(s): 412339d

add eda, clean script

Browse files
eda.ipynb ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "id": "c9526c52",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import datasets\n",
11
+ "from datasets import DatasetDict, load_dataset, load_metric"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 44,
17
+ "id": "663ff92e",
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "import re"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 21,
27
+ "id": "cc9f1c45",
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "dataset_name = \"mozilla-foundation/common_voice_7_0\"\n",
32
+ "dataset_config_name = \"sv-SE\"\n",
33
+ "train_split_name = \"train+validation\"\n",
34
+ "use_auth_token = True"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 22,
40
+ "id": "21fd7030",
41
+ "metadata": {},
42
+ "outputs": [],
43
+ "source": [
44
+ "raw_datasets = DatasetDict()"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 35,
50
+ "id": "81a27912",
51
+ "metadata": {},
52
+ "outputs": [
53
+ {
54
+ "name": "stderr",
55
+ "output_type": "stream",
56
+ "text": [
57
+ "Reusing dataset common_voice (/Users/emiliomarinone/.cache/huggingface/datasets/mozilla-foundation___common_voice/sv-SE/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n"
58
+ ]
59
+ }
60
+ ],
61
+ "source": [
62
+ "raw_datasets[\"train\"] = load_dataset(\n",
63
+ " dataset_name,\n",
64
+ " dataset_config_name,\n",
65
+ " split=train_split_name,\n",
66
+ " use_auth_token=use_auth_token,\n",
67
+ ")"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": 28,
73
+ "id": "7945cada",
74
+ "metadata": {},
75
+ "outputs": [
76
+ {
77
+ "name": "stderr",
78
+ "output_type": "stream",
79
+ "text": [
80
+ "Reusing dataset common_voice (/Users/emiliomarinone/.cache/huggingface/datasets/mozilla-foundation___common_voice/sv-SE/7.0.0/33e08856cfa0d0665e837bcad73ffd920a0bc713ce8c5fffb55dbdf1c084d5ba)\n"
81
+ ]
82
+ }
83
+ ],
84
+ "source": [
85
+ "raw_datasets[\"test\"] = load_dataset(\n",
86
+ " dataset_name,\n",
87
+ " dataset_config_name,\n",
88
+ " split=\"test\",\n",
89
+ " use_auth_token=use_auth_token,\n",
90
+ ")"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": 36,
96
+ "id": "c98cb649",
97
+ "metadata": {},
98
+ "outputs": [],
99
+ "source": [
100
+ "training_data = raw_datasets[\"train\"]"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 29,
106
+ "id": "1aead6a1",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "test_data = raw_datasets[\"test\"]"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 37,
116
+ "id": "97e9a626",
117
+ "metadata": {},
118
+ "outputs": [
119
+ {
120
+ "data": {
121
+ "text/plain": [
122
+ "Dataset({\n",
123
+ " features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
124
+ " num_rows: 11030\n",
125
+ "})"
126
+ ]
127
+ },
128
+ "execution_count": 37,
129
+ "metadata": {},
130
+ "output_type": "execute_result"
131
+ }
132
+ ],
133
+ "source": [
134
+ "training_data"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": 30,
140
+ "id": "fc794e39",
141
+ "metadata": {},
142
+ "outputs": [
143
+ {
144
+ "data": {
145
+ "text/plain": [
146
+ "Dataset({\n",
147
+ " features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],\n",
148
+ " num_rows: 4620\n",
149
+ "})"
150
+ ]
151
+ },
152
+ "execution_count": 30,
153
+ "metadata": {},
154
+ "output_type": "execute_result"
155
+ }
156
+ ],
157
+ "source": [
158
+ "test_data"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": 31,
164
+ "id": "31b328fd",
165
+ "metadata": {},
166
+ "outputs": [],
167
+ "source": [
168
+ "train_speakers_dict = {}\n",
169
+ "for record in training_data:\n",
170
+ " try:\n",
171
+ " speakers_dict[record[\"client_id\"]].append(record[\"path\"])\n",
172
+ " except:\n",
173
+ " speakers_dict[record[\"client_id\"]] = [record[\"path\"]]"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": 32,
179
+ "id": "7eba5861",
180
+ "metadata": {},
181
+ "outputs": [
182
+ {
183
+ "data": {
184
+ "text/plain": [
185
+ "0"
186
+ ]
187
+ },
188
+ "execution_count": 32,
189
+ "metadata": {},
190
+ "output_type": "execute_result"
191
+ }
192
+ ],
193
+ "source": [
194
+ "len(f\"Speakers in training set: {train_speakers_dict}\")"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": 38,
200
+ "id": "17905c39",
201
+ "metadata": {},
202
+ "outputs": [],
203
+ "source": [
204
+ "test_speakers_dict = {}\n",
205
+ "for record in test_data:\n",
206
+ " try:\n",
207
+ " speakers_dict[record[\"client_id\"]].append(record[\"path\"])\n",
208
+ " except:\n",
209
+ " speakers_dict[record[\"client_id\"]] = [record[\"path\"]]"
210
+ ]
211
+ },
212
+ {
213
+ "cell_type": "code",
214
+ "execution_count": 43,
215
+ "id": "25a25454",
216
+ "metadata": {},
217
+ "outputs": [
218
+ {
219
+ "data": {
220
+ "text/plain": [
221
+ "24"
222
+ ]
223
+ },
224
+ "execution_count": 43,
225
+ "metadata": {},
226
+ "output_type": "execute_result"
227
+ }
228
+ ],
229
+ "source": [
230
+ "len(f\"Speakers in test set: {test_speakers_dict}\")"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "code",
235
+ "execution_count": 42,
236
+ "id": "f72bdb7a",
237
+ "metadata": {},
238
+ "outputs": [
239
+ {
240
+ "name": "stdout",
241
+ "output_type": "stream",
242
+ "text": [
243
+ "Speakers in both training and test sets: 0\n"
244
+ ]
245
+ }
246
+ ],
247
+ "source": [
248
+ "c = 0\n",
249
+ "for speaker in test_speakers_dict:\n",
250
+ " if speaker in train_speakers_dict:\n",
251
+ " c+=1\n",
252
+ "print(f\"Speakers in both training and test sets: {c}\")"
253
+ ]
254
+ },
255
+ {
256
+ "cell_type": "code",
257
+ "execution_count": 45,
258
+ "id": "ed6bc20b",
259
+ "metadata": {},
260
+ "outputs": [],
261
+ "source": [
262
+ "chars_to_ignore_regex = '[,?.!\\-\\;\\:\"“%‘”�—’…–]'\n",
263
+ "def clean_text(text):\n",
264
+ " return re.sub(chars_to_ignore_regex, \"\", text.lower())"
265
+ ]
266
+ },
267
+ {
268
+ "cell_type": "code",
269
+ "execution_count": 51,
270
+ "id": "16b289be",
271
+ "metadata": {},
272
+ "outputs": [
273
+ {
274
+ "name": "stdout",
275
+ "output_type": "stream",
276
+ "text": [
277
+ "Avg tokens training data: 7.243336355394379\n"
278
+ ]
279
+ }
280
+ ],
281
+ "source": [
282
+ "num_tokens_train = 0\n",
283
+ "for record in training_data:\n",
284
+ " num_tokens_train += len(clean_text(record[\"sentence\"]).split())\n",
285
+ "avg_tokens_train = num_tokens_train / training_data.num_rows\n",
286
+ "print(f\"Avg tokens training data: {avg_tokens_train}\")"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "code",
291
+ "execution_count": 52,
292
+ "id": "364aff29",
293
+ "metadata": {},
294
+ "outputs": [
295
+ {
296
+ "name": "stdout",
297
+ "output_type": "stream",
298
+ "text": [
299
+ "Avg tokens training data: 7.074891774891775\n"
300
+ ]
301
+ }
302
+ ],
303
+ "source": [
304
+ "num_tokens_test = 0\n",
305
+ "for record in test_data:\n",
306
+ " num_tokens_test += len(clean_text(record[\"sentence\"]).split())\n",
307
+ "avg_tokens_test = num_tokens_test / test_data.num_rows\n",
308
+ "print(f\"Avg tokens training data: {avg_tokens_test}\")"
309
+ ]
310
+ }
311
+ ],
312
+ "metadata": {
313
+ "kernelspec": {
314
+ "display_name": "Python 3 (ipykernel)",
315
+ "language": "python",
316
+ "name": "python3"
317
+ },
318
+ "language_info": {
319
+ "codemirror_mode": {
320
+ "name": "ipython",
321
+ "version": 3
322
+ },
323
+ "file_extension": ".py",
324
+ "mimetype": "text/x-python",
325
+ "name": "python",
326
+ "nbconvert_exporter": "python",
327
+ "pygments_lexer": "ipython3",
328
+ "version": "3.8.6"
329
+ }
330
+ },
331
+ "nbformat": 4,
332
+ "nbformat_minor": 5
333
+ }
run_speech_recognition_ctc.py CHANGED
@@ -43,7 +43,6 @@ from transformers import (
43
  Trainer,
44
  TrainingArguments,
45
  Wav2Vec2Processor,
46
- Wav2Vec2ProcessorWithLM,
47
  set_seed,
48
  )
49
  from transformers.trainer_utils import get_last_checkpoint, is_main_process
 
43
  Trainer,
44
  TrainingArguments,
45
  Wav2Vec2Processor,
 
46
  set_seed,
47
  )
48
  from transformers.trainer_utils import get_last_checkpoint, is_main_process
train_n_gram_lm_with_KenLM.ipynb CHANGED
@@ -1,2262 +1,301 @@
1
  {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {},
6
- "source": [
7
- "# Train n-gram language model with KenLM on Colab"
8
- ]
9
- },
10
- {
11
- "cell_type": "markdown",
12
- "metadata": {
13
- "id": "PtkgQE7--Ufg"
14
- },
15
- "source": [
16
- "See https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Boosting_Wav2Vec2_with_n_grams_in_Transformers.ipynb#scrollTo=X9qg4FPt2zi8 for detailed explanation on how to use KenLM to boost wav2vec2 fine-tuned models on 🤗"
17
- ]
18
- },
19
- {
20
- "cell_type": "markdown",
21
- "metadata": {
22
- "id": "VBCqCboC6Soc"
23
- },
24
- "source": [
25
- "Install KenLM"
26
- ]
27
- },
28
- {
29
- "cell_type": "code",
30
- "execution_count": 4,
31
- "metadata": {
32
- "colab": {
33
- "base_uri": "https://localhost:8080/"
34
- },
35
- "id": "-CKLr9bI6GPE",
36
- "outputId": "0c6d917e-4896-4e35-c92f-4b085f77c893"
37
- },
38
- "outputs": [
39
- {
40
- "name": "stdout",
41
- "output_type": "stream",
42
- "text": [
43
- "The operation couldn’t be completed. Unable to locate a Java Runtime that supports apt.\r\n",
44
- "Please visit http://www.java.com for information on installing Java.\r\n",
45
- "\r\n"
46
- ]
47
- }
48
- ],
49
- "source": [
50
- "!sudo apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev"
51
- ]
52
- },
53
- {
54
- "cell_type": "code",
55
- "execution_count": null,
56
- "metadata": {
57
  "colab": {
58
- "base_uri": "https://localhost:8080/"
 
 
59
  },
60
- "id": "TIlrFi3M6XO4",
61
- "outputId": "7c986a6f-f84c-4d29-b3f8-941cb85e6e8d"
62
- },
63
- "outputs": [],
64
- "source": [
65
- "!wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz"
66
- ]
67
- },
68
- {
69
- "cell_type": "code",
70
- "execution_count": 3,
71
- "metadata": {
72
- "colab": {
73
- "base_uri": "https://localhost:8080/"
74
  },
75
- "id": "KGwSg6Bl6a8Y",
76
- "outputId": "025562ab-679d-4986-9474-334dc8bd834e"
77
- },
78
- "outputs": [
79
- {
80
- "name": "stdout",
81
- "output_type": "stream",
82
- "text": [
83
- "-- The C compiler identification is GNU 7.5.0\n",
84
- "-- The CXX compiler identification is GNU 7.5.0\n",
85
- "-- Check for working C compiler: /usr/bin/cc\n",
86
- "-- Check for working C compiler: /usr/bin/cc -- works\n",
87
- "-- Detecting C compiler ABI info\n",
88
- "-- Detecting C compiler ABI info - done\n",
89
- "-- Detecting C compile features\n",
90
- "-- Detecting C compile features - done\n",
91
- "-- Check for working CXX compiler: /usr/bin/c++\n",
92
- "-- Check for working CXX compiler: /usr/bin/c++ -- works\n",
93
- "-- Detecting CXX compiler ABI info\n",
94
- "-- Detecting CXX compiler ABI info - done\n",
95
- "-- Detecting CXX compile features\n",
96
- "-- Detecting CXX compile features - done\n",
97
- "-- Looking for pthread.h\n",
98
- "-- Looking for pthread.h - found\n",
99
- "-- Looking for pthread_create\n",
100
- "-- Looking for pthread_create - not found\n",
101
- "-- Looking for pthread_create in pthreads\n",
102
- "-- Looking for pthread_create in pthreads - not found\n",
103
- "-- Looking for pthread_create in pthread\n",
104
- "-- Looking for pthread_create in pthread - found\n",
105
- "-- Found Threads: TRUE \n",
106
- "-- Boost version: 1.65.1\n",
107
- "-- Found the following Boost libraries:\n",
108
- "-- program_options\n",
109
- "-- system\n",
110
- "-- thread\n",
111
- "-- unit_test_framework\n",
112
- "-- chrono\n",
113
- "-- date_time\n",
114
- "-- atomic\n",
115
- "-- Check if compiler accepts -pthread\n",
116
- "-- Check if compiler accepts -pthread - yes\n",
117
- "-- Found ZLIB: /usr/lib/x86_64-linux-gnu/libz.so (found version \"1.2.11\") \n",
118
- "-- Found BZip2: /usr/lib/x86_64-linux-gnu/libbz2.so (found version \"1.0.6\") \n",
119
- "-- Looking for BZ2_bzCompressInit\n",
120
- "-- Looking for BZ2_bzCompressInit - found\n",
121
- "-- Looking for lzma_auto_decoder in /usr/lib/x86_64-linux-gnu/liblzma.so\n",
122
- "-- Looking for lzma_auto_decoder in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n",
123
- "-- Looking for lzma_easy_encoder in /usr/lib/x86_64-linux-gnu/liblzma.so\n",
124
- "-- Looking for lzma_easy_encoder in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n",
125
- "-- Looking for lzma_lzma_preset in /usr/lib/x86_64-linux-gnu/liblzma.so\n",
126
- "-- Looking for lzma_lzma_preset in /usr/lib/x86_64-linux-gnu/liblzma.so - found\n",
127
- "-- Found LibLZMA: /usr/include (found version \"5.2.2\") \n",
128
- "-- Found OpenMP_C: -fopenmp (found version \"4.5\") \n",
129
- "-- Found OpenMP_CXX: -fopenmp (found version \"4.5\") \n",
130
- "-- Found OpenMP: TRUE (found version \"4.5\") \n",
131
- "-- Configuring done\n",
132
- "-- Generating done\n",
133
- "-- Build files have been written to: /content/kenlm/build\n",
134
- "\u001b[35m\u001b[1mScanning dependencies of target kenlm_util\u001b[0m\n",
135
- "[ 2%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/bignum.cc.o\u001b[0m\n",
136
- "[ 2%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/bignum-dtoa.cc.o\u001b[0m\n",
137
- "[ 3%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/cached-powers.cc.o\u001b[0m\n",
138
- "[ 4%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/diy-fp.cc.o\u001b[0m\n",
139
- "[ 5%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/double-conversion.cc.o\u001b[0m\n",
140
- "[ 6%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/fast-dtoa.cc.o\u001b[0m\n",
141
- "[ 7%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/fixed-dtoa.cc.o\u001b[0m\n",
142
- "[ 8%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/strtod.cc.o\u001b[0m\n",
143
- "[ 9%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/chain.cc.o\u001b[0m\n",
144
- "[ 10%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/count_records.cc.o\u001b[0m\n",
145
- "[ 11%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/io.cc.o\u001b[0m\n",
146
- "[ 12%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/line_input.cc.o\u001b[0m\n",
147
- "[ 13%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/multi_progress.cc.o\u001b[0m\n",
148
- "[ 14%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/rewindable_stream.cc.o\u001b[0m\n",
149
- "[ 15%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/bit_packing.cc.o\u001b[0m\n",
150
- "[ 16%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/ersatz_progress.cc.o\u001b[0m\n",
151
- "[ 17%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/exception.cc.o\u001b[0m\n",
152
- "[ 18%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/file.cc.o\u001b[0m\n",
153
- "[ 19%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/file_piece.cc.o\u001b[0m\n",
154
- "[ 20%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/float_to_string.cc.o\u001b[0m\n",
155
- "[ 21%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/integer_to_string.cc.o\u001b[0m\n",
156
- "[ 22%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/mmap.cc.o\u001b[0m\n",
157
- "[ 23%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/murmur_hash.cc.o\u001b[0m\n",
158
- "[ 25%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/parallel_read.cc.o\u001b[0m\n",
159
- "[ 26%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/pool.cc.o\u001b[0m\n",
160
- "[ 27%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/read_compressed.cc.o\u001b[0m\n",
161
- "[ 28%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/scoped.cc.o\u001b[0m\n",
162
- "[ 29%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/spaces.cc.o\u001b[0m\n",
163
- "[ 30%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/string_piece.cc.o\u001b[0m\n",
164
- "[ 31%] \u001b[32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/usage.cc.o\u001b[0m\n",
165
- "[ 32%] \u001b[32m\u001b[1mLinking CXX static library ../lib/libkenlm_util.a\u001b[0m\n",
166
- "[ 32%] Built target kenlm_util\n",
167
- "\u001b[35m\u001b[1mScanning dependencies of target probing_hash_table_benchmark\u001b[0m\n",
168
- "\u001b[35m\u001b[1mScanning dependencies of target kenlm\u001b[0m\n",
169
- "[ 33%] \u001b[32mBuilding CXX object util/CMakeFiles/probing_hash_table_benchmark.dir/probing_hash_table_benchmark_main.cc.o\u001b[0m\n",
170
- "[ 34%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/bhiksha.cc.o\u001b[0m\n",
171
- "[ 35%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/binary_format.cc.o\u001b[0m\n",
172
- "[ 36%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/config.cc.o\u001b[0m\n",
173
- "[ 37%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/lm_exception.cc.o\u001b[0m\n",
174
- "[ 38%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/model.cc.o\u001b[0m\n",
175
- "[ 39%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/quantize.cc.o\u001b[0m\n",
176
- "[ 40%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/read_arpa.cc.o\u001b[0m\n",
177
- "[ 41%] \u001b[32m\u001b[1mLinking CXX executable ../bin/probing_hash_table_benchmark\u001b[0m\n",
178
- "[ 41%] Built target probing_hash_table_benchmark\n",
179
- "\u001b[35m\u001b[1mScanning dependencies of target kenlm_filter\u001b[0m\n",
180
- "[ 42%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/arpa_io.cc.o\u001b[0m\n",
181
- "[ 43%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/search_hashed.cc.o\u001b[0m\n",
182
- "[ 44%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/phrase.cc.o\u001b[0m\n",
183
- "[ 45%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/search_trie.cc.o\u001b[0m\n",
184
- "[ 46%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/vocab.cc.o\u001b[0m\n",
185
- "[ 47%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_filter.a\u001b[0m\n",
186
- "[ 47%] Built target kenlm_filter\n",
187
- "[ 48%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/sizes.cc.o\u001b[0m\n",
188
- "[ 50%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/trie.cc.o\u001b[0m\n",
189
- "[ 51%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/trie_sort.cc.o\u001b[0m\n",
190
- "[ 52%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/value_build.cc.o\u001b[0m\n",
191
- "[ 53%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/virtual_interface.cc.o\u001b[0m\n",
192
- "[ 54%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/vocab.cc.o\u001b[0m\n",
193
- "[ 55%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/model_buffer.cc.o\u001b[0m\n",
194
- "[ 56%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/print.cc.o\u001b[0m\n",
195
- "[ 57%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/renumber.cc.o\u001b[0m\n",
196
- "[ 58%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/size_option.cc.o\u001b[0m\n",
197
- "[ 59%] \u001b[32m\u001b[1mLinking CXX static library ../lib/libkenlm.a\u001b[0m\n",
198
- "[ 59%] Built target kenlm\n",
199
- "\u001b[35m\u001b[1mScanning dependencies of target build_binary\u001b[0m\n",
200
- "\u001b[35m\u001b[1mScanning dependencies of target fragment\u001b[0m\n",
201
- "[ 60%] \u001b[32mBuilding CXX object lm/CMakeFiles/fragment.dir/fragment_main.cc.o\u001b[0m\n",
202
- "[ 61%] \u001b[32mBuilding CXX object lm/CMakeFiles/build_binary.dir/build_binary_main.cc.o\u001b[0m\n",
203
- "[ 62%] \u001b[32m\u001b[1mLinking CXX executable ../bin/fragment\u001b[0m\n",
204
- "[ 63%] \u001b[32m\u001b[1mLinking CXX executable ../bin/build_binary\u001b[0m\n",
205
- "[ 63%] Built target fragment\n",
206
- "\u001b[35m\u001b[1mScanning dependencies of target kenlm_benchmark\u001b[0m\n",
207
- "[ 64%] \u001b[32mBuilding CXX object lm/CMakeFiles/kenlm_benchmark.dir/kenlm_benchmark_main.cc.o\u001b[0m\n",
208
- "[ 64%] Built target build_binary\n",
209
- "\u001b[35m\u001b[1mScanning dependencies of target query\u001b[0m\n",
210
- "[ 65%] \u001b[32mBuilding CXX object lm/CMakeFiles/query.dir/query_main.cc.o\u001b[0m\n",
211
- "[ 66%] \u001b[32m\u001b[1mLinking CXX executable ../bin/query\u001b[0m\n",
212
- "[ 66%] Built target query\n",
213
- "\u001b[35m\u001b[1mScanning dependencies of target kenlm_builder\u001b[0m\n",
214
- "[ 67%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/adjust_counts.cc.o\u001b[0m\n",
215
- "[ 68%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/corpus_count.cc.o\u001b[0m\n",
216
- "[ 69%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/initial_probabilities.cc.o\u001b[0m\n",
217
- "[ 70%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/interpolate.cc.o\u001b[0m\n",
218
- "[ 71%] \u001b[32m\u001b[1mLinking CXX executable ../bin/kenlm_benchmark\u001b[0m\n",
219
- "[ 72%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/output.cc.o\u001b[0m\n",
220
- "[ 72%] Built target kenlm_benchmark\n",
221
- "\u001b[35m\u001b[1mScanning dependencies of target phrase_table_vocab\u001b[0m\n",
222
- "[ 73%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/phrase_table_vocab.dir/phrase_table_vocab_main.cc.o\u001b[0m\n",
223
- "[ 75%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/phrase_table_vocab\u001b[0m\n",
224
- "[ 75%] Built target phrase_table_vocab\n",
225
- "\u001b[35m\u001b[1mScanning dependencies of target filter\u001b[0m\n",
226
- "[ 76%] \u001b[32mBuilding CXX object lm/filter/CMakeFiles/filter.dir/filter_main.cc.o\u001b[0m\n",
227
- "[ 77%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/pipeline.cc.o\u001b[0m\n",
228
- "[ 78%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_builder.a\u001b[0m\n",
229
- "[ 78%] Built target kenlm_builder\n",
230
- "\u001b[35m\u001b[1mScanning dependencies of target kenlm_interpolate\u001b[0m\n",
231
- "[ 79%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/backoff_reunification.cc.o\u001b[0m\n",
232
- "[ 80%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/bounded_sequence_encoding.cc.o\u001b[0m\n",
233
- "[ 81%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/merge_probabilities.cc.o\u001b[0m\n",
234
- "[ 82%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/filter\u001b[0m\n",
235
- "[ 82%] Built target filter\n",
236
- "\u001b[35m\u001b[1mScanning dependencies of target count_ngrams\u001b[0m\n",
237
- "[ 83%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/count_ngrams.dir/count_ngrams_main.cc.o\u001b[0m\n",
238
- "[ 84%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/merge_vocab.cc.o\u001b[0m\n",
239
- "[ 85%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/normalize.cc.o\u001b[0m\n",
240
- "[ 86%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/pipeline.cc.o\u001b[0m\n",
241
- "[ 87%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/count_ngrams\u001b[0m\n",
242
- "[ 87%] Built target count_ngrams\n",
243
- "\u001b[35m\u001b[1mScanning dependencies of target lmplz\u001b[0m\n",
244
- "[ 88%] \u001b[32mBuilding CXX object lm/builder/CMakeFiles/lmplz.dir/lmplz_main.cc.o\u001b[0m\n",
245
- "[ 89%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/lmplz\u001b[0m\n",
246
- "[ 89%] Built target lmplz\n",
247
- "[ 90%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/split_worker.cc.o\u001b[0m\n",
248
- "[ 91%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/tune_derivatives.cc.o\u001b[0m\n",
249
- "[ 92%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/tune_instances.cc.o\u001b[0m\n",
250
- "[ 93%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/tune_weights.cc.o\u001b[0m\n",
251
- "[ 94%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/kenlm_interpolate.dir/universal_vocab.cc.o\u001b[0m\n",
252
- "[ 95%] \u001b[32m\u001b[1mLinking CXX static library ../../lib/libkenlm_interpolate.a\u001b[0m\n",
253
- "[ 95%] Built target kenlm_interpolate\n",
254
- "\u001b[35m\u001b[1mScanning dependencies of target streaming_example\u001b[0m\n",
255
- "\u001b[35m\u001b[1mScanning dependencies of target interpolate\u001b[0m\n",
256
- "[ 96%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/streaming_example.dir/streaming_example_main.cc.o\u001b[0m\n",
257
- "[ 97%] \u001b[32mBuilding CXX object lm/interpolate/CMakeFiles/interpolate.dir/interpolate_main.cc.o\u001b[0m\n",
258
- "[ 98%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/interpolate\u001b[0m\n",
259
- "[ 98%] Built target interpolate\n",
260
- "[100%] \u001b[32m\u001b[1mLinking CXX executable ../../bin/streaming_example\u001b[0m\n",
261
- "[100%] Built target streaming_example\n",
262
- "build_binary fragment\t lmplz\t\t\t query\n",
263
- "count_ngrams interpolate phrase_table_vocab\t streaming_example\n",
264
- "filter\t kenlm_benchmark probing_hash_table_benchmark\n"
265
- ]
266
  }
267
- ],
268
- "source": [
269
- "!mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2\n",
270
- "!ls kenlm/build/bin"
271
- ]
272
- },
273
- {
274
- "cell_type": "markdown",
275
- "metadata": {
276
- "id": "rUUGXbDy6x7r"
277
- },
278
- "source": [
279
- "Install 🤗 dependencies"
280
- ]
281
  },
282
- {
283
- "cell_type": "code",
284
- "execution_count": 4,
285
- "metadata": {
286
- "colab": {
287
- "base_uri": "https://localhost:8080/"
288
- },
289
- "id": "Gs8LAZKr6wF8",
290
- "outputId": "2a1785bb-f254-487a-ef4c-e496f037145a"
291
- },
292
- "outputs": [
293
  {
294
- "name": "stdout",
295
- "output_type": "stream",
296
- "text": [
297
- "Collecting datasets\n",
298
- " Downloading datasets-1.18.0-py3-none-any.whl (311 kB)\n",
299
- "\u001b[K |████████████████████████████████| 311 kB 5.3 MB/s \n",
300
- "\u001b[?25hCollecting transformers\n",
301
- " Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)\n",
302
- "\u001b[K |████████████████████████████████| 3.4 MB 39.8 MB/s \n",
303
- "\u001b[?25hRequirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (2.23.0)\n",
304
- "Collecting aiohttp\n",
305
- " Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)\n",
306
- "\u001b[K |████████████████████████████████| 1.1 MB 54.7 MB/s \n",
307
- "\u001b[?25hRequirement already satisfied: pyarrow!=4.0.0,>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets) (3.0.0)\n",
308
- "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from datasets) (1.19.5)\n",
309
- "Collecting xxhash\n",
310
- " Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)\n",
311
- "\u001b[K |████████████████████████████████| 243 kB 41.5 MB/s \n",
312
- "\u001b[?25hRequirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from datasets) (0.3.4)\n",
313
- "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from datasets) (4.62.3)\n",
314
- "Collecting huggingface-hub<1.0.0,>=0.1.0\n",
315
- " Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)\n",
316
- "\u001b[K |████████████████████████████████| 67 kB 5.1 MB/s \n",
317
- "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from datasets) (21.3)\n",
318
- "Collecting fsspec[http]>=2021.05.0\n",
319
- " Downloading fsspec-2022.1.0-py3-none-any.whl (133 kB)\n",
320
- "\u001b[K |████████████████████████████████| 133 kB 50.2 MB/s \n",
321
- "\u001b[?25hRequirement already satisfied: multiprocess in /usr/local/lib/python3.7/dist-packages (from datasets) (0.70.12.2)\n",
322
- "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from datasets) (4.10.0)\n",
323
- "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from datasets) (1.1.5)\n",
324
- "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.4.2)\n",
325
- "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.10.0.2)\n",
326
- "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.13)\n",
327
- "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->datasets) (3.0.6)\n",
328
- "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (3.0.4)\n",
329
- "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (1.24.3)\n",
330
- "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2021.10.8)\n",
331
- "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->datasets) (2.10)\n",
332
- "Collecting pyyaml\n",
333
- " Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)\n",
334
- "\u001b[K |████████████████████████████████| 596 kB 56.2 MB/s \n",
335
- "\u001b[?25hCollecting tokenizers<0.11,>=0.10.1\n",
336
- " Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)\n",
337
- "\u001b[K |████████████████████████████████| 3.3 MB 46.1 MB/s \n",
338
- "\u001b[?25hCollecting sacremoses\n",
339
- " Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)\n",
340
- "\u001b[K |████████████████████████████████| 895 kB 51.6 MB/s \n",
341
- "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n",
342
- "Collecting frozenlist>=1.1.1\n",
343
- " Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n",
344
- "\u001b[K |████████████████████████████████| 144 kB 51.5 MB/s \n",
345
- "\u001b[?25hCollecting yarl<2.0,>=1.0\n",
346
- " Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n",
347
- "\u001b[K |████████████████████████████████| 271 kB 56.7 MB/s \n",
348
- "\u001b[?25hCollecting asynctest==0.13.0\n",
349
- " Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)\n",
350
- "Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (2.0.10)\n",
351
- "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets) (21.4.0)\n",
352
- "Collecting multidict<7.0,>=4.5\n",
353
- " Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)\n",
354
- "\u001b[K |████████████████████████████████| 94 kB 3.0 MB/s \n",
355
- "\u001b[?25hCollecting aiosignal>=1.1.2\n",
356
- " Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\n",
357
- "Collecting async-timeout<5.0,>=4.0.0a3\n",
358
- " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n",
359
- "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->datasets) (3.7.0)\n",
360
- "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2018.9)\n",
361
- "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->datasets) (2.8.2)\n",
362
- "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->datasets) (1.15.0)\n",
363
- "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n",
364
- "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.1.0)\n",
365
- "Installing collected packages: multidict, frozenlist, yarl, asynctest, async-timeout, aiosignal, pyyaml, fsspec, aiohttp, xxhash, tokenizers, sacremoses, huggingface-hub, transformers, datasets\n",
366
- " Attempting uninstall: pyyaml\n",
367
- " Found existing installation: PyYAML 3.13\n",
368
- " Uninstalling PyYAML-3.13:\n",
369
- " Successfully uninstalled PyYAML-3.13\n",
370
- "Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 async-timeout-4.0.2 asynctest-0.13.0 datasets-1.18.0 frozenlist-1.3.0 fsspec-2022.1.0 huggingface-hub-0.4.0 multidict-6.0.2 pyyaml-6.0 sacremoses-0.0.47 tokenizers-0.10.3 transformers-4.15.0 xxhash-2.0.2 yarl-1.7.2\n"
371
- ]
372
- }
373
- ],
374
- "source": [
375
- "!pip install datasets transformers"
376
- ]
377
- },
378
- {
379
- "cell_type": "markdown",
380
- "metadata": {
381
- "id": "6RoHBmOz66fz"
382
- },
383
- "source": [
384
- "Load preprocessed dataset from 🤗 and write it to file as required by KenLM"
385
- ]
386
- },
387
- {
388
- "cell_type": "code",
389
- "execution_count": 5,
390
- "metadata": {
391
- "colab": {
392
- "base_uri": "https://localhost:8080/",
393
- "height": 216,
394
- "referenced_widgets": [
395
- "ad5d7b0bc9ad4e228b3bc76bc975cc47",
396
- "5925567ffea2436691c4ed3b7b147c17",
397
- "799acae3451445f0a3616b8932f2e3f3",
398
- "1714ea91694842339756f26b2fa9c725",
399
- "b5d6b069468246abbb3207f3df6f9dde",
400
- "5a9c9d4b60e54a3bb64c576707bd9736",
401
- "789d5845a82e48fe9c629af743b5b1f0",
402
- "e3414cc0456241eca109f4e9e115d16a",
403
- "d3e6acd54d024d6791aab76232557721",
404
- "6b43ea2d93c04965a4539b3ef839893b",
405
- "4958b4c72d0c48af9a77974fc4ed449c",
406
- "d722bbfffeaf4ea7a1060d10dc3a06db",
407
- "e81b0bf92adc4aadaafce4ee7d36421e",
408
- "a4b5b93b88f549e8a4f37f3d48834ca9",
409
- "82692c41501c487fad27c6b19836f46f",
410
- "af8f433ef2f540c9bd70d14421904d83",
411
- "5e469744bf6a4813983ae8ee727c1c5e",
412
- "34c5f87238cb4f13a03b207aa7dc1d18",
413
- "c7955974289a4f448b422d7e4640131a",
414
- "db7ee45589e04749b80376e25ee377bb",
415
- "34d9460b112c419885bbff5211674cb3",
416
- "033cb43d32314d279a7b9e1e86bbccdc",
417
- "6a7e3547dc4141e7b5937f2baff58cbf",
418
- "921a3c1f50a24979838fd560c2cea9e0",
419
- "e33033ecda374ed4966ae5fccf6efe37",
420
- "52a852c0f98c49aa9e5edfdd4f91e4ca",
421
- "d1ff84cb5591449abcc7dd3e37f9a2df",
422
- "f479a9629c414cb495a97b0741b0fe4b",
423
- "a41cf7f5121a4068842bb5c7d2bc4d62",
424
- "31b2d7d8d9054c8fb47bf1b58043aee1",
425
- "23c9da8dd7bf4be9a23357806ebfc036",
426
- "e084d47529ca4131b233ea3514a6344f",
427
- "a6e3c5ce0a3c49ffb3d7cbf92568fe47",
428
- "a1eca879a11f414f8173b0c2c260f4c3",
429
- "75130a60f93b49c8bee0986665121d02",
430
- "328cea1a2aac4fb58bceeaf126b99371",
431
- "662d61fdd89d434785e74a7038427fbc",
432
- "670d4f16a7e44144afc0ac70eea59325",
433
- "7f92331b29fd49a68815b6d7389c1005",
434
- "c710ba94fd65486cbcbe1d402919e27f",
435
- "5951d1bafdd548b6b835b28cf9960533",
436
- "adcffda7f78c4a1c8bdc6010c8704292",
437
- "2c37aaee1f524837b477dc584209733a",
438
- "0bee4735e017471fa8679ad984b88633"
439
- ]
440
  },
441
- "id": "0bDpNg9c6mUu",
442
- "outputId": "677d294f-2e37-48d5-bab0-6e21d1b4fe30"
443
- },
444
- "outputs": [
445
  {
446
- "data": {
447
- "application/vnd.jupyter.widget-view+json": {
448
- "model_id": "ad5d7b0bc9ad4e228b3bc76bc975cc47",
449
- "version_major": 2,
450
- "version_minor": 0
 
451
  },
452
- "text/plain": [
453
- "Downloading: 0%| | 0.00/1.16k [00:00<?, ?B/s]"
454
- ]
455
- },
456
- "metadata": {},
457
- "output_type": "display_data"
458
  },
459
  {
460
- "name": "stderr",
461
- "output_type": "stream",
462
- "text": [
463
- "Using custom data configuration hf-test--swedish_corpora_parliament_processed-56ded20e2faa0852\n"
464
- ]
 
 
 
 
 
 
465
  },
466
  {
467
- "name": "stdout",
468
- "output_type": "stream",
469
- "text": [
470
- "Downloading and preparing dataset europarl_bilingual/en-sv (download: 151.40 MiB, generated: 278.82 MiB, post-processed: Unknown size, total: 430.21 MiB) to /root/.cache/huggingface/datasets/parquet/hf-test--swedish_corpora_parliament_processed-56ded20e2faa0852/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121...\n"
471
- ]
 
 
 
 
472
  },
473
  {
474
- "data": {
475
- "application/vnd.jupyter.widget-view+json": {
476
- "model_id": "d722bbfffeaf4ea7a1060d10dc3a06db",
477
- "version_major": 2,
478
- "version_minor": 0
 
479
  },
480
- "text/plain": [
481
- " 0%| | 0/1 [00:00<?, ?it/s]"
482
- ]
483
- },
484
- "metadata": {},
485
- "output_type": "display_data"
486
  },
487
  {
488
- "data": {
489
- "application/vnd.jupyter.widget-view+json": {
490
- "model_id": "6a7e3547dc4141e7b5937f2baff58cbf",
491
- "version_major": 2,
492
- "version_minor": 0
 
 
493
  },
494
- "text/plain": [
495
- "Downloading: 0%| | 0.00/159M [00:00<?, ?B/s]"
496
- ]
497
- },
498
- "metadata": {},
499
- "output_type": "display_data"
500
  },
501
  {
502
- "data": {
503
- "application/vnd.jupyter.widget-view+json": {
504
- "model_id": "a1eca879a11f414f8173b0c2c260f4c3",
505
- "version_major": 2,
506
- "version_minor": 0
 
 
 
 
 
 
 
 
 
507
  },
508
- "text/plain": [
509
- " 0%| | 0/1 [00:00<?, ?it/s]"
510
- ]
511
- },
512
- "metadata": {},
513
- "output_type": "display_data"
514
  },
515
  {
516
- "name": "stdout",
517
- "output_type": "stream",
518
- "text": [
519
- "Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/hf-test--swedish_corpora_parliament_processed-56ded20e2faa0852/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121. Subsequent calls will reuse this data.\n"
520
- ]
521
- }
522
- ],
523
- "source": [
524
- "from datasets import load_dataset\n",
525
- "\n",
526
- "# change to your dataset path\n",
527
- "username = \"hf-test\" \n",
528
- "target_lang = \"sv\"\n",
529
- "\n",
530
- "dataset = load_dataset(f\"{username}/{target_lang}_corpora_parliament_processed\", split=\"train\")\n",
531
- "\n",
532
- "with open(\"text.txt\", \"w\") as file:\n",
533
- " file.write(\" \".join(dataset[\"text\"]))"
534
- ]
535
- },
536
- {
537
- "cell_type": "markdown",
538
- "metadata": {
539
- "id": "z8PqeGC17jD8"
540
- },
541
- "source": [
542
- "Train 5-gram language model"
543
- ]
544
- },
545
- {
546
- "cell_type": "code",
547
- "execution_count": 6,
548
- "metadata": {
549
- "colab": {
550
- "base_uri": "https://localhost:8080/"
551
  },
552
- "id": "_8KoINuj7h-1",
553
- "outputId": "26e0622d-6cb6-4329-e722-91ae9df263c7"
554
- },
555
- "outputs": [
556
  {
557
- "name": "stdout",
558
- "output_type": "stream",
559
- "text": [
560
- "=== 1/5 Counting and sorting n-grams ===\n",
561
- "Reading /content/text.txt\n",
562
- "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
563
- "tcmalloc: large alloc 1918697472 bytes == 0x5623caa4e000 @ 0x7fe627aa41e7 0x5623c8d517a2 0x5623c8cec51e 0x5623c8ccb2eb 0x5623c8cb7066 0x7fe625c3dbf7 0x5623c8cb8baa\n",
564
- "tcmalloc: large alloc 8953896960 bytes == 0x56243d01e000 @ 0x7fe627aa41e7 0x5623c8d517a2 0x5623c8d407ca 0x5623c8d41208 0x5623c8ccb308 0x5623c8cb7066 0x7fe625c3dbf7 0x5623c8cb8baa\n",
565
- "****************************************************************************************************\n",
566
- "Unigram tokens 42153890 types 360209\n",
567
- "=== 2/5 Calculating and sorting adjusted counts ===\n",
568
- "Chain sizes: 1:4322508 2:1062773568 3:1992700672 4:3188320768 5:4649634816\n",
569
- "tcmalloc: large alloc 4649639936 bytes == 0x5623caa4e000 @ 0x7fe627aa41e7 0x5623c8d517a2 0x5623c8d407ca 0x5623c8d41208 0x5623c8ccb8d7 0x5623c8cb7066 0x7fe625c3dbf7 0x5623c8cb8baa\n",
570
- "tcmalloc: large alloc 1992704000 bytes == 0x56251f640000 @ 0x7fe627aa41e7 0x5623c8d517a2 0x5623c8d407ca 0x5623c8d41208 0x5623c8ccbcdd 0x5623c8cb7066 0x7fe625c3dbf7 0x5623c8cb8baa\n",
571
- "tcmalloc: large alloc 3188326400 bytes == 0x5626533e4000 @ 0x7fe627aa41e7 0x5623c8d517a2 0x5623c8d407ca 0x5623c8d41208 0x5623c8ccbcdd 0x5623c8cb7066 0x7fe625c3dbf7 0x5623c8cb8baa\n",
572
- "Statistics:\n",
573
- "1 360208 D1=0.686222 D2=1.01595 D3+=1.33685\n",
574
- "2 5476741 D1=0.761523 D2=1.06735 D3+=1.32559\n",
575
- "3 18177681 D1=0.839918 D2=1.12061 D3+=1.33794\n",
576
- "4 30374983 D1=0.909146 D2=1.20496 D3+=1.37235\n",
577
- "5 37231651 D1=0.944104 D2=1.25164 D3+=1.344\n",
578
- "Memory estimate for binary LM:\n",
579
- "type MB\n",
580
- "probing 1884 assuming -p 1.5\n",
581
- "probing 2195 assuming -r models -p 1.5\n",
582
- "trie 922 without quantization\n",
583
- "trie 518 assuming -q 8 -b 8 quantization \n",
584
- "trie 806 assuming -a 22 array pointer compression\n",
585
- "trie 401 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n",
586
- "=== 3/5 Calculating and sorting initial probabilities ===\n",
587
- "Chain sizes: 1:4322496 2:87627856 3:363553620 4:728999592 5:1042486228\n",
588
- "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
589
- "####################################################################################################\n",
590
- "=== 4/5 Calculating and writing order-interpolated probabilities ===\n",
591
- "Chain sizes: 1:4322496 2:87627856 3:363553620 4:728999592 5:1042486228\n",
592
- "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
593
- "####################################################################################################\n",
594
- "=== 5/5 Writing ARPA model ===\n",
595
- "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
596
- "****************************************************************************************************\n",
597
- "Name:lmplz\tVmPeak:14181536 kB\tVmRSS:2199072 kB\tRSSMax:4117540 kB\tuser:125.411\tsys:25.1745\tCPU:150.586\treal:290.479\n"
598
- ]
599
- }
600
- ],
601
- "source": [
602
- "!kenlm/build/bin/lmplz -o 5 <\"text.txt\" > \"5gram.arpa\""
603
- ]
604
- },
605
- {
606
- "cell_type": "markdown",
607
- "metadata": {
608
- "id": "ZJ5OKh358nwR"
609
- },
610
- "source": [
611
- "Check head of file"
612
- ]
613
- },
614
- {
615
- "cell_type": "code",
616
- "execution_count": 7,
617
- "metadata": {
618
- "colab": {
619
- "base_uri": "https://localhost:8080/"
620
  },
621
- "id": "pv93ZCR68s4m",
622
- "outputId": "9489b8a8-789d-4779-85f4-f4aa4e0b3392"
623
- },
624
- "outputs": [
625
  {
626
- "name": "stdout",
627
- "output_type": "stream",
628
- "text": [
629
- "\\data\\\n",
630
- "ngram 1=360208\n",
631
- "ngram 2=5476741\n",
632
- "ngram 3=18177681\n",
633
- "ngram 4=30374983\n",
634
- "ngram 5=37231651\n",
635
- "\n",
636
- "\\1-grams:\n",
637
- "-6.770219\t<unk>\t0\n",
638
- "0\t<s>\t-0.11831701\n",
639
- "-4.6095004\tåterupptagande\t-1.2174699\n",
640
- "-2.2361007\tav\t-0.79668784\n",
641
- "-4.8163533\tsessionen\t-0.37327805\n",
642
- "-2.2251768\tjag\t-1.4205662\n",
643
- "-4.181505\tförklarar\t-0.56261665\n",
644
- "-3.5790775\teuropaparlamentets\t-0.63611007\n",
645
- "-4.771945\tsession\t-0.3647111\n",
646
- "-5.8043895\tåterupptagen\t-0.3058712\n",
647
- "-2.8580177\tefter\t-0.7557702\n",
648
- "-5.199537\tavbrottet\t-0.43322718\n"
649
- ]
650
- }
651
- ],
652
- "source": [
653
- "!head -20 5gram.arpa"
654
- ]
655
- },
656
- {
657
- "cell_type": "markdown",
658
- "metadata": {
659
- "id": "FEcPijF77mPY"
660
- },
661
- "source": [
662
- "Add end-of-sentence token \"\\</s>\" "
663
- ]
664
- },
665
- {
666
- "cell_type": "code",
667
- "execution_count": 8,
668
- "metadata": {
669
- "id": "Sktd-U5a7yZL"
670
- },
671
- "outputs": [],
672
- "source": [
673
- "with open(\"5gram.arpa\", \"r\") as read_file, open(\"5gram_sv_lm.arpa\", \"w\") as write_file:\n",
674
- " has_added_eos = False\n",
675
- " for line in read_file:\n",
676
- " if not has_added_eos and \"ngram 1=\" in line:\n",
677
- " count=line.strip().split(\"=\")[-1]\n",
678
- " write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n",
679
- " elif not has_added_eos and \"<s>\" in line:\n",
680
- " write_file.write(line)\n",
681
- " write_file.write(line.replace(\"<s>\", \"</s>\"))\n",
682
- " has_added_eos = True\n",
683
- " else:\n",
684
- " write_file.write(line)"
685
- ]
686
- },
687
- {
688
- "cell_type": "markdown",
689
- "metadata": {
690
- "id": "hqXHYY-K760Q"
691
- },
692
- "source": [
693
- "Check head of file"
694
- ]
695
- },
696
- {
697
- "cell_type": "code",
698
- "execution_count": 9,
699
- "metadata": {
700
- "colab": {
701
- "base_uri": "https://localhost:8080/"
702
  },
703
- "id": "0QuHk3AY8Hax",
704
- "outputId": "090d065f-95c7-48e5-bc0c-01069f69c619"
705
- },
706
- "outputs": [
707
  {
708
- "name": "stdout",
709
- "output_type": "stream",
710
- "text": [
711
- "\\data\\\n",
712
- "ngram 1=360209\n",
713
- "ngram 2=5476741\n",
714
- "ngram 3=18177681\n",
715
- "ngram 4=30374983\n",
716
- "ngram 5=37231651\n",
717
- "\n",
718
- "\\1-grams:\n",
719
- "-6.770219\t<unk>\t0\n",
720
- "0\t<s>\t-0.11831701\n",
721
- "0\t</s>\t-0.11831701\n",
722
- "-4.6095004\tåterupptagande\t-1.2174699\n",
723
- "-2.2361007\tav\t-0.79668784\n",
724
- "-4.8163533\tsessionen\t-0.37327805\n",
725
- "-2.2251768\tjag\t-1.4205662\n",
726
- "-4.181505\tförklarar\t-0.56261665\n",
727
- "-3.5790775\teuropaparlamentets\t-0.63611007\n",
728
- "-4.771945\tsession\t-0.3647111\n",
729
- "-5.8043895\tåterupptagen\t-0.3058712\n",
730
- "-2.8580177\tefter\t-0.7557702\n"
731
- ]
732
- }
733
- ],
734
- "source": [
735
- "!head -20 5gram_sv_lm.arpa"
736
- ]
737
- },
738
- {
739
- "cell_type": "markdown",
740
- "metadata": {
741
- "id": "kTvRntrZ9-uq"
742
- },
743
- "source": [
744
- "Compress arpa file by converting it to bin"
745
- ]
746
- },
747
- {
748
- "cell_type": "code",
749
- "execution_count": 11,
750
- "metadata": {
751
- "colab": {
752
- "base_uri": "https://localhost:8080/"
753
  },
754
- "id": "DnmOlNZ5-ClT",
755
- "outputId": "c380c05a-e335-4e9d-98b2-c015645a2d40"
756
- },
757
- "outputs": [
758
  {
759
- "name": "stdout",
760
- "output_type": "stream",
761
- "text": [
762
- "Reading 5gram_sv_lm.arpa\n",
763
- "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
764
- "****************************************************************************************************\n",
765
- "SUCCESS\n"
766
- ]
767
- }
768
- ],
769
- "source": [
770
- "!kenlm/build/bin/build_binary 5gram_sv_lm.arpa 5gram_sv_lm.bin"
771
- ]
772
- },
773
- {
774
- "cell_type": "markdown",
775
- "metadata": {
776
- "id": "Xra-pM-M8MZj"
777
- },
778
- "source": [
779
- "Download file to local machine (use Chrome if it fails on another browser)."
780
- ]
781
- },
782
- {
783
- "cell_type": "code",
784
- "execution_count": 12,
785
- "metadata": {
786
- "colab": {
787
- "base_uri": "https://localhost:8080/",
788
- "height": 34
789
  },
790
- "id": "M7b5x8Hr8Yuo",
791
- "outputId": "5fbedff6-4a41-47c5-903c-2ad3b59983e1"
792
- },
793
- "outputs": [
794
  {
795
- "data": {
796
- "application/javascript": [
797
- "\n",
798
- " async function download(id, filename, size) {\n",
799
- " if (!google.colab.kernel.accessAllowed) {\n",
800
- " return;\n",
801
- " }\n",
802
- " const div = document.createElement('div');\n",
803
- " const label = document.createElement('label');\n",
804
- " label.textContent = `Downloading \"${filename}\": `;\n",
805
- " div.appendChild(label);\n",
806
- " const progress = document.createElement('progress');\n",
807
- " progress.max = size;\n",
808
- " div.appendChild(progress);\n",
809
- " document.body.appendChild(div);\n",
810
- "\n",
811
- " const buffers = [];\n",
812
- " let downloaded = 0;\n",
813
- "\n",
814
- " const channel = await google.colab.kernel.comms.open(id);\n",
815
- " // Send a message to notify the kernel that we're ready.\n",
816
- " channel.send({})\n",
817
- "\n",
818
- " for await (const message of channel.messages) {\n",
819
- " // Send a message to notify the kernel that we're ready.\n",
820
- " channel.send({})\n",
821
- " if (message.buffers) {\n",
822
- " for (const buffer of message.buffers) {\n",
823
- " buffers.push(buffer);\n",
824
- " downloaded += buffer.byteLength;\n",
825
- " progress.value = downloaded;\n",
826
- " }\n",
827
- " }\n",
828
- " }\n",
829
- " const blob = new Blob(buffers, {type: 'application/binary'});\n",
830
- " const a = document.createElement('a');\n",
831
- " a.href = window.URL.createObjectURL(blob);\n",
832
- " a.download = filename;\n",
833
- " div.appendChild(a);\n",
834
- " a.click();\n",
835
- " div.remove();\n",
836
- " }\n",
837
- " "
838
  ],
839
- "text/plain": [
840
- "<IPython.core.display.Javascript object>"
841
- ]
842
- },
843
- "metadata": {},
844
- "output_type": "display_data"
845
  },
846
  {
847
- "data": {
848
- "application/javascript": [
849
- "download(\"download_82154b0d-c2b7-4fbd-8f04-e987f3406e7e\", \"5gram_sv_lm.bin\", 1981380707)"
 
 
 
 
 
850
  ],
851
- "text/plain": [
852
- "<IPython.core.display.Javascript object>"
853
- ]
854
- },
855
- "metadata": {},
856
- "output_type": "display_data"
857
- }
858
- ],
859
- "source": [
860
- "from google.colab import files\n",
861
- "files.download(\"5gram_sv_lm.bin\") "
862
- ]
863
- }
864
- ],
865
- "metadata": {
866
- "colab": {
867
- "name": "train_n-gram_lm_with_KenLM",
868
- "provenance": []
869
- },
870
- "kernelspec": {
871
- "display_name": "Python 3 (ipykernel)",
872
- "language": "python",
873
- "name": "python3"
874
- },
875
- "language_info": {
876
- "codemirror_mode": {
877
- "name": "ipython",
878
- "version": 3
879
- },
880
- "file_extension": ".py",
881
- "mimetype": "text/x-python",
882
- "name": "python",
883
- "nbconvert_exporter": "python",
884
- "pygments_lexer": "ipython3",
885
- "version": "3.8.6"
886
- },
887
- "widgets": {
888
- "application/vnd.jupyter.widget-state+json": {
889
- "033cb43d32314d279a7b9e1e86bbccdc": {
890
- "model_module": "@jupyter-widgets/base",
891
- "model_module_version": "1.2.0",
892
- "model_name": "LayoutModel",
893
- "state": {
894
- "_model_module": "@jupyter-widgets/base",
895
- "_model_module_version": "1.2.0",
896
- "_model_name": "LayoutModel",
897
- "_view_count": null,
898
- "_view_module": "@jupyter-widgets/base",
899
- "_view_module_version": "1.2.0",
900
- "_view_name": "LayoutView",
901
- "align_content": null,
902
- "align_items": null,
903
- "align_self": null,
904
- "border": null,
905
- "bottom": null,
906
- "display": null,
907
- "flex": null,
908
- "flex_flow": null,
909
- "grid_area": null,
910
- "grid_auto_columns": null,
911
- "grid_auto_flow": null,
912
- "grid_auto_rows": null,
913
- "grid_column": null,
914
- "grid_gap": null,
915
- "grid_row": null,
916
- "grid_template_areas": null,
917
- "grid_template_columns": null,
918
- "grid_template_rows": null,
919
- "height": null,
920
- "justify_content": null,
921
- "justify_items": null,
922
- "left": null,
923
- "margin": null,
924
- "max_height": null,
925
- "max_width": null,
926
- "min_height": null,
927
- "min_width": null,
928
- "object_fit": null,
929
- "object_position": null,
930
- "order": null,
931
- "overflow": null,
932
- "overflow_x": null,
933
- "overflow_y": null,
934
- "padding": null,
935
- "right": null,
936
- "top": null,
937
- "visibility": null,
938
- "width": null
939
- }
940
- },
941
- "0bee4735e017471fa8679ad984b88633": {
942
- "model_module": "@jupyter-widgets/base",
943
- "model_module_version": "1.2.0",
944
- "model_name": "LayoutModel",
945
- "state": {
946
- "_model_module": "@jupyter-widgets/base",
947
- "_model_module_version": "1.2.0",
948
- "_model_name": "LayoutModel",
949
- "_view_count": null,
950
- "_view_module": "@jupyter-widgets/base",
951
- "_view_module_version": "1.2.0",
952
- "_view_name": "LayoutView",
953
- "align_content": null,
954
- "align_items": null,
955
- "align_self": null,
956
- "border": null,
957
- "bottom": null,
958
- "display": null,
959
- "flex": null,
960
- "flex_flow": null,
961
- "grid_area": null,
962
- "grid_auto_columns": null,
963
- "grid_auto_flow": null,
964
- "grid_auto_rows": null,
965
- "grid_column": null,
966
- "grid_gap": null,
967
- "grid_row": null,
968
- "grid_template_areas": null,
969
- "grid_template_columns": null,
970
- "grid_template_rows": null,
971
- "height": null,
972
- "justify_content": null,
973
- "justify_items": null,
974
- "left": null,
975
- "margin": null,
976
- "max_height": null,
977
- "max_width": null,
978
- "min_height": null,
979
- "min_width": null,
980
- "object_fit": null,
981
- "object_position": null,
982
- "order": null,
983
- "overflow": null,
984
- "overflow_x": null,
985
- "overflow_y": null,
986
- "padding": null,
987
- "right": null,
988
- "top": null,
989
- "visibility": null,
990
- "width": null
991
- }
992
- },
993
- "1714ea91694842339756f26b2fa9c725": {
994
- "model_module": "@jupyter-widgets/controls",
995
- "model_module_version": "1.5.0",
996
- "model_name": "FloatProgressModel",
997
- "state": {
998
- "_dom_classes": [],
999
- "_model_module": "@jupyter-widgets/controls",
1000
- "_model_module_version": "1.5.0",
1001
- "_model_name": "FloatProgressModel",
1002
- "_view_count": null,
1003
- "_view_module": "@jupyter-widgets/controls",
1004
- "_view_module_version": "1.5.0",
1005
- "_view_name": "ProgressView",
1006
- "bar_style": "success",
1007
- "description": "",
1008
- "description_tooltip": null,
1009
- "layout": "IPY_MODEL_d3e6acd54d024d6791aab76232557721",
1010
- "max": 1157,
1011
- "min": 0,
1012
- "orientation": "horizontal",
1013
- "style": "IPY_MODEL_e3414cc0456241eca109f4e9e115d16a",
1014
- "value": 1157
1015
- }
1016
- },
1017
- "23c9da8dd7bf4be9a23357806ebfc036": {
1018
- "model_module": "@jupyter-widgets/base",
1019
- "model_module_version": "1.2.0",
1020
- "model_name": "LayoutModel",
1021
- "state": {
1022
- "_model_module": "@jupyter-widgets/base",
1023
- "_model_module_version": "1.2.0",
1024
- "_model_name": "LayoutModel",
1025
- "_view_count": null,
1026
- "_view_module": "@jupyter-widgets/base",
1027
- "_view_module_version": "1.2.0",
1028
- "_view_name": "LayoutView",
1029
- "align_content": null,
1030
- "align_items": null,
1031
- "align_self": null,
1032
- "border": null,
1033
- "bottom": null,
1034
- "display": null,
1035
- "flex": null,
1036
- "flex_flow": null,
1037
- "grid_area": null,
1038
- "grid_auto_columns": null,
1039
- "grid_auto_flow": null,
1040
- "grid_auto_rows": null,
1041
- "grid_column": null,
1042
- "grid_gap": null,
1043
- "grid_row": null,
1044
- "grid_template_areas": null,
1045
- "grid_template_columns": null,
1046
- "grid_template_rows": null,
1047
- "height": null,
1048
- "justify_content": null,
1049
- "justify_items": null,
1050
- "left": null,
1051
- "margin": null,
1052
- "max_height": null,
1053
- "max_width": null,
1054
- "min_height": null,
1055
- "min_width": null,
1056
- "object_fit": null,
1057
- "object_position": null,
1058
- "order": null,
1059
- "overflow": null,
1060
- "overflow_x": null,
1061
- "overflow_y": null,
1062
- "padding": null,
1063
- "right": null,
1064
- "top": null,
1065
- "visibility": null,
1066
- "width": null
1067
- }
1068
- },
1069
- "2c37aaee1f524837b477dc584209733a": {
1070
- "model_module": "@jupyter-widgets/controls",
1071
- "model_module_version": "1.5.0",
1072
- "model_name": "DescriptionStyleModel",
1073
- "state": {
1074
- "_model_module": "@jupyter-widgets/controls",
1075
- "_model_module_version": "1.5.0",
1076
- "_model_name": "DescriptionStyleModel",
1077
- "_view_count": null,
1078
- "_view_module": "@jupyter-widgets/base",
1079
- "_view_module_version": "1.2.0",
1080
- "_view_name": "StyleView",
1081
- "description_width": ""
1082
- }
1083
- },
1084
- "31b2d7d8d9054c8fb47bf1b58043aee1": {
1085
- "model_module": "@jupyter-widgets/controls",
1086
- "model_module_version": "1.5.0",
1087
- "model_name": "ProgressStyleModel",
1088
- "state": {
1089
- "_model_module": "@jupyter-widgets/controls",
1090
- "_model_module_version": "1.5.0",
1091
- "_model_name": "ProgressStyleModel",
1092
- "_view_count": null,
1093
- "_view_module": "@jupyter-widgets/base",
1094
- "_view_module_version": "1.2.0",
1095
- "_view_name": "StyleView",
1096
- "bar_color": null,
1097
- "description_width": ""
1098
- }
1099
- },
1100
- "328cea1a2aac4fb58bceeaf126b99371": {
1101
- "model_module": "@jupyter-widgets/controls",
1102
- "model_module_version": "1.5.0",
1103
- "model_name": "HTMLModel",
1104
- "state": {
1105
- "_dom_classes": [],
1106
- "_model_module": "@jupyter-widgets/controls",
1107
- "_model_module_version": "1.5.0",
1108
- "_model_name": "HTMLModel",
1109
- "_view_count": null,
1110
- "_view_module": "@jupyter-widgets/controls",
1111
- "_view_module_version": "1.5.0",
1112
- "_view_name": "HTMLView",
1113
- "description": "",
1114
- "description_tooltip": null,
1115
- "layout": "IPY_MODEL_c710ba94fd65486cbcbe1d402919e27f",
1116
- "placeholder": "​",
1117
- "style": "IPY_MODEL_7f92331b29fd49a68815b6d7389c1005",
1118
- "value": "100%"
1119
- }
1120
- },
1121
- "34c5f87238cb4f13a03b207aa7dc1d18": {
1122
- "model_module": "@jupyter-widgets/base",
1123
- "model_module_version": "1.2.0",
1124
- "model_name": "LayoutModel",
1125
- "state": {
1126
- "_model_module": "@jupyter-widgets/base",
1127
- "_model_module_version": "1.2.0",
1128
- "_model_name": "LayoutModel",
1129
- "_view_count": null,
1130
- "_view_module": "@jupyter-widgets/base",
1131
- "_view_module_version": "1.2.0",
1132
- "_view_name": "LayoutView",
1133
- "align_content": null,
1134
- "align_items": null,
1135
- "align_self": null,
1136
- "border": null,
1137
- "bottom": null,
1138
- "display": null,
1139
- "flex": null,
1140
- "flex_flow": null,
1141
- "grid_area": null,
1142
- "grid_auto_columns": null,
1143
- "grid_auto_flow": null,
1144
- "grid_auto_rows": null,
1145
- "grid_column": null,
1146
- "grid_gap": null,
1147
- "grid_row": null,
1148
- "grid_template_areas": null,
1149
- "grid_template_columns": null,
1150
- "grid_template_rows": null,
1151
- "height": null,
1152
- "justify_content": null,
1153
- "justify_items": null,
1154
- "left": null,
1155
- "margin": null,
1156
- "max_height": null,
1157
- "max_width": null,
1158
- "min_height": null,
1159
- "min_width": null,
1160
- "object_fit": null,
1161
- "object_position": null,
1162
- "order": null,
1163
- "overflow": null,
1164
- "overflow_x": null,
1165
- "overflow_y": null,
1166
- "padding": null,
1167
- "right": null,
1168
- "top": null,
1169
- "visibility": null,
1170
- "width": null
1171
- }
1172
- },
1173
- "34d9460b112c419885bbff5211674cb3": {
1174
- "model_module": "@jupyter-widgets/controls",
1175
- "model_module_version": "1.5.0",
1176
- "model_name": "DescriptionStyleModel",
1177
- "state": {
1178
- "_model_module": "@jupyter-widgets/controls",
1179
- "_model_module_version": "1.5.0",
1180
- "_model_name": "DescriptionStyleModel",
1181
- "_view_count": null,
1182
- "_view_module": "@jupyter-widgets/base",
1183
- "_view_module_version": "1.2.0",
1184
- "_view_name": "StyleView",
1185
- "description_width": ""
1186
- }
1187
- },
1188
- "4958b4c72d0c48af9a77974fc4ed449c": {
1189
- "model_module": "@jupyter-widgets/base",
1190
- "model_module_version": "1.2.0",
1191
- "model_name": "LayoutModel",
1192
- "state": {
1193
- "_model_module": "@jupyter-widgets/base",
1194
- "_model_module_version": "1.2.0",
1195
- "_model_name": "LayoutModel",
1196
- "_view_count": null,
1197
- "_view_module": "@jupyter-widgets/base",
1198
- "_view_module_version": "1.2.0",
1199
- "_view_name": "LayoutView",
1200
- "align_content": null,
1201
- "align_items": null,
1202
- "align_self": null,
1203
- "border": null,
1204
- "bottom": null,
1205
- "display": null,
1206
- "flex": null,
1207
- "flex_flow": null,
1208
- "grid_area": null,
1209
- "grid_auto_columns": null,
1210
- "grid_auto_flow": null,
1211
- "grid_auto_rows": null,
1212
- "grid_column": null,
1213
- "grid_gap": null,
1214
- "grid_row": null,
1215
- "grid_template_areas": null,
1216
- "grid_template_columns": null,
1217
- "grid_template_rows": null,
1218
- "height": null,
1219
- "justify_content": null,
1220
- "justify_items": null,
1221
- "left": null,
1222
- "margin": null,
1223
- "max_height": null,
1224
- "max_width": null,
1225
- "min_height": null,
1226
- "min_width": null,
1227
- "object_fit": null,
1228
- "object_position": null,
1229
- "order": null,
1230
- "overflow": null,
1231
- "overflow_x": null,
1232
- "overflow_y": null,
1233
- "padding": null,
1234
- "right": null,
1235
- "top": null,
1236
- "visibility": null,
1237
- "width": null
1238
- }
1239
- },
1240
- "52a852c0f98c49aa9e5edfdd4f91e4ca": {
1241
- "model_module": "@jupyter-widgets/controls",
1242
- "model_module_version": "1.5.0",
1243
- "model_name": "FloatProgressModel",
1244
- "state": {
1245
- "_dom_classes": [],
1246
- "_model_module": "@jupyter-widgets/controls",
1247
- "_model_module_version": "1.5.0",
1248
- "_model_name": "FloatProgressModel",
1249
- "_view_count": null,
1250
- "_view_module": "@jupyter-widgets/controls",
1251
- "_view_module_version": "1.5.0",
1252
- "_view_name": "ProgressView",
1253
- "bar_style": "success",
1254
- "description": "",
1255
- "description_tooltip": null,
1256
- "layout": "IPY_MODEL_23c9da8dd7bf4be9a23357806ebfc036",
1257
- "max": 158752204,
1258
- "min": 0,
1259
- "orientation": "horizontal",
1260
- "style": "IPY_MODEL_31b2d7d8d9054c8fb47bf1b58043aee1",
1261
- "value": 158752204
1262
- }
1263
- },
1264
- "5925567ffea2436691c4ed3b7b147c17": {
1265
- "model_module": "@jupyter-widgets/base",
1266
- "model_module_version": "1.2.0",
1267
- "model_name": "LayoutModel",
1268
- "state": {
1269
- "_model_module": "@jupyter-widgets/base",
1270
- "_model_module_version": "1.2.0",
1271
- "_model_name": "LayoutModel",
1272
- "_view_count": null,
1273
- "_view_module": "@jupyter-widgets/base",
1274
- "_view_module_version": "1.2.0",
1275
- "_view_name": "LayoutView",
1276
- "align_content": null,
1277
- "align_items": null,
1278
- "align_self": null,
1279
- "border": null,
1280
- "bottom": null,
1281
- "display": null,
1282
- "flex": null,
1283
- "flex_flow": null,
1284
- "grid_area": null,
1285
- "grid_auto_columns": null,
1286
- "grid_auto_flow": null,
1287
- "grid_auto_rows": null,
1288
- "grid_column": null,
1289
- "grid_gap": null,
1290
- "grid_row": null,
1291
- "grid_template_areas": null,
1292
- "grid_template_columns": null,
1293
- "grid_template_rows": null,
1294
- "height": null,
1295
- "justify_content": null,
1296
- "justify_items": null,
1297
- "left": null,
1298
- "margin": null,
1299
- "max_height": null,
1300
- "max_width": null,
1301
- "min_height": null,
1302
- "min_width": null,
1303
- "object_fit": null,
1304
- "object_position": null,
1305
- "order": null,
1306
- "overflow": null,
1307
- "overflow_x": null,
1308
- "overflow_y": null,
1309
- "padding": null,
1310
- "right": null,
1311
- "top": null,
1312
- "visibility": null,
1313
- "width": null
1314
- }
1315
- },
1316
- "5951d1bafdd548b6b835b28cf9960533": {
1317
- "model_module": "@jupyter-widgets/controls",
1318
- "model_module_version": "1.5.0",
1319
- "model_name": "ProgressStyleModel",
1320
- "state": {
1321
- "_model_module": "@jupyter-widgets/controls",
1322
- "_model_module_version": "1.5.0",
1323
- "_model_name": "ProgressStyleModel",
1324
- "_view_count": null,
1325
- "_view_module": "@jupyter-widgets/base",
1326
- "_view_module_version": "1.2.0",
1327
- "_view_name": "StyleView",
1328
- "bar_color": null,
1329
- "description_width": ""
1330
- }
1331
- },
1332
- "5a9c9d4b60e54a3bb64c576707bd9736": {
1333
- "model_module": "@jupyter-widgets/controls",
1334
- "model_module_version": "1.5.0",
1335
- "model_name": "DescriptionStyleModel",
1336
- "state": {
1337
- "_model_module": "@jupyter-widgets/controls",
1338
- "_model_module_version": "1.5.0",
1339
- "_model_name": "DescriptionStyleModel",
1340
- "_view_count": null,
1341
- "_view_module": "@jupyter-widgets/base",
1342
- "_view_module_version": "1.2.0",
1343
- "_view_name": "StyleView",
1344
- "description_width": ""
1345
- }
1346
- },
1347
- "5e469744bf6a4813983ae8ee727c1c5e": {
1348
- "model_module": "@jupyter-widgets/controls",
1349
- "model_module_version": "1.5.0",
1350
- "model_name": "DescriptionStyleModel",
1351
- "state": {
1352
- "_model_module": "@jupyter-widgets/controls",
1353
- "_model_module_version": "1.5.0",
1354
- "_model_name": "DescriptionStyleModel",
1355
- "_view_count": null,
1356
- "_view_module": "@jupyter-widgets/base",
1357
- "_view_module_version": "1.2.0",
1358
- "_view_name": "StyleView",
1359
- "description_width": ""
1360
- }
1361
- },
1362
- "662d61fdd89d434785e74a7038427fbc": {
1363
- "model_module": "@jupyter-widgets/controls",
1364
- "model_module_version": "1.5.0",
1365
- "model_name": "FloatProgressModel",
1366
- "state": {
1367
- "_dom_classes": [],
1368
- "_model_module": "@jupyter-widgets/controls",
1369
- "_model_module_version": "1.5.0",
1370
- "_model_name": "FloatProgressModel",
1371
- "_view_count": null,
1372
- "_view_module": "@jupyter-widgets/controls",
1373
- "_view_module_version": "1.5.0",
1374
- "_view_name": "ProgressView",
1375
- "bar_style": "success",
1376
- "description": "",
1377
- "description_tooltip": null,
1378
- "layout": "IPY_MODEL_adcffda7f78c4a1c8bdc6010c8704292",
1379
- "max": 1,
1380
- "min": 0,
1381
- "orientation": "horizontal",
1382
- "style": "IPY_MODEL_5951d1bafdd548b6b835b28cf9960533",
1383
- "value": 1
1384
- }
1385
- },
1386
- "670d4f16a7e44144afc0ac70eea59325": {
1387
- "model_module": "@jupyter-widgets/controls",
1388
- "model_module_version": "1.5.0",
1389
- "model_name": "HTMLModel",
1390
- "state": {
1391
- "_dom_classes": [],
1392
- "_model_module": "@jupyter-widgets/controls",
1393
- "_model_module_version": "1.5.0",
1394
- "_model_name": "HTMLModel",
1395
- "_view_count": null,
1396
- "_view_module": "@jupyter-widgets/controls",
1397
- "_view_module_version": "1.5.0",
1398
- "_view_name": "HTMLView",
1399
- "description": "",
1400
- "description_tooltip": null,
1401
- "layout": "IPY_MODEL_0bee4735e017471fa8679ad984b88633",
1402
- "placeholder": "​",
1403
- "style": "IPY_MODEL_2c37aaee1f524837b477dc584209733a",
1404
- "value": " 1/1 [00:00&lt;00:00, 21.15it/s]"
1405
- }
1406
  },
1407
- "6a7e3547dc4141e7b5937f2baff58cbf": {
1408
- "model_module": "@jupyter-widgets/controls",
1409
- "model_module_version": "1.5.0",
1410
- "model_name": "HBoxModel",
1411
- "state": {
1412
- "_dom_classes": [],
1413
- "_model_module": "@jupyter-widgets/controls",
1414
- "_model_module_version": "1.5.0",
1415
- "_model_name": "HBoxModel",
1416
- "_view_count": null,
1417
- "_view_module": "@jupyter-widgets/controls",
1418
- "_view_module_version": "1.5.0",
1419
- "_view_name": "HBoxView",
1420
- "box_style": "",
1421
- "children": [
1422
- "IPY_MODEL_e33033ecda374ed4966ae5fccf6efe37",
1423
- "IPY_MODEL_52a852c0f98c49aa9e5edfdd4f91e4ca",
1424
- "IPY_MODEL_d1ff84cb5591449abcc7dd3e37f9a2df"
1425
  ],
1426
- "layout": "IPY_MODEL_921a3c1f50a24979838fd560c2cea9e0"
1427
- }
1428
- },
1429
- "6b43ea2d93c04965a4539b3ef839893b": {
1430
- "model_module": "@jupyter-widgets/controls",
1431
- "model_module_version": "1.5.0",
1432
- "model_name": "DescriptionStyleModel",
1433
- "state": {
1434
- "_model_module": "@jupyter-widgets/controls",
1435
- "_model_module_version": "1.5.0",
1436
- "_model_name": "DescriptionStyleModel",
1437
- "_view_count": null,
1438
- "_view_module": "@jupyter-widgets/base",
1439
- "_view_module_version": "1.2.0",
1440
- "_view_name": "StyleView",
1441
- "description_width": ""
1442
- }
1443
- },
1444
- "75130a60f93b49c8bee0986665121d02": {
1445
- "model_module": "@jupyter-widgets/base",
1446
- "model_module_version": "1.2.0",
1447
- "model_name": "LayoutModel",
1448
- "state": {
1449
- "_model_module": "@jupyter-widgets/base",
1450
- "_model_module_version": "1.2.0",
1451
- "_model_name": "LayoutModel",
1452
- "_view_count": null,
1453
- "_view_module": "@jupyter-widgets/base",
1454
- "_view_module_version": "1.2.0",
1455
- "_view_name": "LayoutView",
1456
- "align_content": null,
1457
- "align_items": null,
1458
- "align_self": null,
1459
- "border": null,
1460
- "bottom": null,
1461
- "display": null,
1462
- "flex": null,
1463
- "flex_flow": null,
1464
- "grid_area": null,
1465
- "grid_auto_columns": null,
1466
- "grid_auto_flow": null,
1467
- "grid_auto_rows": null,
1468
- "grid_column": null,
1469
- "grid_gap": null,
1470
- "grid_row": null,
1471
- "grid_template_areas": null,
1472
- "grid_template_columns": null,
1473
- "grid_template_rows": null,
1474
- "height": null,
1475
- "justify_content": null,
1476
- "justify_items": null,
1477
- "left": null,
1478
- "margin": null,
1479
- "max_height": null,
1480
- "max_width": null,
1481
- "min_height": null,
1482
- "min_width": null,
1483
- "object_fit": null,
1484
- "object_position": null,
1485
- "order": null,
1486
- "overflow": null,
1487
- "overflow_x": null,
1488
- "overflow_y": null,
1489
- "padding": null,
1490
- "right": null,
1491
- "top": null,
1492
- "visibility": null,
1493
- "width": null
1494
- }
1495
- },
1496
- "789d5845a82e48fe9c629af743b5b1f0": {
1497
- "model_module": "@jupyter-widgets/base",
1498
- "model_module_version": "1.2.0",
1499
- "model_name": "LayoutModel",
1500
- "state": {
1501
- "_model_module": "@jupyter-widgets/base",
1502
- "_model_module_version": "1.2.0",
1503
- "_model_name": "LayoutModel",
1504
- "_view_count": null,
1505
- "_view_module": "@jupyter-widgets/base",
1506
- "_view_module_version": "1.2.0",
1507
- "_view_name": "LayoutView",
1508
- "align_content": null,
1509
- "align_items": null,
1510
- "align_self": null,
1511
- "border": null,
1512
- "bottom": null,
1513
- "display": null,
1514
- "flex": null,
1515
- "flex_flow": null,
1516
- "grid_area": null,
1517
- "grid_auto_columns": null,
1518
- "grid_auto_flow": null,
1519
- "grid_auto_rows": null,
1520
- "grid_column": null,
1521
- "grid_gap": null,
1522
- "grid_row": null,
1523
- "grid_template_areas": null,
1524
- "grid_template_columns": null,
1525
- "grid_template_rows": null,
1526
- "height": null,
1527
- "justify_content": null,
1528
- "justify_items": null,
1529
- "left": null,
1530
- "margin": null,
1531
- "max_height": null,
1532
- "max_width": null,
1533
- "min_height": null,
1534
- "min_width": null,
1535
- "object_fit": null,
1536
- "object_position": null,
1537
- "order": null,
1538
- "overflow": null,
1539
- "overflow_x": null,
1540
- "overflow_y": null,
1541
- "padding": null,
1542
- "right": null,
1543
- "top": null,
1544
- "visibility": null,
1545
- "width": null
1546
- }
1547
- },
1548
- "799acae3451445f0a3616b8932f2e3f3": {
1549
- "model_module": "@jupyter-widgets/controls",
1550
- "model_module_version": "1.5.0",
1551
- "model_name": "HTMLModel",
1552
- "state": {
1553
- "_dom_classes": [],
1554
- "_model_module": "@jupyter-widgets/controls",
1555
- "_model_module_version": "1.5.0",
1556
- "_model_name": "HTMLModel",
1557
- "_view_count": null,
1558
- "_view_module": "@jupyter-widgets/controls",
1559
- "_view_module_version": "1.5.0",
1560
- "_view_name": "HTMLView",
1561
- "description": "",
1562
- "description_tooltip": null,
1563
- "layout": "IPY_MODEL_789d5845a82e48fe9c629af743b5b1f0",
1564
- "placeholder": "​",
1565
- "style": "IPY_MODEL_5a9c9d4b60e54a3bb64c576707bd9736",
1566
- "value": "Downloading: 100%"
1567
- }
1568
- },
1569
- "7f92331b29fd49a68815b6d7389c1005": {
1570
- "model_module": "@jupyter-widgets/controls",
1571
- "model_module_version": "1.5.0",
1572
- "model_name": "DescriptionStyleModel",
1573
- "state": {
1574
- "_model_module": "@jupyter-widgets/controls",
1575
- "_model_module_version": "1.5.0",
1576
- "_model_name": "DescriptionStyleModel",
1577
- "_view_count": null,
1578
- "_view_module": "@jupyter-widgets/base",
1579
- "_view_module_version": "1.2.0",
1580
- "_view_name": "StyleView",
1581
- "description_width": ""
1582
- }
1583
- },
1584
- "82692c41501c487fad27c6b19836f46f": {
1585
- "model_module": "@jupyter-widgets/controls",
1586
- "model_module_version": "1.5.0",
1587
- "model_name": "FloatProgressModel",
1588
- "state": {
1589
- "_dom_classes": [],
1590
- "_model_module": "@jupyter-widgets/controls",
1591
- "_model_module_version": "1.5.0",
1592
- "_model_name": "FloatProgressModel",
1593
- "_view_count": null,
1594
- "_view_module": "@jupyter-widgets/controls",
1595
- "_view_module_version": "1.5.0",
1596
- "_view_name": "ProgressView",
1597
- "bar_style": "success",
1598
- "description": "",
1599
- "description_tooltip": null,
1600
- "layout": "IPY_MODEL_db7ee45589e04749b80376e25ee377bb",
1601
- "max": 1,
1602
- "min": 0,
1603
- "orientation": "horizontal",
1604
- "style": "IPY_MODEL_c7955974289a4f448b422d7e4640131a",
1605
- "value": 1
1606
- }
1607
- },
1608
- "921a3c1f50a24979838fd560c2cea9e0": {
1609
- "model_module": "@jupyter-widgets/base",
1610
- "model_module_version": "1.2.0",
1611
- "model_name": "LayoutModel",
1612
- "state": {
1613
- "_model_module": "@jupyter-widgets/base",
1614
- "_model_module_version": "1.2.0",
1615
- "_model_name": "LayoutModel",
1616
- "_view_count": null,
1617
- "_view_module": "@jupyter-widgets/base",
1618
- "_view_module_version": "1.2.0",
1619
- "_view_name": "LayoutView",
1620
- "align_content": null,
1621
- "align_items": null,
1622
- "align_self": null,
1623
- "border": null,
1624
- "bottom": null,
1625
- "display": null,
1626
- "flex": null,
1627
- "flex_flow": null,
1628
- "grid_area": null,
1629
- "grid_auto_columns": null,
1630
- "grid_auto_flow": null,
1631
- "grid_auto_rows": null,
1632
- "grid_column": null,
1633
- "grid_gap": null,
1634
- "grid_row": null,
1635
- "grid_template_areas": null,
1636
- "grid_template_columns": null,
1637
- "grid_template_rows": null,
1638
- "height": null,
1639
- "justify_content": null,
1640
- "justify_items": null,
1641
- "left": null,
1642
- "margin": null,
1643
- "max_height": null,
1644
- "max_width": null,
1645
- "min_height": null,
1646
- "min_width": null,
1647
- "object_fit": null,
1648
- "object_position": null,
1649
- "order": null,
1650
- "overflow": null,
1651
- "overflow_x": null,
1652
- "overflow_y": null,
1653
- "padding": null,
1654
- "right": null,
1655
- "top": null,
1656
- "visibility": null,
1657
- "width": null
1658
- }
1659
  },
1660
- "a1eca879a11f414f8173b0c2c260f4c3": {
1661
- "model_module": "@jupyter-widgets/controls",
1662
- "model_module_version": "1.5.0",
1663
- "model_name": "HBoxModel",
1664
- "state": {
1665
- "_dom_classes": [],
1666
- "_model_module": "@jupyter-widgets/controls",
1667
- "_model_module_version": "1.5.0",
1668
- "_model_name": "HBoxModel",
1669
- "_view_count": null,
1670
- "_view_module": "@jupyter-widgets/controls",
1671
- "_view_module_version": "1.5.0",
1672
- "_view_name": "HBoxView",
1673
- "box_style": "",
1674
- "children": [
1675
- "IPY_MODEL_328cea1a2aac4fb58bceeaf126b99371",
1676
- "IPY_MODEL_662d61fdd89d434785e74a7038427fbc",
1677
- "IPY_MODEL_670d4f16a7e44144afc0ac70eea59325"
1678
  ],
1679
- "layout": "IPY_MODEL_75130a60f93b49c8bee0986665121d02"
1680
- }
1681
- },
1682
- "a41cf7f5121a4068842bb5c7d2bc4d62": {
1683
- "model_module": "@jupyter-widgets/base",
1684
- "model_module_version": "1.2.0",
1685
- "model_name": "LayoutModel",
1686
- "state": {
1687
- "_model_module": "@jupyter-widgets/base",
1688
- "_model_module_version": "1.2.0",
1689
- "_model_name": "LayoutModel",
1690
- "_view_count": null,
1691
- "_view_module": "@jupyter-widgets/base",
1692
- "_view_module_version": "1.2.0",
1693
- "_view_name": "LayoutView",
1694
- "align_content": null,
1695
- "align_items": null,
1696
- "align_self": null,
1697
- "border": null,
1698
- "bottom": null,
1699
- "display": null,
1700
- "flex": null,
1701
- "flex_flow": null,
1702
- "grid_area": null,
1703
- "grid_auto_columns": null,
1704
- "grid_auto_flow": null,
1705
- "grid_auto_rows": null,
1706
- "grid_column": null,
1707
- "grid_gap": null,
1708
- "grid_row": null,
1709
- "grid_template_areas": null,
1710
- "grid_template_columns": null,
1711
- "grid_template_rows": null,
1712
- "height": null,
1713
- "justify_content": null,
1714
- "justify_items": null,
1715
- "left": null,
1716
- "margin": null,
1717
- "max_height": null,
1718
- "max_width": null,
1719
- "min_height": null,
1720
- "min_width": null,
1721
- "object_fit": null,
1722
- "object_position": null,
1723
- "order": null,
1724
- "overflow": null,
1725
- "overflow_x": null,
1726
- "overflow_y": null,
1727
- "padding": null,
1728
- "right": null,
1729
- "top": null,
1730
- "visibility": null,
1731
- "width": null
1732
- }
1733
- },
1734
- "a4b5b93b88f549e8a4f37f3d48834ca9": {
1735
- "model_module": "@jupyter-widgets/controls",
1736
- "model_module_version": "1.5.0",
1737
- "model_name": "HTMLModel",
1738
- "state": {
1739
- "_dom_classes": [],
1740
- "_model_module": "@jupyter-widgets/controls",
1741
- "_model_module_version": "1.5.0",
1742
- "_model_name": "HTMLModel",
1743
- "_view_count": null,
1744
- "_view_module": "@jupyter-widgets/controls",
1745
- "_view_module_version": "1.5.0",
1746
- "_view_name": "HTMLView",
1747
- "description": "",
1748
- "description_tooltip": null,
1749
- "layout": "IPY_MODEL_34c5f87238cb4f13a03b207aa7dc1d18",
1750
- "placeholder": "​",
1751
- "style": "IPY_MODEL_5e469744bf6a4813983ae8ee727c1c5e",
1752
- "value": "100%"
1753
- }
1754
- },
1755
- "a6e3c5ce0a3c49ffb3d7cbf92568fe47": {
1756
- "model_module": "@jupyter-widgets/base",
1757
- "model_module_version": "1.2.0",
1758
- "model_name": "LayoutModel",
1759
- "state": {
1760
- "_model_module": "@jupyter-widgets/base",
1761
- "_model_module_version": "1.2.0",
1762
- "_model_name": "LayoutModel",
1763
- "_view_count": null,
1764
- "_view_module": "@jupyter-widgets/base",
1765
- "_view_module_version": "1.2.0",
1766
- "_view_name": "LayoutView",
1767
- "align_content": null,
1768
- "align_items": null,
1769
- "align_self": null,
1770
- "border": null,
1771
- "bottom": null,
1772
- "display": null,
1773
- "flex": null,
1774
- "flex_flow": null,
1775
- "grid_area": null,
1776
- "grid_auto_columns": null,
1777
- "grid_auto_flow": null,
1778
- "grid_auto_rows": null,
1779
- "grid_column": null,
1780
- "grid_gap": null,
1781
- "grid_row": null,
1782
- "grid_template_areas": null,
1783
- "grid_template_columns": null,
1784
- "grid_template_rows": null,
1785
- "height": null,
1786
- "justify_content": null,
1787
- "justify_items": null,
1788
- "left": null,
1789
- "margin": null,
1790
- "max_height": null,
1791
- "max_width": null,
1792
- "min_height": null,
1793
- "min_width": null,
1794
- "object_fit": null,
1795
- "object_position": null,
1796
- "order": null,
1797
- "overflow": null,
1798
- "overflow_x": null,
1799
- "overflow_y": null,
1800
- "padding": null,
1801
- "right": null,
1802
- "top": null,
1803
- "visibility": null,
1804
- "width": null
1805
- }
1806
  },
1807
- "ad5d7b0bc9ad4e228b3bc76bc975cc47": {
1808
- "model_module": "@jupyter-widgets/controls",
1809
- "model_module_version": "1.5.0",
1810
- "model_name": "HBoxModel",
1811
- "state": {
1812
- "_dom_classes": [],
1813
- "_model_module": "@jupyter-widgets/controls",
1814
- "_model_module_version": "1.5.0",
1815
- "_model_name": "HBoxModel",
1816
- "_view_count": null,
1817
- "_view_module": "@jupyter-widgets/controls",
1818
- "_view_module_version": "1.5.0",
1819
- "_view_name": "HBoxView",
1820
- "box_style": "",
1821
- "children": [
1822
- "IPY_MODEL_799acae3451445f0a3616b8932f2e3f3",
1823
- "IPY_MODEL_1714ea91694842339756f26b2fa9c725",
1824
- "IPY_MODEL_b5d6b069468246abbb3207f3df6f9dde"
1825
  ],
1826
- "layout": "IPY_MODEL_5925567ffea2436691c4ed3b7b147c17"
1827
- }
1828
- },
1829
- "adcffda7f78c4a1c8bdc6010c8704292": {
1830
- "model_module": "@jupyter-widgets/base",
1831
- "model_module_version": "1.2.0",
1832
- "model_name": "LayoutModel",
1833
- "state": {
1834
- "_model_module": "@jupyter-widgets/base",
1835
- "_model_module_version": "1.2.0",
1836
- "_model_name": "LayoutModel",
1837
- "_view_count": null,
1838
- "_view_module": "@jupyter-widgets/base",
1839
- "_view_module_version": "1.2.0",
1840
- "_view_name": "LayoutView",
1841
- "align_content": null,
1842
- "align_items": null,
1843
- "align_self": null,
1844
- "border": null,
1845
- "bottom": null,
1846
- "display": null,
1847
- "flex": null,
1848
- "flex_flow": null,
1849
- "grid_area": null,
1850
- "grid_auto_columns": null,
1851
- "grid_auto_flow": null,
1852
- "grid_auto_rows": null,
1853
- "grid_column": null,
1854
- "grid_gap": null,
1855
- "grid_row": null,
1856
- "grid_template_areas": null,
1857
- "grid_template_columns": null,
1858
- "grid_template_rows": null,
1859
- "height": null,
1860
- "justify_content": null,
1861
- "justify_items": null,
1862
- "left": null,
1863
- "margin": null,
1864
- "max_height": null,
1865
- "max_width": null,
1866
- "min_height": null,
1867
- "min_width": null,
1868
- "object_fit": null,
1869
- "object_position": null,
1870
- "order": null,
1871
- "overflow": null,
1872
- "overflow_x": null,
1873
- "overflow_y": null,
1874
- "padding": null,
1875
- "right": null,
1876
- "top": null,
1877
- "visibility": null,
1878
- "width": null
1879
- }
1880
- },
1881
- "af8f433ef2f540c9bd70d14421904d83": {
1882
- "model_module": "@jupyter-widgets/controls",
1883
- "model_module_version": "1.5.0",
1884
- "model_name": "HTMLModel",
1885
- "state": {
1886
- "_dom_classes": [],
1887
- "_model_module": "@jupyter-widgets/controls",
1888
- "_model_module_version": "1.5.0",
1889
- "_model_name": "HTMLModel",
1890
- "_view_count": null,
1891
- "_view_module": "@jupyter-widgets/controls",
1892
- "_view_module_version": "1.5.0",
1893
- "_view_name": "HTMLView",
1894
- "description": "",
1895
- "description_tooltip": null,
1896
- "layout": "IPY_MODEL_033cb43d32314d279a7b9e1e86bbccdc",
1897
- "placeholder": "​",
1898
- "style": "IPY_MODEL_34d9460b112c419885bbff5211674cb3",
1899
- "value": " 1/1 [00:05&lt;00:00, 5.67s/it]"
1900
- }
1901
- },
1902
- "b5d6b069468246abbb3207f3df6f9dde": {
1903
- "model_module": "@jupyter-widgets/controls",
1904
- "model_module_version": "1.5.0",
1905
- "model_name": "HTMLModel",
1906
- "state": {
1907
- "_dom_classes": [],
1908
- "_model_module": "@jupyter-widgets/controls",
1909
- "_model_module_version": "1.5.0",
1910
- "_model_name": "HTMLModel",
1911
- "_view_count": null,
1912
- "_view_module": "@jupyter-widgets/controls",
1913
- "_view_module_version": "1.5.0",
1914
- "_view_name": "HTMLView",
1915
- "description": "",
1916
- "description_tooltip": null,
1917
- "layout": "IPY_MODEL_4958b4c72d0c48af9a77974fc4ed449c",
1918
- "placeholder": "​",
1919
- "style": "IPY_MODEL_6b43ea2d93c04965a4539b3ef839893b",
1920
- "value": " 1.16k/1.16k [00:00&lt;00:00, 24.4kB/s]"
1921
- }
1922
- },
1923
- "c710ba94fd65486cbcbe1d402919e27f": {
1924
- "model_module": "@jupyter-widgets/base",
1925
- "model_module_version": "1.2.0",
1926
- "model_name": "LayoutModel",
1927
- "state": {
1928
- "_model_module": "@jupyter-widgets/base",
1929
- "_model_module_version": "1.2.0",
1930
- "_model_name": "LayoutModel",
1931
- "_view_count": null,
1932
- "_view_module": "@jupyter-widgets/base",
1933
- "_view_module_version": "1.2.0",
1934
- "_view_name": "LayoutView",
1935
- "align_content": null,
1936
- "align_items": null,
1937
- "align_self": null,
1938
- "border": null,
1939
- "bottom": null,
1940
- "display": null,
1941
- "flex": null,
1942
- "flex_flow": null,
1943
- "grid_area": null,
1944
- "grid_auto_columns": null,
1945
- "grid_auto_flow": null,
1946
- "grid_auto_rows": null,
1947
- "grid_column": null,
1948
- "grid_gap": null,
1949
- "grid_row": null,
1950
- "grid_template_areas": null,
1951
- "grid_template_columns": null,
1952
- "grid_template_rows": null,
1953
- "height": null,
1954
- "justify_content": null,
1955
- "justify_items": null,
1956
- "left": null,
1957
- "margin": null,
1958
- "max_height": null,
1959
- "max_width": null,
1960
- "min_height": null,
1961
- "min_width": null,
1962
- "object_fit": null,
1963
- "object_position": null,
1964
- "order": null,
1965
- "overflow": null,
1966
- "overflow_x": null,
1967
- "overflow_y": null,
1968
- "padding": null,
1969
- "right": null,
1970
- "top": null,
1971
- "visibility": null,
1972
- "width": null
1973
- }
1974
- },
1975
- "c7955974289a4f448b422d7e4640131a": {
1976
- "model_module": "@jupyter-widgets/controls",
1977
- "model_module_version": "1.5.0",
1978
- "model_name": "ProgressStyleModel",
1979
- "state": {
1980
- "_model_module": "@jupyter-widgets/controls",
1981
- "_model_module_version": "1.5.0",
1982
- "_model_name": "ProgressStyleModel",
1983
- "_view_count": null,
1984
- "_view_module": "@jupyter-widgets/base",
1985
- "_view_module_version": "1.2.0",
1986
- "_view_name": "StyleView",
1987
- "bar_color": null,
1988
- "description_width": ""
1989
- }
1990
- },
1991
- "d1ff84cb5591449abcc7dd3e37f9a2df": {
1992
- "model_module": "@jupyter-widgets/controls",
1993
- "model_module_version": "1.5.0",
1994
- "model_name": "HTMLModel",
1995
- "state": {
1996
- "_dom_classes": [],
1997
- "_model_module": "@jupyter-widgets/controls",
1998
- "_model_module_version": "1.5.0",
1999
- "_model_name": "HTMLModel",
2000
- "_view_count": null,
2001
- "_view_module": "@jupyter-widgets/controls",
2002
- "_view_module_version": "1.5.0",
2003
- "_view_name": "HTMLView",
2004
- "description": "",
2005
- "description_tooltip": null,
2006
- "layout": "IPY_MODEL_a6e3c5ce0a3c49ffb3d7cbf92568fe47",
2007
- "placeholder": "​",
2008
- "style": "IPY_MODEL_e084d47529ca4131b233ea3514a6344f",
2009
- "value": " 159M/159M [00:04&lt;00:00, 26.9MB/s]"
2010
- }
2011
- },
2012
- "d3e6acd54d024d6791aab76232557721": {
2013
- "model_module": "@jupyter-widgets/base",
2014
- "model_module_version": "1.2.0",
2015
- "model_name": "LayoutModel",
2016
- "state": {
2017
- "_model_module": "@jupyter-widgets/base",
2018
- "_model_module_version": "1.2.0",
2019
- "_model_name": "LayoutModel",
2020
- "_view_count": null,
2021
- "_view_module": "@jupyter-widgets/base",
2022
- "_view_module_version": "1.2.0",
2023
- "_view_name": "LayoutView",
2024
- "align_content": null,
2025
- "align_items": null,
2026
- "align_self": null,
2027
- "border": null,
2028
- "bottom": null,
2029
- "display": null,
2030
- "flex": null,
2031
- "flex_flow": null,
2032
- "grid_area": null,
2033
- "grid_auto_columns": null,
2034
- "grid_auto_flow": null,
2035
- "grid_auto_rows": null,
2036
- "grid_column": null,
2037
- "grid_gap": null,
2038
- "grid_row": null,
2039
- "grid_template_areas": null,
2040
- "grid_template_columns": null,
2041
- "grid_template_rows": null,
2042
- "height": null,
2043
- "justify_content": null,
2044
- "justify_items": null,
2045
- "left": null,
2046
- "margin": null,
2047
- "max_height": null,
2048
- "max_width": null,
2049
- "min_height": null,
2050
- "min_width": null,
2051
- "object_fit": null,
2052
- "object_position": null,
2053
- "order": null,
2054
- "overflow": null,
2055
- "overflow_x": null,
2056
- "overflow_y": null,
2057
- "padding": null,
2058
- "right": null,
2059
- "top": null,
2060
- "visibility": null,
2061
- "width": null
2062
- }
2063
  },
2064
- "d722bbfffeaf4ea7a1060d10dc3a06db": {
2065
- "model_module": "@jupyter-widgets/controls",
2066
- "model_module_version": "1.5.0",
2067
- "model_name": "HBoxModel",
2068
- "state": {
2069
- "_dom_classes": [],
2070
- "_model_module": "@jupyter-widgets/controls",
2071
- "_model_module_version": "1.5.0",
2072
- "_model_name": "HBoxModel",
2073
- "_view_count": null,
2074
- "_view_module": "@jupyter-widgets/controls",
2075
- "_view_module_version": "1.5.0",
2076
- "_view_name": "HBoxView",
2077
- "box_style": "",
2078
- "children": [
2079
- "IPY_MODEL_a4b5b93b88f549e8a4f37f3d48834ca9",
2080
- "IPY_MODEL_82692c41501c487fad27c6b19836f46f",
2081
- "IPY_MODEL_af8f433ef2f540c9bd70d14421904d83"
2082
  ],
2083
- "layout": "IPY_MODEL_e81b0bf92adc4aadaafce4ee7d36421e"
2084
- }
2085
- },
2086
- "db7ee45589e04749b80376e25ee377bb": {
2087
- "model_module": "@jupyter-widgets/base",
2088
- "model_module_version": "1.2.0",
2089
- "model_name": "LayoutModel",
2090
- "state": {
2091
- "_model_module": "@jupyter-widgets/base",
2092
- "_model_module_version": "1.2.0",
2093
- "_model_name": "LayoutModel",
2094
- "_view_count": null,
2095
- "_view_module": "@jupyter-widgets/base",
2096
- "_view_module_version": "1.2.0",
2097
- "_view_name": "LayoutView",
2098
- "align_content": null,
2099
- "align_items": null,
2100
- "align_self": null,
2101
- "border": null,
2102
- "bottom": null,
2103
- "display": null,
2104
- "flex": null,
2105
- "flex_flow": null,
2106
- "grid_area": null,
2107
- "grid_auto_columns": null,
2108
- "grid_auto_flow": null,
2109
- "grid_auto_rows": null,
2110
- "grid_column": null,
2111
- "grid_gap": null,
2112
- "grid_row": null,
2113
- "grid_template_areas": null,
2114
- "grid_template_columns": null,
2115
- "grid_template_rows": null,
2116
- "height": null,
2117
- "justify_content": null,
2118
- "justify_items": null,
2119
- "left": null,
2120
- "margin": null,
2121
- "max_height": null,
2122
- "max_width": null,
2123
- "min_height": null,
2124
- "min_width": null,
2125
- "object_fit": null,
2126
- "object_position": null,
2127
- "order": null,
2128
- "overflow": null,
2129
- "overflow_x": null,
2130
- "overflow_y": null,
2131
- "padding": null,
2132
- "right": null,
2133
- "top": null,
2134
- "visibility": null,
2135
- "width": null
2136
- }
2137
- },
2138
- "e084d47529ca4131b233ea3514a6344f": {
2139
- "model_module": "@jupyter-widgets/controls",
2140
- "model_module_version": "1.5.0",
2141
- "model_name": "DescriptionStyleModel",
2142
- "state": {
2143
- "_model_module": "@jupyter-widgets/controls",
2144
- "_model_module_version": "1.5.0",
2145
- "_model_name": "DescriptionStyleModel",
2146
- "_view_count": null,
2147
- "_view_module": "@jupyter-widgets/base",
2148
- "_view_module_version": "1.2.0",
2149
- "_view_name": "StyleView",
2150
- "description_width": ""
2151
- }
2152
  },
2153
- "e33033ecda374ed4966ae5fccf6efe37": {
2154
- "model_module": "@jupyter-widgets/controls",
2155
- "model_module_version": "1.5.0",
2156
- "model_name": "HTMLModel",
2157
- "state": {
2158
- "_dom_classes": [],
2159
- "_model_module": "@jupyter-widgets/controls",
2160
- "_model_module_version": "1.5.0",
2161
- "_model_name": "HTMLModel",
2162
- "_view_count": null,
2163
- "_view_module": "@jupyter-widgets/controls",
2164
- "_view_module_version": "1.5.0",
2165
- "_view_name": "HTMLView",
2166
- "description": "",
2167
- "description_tooltip": null,
2168
- "layout": "IPY_MODEL_a41cf7f5121a4068842bb5c7d2bc4d62",
2169
- "placeholder": "​",
2170
- "style": "IPY_MODEL_f479a9629c414cb495a97b0741b0fe4b",
2171
- "value": "Downloading: 100%"
2172
- }
2173
  },
2174
- "e3414cc0456241eca109f4e9e115d16a": {
2175
- "model_module": "@jupyter-widgets/controls",
2176
- "model_module_version": "1.5.0",
2177
- "model_name": "ProgressStyleModel",
2178
- "state": {
2179
- "_model_module": "@jupyter-widgets/controls",
2180
- "_model_module_version": "1.5.0",
2181
- "_model_name": "ProgressStyleModel",
2182
- "_view_count": null,
2183
- "_view_module": "@jupyter-widgets/base",
2184
- "_view_module_version": "1.2.0",
2185
- "_view_name": "StyleView",
2186
- "bar_color": null,
2187
- "description_width": ""
2188
- }
2189
  },
2190
- "e81b0bf92adc4aadaafce4ee7d36421e": {
2191
- "model_module": "@jupyter-widgets/base",
2192
- "model_module_version": "1.2.0",
2193
- "model_name": "LayoutModel",
2194
- "state": {
2195
- "_model_module": "@jupyter-widgets/base",
2196
- "_model_module_version": "1.2.0",
2197
- "_model_name": "LayoutModel",
2198
- "_view_count": null,
2199
- "_view_module": "@jupyter-widgets/base",
2200
- "_view_module_version": "1.2.0",
2201
- "_view_name": "LayoutView",
2202
- "align_content": null,
2203
- "align_items": null,
2204
- "align_self": null,
2205
- "border": null,
2206
- "bottom": null,
2207
- "display": null,
2208
- "flex": null,
2209
- "flex_flow": null,
2210
- "grid_area": null,
2211
- "grid_auto_columns": null,
2212
- "grid_auto_flow": null,
2213
- "grid_auto_rows": null,
2214
- "grid_column": null,
2215
- "grid_gap": null,
2216
- "grid_row": null,
2217
- "grid_template_areas": null,
2218
- "grid_template_columns": null,
2219
- "grid_template_rows": null,
2220
- "height": null,
2221
- "justify_content": null,
2222
- "justify_items": null,
2223
- "left": null,
2224
- "margin": null,
2225
- "max_height": null,
2226
- "max_width": null,
2227
- "min_height": null,
2228
- "min_width": null,
2229
- "object_fit": null,
2230
- "object_position": null,
2231
- "order": null,
2232
- "overflow": null,
2233
- "overflow_x": null,
2234
- "overflow_y": null,
2235
- "padding": null,
2236
- "right": null,
2237
- "top": null,
2238
- "visibility": null,
2239
- "width": null
2240
- }
2241
  },
2242
- "f479a9629c414cb495a97b0741b0fe4b": {
2243
- "model_module": "@jupyter-widgets/controls",
2244
- "model_module_version": "1.5.0",
2245
- "model_name": "DescriptionStyleModel",
2246
- "state": {
2247
- "_model_module": "@jupyter-widgets/controls",
2248
- "_model_module_version": "1.5.0",
2249
- "_model_name": "DescriptionStyleModel",
2250
- "_view_count": null,
2251
- "_view_module": "@jupyter-widgets/base",
2252
- "_view_module_version": "1.2.0",
2253
- "_view_name": "StyleView",
2254
- "description_width": ""
2255
- }
2256
  }
2257
- }
2258
- }
2259
- },
2260
- "nbformat": 4,
2261
- "nbformat_minor": 1
2262
- }
 
1
  {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "colab": {
6
+ "name": "Boosting_Wav2Vec2_with_n_grams_in_🤗_Transformers.ipynb",
7
+ "provenance": [],
8
+ "collapsed_sections": []
9
  },
10
+ "kernelspec": {
11
+ "name": "python3",
12
+ "display_name": "Python 3"
 
 
 
 
 
 
 
 
 
 
 
13
  },
14
+ "language_info": {
15
+ "name": "python"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  },
18
+ "cells": [
 
 
 
 
 
 
 
 
 
 
19
  {
20
+ "cell_type": "code",
21
+ "source": [
22
+ "!pip install datasets transformers"
23
+ ],
24
+ "metadata": {
25
+ "id": "OWGc_zfyq5_T"
26
+ },
27
+ "execution_count": null,
28
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  },
 
 
 
 
30
  {
31
+ "cell_type": "code",
32
+ "source": [
33
+ "!pip install https://github.com/kpu/kenlm/archive/master.zip pyctcdecode"
34
+ ],
35
+ "metadata": {
36
+ "id": "TvDJ7CYpzSJQ"
37
  },
38
+ "execution_count": null,
39
+ "outputs": []
 
 
 
 
40
  },
41
  {
42
+ "cell_type": "code",
43
+ "source": [
44
+ "from huggingface_hub import notebook_login\n",
45
+ "\n",
46
+ "notebook_login()"
47
+ ],
48
+ "metadata": {
49
+ "id": "JHTeonOGXiGq"
50
+ },
51
+ "execution_count": null,
52
+ "outputs": []
53
  },
54
  {
55
+ "cell_type": "code",
56
+ "source": [
57
+ "!sudo apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev"
58
+ ],
59
+ "metadata": {
60
+ "id": "FKMMWfVQp_gP"
61
+ },
62
+ "execution_count": null,
63
+ "outputs": []
64
  },
65
  {
66
+ "cell_type": "code",
67
+ "source": [
68
+ "!wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz"
69
+ ],
70
+ "metadata": {
71
+ "id": "J8mm4ExzqIaZ"
72
  },
73
+ "execution_count": null,
74
+ "outputs": []
 
 
 
 
75
  },
76
  {
77
+ "cell_type": "code",
78
+ "source": [
79
+ "!mkdir kenlm/build && cd kenlm/build && cmake .. && make -j2\n",
80
+ "!ls kenlm/build/bin"
81
+ ],
82
+ "metadata": {
83
+ "id": "MS4mqMyZqVAI"
84
  },
85
+ "execution_count": null,
86
+ "outputs": []
 
 
 
 
87
  },
88
  {
89
+ "cell_type": "code",
90
+ "source": [
91
+ "from datasets import load_dataset\n",
92
+ "\n",
93
+ "username = \"hf-test\" # change to your username\n",
94
+ "target_lang = \"sv\"\n",
95
+ "\n",
96
+ "dataset = load_dataset(f\"{username}/{target_lang}_corpora_parliament_processed\", split=\"train\")\n",
97
+ "\n",
98
+ "with open(\"text.txt\", \"w\") as file:\n",
99
+ " file.write(\" \".join(dataset[\"text\"]))"
100
+ ],
101
+ "metadata": {
102
+ "id": "VIgErMqApENm"
103
  },
104
+ "execution_count": null,
105
+ "outputs": []
 
 
 
 
106
  },
107
  {
108
+ "cell_type": "code",
109
+ "source": [
110
+ "\n",
111
+ "!kenlm/build/bin/lmplz -o 5 <\"text.txt\" > \"5gram.arpa\""
112
+ ],
113
+ "metadata": {
114
+ "id": "_MdDNBlZrPOm"
115
+ },
116
+ "execution_count": null,
117
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  },
 
 
 
 
119
  {
120
+ "cell_type": "code",
121
+ "source": [
122
+ "!head -20 5gram.arpa"
123
+ ],
124
+ "metadata": {
125
+ "id": "TRnV8Miusl--"
126
+ },
127
+ "execution_count": null,
128
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  },
 
 
 
 
130
  {
131
+ "cell_type": "code",
132
+ "source": [
133
+ "with open(\"5gram.arpa\", \"r\") as read_file, open(\"5gram_correct.arpa\", \"w\") as write_file:\n",
134
+ " has_added_eos = False\n",
135
+ " for line in read_file:\n",
136
+ " if not has_added_eos and \"ngram 1=\" in line:\n",
137
+ " count=line.strip().split(\"=\")[-1]\n",
138
+ " write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n",
139
+ " elif not has_added_eos and \"<s>\" in line:\n",
140
+ " write_file.write(line)\n",
141
+ " write_file.write(line.replace(\"<s>\", \"</s>\"))\n",
142
+ " has_added_eos = True\n",
143
+ " else:\n",
144
+ " write_file.write(line)"
145
+ ],
146
+ "metadata": {
147
+ "id": "_7u7dVPkvyRZ"
148
+ },
149
+ "execution_count": null,
150
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  },
 
 
 
 
152
  {
153
+ "cell_type": "code",
154
+ "source": [
155
+ "!head -20 5gram_correct.arpa"
156
+ ],
157
+ "metadata": {
158
+ "id": "YF1RSm-Pxst5"
159
+ },
160
+ "execution_count": null,
161
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  },
 
 
 
 
163
  {
164
+ "cell_type": "code",
165
+ "source": [
166
+ "from transformers import AutoProcessor\n",
167
+ "\n",
168
+ "processor = AutoProcessor.from_pretrained(\"marinone94/xls-r-300m-sv-robust\")"
169
+ ],
170
+ "metadata": {
171
+ "id": "paV71gdAtkDC"
172
+ },
173
+ "execution_count": null,
174
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  },
 
 
 
 
176
  {
177
+ "cell_type": "code",
178
+ "source": [
179
+ "vocab_dict = processor.tokenizer.get_vocab()\n",
180
+ "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  ],
182
+ "metadata": {
183
+ "id": "ZKwKxMoitoGS"
184
+ },
185
+ "execution_count": null,
186
+ "outputs": []
 
187
  },
188
  {
189
+ "cell_type": "code",
190
+ "source": [
191
+ "from pyctcdecode import build_ctcdecoder\n",
192
+ "\n",
193
+ "decoder = build_ctcdecoder(\n",
194
+ " labels=list(sorted_vocab_dict.keys()),\n",
195
+ " kenlm_model_path=\"5gram_correct.arpa\",\n",
196
+ ")"
197
  ],
198
+ "metadata": {
199
+ "id": "zTLzCLB2tQP7"
200
+ },
201
+ "execution_count": null,
202
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  },
204
+ {
205
+ "cell_type": "code",
206
+ "source": [
207
+ "from transformers import Wav2Vec2ProcessorWithLM\n",
208
+ "\n",
209
+ "processor_with_lm = Wav2Vec2ProcessorWithLM(\n",
210
+ " feature_extractor=processor.feature_extractor,\n",
211
+ " tokenizer=processor.tokenizer,\n",
212
+ " decoder=decoder\n",
213
+ ")"
 
 
 
 
 
 
 
 
214
  ],
215
+ "metadata": {
216
+ "id": "VBVf50EzZgAQ"
217
+ },
218
+ "execution_count": null,
219
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  },
221
+ {
222
+ "cell_type": "code",
223
+ "source": [
224
+ "!sudo apt-get install git-lfs tree"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  ],
226
+ "metadata": {
227
+ "id": "BZZm3ECc5TMP"
228
+ },
229
+ "execution_count": null,
230
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  },
232
+ {
233
+ "cell_type": "code",
234
+ "source": [
235
+ "from huggingface_hub import Repository\n",
236
+ "\n",
237
+ "repo = Repository(local_dir=\"xls-r-300m-sv-robust\", clone_from=\"marinone94/xls-r-300m-sv-robust\")"
 
 
 
 
 
 
 
 
 
 
 
 
238
  ],
239
+ "metadata": {
240
+ "id": "fIfcunhF4YM6"
241
+ },
242
+ "execution_count": null,
243
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  },
245
+ {
246
+ "cell_type": "code",
247
+ "source": [
248
+ "processor_with_lm.save_pretrained(\"xls-r-300m-sv-robust\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  ],
250
+ "metadata": {
251
+ "id": "UZ1sWfPH2oce"
252
+ },
253
+ "execution_count": null,
254
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  },
256
+ {
257
+ "cell_type": "code",
258
+ "source": [
259
+ "!tree -h xls-r-300m-sv/"
260
+ ],
261
+ "metadata": {
262
+ "id": "ClyENOYFcC_C"
263
+ },
264
+ "execution_count": null,
265
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
266
  },
267
+ {
268
+ "cell_type": "code",
269
+ "source": [
270
+ "!kenlm/build/bin/build_binary xls-r-300m-sv-robust/language_model/5gram_correct.arpa xls-r-300m-sv-robust/language_model/5gram.bin"
271
+ ],
272
+ "metadata": {
273
+ "id": "X9qg4FPt2zi8"
274
+ },
275
+ "execution_count": null,
276
+ "outputs": []
 
 
 
 
 
277
  },
278
+ {
279
+ "cell_type": "code",
280
+ "source": [
281
+ "!rm xls-r-300m-sv-robust/language_model/5gram_correct.arpa && tree -h xls-r-300m-sv-robust/"
282
+ ],
283
+ "metadata": {
284
+ "id": "Zn4J-4OZdMPc"
285
+ },
286
+ "execution_count": null,
287
+ "outputs": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  },
289
+ {
290
+ "cell_type": "code",
291
+ "source": [
292
+ "repo.push_to_hub(commit_message=\"Upload 5-gram lm-boosted decoder\")"
293
+ ],
294
+ "metadata": {
295
+ "id": "WEV1sx6ee3aT"
296
+ },
297
+ "execution_count": null,
298
+ "outputs": []
 
 
 
 
299
  }
300
+ ]
301
+ }