LuisAVasquez commited on
Commit
e5aaad7
1 Parent(s): bb6c60b

Uploading notebooks for replication of the training

Browse files
training_notebooks/01_Downloading_and_preprocessing_corpora.ipynb ADDED
@@ -0,0 +1,1100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import cltk"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 2,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "from cltk.data.fetch import FetchCorpus"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": null,
24
+ "metadata": {},
25
+ "outputs": [],
26
+ "source": [
27
+ "cltk.utils.get_cltk_data_dir()"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": 36,
33
+ "metadata": {},
34
+ "outputs": [],
35
+ "source": [
36
+ "corpus_downloader = FetchCorpus(language=\"lat\")"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 37,
42
+ "metadata": {},
43
+ "outputs": [
44
+ {
45
+ "data": {
46
+ "text/plain": [
47
+ "['lat_text_perseus',\n",
48
+ " 'lat_treebank_perseus',\n",
49
+ " 'lat_text_latin_library',\n",
50
+ " 'phi5',\n",
51
+ " 'phi7',\n",
52
+ " 'latin_proper_names_cltk',\n",
53
+ " 'lat_models_cltk',\n",
54
+ " 'latin_pos_lemmata_cltk',\n",
55
+ " 'latin_treebank_index_thomisticus',\n",
56
+ " 'latin_lexica_perseus',\n",
57
+ " 'latin_training_set_sentence_cltk',\n",
58
+ " 'latin_word2vec_cltk',\n",
59
+ " 'latin_text_antique_digiliblt',\n",
60
+ " 'latin_text_corpus_grammaticorum_latinorum',\n",
61
+ " 'latin_text_poeti_ditalia',\n",
62
+ " 'lat_text_tesserae',\n",
63
+ " 'cltk_lat_lewis_elementary_lexicon']"
64
+ ]
65
+ },
66
+ "execution_count": 37,
67
+ "metadata": {},
68
+ "output_type": "execute_result"
69
+ }
70
+ ],
71
+ "source": [
72
+ "corpus_downloader.list_corpora"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": 38,
78
+ "metadata": {},
79
+ "outputs": [],
80
+ "source": [
81
+ "corpus_downloader.import_corpus(\"lat_text_perseus\")"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": 39,
87
+ "metadata": {},
88
+ "outputs": [],
89
+ "source": [
90
+ "corpus_downloader.import_corpus(\"lat_text_latin_library\")"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": 40,
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "corpus_downloader.import_corpus(\"lat_text_tesserae\")"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": 41,
105
+ "metadata": {},
106
+ "outputs": [],
107
+ "source": [
108
+ "corpus_downloader.import_corpus(\"latin_text_corpus_grammaticorum_latinorum\")"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "code",
113
+ "execution_count": 13,
114
+ "metadata": {},
115
+ "outputs": [],
116
+ "source": [
117
+ "import os"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": 14,
123
+ "metadata": {},
124
+ "outputs": [],
125
+ "source": [
126
+ "cltk_data_dir = cltk.utils.get_cltk_data_dir()\n",
127
+ "\n",
128
+ "latin_library_path = os.path.join(cltk_data_dir, \"lat\", \"text\", \"lat_text_latin_library\")\n",
129
+ "perseus_path = os.path.join(cltk_data_dir, \"lat\", \"text\", \"lat_text_perseus\", \"cltk_json\")\n",
130
+ "tesserae_path = os.path.join(cltk_data_dir, \"lat\", \"text\", \"lat_text_tesserae\", \"texts\")\n",
131
+ "corpus_grammaticorum_path = os.path.join(cltk_data_dir, \"lat\", \"text\", \"latin_text_corpus_grammaticorum_latinorum\") "
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": 15,
137
+ "metadata": {},
138
+ "outputs": [],
139
+ "source": [
140
+ "my_corpus_path = \"custom_latin_corpus\""
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": 16,
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": [
149
+ "import os\n",
150
+ "import shutil"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "code",
155
+ "execution_count": 17,
156
+ "metadata": {},
157
+ "outputs": [],
158
+ "source": [
159
+ "if not os.path.exists(my_corpus_path):\n",
160
+ " os.mkdir(my_corpus_path)"
161
+ ]
162
+ },
163
+ {
164
+ "attachments": {},
165
+ "cell_type": "markdown",
166
+ "metadata": {},
167
+ "source": [
168
+ "# Processing Latin Library"
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "code",
173
+ "execution_count": 18,
174
+ "metadata": {},
175
+ "outputs": [],
176
+ "source": [
177
+ "# the latin library ihas files already in .txt format, so we just need to copy them"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": 19,
183
+ "metadata": {},
184
+ "outputs": [],
185
+ "source": [
186
+ "new_latin_library_path = os.path.join(my_corpus_path, \"latin_library\")"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": 20,
192
+ "metadata": {},
193
+ "outputs": [],
194
+ "source": [
195
+ "try:\n",
196
+ " shutil.copytree(\n",
197
+ " latin_library_path,\n",
198
+ " new_latin_library_path\n",
199
+ " )\n",
200
+ "except:\n",
201
+ " pass"
202
+ ]
203
+ },
204
+ {
205
+ "attachments": {},
206
+ "cell_type": "markdown",
207
+ "metadata": {},
208
+ "source": [
209
+ "# Processing Perseus Library"
210
+ ]
211
+ },
212
+ {
213
+ "cell_type": "code",
214
+ "execution_count": 21,
215
+ "metadata": {},
216
+ "outputs": [],
217
+ "source": [
218
+ "new_perseus_path = os.path.join(\n",
219
+ " my_corpus_path, \"perseus_library\"\n",
220
+ ")"
221
+ ]
222
+ },
223
+ {
224
+ "cell_type": "code",
225
+ "execution_count": 22,
226
+ "metadata": {},
227
+ "outputs": [],
228
+ "source": [
229
+ "if not os.path.exists(new_perseus_path):\n",
230
+ " os.mkdir(new_perseus_path)"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "code",
235
+ "execution_count": 23,
236
+ "metadata": {},
237
+ "outputs": [],
238
+ "source": [
239
+ "import json"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "code",
244
+ "execution_count": 24,
245
+ "metadata": {},
246
+ "outputs": [
247
+ {
248
+ "data": {
249
+ "text/plain": [
250
+ "['terence__the-self-tormenter__english.json',\n",
251
+ " 'plautus-titus-maccius__trinummus__latin.json',\n",
252
+ " 'ausonius-decimus-magnus__griphus-ternarii-numeri__latin.json',\n",
253
+ " 'cicero__in-defense-of-aulus-cluentius-habitus__latin.json',\n",
254
+ " 'plautus-titus-maccius__bacchides__latin.json',\n",
255
+ " 'ovid__remedy-of-love__latin.json',\n",
256
+ " 'tertullian-ca-160-ca-230__de-ieiunio-adversus-psychicos__latin.json',\n",
257
+ " 'tertullian-ca-160-ca-230__ad-uxorem__latin.json',\n",
258
+ " 'vopiscus-flavius-fl-34-jh__antoninus-caracalla__latin.json',\n",
259
+ " 'cicero__on-behalf-of-marcus-tullius__english.json',\n",
260
+ " 'ausonius-decimus-magnus__versus-paschales-prosodic__latin.json',\n",
261
+ " 'claudianus-claudius__panegyricus-de-sexto-consulatu-honorii-augusti__latin.json',\n",
262
+ " 'nepos-cornelius__agesilaus__latin.json',\n",
263
+ " 'tertullian-ca-160-ca-230__apologeticum__latin.json',\n",
264
+ " 'paris-julius__facta-et-dicta-memorabilia__latin.json',\n",
265
+ " 'cicero__de-optimo-genere-oratorum__latin.json',\n",
266
+ " 'cicero__paradoxa-stoicorum__latin.json',\n",
267
+ " 'suetonius-ca-69-ca-122__divus-titus__latin.json',\n",
268
+ " 'virgil__eclogues__latin.json',\n",
269
+ " 'martial__epigrammata__latin.json',\n",
270
+ " 'tertullian-ca-160-ca-230__de-carnis-resurrectione__latin.json',\n",
271
+ " 'cicero__for-marcus-fonteius__latin.json',\n",
272
+ " 'cicero__in-the-senate-after-his-return__latin.json',\n",
273
+ " 'terence__the-brothers__latin.json',\n",
274
+ " 'vopiscus-flavius-fl-34-jh__pertinax__latin.json',\n",
275
+ " 'nepos-cornelius__datames__latin.json',\n",
276
+ " 'nepos-cornelius__cato__latin.json',\n",
277
+ " 'seneca-lucius-annaeus__de-constantia__latin.json',\n",
278
+ " 'tibullus__elegiae__english.json',\n",
279
+ " 'celsus-aulus-cornelius__de-medicina__english.json',\n",
280
+ " 'ovid__letters-from-the-black-sea__latin.json',\n",
281
+ " 'curtius-rufus-quintus__historiarum-alexandri-magni__latin.json',\n",
282
+ " 'caesar-julius__gallic-war__english.json',\n",
283
+ " 'ausonius-decimus-magnus__epigrammaton-liber__latin.json',\n",
284
+ " 'cicero__letters-to-his-friends__latin.json',\n",
285
+ " 'claudianus-claudius__in-rufinum__latin.json',\n",
286
+ " 'gellius-aulus__noctes-atticae__latin.json',\n",
287
+ " 'seneca-lucius-annaeus-55-bc-ca-39-ad__fragmenta__latin.json',\n",
288
+ " 'cicero__four-orations-against-lucius-catilina__english.json',\n",
289
+ " 'cicero__de-finibus-bonorum-et-malorum__latin.json',\n",
290
+ " 'suetonius-ca-69-ca-122__caligula__latin.json',\n",
291
+ " 'cicero__letters-to-his-brother-quintus__latin.json',\n",
292
+ " 'nepos-cornelius__pausanias__latin.json',\n",
293
+ " 'vopiscus-flavius-fl-34-jh__carus-et-carinus-et-numerianus__latin.json',\n",
294
+ " 'cicero__for-sulla__latin.json',\n",
295
+ " 'plautus-titus-maccius__miles-gloriosus__latin.json',\n",
296
+ " 'seneca-lucius-annaeus-plays__oedipus__latin.json',\n",
297
+ " 'horace__satires__english.json',\n",
298
+ " 'seneca-lucius-annaeus-55-bc-ca-39-ad__suasoriae__latin.json',\n",
299
+ " 'cicero__for-archias__latin.json',\n",
300
+ " 'columella-lucius-junius-moderatus__res-rustica__latin.json',\n",
301
+ " 'plautus-titus-maccius__asinaria__english.json',\n",
302
+ " 'seneca-lucius-annaeus-55-bc-ca-39-ad__controversiae__latin.json',\n",
303
+ " 'plautus-titus-maccius__captivi__english.json',\n",
304
+ " 'cicero__for-publius-quinctius__english.json',\n",
305
+ " 'cicero__for-marcus-fonteius__english.json',\n",
306
+ " 'vopiscus-flavius-fl-34-jh__firmus-saturninus-proculus-et-bonosus__latin.json',\n",
307
+ " 'cicero__in-defense-of-l-murena__latin.json',\n",
308
+ " 'ausonius-decimus-magnus__epitaphia__latin.json',\n",
309
+ " 'plautus-titus-maccius__amphitruo__english.json',\n",
310
+ " 'cicero__for-marcus-caelius__latin.json',\n",
311
+ " 'claudianus-claudius__panegyricus-de-quarto-consulatu-honorii-augusti__latin.json',\n",
312
+ " 'nepos-cornelius__timotheus__latin.json',\n",
313
+ " 'claudianus-claudius__panegyricus-de-tertio-consulatu-honorii-augusti__latin.json',\n",
314
+ " 'plautus-titus-maccius__persa__english.json',\n",
315
+ " 'plautus-titus-maccius__truculentus__latin.json',\n",
316
+ " 'nepos-cornelius__alcibiades__latin.json',\n",
317
+ " 'lucan__civil-war__latin.json',\n",
318
+ " 'minucius-felix-marcus__octavius__latin.json',\n",
319
+ " 'ausonius-decimus-magnus__ephemeris__latin.json',\n",
320
+ " 'ovid__metamorphoses__latin.json',\n",
321
+ " 'seneca-lucius-annaeus-plays__apocolocyntosis__latin.json',\n",
322
+ " 'seneca-lucius-annaeus__de-brevitate-vitae__latin.json',\n",
323
+ " 'claudianus-claudius__de-bello-gothico__latin.json',\n",
324
+ " 'cicero__orations-against-verres__latin.json',\n",
325
+ " 'suetonius-ca-69-ca-122__divus-vespasianus__english.json',\n",
326
+ " 'plautus-titus-maccius__epidicus__latin.json',\n",
327
+ " 'seneca-lucius-annaeus-plays__agamemnon__latin.json',\n",
328
+ " 'tertullian-ca-160-ca-230__adversus-marcionem__latin.json',\n",
329
+ " 'claudianus-claudius__epithalamium-de-nuptiis-honorii-augusti__latin.json',\n",
330
+ " 'cicero__letters-to-brutus__latin.json',\n",
331
+ " 'suetonius-ca-69-ca-122__tiberius__latin.json',\n",
332
+ " 'vopiscus-flavius-fl-34-jh__antoninus-geta__latin.json',\n",
333
+ " 'cicero__in-defense-of-lucius-flaccus__english.json',\n",
334
+ " 'horace__satires__latin.json',\n",
335
+ " 'ovid__ibis__latin.json',\n",
336
+ " 'vopiscus-flavius-fl-34-jh__verus__latin.json',\n",
337
+ " 'tertullian-ca-160-ca-230__de-idolatria__latin.json',\n",
338
+ " 'apuleius__apologia__latin.json',\n",
339
+ " 'tertullian-ca-160-ca-230__de-testimionio-animae__latin.json',\n",
340
+ " 'tertullian-ca-160-ca-230__de-praescriptionibus-hereticorum__latin.json',\n",
341
+ " 'ovid__epistulae__english.json',\n",
342
+ " 'seneca-lucius-annaeus__de-tranquilitate-animi__latin.json',\n",
343
+ " 'cicero__in-defense-of-aulus-cluentius-habitus__english.json',\n",
344
+ " 'seneca-lucius-annaeus__de-consolatione-ad-helviam__latin.json',\n",
345
+ " 'vopiscus-flavius-fl-34-jh__maximus-et-balbinus__latin.json',\n",
346
+ " 'tacitus-cornelius__agricola__latin.json',\n",
347
+ " 'horace__odes__latin.json',\n",
348
+ " 'boethius-d-524__quomodo-trinitas-unus-deus-ac-non-tres-dii-de-trinitate__latin.json',\n",
349
+ " 'suetonius-ca-69-ca-122__domitianus__english.json',\n",
350
+ " 'claudianus-claudius__carminum-minorum-corpusculum__latin.json',\n",
351
+ " 'ovid__sorrows__latin.json',\n",
352
+ " 'nepos-cornelius__phocion__latin.json',\n",
353
+ " 'tacitus-cornelius__germania__latin.json',\n",
354
+ " 'vopiscus-flavius-fl-34-jh__gallieni-duo__latin.json',\n",
355
+ " 'caesar-julius__gallic-war__latin.json',\n",
356
+ " 'cicero__orator__latin.json',\n",
357
+ " 'cicero__de-republica__latin.json',\n",
358
+ " 'ovid__epistulae__latin.json',\n",
359
+ " 'cicero__de-fato__latin.json',\n",
360
+ " 'nepos-cornelius__de-regibus__latin.json',\n",
361
+ " 'sallust__historiae__latin.json',\n",
362
+ " 'vopiscus-flavius-fl-34-jh__tacitus__latin.json',\n",
363
+ " 'plautus-titus-maccius__mostellaria__latin.json',\n",
364
+ " 'vopiscus-flavius-fl-34-jh__valeriani-duo__latin.json',\n",
365
+ " 'tertullian-ca-160-ca-230__de-virginibus-velandis__latin.json',\n",
366
+ " 'ausonius-decimus-magnus__gratiarum-actio__latin.json',\n",
367
+ " 'vopiscus-flavius-fl-34-jh__antoninus-pius__latin.json',\n",
368
+ " 'nepos-cornelius__iphicrates__latin.json',\n",
369
+ " 'ausonius-decimus-magnus__genethliacon-ad-ausonium-nepotem__latin.json',\n",
370
+ " 'plautus-titus-maccius__rudens__english.json',\n",
371
+ " 'nepos-cornelius__pelopidas__latin.json',\n",
372
+ " 'plautus-titus-maccius__aulularia__english.json',\n",
373
+ " 'cicero__to-the-citizens-after-his-return__english.json',\n",
374
+ " 'seneca-lucius-annaeus-plays__hercules-oetaeus__latin.json',\n",
375
+ " 'tertullian-ca-160-ca-230__de-baptismo__latin.json',\n",
376
+ " 'boethius-d-524__liber-de-persona-et-duabus-naturis-contra-eutychen-et-nestorium__latin.json',\n",
377
+ " 'suetonius-ca-69-ca-122__divus-claudius__latin.json',\n",
378
+ " 'nepos-cornelius__aristides__latin.json',\n",
379
+ " 'terence__the-eunuch__english.json',\n",
380
+ " 'cicero__academica__latin.json',\n",
381
+ " 'terence__the-eunuch__latin.json',\n",
382
+ " 'plautus-titus-maccius__bacchides__english.json',\n",
383
+ " 'celsus-aulus-cornelius__de-medicina__latin.json',\n",
384
+ " 'cicero__tusculanae-disputationes__latin.json',\n",
385
+ " 'cicero__timaeus__latin.json',\n",
386
+ " 'nepos-cornelius__chabrias__latin.json',\n",
387
+ " 'sallust__bellum-iugurthinum__latin.json',\n",
388
+ " 'boethius-d-524__de-fide-catholica__latin.json',\n",
389
+ " 'pliny-the-elder__naturalis-historia__latin.json',\n",
390
+ " 'nepos-cornelius__eumenes__latin.json',\n",
391
+ " 'vopiscus-flavius-fl-34-jh__diadumenus-antoninus__latin.json',\n",
392
+ " 'cicero__divinatio-against-q-caecilius__latin.json',\n",
393
+ " 'plautus-titus-maccius__pseudolus__english.json',\n",
394
+ " 'seneca-lucius-annaeus-plays__medea__latin.json',\n",
395
+ " 'plautus-titus-maccius__menaechmi__latin.json',\n",
396
+ " 'suetonius-ca-69-ca-122__caligula__english.json',\n",
397
+ " 'juvenal__satires__latin.json',\n",
398
+ " 'vopiscus-flavius-fl-34-jh__divus-claudius__latin.json',\n",
399
+ " 'lucan__civil-war__english.json',\n",
400
+ " 'cicero__de-natura-deorum__latin.json',\n",
401
+ " 'vopiscus-flavius-fl-34-jh__helius__latin.json',\n",
402
+ " 'tertullian-ca-160-ca-230__de-spectaculis__latin.json',\n",
403
+ " 'plautus-titus-maccius__cistellaria__english.json',\n",
404
+ " 'cicero__against-publius-servilius-rullus__english.json',\n",
405
+ " 'cicero__brutus__latin.json',\n",
406
+ " 'claudianus-claudius__panegyricus-dictus-probino-et-olybrio-consulibus__latin.json',\n",
407
+ " 'vopiscus-flavius-fl-34-jh__marcus-antoninus-philosophus__latin.json',\n",
408
+ " 'terence__phormio__english.json',\n",
409
+ " 'ausonius-decimus-magnus__praefatiunculae__latin.json',\n",
410
+ " 'cicero__de-amicitia__latin.json',\n",
411
+ " 'tertullian-ca-160-ca-230__scorpiace__latin.json',\n",
412
+ " 'horace__ars-poetica__english.json',\n",
413
+ " 'propertius-sextus__elegies__latin.json',\n",
414
+ " 'tibullus__elegiae__latin.json',\n",
415
+ " 'tertullian-ca-160-ca-230__de-cultu-feminarum__latin.json',\n",
416
+ " 'terence__the-mother-in-law__english.json',\n",
417
+ " 'vopiscus-flavius-fl-34-jh__maximini-duo__latin.json',\n",
418
+ " 'seneca-lucius-annaeus__de-consolatione-ad-polybium__latin.json',\n",
419
+ " 'statius-p-papinius-publius-papinius__silvae__latin.json',\n",
420
+ " 'tertullian-ca-160-ca-230__ad-martyras__latin.json',\n",
421
+ " 'plautus-titus-maccius__asinaria__latin.json',\n",
422
+ " 'virgil__aeneid__english.json',\n",
423
+ " 'seneca-lucius-annaeus-plays__de-clementia__latin.json',\n",
424
+ " 'plautus-titus-maccius__curculio__english.json',\n",
425
+ " 'plautus-titus-maccius__aulularia__latin.json',\n",
426
+ " 'cicero__de-officiis__latin.json',\n",
427
+ " 'ovid__book-of-days__latin.json',\n",
428
+ " 'vopiscus-flavius-fl-34-jh__avidius-casius__latin.json',\n",
429
+ " 'vitruvius-pollio__on-architecture__latin.json',\n",
430
+ " 'terence__phormio__latin.json',\n",
431
+ " 'vopiscus-flavius-fl-34-jh__de-vita-hadriani__latin.json',\n",
432
+ " 'virgil__eclogues__english.json',\n",
433
+ " 'vopiscus-flavius-fl-34-jh__opilius-macrinus__latin.json',\n",
434
+ " 'cicero__on-pompeys-command__latin.json',\n",
435
+ " 'claudianus-claudius__panegyricus-dictus-manlio-theodoro-consuli__latin.json',\n",
436
+ " 'ausonius-decimus-magnus__precationes__latin.json',\n",
437
+ " 'ovid__metamorphoses__english.json',\n",
438
+ " 'ammianus-marcellinus__rerum-gestarum__english.json',\n",
439
+ " 'cicero__orations-against-verres__english.json',\n",
440
+ " 'vopiscus-flavius-fl-34-jh__divus-aurelianus__latin.json',\n",
441
+ " 'sallust__catilinae-coniuratio__english.json',\n",
442
+ " 'cicero__in-defense-of-caius-rabirius__latin.json',\n",
443
+ " 'cicero__partitiones-oratoriae__latin.json',\n",
444
+ " 'plautus-titus-maccius__curculio__latin.json',\n",
445
+ " 'ovid__amores__english.json',\n",
446
+ " 'titus-livius-livy__the-history-of-rome__english.json',\n",
447
+ " 'terence__andria__latin.json',\n",
448
+ " 'cicero__for-archias__english.json',\n",
449
+ " 'seneca-lucius-annaeus-plays__hercules-furens__latin.json',\n",
450
+ " 'ausonius-decimus-magnus__mosella__latin.json',\n",
451
+ " 'cicero__for-quintus-roscius-the-actor__english.json',\n",
452
+ " 'valerius-flaccus-gaius__argonautica__latin.json',\n",
453
+ " 'prudentius-b-348__contra-orationem-symmachia__latin.json',\n",
454
+ " 'vitruvius-pollio__on-architecture__english.json',\n",
455
+ " 'cicero__letters-to-atticus__latin.json',\n",
456
+ " 'vopiscus-flavius-fl-34-jh__probus__latin.json',\n",
457
+ " 'plautus-titus-maccius__rudens__latin.json',\n",
458
+ " 'horace__odes__english.json',\n",
459
+ " 'catullus__carmina__english.json',\n",
460
+ " 'cicero__for-sextus-roscius-of-ameria__latin.json',\n",
461
+ " 'ausonius-decimus-magnus__epicedion-in-patrem__latin.json',\n",
462
+ " 'nepos-cornelius__dion__latin.json',\n",
463
+ " 'seneca-lucius-annaeus-plays__phaedra__latin.json',\n",
464
+ " 'suetonius-ca-69-ca-122__divus-claudius__english.json',\n",
465
+ " 'prudentius-b-348__dittochaeon__latin.json',\n",
466
+ " 'ovid__amores__latin.json',\n",
467
+ " 'seneca-lucius-annaeus-plays__octavia__latin.json',\n",
468
+ " 'cicero__for-quintus-roscius-the-actor__latin.json',\n",
469
+ " 'apuleius__metamorphoses__latin.json',\n",
470
+ " 'ausonius-decimus-magnus__oratio-versibus-rhopalicis__latin.json',\n",
471
+ " 'suetonius-ca-69-ca-122__divus-julius__english.json',\n",
472
+ " 'plautus-titus-maccius__persa__latin.json',\n",
473
+ " 'cicero__four-orations-against-lucius-catilina__latin.json',\n",
474
+ " 'cicero__on-behalf-of-marcus-tullius__latin.json',\n",
475
+ " 'ausonius-decimus-magnus__technopaegnion__latin.json',\n",
476
+ " 'plautus-titus-maccius__mercator__latin.json',\n",
477
+ " 'seneca-lucius-annaeus__de-consolatione-ad-marciam__latin.json',\n",
478
+ " 'quintus-tullius-cicero__commentariolum-petitionis__latin.json',\n",
479
+ " 'ovid__art-of-love__english.json',\n",
480
+ " 'nepos-cornelius__themistocles__latin.json',\n",
481
+ " 'cicero__de-divinatione__latin.json',\n",
482
+ " 'claudianus-claudius__de-bello-gildonico__latin.json',\n",
483
+ " 'nepos-cornelius__atticus__latin.json',\n",
484
+ " 'terence__the-brothers__english.json',\n",
485
+ " 'tertullian-ca-160-ca-230__de-pallio__latin.json',\n",
486
+ " 'plautus-titus-maccius__captivi__latin.json',\n",
487
+ " 'ausonius-decimus-magnus__eclogarum-liber__latin.json',\n",
488
+ " 'ausonius-decimus-magnus__liber-protrepticus-ad-nepotem__latin.json',\n",
489
+ " 'tertullian-ca-160-ca-230__de-oratione__latin.json',\n",
490
+ " 'ausonius-decimus-magnus__caesares__latin.json',\n",
491
+ " 'sallust__bellum-iugurthinum__english.json',\n",
492
+ " 'plautus-titus-maccius__epidicus__english.json',\n",
493
+ " 'vopiscus-flavius-fl-34-jh__antoninus-heliogobalus__latin.json',\n",
494
+ " 'cicero__for-publius-quinctius__latin.json',\n",
495
+ " 'boethius-d-524__quomodo-substantiae-in-eo-quod-sint-bonae-sint-cum-non-sint-substanialia-bona__latin.json',\n",
496
+ " 'nepos-cornelius__hannibal__latin.json',\n",
497
+ " 'cicero__in-the-senate-after-his-return__english.json',\n",
498
+ " 'cicero__on-behalf-of-aulus-caecina__english.json',\n",
499
+ " 'ausonius-decimus-magnus__bissula__latin.json',\n",
500
+ " 'terence__the-mother-in-law__latin.json',\n",
501
+ " 'tertullian-ca-160-ca-230__de-monogamia__latin.json',\n",
502
+ " 'prudentius-b-348__hamartigenia__latin.json',\n",
503
+ " 'plautus-titus-maccius__poenulus__english.json',\n",
504
+ " 'horace__epistulae__latin.json',\n",
505
+ " 'plautus-titus-maccius__menaechmi__english.json',\n",
506
+ " 'vopiscus-flavius-fl-34-jh__alexander-severus__latin.json',\n",
507
+ " 'tertullian-ca-160-ca-230__adversus-hermogenem__latin.json',\n",
508
+ " 'seneca-lucius-annaeus__de-vita-beata__latin.json',\n",
509
+ " 'gellius-aulus__noctes-atticae__english.json',\n",
510
+ " 'seneca-lucius-annaeus-plays__apocolocyntosis__english.json',\n",
511
+ " 'tertullian-ca-160-ca-230__ad-nationes-libri-duo__latin.json',\n",
512
+ " 'tertullian-ca-160-ca-230__de-patientia__latin.json',\n",
513
+ " 'claudianus-claudius__fescinnina-de-nuptiis-honorii-augusti__latin.json',\n",
514
+ " 'cicero__on-oratory__latin.json',\n",
515
+ " 'cicero__de-senectute__latin.json',\n",
516
+ " 'nepos-cornelius__conon__latin.json',\n",
517
+ " 'horace__ars-poetica__latin.json',\n",
518
+ " 'tertullian-ca-160-ca-230__de-carne-christi__latin.json',\n",
519
+ " 'seneca-lucius-annaeus-plays__thyestes__latin.json',\n",
520
+ " 'suetonius-ca-69-ca-122__vitellius__english.json',\n",
521
+ " 'prudentius-b-348__liber-peristephanon__latin.json',\n",
522
+ " 'virgil__aeneid__latin.json',\n",
523
+ " 'suetonius-ca-69-ca-122__nero__latin.json',\n",
524
+ " 'suetonius-ca-69-ca-122__tiberius__english.json',\n",
525
+ " 'vopiscus-flavius-fl-34-jh__didius-julianus__latin.json',\n",
526
+ " 'ovid__remedy-of-love__english.json',\n",
527
+ " 'lucretius__de-rerum-natura__english.json',\n",
528
+ " 'plautus-titus-maccius__trinummus__english.json',\n",
529
+ " 'plautus-titus-maccius__mostellaria__english.json',\n",
530
+ " 'suetonius-ca-69-ca-122__divus-augustus__english.json',\n",
531
+ " 'ovid__art-of-love__latin.json',\n",
532
+ " 'seneca-lucius-annaeus-plays__troades-furens__latin.json',\n",
533
+ " 'horace__carmen-saeculare__latin.json',\n",
534
+ " 'virgil__georgics__latin.json',\n",
535
+ " 'ovid__art-of-beauty__latin.json',\n",
536
+ " 'claudianus-claudius__de-raptu-proserpinae__latin.json',\n",
537
+ " 'vopiscus-flavius-fl-34-jh__severus__latin.json',\n",
538
+ " 'prudentius-b-348__apotheosis__latin.json',\n",
539
+ " 'tertullian-ca-160-ca-230__de-pudicitia__latin.json',\n",
540
+ " 'plautus-titus-maccius__casina__latin.json',\n",
541
+ " 'nepos-cornelius__lysander__latin.json',\n",
542
+ " 'prudentius-b-348__praefetio__latin.json',\n",
543
+ " 'cicero__de-inventione__latin.json',\n",
544
+ " 'nepos-cornelius__thrasybulus__latin.json',\n",
545
+ " 'suetonius-ca-69-ca-122__galba__latin.json',\n",
546
+ " 'cicero__for-sulla__english.json',\n",
547
+ " 'ausonius-decimus-magnus__ordo-urbium-nobilium__latin.json',\n",
548
+ " 'cicero__on-pompeys-command__english.json',\n",
549
+ " 'silius-italicus-tiberius-catius__punica__latin.json',\n",
550
+ " 'tertullian-ca-160-ca-230__de-corona__latin.json',\n",
551
+ " 'sallust__catilinae-coniuratio__latin.json',\n",
552
+ " 'tertullian-ca-160-ca-230__adversus-valentinianos__latin.json',\n",
553
+ " 'vopiscus-flavius-fl-34-jh__commodus-antoninus__latin.json',\n",
554
+ " 'ausonius-decimus-magnus__epistulae__latin.json',\n",
555
+ " 'nepos-cornelius__hamilcar__latin.json',\n",
556
+ " 'plautus-titus-maccius__mercator__english.json',\n",
557
+ " 'plautus-titus-maccius__miles-gloriosus__english.json',\n",
558
+ " 'prudentius-b-348__epilogus__latin.json',\n",
559
+ " 'apuleius__florida__latin.json',\n",
560
+ " 'lucretius__de-rerum-natura__latin.json',\n",
561
+ " 'ausonius-decimus-magnus__de-herediolo__latin.json',\n",
562
+ " 'suetonius-ca-69-ca-122__divus-julius__latin.json',\n",
563
+ " 'horace__epodi__latin.json',\n",
564
+ " 'seneca-lucius-annaeus-55-bc-ca-39-ad__excerpta-controversiae__latin.json',\n",
565
+ " 'vopiscus-flavius-fl-34-jh__tyranni-triginta__latin.json',\n",
566
+ " 'cicero__to-the-citizens-after-his-return__latin.json',\n",
567
+ " 'seneca-lucius-annaeus__de-ira__latin.json',\n",
568
+ " 'plautus-titus-maccius__casina__english.json',\n",
569
+ " 'suetonius-ca-69-ca-122__divus-titus__english.json',\n",
570
+ " 'prudentius-b-348__psychomachia__latin.json',\n",
571
+ " 'seneca-lucius-annaeus-plays__phoenissae__latin.json',\n",
572
+ " 'cicero__in-defense-of-caius-rabirius__english.json',\n",
573
+ " 'plautus-titus-maccius__truculentus__english.json',\n",
574
+ " 'suetonius-ca-69-ca-122__otho__latin.json',\n",
575
+ " 'cicero__topica__latin.json',\n",
576
+ " 'seneca-lucius-annaeus__de-providentia__latin.json',\n",
577
+ " 'terence__the-self-tormenter__latin.json',\n",
578
+ " 'suetonius-ca-69-ca-122__nero__english.json',\n",
579
+ " 'ausonius-decimus-magnus__parentalia__latin.json',\n",
580
+ " 'claudianus-claudius__de-consulatu-stilichonis__latin.json',\n",
581
+ " 'suetonius-ca-69-ca-122__vitellius__latin.json',\n",
582
+ " 'tertullian-ca-160-ca-230__de-exhortatione-castitatis-liber__latin.json',\n",
583
+ " 'boethius-d-524__utrum-pater-et-filius-ac-spiritus-sanctus-de-divinitate-substantialiter-praedicentur-liber__latin.json',\n",
584
+ " 'prudentius-b-348__cathemerina__latin.json',\n",
585
+ " 'plautus-titus-maccius__stichus__english.json',\n",
586
+ " 'suetonius-ca-69-ca-122__divus-vespasianus__latin.json',\n",
587
+ " 'suetonius-ca-69-ca-122__otho__english.json',\n",
588
+ " 'cicero__in-defense-of-lucius-flaccus__latin.json',\n",
589
+ " 'vopiscus-flavius-fl-34-jh__clodinus-albinus__latin.json',\n",
590
+ " 'cicero__on-behalf-of-aulus-caecina__latin.json',\n",
591
+ " 'vopiscus-flavius-fl-34-jh__pescennius-niger__latin.json',\n",
592
+ " 'suetonius-ca-69-ca-122__domitianus__latin.json',\n",
593
+ " 'ammianus-marcellinus__rerum-gestarum__latin.json',\n",
594
+ " 'ausonius-decimus-magnus__commemoratio-professorum-burdigalensium__latin.json',\n",
595
+ " 'tertullian-ca-160-ca-230__ad-scapulam__latin.json',\n",
596
+ " 'statius-p-papinius-publius-papinius__achilleis__latin.json',\n",
597
+ " 'tertullian-ca-160-ca-230__de-fuga-in-persecutione__latin.json',\n",
598
+ " 'terence__andria__english.json',\n",
599
+ " 'cicero__for-sextus-roscius-of-ameria__english.json',\n",
600
+ " 'ovid__art-of-beauty__english.json',\n",
601
+ " 'cicero__against-publius-servilius-rullus__latin.json',\n",
602
+ " 'tertullian-ca-160-ca-230__de-anima__latin.json',\n",
603
+ " 'seneca-lucius-annaeus__de-otio-sapientis__latin.json',\n",
604
+ " 'nepos-cornelius__epaminondas__latin.json',\n",
605
+ " 'nepos-cornelius__cimon__latin.json',\n",
606
+ " 'nepos-cornelius__timoleon__latin.json',\n",
607
+ " 'suetonius-ca-69-ca-122__galba__english.json',\n",
608
+ " 'cicero__lucullus__latin.json',\n",
609
+ " 'plautus-titus-maccius__poenulus__latin.json',\n",
610
+ " 'pliny-the-younger__epistulae__latin.json',\n",
611
+ " 'cicero__divinatio-against-q-caecilius__english.json',\n",
612
+ " 'tertullian-ca-160-ca-230__de-paenitentia__latin.json',\n",
613
+ " 'nepos-cornelius__miltiades__latin.json',\n",
614
+ " 'plautus-titus-maccius__stichus__latin.json',\n",
615
+ " 'jerome-saint-d-419-or-20__epistolae__latin.json',\n",
616
+ " 'vopiscus-flavius-fl-34-jh__goridani-tres__latin.json',\n",
617
+ " 'suetonius-ca-69-ca-122__divus-augustus__latin.json',\n",
618
+ " 'virgil__georgics__english.json',\n",
619
+ " 'florus-lucius-annaeus__epitome-rerum-romanorum__latin.json',\n",
620
+ " 'cicero__philippics__english.json',\n",
621
+ " 'boethius-d-524__de-consolatione-philosophiae__latin.json',\n",
622
+ " 'tertullian-ca-160-ca-230__adversus-judaeos-liber__latin.json',\n",
623
+ " 'claudianus-claudius__in-eutropium__latin.json',\n",
624
+ " 'cicero__in-defense-of-l-murena__english.json',\n",
625
+ " 'tertullian-ca-160-ca-230__adversus-praxean__latin.json']"
626
+ ]
627
+ },
628
+ "execution_count": 24,
629
+ "metadata": {},
630
+ "output_type": "execute_result"
631
+ }
632
+ ],
633
+ "source": [
634
+ "os.listdir(perseus_path)"
635
+ ]
636
+ },
637
+ {
638
+ "cell_type": "code",
639
+ "execution_count": 25,
640
+ "metadata": {},
641
+ "outputs": [
642
+ {
643
+ "data": {
644
+ "text/plain": [
645
+ "'\\n{\\n \"meta\":\"section\",\\n \"author\":\"cicero\",\\n \"text\":{\\n \"0\":\"\\n credo ego vos, iudices, mirari quid sit quod, cum tot summi oratores \\nhominesque nobilissimi sedeant, ego potissimum surrexerim , is qui \\nneque aetate neque ingenio neque auctoritate sim cum his qui \\nsedeant comparandus. omnes hi quos videtis adesse in hac causa \\niniuriam novo scelere conflatam putant oportere defendi, defendere \\nipsi propter iniquitatem temporum non audent. ita fit ut adsint propterea \\nquod officium sequuntur, taceant autem idcirco quia \\npericulum vitant.\",\\n \"1\":\"\\n quid ergo? audacissimus ego ex omnibus? minime . \\n an \\n tanto officiosior quam ceteri? ne istius quidem laudis ita sum \\ncupidus ut aliis eam praereptam velim. quae me igitur res praeter \\nceteros impulit ut causam Sex. Rosci reciperem? quia , si qui istorum \\ndixisset quos videtis adesse, in quibus summa auctoritas est atque \\namplitudo, si verbum de re publica fecisset, id quod in hac causa fieri \\nnecesse est, multo plura dixisse quam dixisset putaretur.\",\\n \"2\":\"\\n ego autem \\nsi omnia quae dicenda sunt libere dixero, nequaquam tamen similiter \\noratio mea exire atque in volgus emanare poterit. deinde quod \\nceterorum neque dictum obscurum potest esse propter nobilitatem et \\namplitudinem neque temere dicto concedi propter aetatem et\\nprudentiam. ego si quid liberius dixero, vel occultum esse propterea \\nquod nondum ad rem publicam accessi, vel ignosci adulescentiae \\nmeae poterit; tametsi non modo ignoscendi ratio verum etiam \\n cognoscendi consuetudo iam de civitate sublata est.\",\\n \"3\":\"\\n accedit illa \\nquoque causa quod a ceteris forsitan ita petitum sit ut dicerent, ut \\nutrumvis salvo officio se facere posse arbitrarentur; a me autem ei \\ncontenderunt qui apud me et amicitia et beneficiis et dignitate \\nplurimum possunt, quorum ego nec benevolentiam erga me ignorare \\nnec auctoritatem aspernari nec voluntatem neglegere debebam . \\n\\n \\n \",\\n \"4\":\"\\n\\n his \\nde causis ego huic causae patronus exstiti, non electus unus qui \\nmaximo ingenio sed relictus ex omnibus qui minimo periculo possem \\ndicere, neque uti satis firmo praesidio defensus Sex. Roscius verum \\nuti ne omnino desertus esset.\\n\\n\\n forsitan quaeratis qui iste terror sit et quae tanta formido quae tot \\nac talis viros impediat quo minus pro capite et fortunis alterius quem \\nad modum consuerunt causam velint dicere. quod adhuc vos \\nignorare non mirum est, propterea quod consulto ab accusatoribus \\neius rei quae conflavit hoc iudicium mentio facta non est.\",\\n \"5\":\"\\n\\n'"
646
+ ]
647
+ },
648
+ "execution_count": 25,
649
+ "metadata": {},
650
+ "output_type": "execute_result"
651
+ }
652
+ ],
653
+ "source": [
654
+ "# there are many json files with the following structure:\n",
655
+ "\n",
656
+ "\"\"\"\n",
657
+ "{\n",
658
+ " \"meta\":\"book-chapter-section\",\n",
659
+ " \"author\":\"ammianus marcellinus.\",\n",
660
+ " \"text\":{\n",
661
+ " \"0\":{\n",
662
+ " \"0\":{\n",
663
+ " \"0\":\" After the survival of the events of an\\n unendurable campaign, when the spirits of both parties,\\n broken by the variety of their dangers and hardships, were still drooping,\\n before the blare of the trumpets had ceased or the soldiers been assigned to\\n their winter quarters, the gusts of raging Fortune brought new storms upon the\\n commonwealth through the misdeeds, many and notorious, of Gallus Caesar.\\n \\n He had been raised, at the very beginning of mature manhood,\\n by an unexpected promotion from the utmost depths of wretchedness to princely\\n heights, and overstepping the bounds of the authority conferred upon him, by\\n excess of violence was causing trouble everywhere. For by his relationship to\\n the imperial stock, and the affinity which he even then had with the name of\\n Constantius, he was\\n raised to such a height of presumption that, if he had been more powerful, he\\n would have ventured (it seemed) upon a course hostile to the author of his good\\n fortune. \",\n",
664
+ " \"1\":\" To his cruelty his wife was besides\\n a serious incentive, a woman beyond measure presumptuous because of her kinship\\n to the emperor, and previously joined in marriage by her father Constantine\\n with his brother's son, King Hanniballianus. She, a Megaera\\n in mortal guise, constantly aroused the\\n savagery of Gallus, being as insatiable as he in her thirst for human blood.\\n The pair in process of time gradually became more expert in doing harm, and\\n through underhand and crafty eavesdroppers, who had the evil habit of lightly\\n adding to their information and wanting to learn only what was false and\\n agreeable to them, they fastened upon innocent victims false charges of\\n aspiring to royal power or of practising magic. \",\n",
665
+ " \"2\":\" There stood out among their lesser atrocities, when their unbridled power\\n had already surpassed the limits of unimportant delinquencies, the sudden and\\n awful death of one Clematius, a nobleman of Alexandria. This man's\\n mother-in-law, it was said, had a violent passion for her son-in-law, but was unable to seduce him; whereupon, gaining entrance to the\\n palace by a back door, she presented the queen with a valuable necklace, and\\n thus secured the dispatch of his death-warrant to Honoratus, at that time Count\\n of the East; and so Clematius, a man contaminated by no guilt, was put to\\n death without being allowed to protest or even to open his lips.\\n \\n \",\n",
666
+ " \"3\":\" After the perpetration of this impious deed,\\n which now began to arouse the fears of others also, as if cruelty were given\\n free rein, some persons were adjudged guilty on the mere shadow of suspicion\\n and condemned. Of these some were put to death, others punished by the\\n confiscation of their property and driven from their homes into exile, where,\\n having nothing left save tears and complaints, they lived on the doles of\\n charity; and since constitutional and just rule had given place to cruel\\n caprice, wealthy and famous houses were being closed. \",\n",
667
+ " \"4\":\" And no words of an accuser, even though bribed, were required amid\\n these accumulations of evils, in order that these crimes might be committed, at\\n least ostensibly, under the forms of law, as has sometimes been done by cruel\\n emperors; but whatever the implacable Caesar had resolved upon was rushed to\\n fulfilment, as if it had been carefully weighed and determined to be right and\\n lawful. \",\n",
668
+ " \"5\":\" It was further devised that sundry\\n low-born men, whose very insignificance made them little to be feared, should\\n be appointed to gather gossip in all quarters of Antioch and\\n report what they had heard. These, as if travellers, and in disguise, attended\\n the gatherings of distinguished citizens, and gained entrance to the houses of\\n the wealthy in the guise of needy clients; then, being secretly admitted to the\\n palace by a back door, they reported whatever they had been able to hear or\\n learn, with one accord making it a rule to add inventions of their own and make\\n doubly worse what they had learned, but suppressing the praise of Caesar which\\n the fear of impending evils extorted from some against their will. \",\n",
669
+ " \"6\":\" And sometimes it happened that if the head of a\\n household, in the seclusion of his private apartments, with no confidential\\n servant present, had whispered something in the ear of his wife, the emperor\\n learned it on the following day, as if it were reported by Amphiaraus or\\n Marcius, those famous seers of old. And so even the walls, the only sharers of secrets, were\\n feared. \",\n",
670
+ " \"7\":\" Moreover, his fixed purpose of\\n ferreting out these and many similar things increased, spurred on by the queen,\\n who pushed her husband's fortunes headlong to sheer ruin, when she ought\\n rather, with womanly gentleness, to have recalled him by helpful counsel to the\\n path of truth and mercy, after the manner of the wife of that savage emperor Maximinus, as we have related in our\\n account of the acts of the Gordians.\\n \\n \\n \",\n",
671
+ " \"8\":\" Finally, following an unprecedented and\\n destructive course, Gallus also ventured to commit the atrocious crime which,\\n to his utter disgrace, Gallienus is said to have once hazarded at Rome. Taking\\n with him a few attendants with concealed weapons, he used to roam at evening\\n about the inns and street-corners, inquiring of every one in Greek, of which he\\n had remarkable command, what he thought of the Caesar. And this he did boldly\\n in a city where the brightness of\\n the lights at night commonly equals the resplendence of day. At last, being\\n often recognized, and reflecting that if he continued that course he would be\\n conspicuous, he appeared only in broad daylight, to attend to matters which he\\n considered important. And all this conduct of his caused very deep sorrow to\\n many.\\n \\n \",\n",
672
+ " \"9\":\" Now at that time Thalassius was the\\n Praetorian Prefect at court, a man who was\\n himself of an imperious character. He, perceiving that Gallus' temper was\\n rising, to the peril of many, did not try to soothe it by ripe counsel, as\\n sometimes high officials have moderated the ire of princes; but rather roused\\n the Caesar to fury by opposing and reproving him at unseasonable times; very\\n frequently he informed the emperor of Gallus' doings, exaggerating them and\\n taking pains—whatever his motive may have been—to do it openly. Through this\\n conduct the Caesar was soon still more violently enraged, and\\n as if raising higher, as it were, the standard of his obstinacy, with no regard\\n for his own life or that of others, he rushed on with uncontrollable\\n impetuosity, like a swift torrent, to overthrow whatever opposed him.\\n \"\n",
673
+ " },\n",
674
+ "\n",
675
+ "\"\"\"\n",
676
+ "\n",
677
+ "# and also, some documents don't have the same depth:\n",
678
+ "\n",
679
+ "\"\"\"\n",
680
+ "{\n",
681
+ " \"meta\":\"section\",\n",
682
+ " \"author\":\"cicero\",\n",
683
+ " \"text\":{\n",
684
+ " \"0\":\"\\n credo ego vos, iudices, mirari quid sit quod, cum tot summi oratores \\nhominesque nobilissimi sedeant, ego potissimum surrexerim , is qui \\nneque aetate neque ingenio neque auctoritate sim cum his qui \\nsedeant comparandus. omnes hi quos videtis adesse in hac causa \\niniuriam novo scelere conflatam putant oportere defendi, defendere \\nipsi propter iniquitatem temporum non audent. ita fit ut adsint propterea \\nquod officium sequuntur, taceant autem idcirco quia \\npericulum vitant.\",\n",
685
+ " \"1\":\"\\n quid ergo? audacissimus ego ex omnibus? minime . \\n an \\n tanto officiosior quam ceteri? ne istius quidem laudis ita sum \\ncupidus ut aliis eam praereptam velim. quae me igitur res praeter \\nceteros impulit ut causam Sex. Rosci reciperem? quia , si qui istorum \\ndixisset quos videtis adesse, in quibus summa auctoritas est atque \\namplitudo, si verbum de re publica fecisset, id quod in hac causa fieri \\nnecesse est, multo plura dixisse quam dixisset putaretur.\",\n",
686
+ " \"2\":\"\\n ego autem \\nsi omnia quae dicenda sunt libere dixero, nequaquam tamen similiter \\noratio mea exire atque in volgus emanare poterit. deinde quod \\nceterorum neque dictum obscurum potest esse propter nobilitatem et \\namplitudinem neque temere dicto concedi propter aetatem et\\nprudentiam. ego si quid liberius dixero, vel occultum esse propterea \\nquod nondum ad rem publicam accessi, vel ignosci adulescentiae \\nmeae poterit; tametsi non modo ignoscendi ratio verum etiam \\n cognoscendi consuetudo iam de civitate sublata est.\",\n",
687
+ " \"3\":\"\\n accedit illa \\nquoque causa quod a ceteris forsitan ita petitum sit ut dicerent, ut \\nutrumvis salvo officio se facere posse arbitrarentur; a me autem ei \\ncontenderunt qui apud me et amicitia et beneficiis et dignitate \\nplurimum possunt, quorum ego nec benevolentiam erga me ignorare \\nnec auctoritatem aspernari nec voluntatem neglegere debebam . \\n\\n \\n \",\n",
688
+ " \"4\":\"\\n\\n his \\nde causis ego huic causae patronus exstiti, non electus unus qui \\nmaximo ingenio sed relictus ex omnibus qui minimo periculo possem \\ndicere, neque uti satis firmo praesidio defensus Sex. Roscius verum \\nuti ne omnino desertus esset.\\n\\n\\n forsitan quaeratis qui iste terror sit et quae tanta formido quae tot \\nac talis viros impediat quo minus pro capite et fortunis alterius quem \\nad modum consuerunt causam velint dicere. quod adhuc vos \\nignorare non mirum est, propterea quod consulto ab accusatoribus \\neius rei quae conflavit hoc iudicium mentio facta non est.\",\n",
689
+ " \"5\":\"\n",
690
+ "\n",
691
+ "\"\"\""
692
+ ]
693
+ },
694
+ {
695
+ "cell_type": "code",
696
+ "execution_count": 26,
697
+ "metadata": {},
698
+ "outputs": [],
699
+ "source": [
700
+ "import re"
701
+ ]
702
+ },
703
+ {
704
+ "cell_type": "code",
705
+ "execution_count": 27,
706
+ "metadata": {},
707
+ "outputs": [],
708
+ "source": [
709
+ "def get_texts_from_dict(my_dict):\n",
710
+ " texts = list(my_dict.values())\n",
711
+ " texts = [ re.sub(r\"\\s+\", \" \", tx) for tx in texts]\n",
712
+ "\n",
713
+ " return texts"
714
+ ]
715
+ },
716
+ {
717
+ "cell_type": "code",
718
+ "execution_count": 28,
719
+ "metadata": {},
720
+ "outputs": [],
721
+ "source": [
722
+ "def recursive_get_texts_from_dict(my_dict, depth):\n",
723
+ " if depth == 1:\n",
724
+ " return get_texts_from_dict(my_dict)\n",
725
+ " else:\n",
726
+ " all_texts = []\n",
727
+ " for key, val in my_dict.items():\n",
728
+ " nested_texts = recursive_get_texts_from_dict(\n",
729
+ " val,\n",
730
+ " depth=depth-1\n",
731
+ " )\n",
732
+ " all_texts.extend(nested_texts)\n",
733
+ " return all_texts\n",
734
+ "\n"
735
+ ]
736
+ },
737
+ {
738
+ "cell_type": "code",
739
+ "execution_count": 29,
740
+ "metadata": {},
741
+ "outputs": [],
742
+ "source": [
743
+ "def extract_text_from_perseus_dict(\n",
744
+ " perseus_dict : dict,\n",
745
+ ")-> list[str]:\n",
746
+ " perseus_text = perseus_dict[\"text\"]\n",
747
+ "\n",
748
+ " # check the nest depth of dictionaries\n",
749
+ "\n",
750
+ " depth = 0\n",
751
+ " aux_dict = perseus_text\n",
752
+ " while True:\n",
753
+ " try:\n",
754
+ " aux_dict = aux_dict[\"0\"]\n",
755
+ " depth +=1\n",
756
+ " except:\n",
757
+ " break\n",
758
+ "\n",
759
+ " return recursive_get_texts_from_dict(perseus_text, depth)\n",
760
+ " \n"
761
+ ]
762
+ },
763
+ {
764
+ "cell_type": "code",
765
+ "execution_count": 30,
766
+ "metadata": {},
767
+ "outputs": [],
768
+ "source": [
769
+ "from tqdm import tqdm"
770
+ ]
771
+ },
772
+ {
773
+ "cell_type": "code",
774
+ "execution_count": 31,
775
+ "metadata": {},
776
+ "outputs": [
777
+ {
778
+ "name": "stderr",
779
+ "output_type": "stream",
780
+ "text": [
781
+ "100%|██████████| 376/376 [00:01<00:00, 195.83it/s]\n"
782
+ ]
783
+ }
784
+ ],
785
+ "source": [
786
+ "for json_path in tqdm(os.listdir(perseus_path)):\n",
787
+ " if \".json\" in json_path and \"english\" not in json_path:\n",
788
+ " # load data\n",
789
+ " with open(os.path.join(perseus_path, json_path), \"r\") as f:\n",
790
+ " perseus_dict = json.load(f)\n",
791
+ " \n",
792
+ " # extract text\n",
793
+ " perseus_text = extract_text_from_perseus_dict(perseus_dict)\n",
794
+ "\n",
795
+ " \n",
796
+ " \n",
797
+ " # write to a txt file\n",
798
+ " new_txt_path = json_path.removesuffix(\".json\")+\".txt\"\n",
799
+ " new_txt_path = os.path.join(new_perseus_path, new_txt_path)\n",
800
+ "\n",
801
+ " with open(new_txt_path, \"w\") as f:\n",
802
+ " f.writelines(line + \"\\n\" for line in perseus_text)\n"
803
+ ]
804
+ },
805
+ {
806
+ "attachments": {},
807
+ "cell_type": "markdown",
808
+ "metadata": {},
809
+ "source": [
810
+ "# Processing Tesserae"
811
+ ]
812
+ },
813
+ {
814
+ "cell_type": "code",
815
+ "execution_count": 32,
816
+ "metadata": {},
817
+ "outputs": [
818
+ {
819
+ "data": {
820
+ "text/plain": [
821
+ "'custom_latin_corpus/tesserae'"
822
+ ]
823
+ },
824
+ "execution_count": 32,
825
+ "metadata": {},
826
+ "output_type": "execute_result"
827
+ }
828
+ ],
829
+ "source": [
830
+ "new_tesserae_path = os.path.join(\n",
831
+ " my_corpus_path, \"tesserae\"\n",
832
+ ")\n",
833
+ "if not os.path.exists(new_tesserae_path):\n",
834
+ " os.mkdir(new_tesserae_path)\n",
835
+ "new_tesserae_path"
836
+ ]
837
+ },
838
+ {
839
+ "cell_type": "code",
840
+ "execution_count": 33,
841
+ "metadata": {},
842
+ "outputs": [
843
+ {
844
+ "data": {
845
+ "text/plain": [
846
+ "'\\n<amm. 18.2.7> Quae dum diligenti maturantur effectu, Hariobaudes exploratis omnibus rediit, docuitque comperta. Post cuius adventum incitatis viribus omnes venere Mogontiacum, ubi Florentio et Lupicino (Severi successore) destinate certantibus, per pontem illic constitutum transiri debere, renitebatur firmissime Caesar, asserens pacatorum terras non debere calcari, ne (ut saepe contigit) per incivilitatem militis occurrentia vastitantis, abrupte foedera frangerentur.\\n<amm. 18.2.8> Alamanni tamen omnes quos petebat exercitus, confine periculum cogitantes, Suomarium regem amicum nobis ex pactione praeterita monuerunt minaciter, ut a transitu Romanos arceret. Eius enim pagi Rheni ripis ulterioribus adhaerebant. Quo testante resistere solum non posse, in unum coacta barbara multitude venit prope Mogontiacum, prohibitura viribus magnis exercitum, ne transmitteret flumen.\\n<amm. 18.2.9> Gemina itaque ratione visum est habile quod suaserat Caesar, ne pacatorum terrae corrumperentur, neve renitente pugnacissima plebe, pons cum multorum discrimine iungeretur iri in locum ad compaginandum pontem aptissimum.\\n'"
847
+ ]
848
+ },
849
+ "execution_count": 33,
850
+ "metadata": {},
851
+ "output_type": "execute_result"
852
+ }
853
+ ],
854
+ "source": [
855
+ "# files (extension .tess) are of the format:\n",
856
+ "\"\"\"\n",
857
+ "<amm. 18.2.7> Quae dum diligenti maturantur effectu, Hariobaudes exploratis omnibus rediit, docuitque comperta. Post cuius adventum incitatis viribus omnes venere Mogontiacum, ubi Florentio et Lupicino (Severi successore) destinate certantibus, per pontem illic constitutum transiri debere, renitebatur firmissime Caesar, asserens pacatorum terras non debere calcari, ne (ut saepe contigit) per incivilitatem militis occurrentia vastitantis, abrupte foedera frangerentur.\n",
858
+ "<amm. 18.2.8> Alamanni tamen omnes quos petebat exercitus, confine periculum cogitantes, Suomarium regem amicum nobis ex pactione praeterita monuerunt minaciter, ut a transitu Romanos arceret. Eius enim pagi Rheni ripis ulterioribus adhaerebant. Quo testante resistere solum non posse, in unum coacta barbara multitude venit prope Mogontiacum, prohibitura viribus magnis exercitum, ne transmitteret flumen.\n",
859
+ "<amm. 18.2.9> Gemina itaque ratione visum est habile quod suaserat Caesar, ne pacatorum terrae corrumperentur, neve renitente pugnacissima plebe, pons cum multorum discrimine iungeretur iri in locum ad compaginandum pontem aptissimum.\n",
860
+ "\"\"\""
861
+ ]
862
+ },
863
+ {
864
+ "cell_type": "code",
865
+ "execution_count": 34,
866
+ "metadata": {},
867
+ "outputs": [
868
+ {
869
+ "name": "stderr",
870
+ "output_type": "stream",
871
+ "text": [
872
+ "100%|██████████| 750/750 [00:02<00:00, 345.63it/s]\n"
873
+ ]
874
+ }
875
+ ],
876
+ "source": [
877
+ "for tess_path in tqdm(os.listdir(tesserae_path)):\n",
878
+ " if \".tess\" in tess_path:\n",
879
+ " with open(os.path.join(tesserae_path, tess_path), \"r\") as f:\n",
880
+ " text = f.readlines()\n",
881
+ "\n",
882
+ " text = [re.sub(r\"<(.)*?>\", \"\", t) for t in text]\n",
883
+ " text = [re.sub(r\"^\\s+\", \"\", t) for t in text]\n",
884
+ " #text = [re.sub(r\"<\\s*[^>]*\\s*>\", \"\", t) for t in text]\n",
885
+ "\n",
886
+ " new_txt_path = tess_path.removesuffix(\".tess\")+\".txt\"\n",
887
+ " new_txt_path = os.path.join(new_tesserae_path, new_txt_path)\n",
888
+ " \n",
889
+ " with open(new_txt_path, \"w\") as f:\n",
890
+ " f.writelines(line + \"\\n\" for line in text)\n"
891
+ ]
892
+ },
893
+ {
894
+ "attachments": {},
895
+ "cell_type": "markdown",
896
+ "metadata": {},
897
+ "source": [
898
+ "# Processing Corpus grammaticorum"
899
+ ]
900
+ },
901
+ {
902
+ "cell_type": "code",
903
+ "execution_count": 35,
904
+ "metadata": {},
905
+ "outputs": [],
906
+ "source": [
907
+ "new_corpus_grammaticorum_path = os.path.join(my_corpus_path, \"corpus_grammaticorum\")\n",
908
+ "if not os.path.exists(new_corpus_grammaticorum_path):\n",
909
+ " os.mkdir(new_corpus_grammaticorum_path)"
910
+ ]
911
+ },
912
+ {
913
+ "cell_type": "code",
914
+ "execution_count": 36,
915
+ "metadata": {},
916
+ "outputs": [
917
+ {
918
+ "data": {
919
+ "text/plain": [
920
+ "'\\n{\\n \"author\":\"Alcuinus\",\\n \"title\":\"Alcuinus de orthographia\",\\n \"edition_new\":\"S. Bruni 1997\",\\n \"edition_keil\":\"GL 7,295-312\",\\n \"text\":\" me legat antiquas cupiat qui scire loquelas / me spernens loquitur mox sine lege patrum. | <littera a> aeternus, aetas, aeuum per duo u, aequitas, aequus id est iustus, haec omnia per ae diptongon scribenda sunt; equus, si animal significat per \\n\\\\nsimplicem e. |accusso per duo c et per duo s scribi debet, accedo per duo c. saepe ad, euphoniae causa, in sequentem mutabitur consonantem, ut afficio \\n\\\\naffluo a\\n\\n'"
921
+ ]
922
+ },
923
+ "execution_count": 36,
924
+ "metadata": {},
925
+ "output_type": "execute_result"
926
+ }
927
+ ],
928
+ "source": [
929
+ "# JSON files with the following format\n",
930
+ "\"\"\"\n",
931
+ "{\n",
932
+ " \"author\":\"Alcuinus\",\n",
933
+ " \"title\":\"Alcuinus de orthographia\",\n",
934
+ " \"edition_new\":\"S. Bruni 1997\",\n",
935
+ " \"edition_keil\":\"GL 7,295-312\",\n",
936
+ " \"text\":\" me legat antiquas cupiat qui scire loquelas / me spernens loquitur mox sine lege patrum. | <littera a> aeternus, aetas, aeuum per duo u, aequitas, aequus id est iustus, haec omnia per ae diptongon scribenda sunt; equus, si animal significat per \n",
937
+ "\\\\nsimplicem e. |accusso per duo c et per duo s scribi debet, accedo per duo c. saepe ad, euphoniae causa, in sequentem mutabitur consonantem, ut afficio \n",
938
+ "\\\\naffluo a\n",
939
+ "\n",
940
+ "\"\"\""
941
+ ]
942
+ },
943
+ {
944
+ "cell_type": "code",
945
+ "execution_count": 37,
946
+ "metadata": {},
947
+ "outputs": [],
948
+ "source": [
949
+ "# there are problems with the double backslash\n",
950
+ "class LazyDecoder(json.JSONDecoder):\n",
951
+ " def decode(self, s, **kwargs):\n",
952
+ " regex_replacements = [\n",
953
+ " (re.compile(r\"\\\\\\\\n\"), r\"\\n\" )\n",
954
+ " #(re.compile(r'([^\\\\])\\\\([^\\\\])'), r'\\1\\\\\\\\\\2'),\n",
955
+ " #(re.compile(r',(\\s*])'), r'\\1'),\n",
956
+ " ]\n",
957
+ " for regex, replacement in regex_replacements:\n",
958
+ " s = regex.sub(replacement, s)\n",
959
+ " return super().decode(s, **kwargs)"
960
+ ]
961
+ },
962
+ {
963
+ "cell_type": "code",
964
+ "execution_count": 40,
965
+ "metadata": {},
966
+ "outputs": [
967
+ {
968
+ "name": "stderr",
969
+ "output_type": "stream",
970
+ "text": [
971
+ "100%|██████████| 44/44 [00:00<00:00, 78.65it/s] \n"
972
+ ]
973
+ },
974
+ {
975
+ "data": {
976
+ "text/plain": [
977
+ "0"
978
+ ]
979
+ },
980
+ "execution_count": 40,
981
+ "metadata": {},
982
+ "output_type": "execute_result"
983
+ }
984
+ ],
985
+ "source": [
986
+ "errors = 0\n",
987
+ "for autor_path in tqdm(os.listdir(corpus_grammaticorum_path)):\n",
988
+ " sub_dir = os.path.join(corpus_grammaticorum_path, autor_path)\n",
989
+ "\n",
990
+ "\n",
991
+ "\n",
992
+ " if not os.path.isdir(sub_dir):\n",
993
+ " continue\n",
994
+ " for json_path in os.listdir(\n",
995
+ " sub_dir\n",
996
+ " ):\n",
997
+ " if \".json\" in json_path:\n",
998
+ " full_path = os.path.join(\n",
999
+ " corpus_grammaticorum_path,\n",
1000
+ " autor_path,\n",
1001
+ " json_path\n",
1002
+ " )\n",
1003
+ " with open(full_path) as f:\n",
1004
+ " #content_dict = json.load(f, cls=LazyDecoder)\n",
1005
+ " content_raw = f.read()\n",
1006
+ " content_raw = re.sub(r\"\\\\\\\\n\", r\"\", content_raw)\n",
1007
+ " content_raw = re.sub(r\"\\\\.\", r\"\", content_raw)\n",
1008
+ " # content_raw = re.sub(r\"\\\\n\", r\"\", content_raw)\n",
1009
+ " content_raw = re.sub(r\"\\|\", r\" \", content_raw)\n",
1010
+ " content_raw = re.sub(r\"/\", r\" \", content_raw)\n",
1011
+ " content_raw = re.sub(r\"\\s+\", r\" \", content_raw)\n",
1012
+ " content_raw = re.sub(r\"[<>\\[\\]\\(\\)»«]\", r\"\", content_raw)\n",
1013
+ "\n",
1014
+ " # deleting some separators\n",
1015
+ " content_raw = re.sub(r\"\\|\\d+\\|\", r\"\", content_raw)\n",
1016
+ "\n",
1017
+ " # deleting page numbers\n",
1018
+ " content_raw = re.sub(r\"\\d+\", r\"\", content_raw)\n",
1019
+ "\n",
1020
+ " # delete some extra quotation marks\n",
1021
+ " replacement = (\n",
1022
+ " lambda match: (\n",
1023
+ " '\"text\": \"'\n",
1024
+ " + match.group(1).replace('\"', '')\n",
1025
+ " + '\"\\n}'\n",
1026
+ " )\n",
1027
+ " )\n",
1028
+ "\n",
1029
+ " content_raw = re.sub(\n",
1030
+ " r'\"text\": \"(.*?)\"\\s}',\n",
1031
+ " replacement,\n",
1032
+ " content_raw\n",
1033
+ " )\n",
1034
+ "\n",
1035
+ " try:\n",
1036
+ " content_dict = json.loads(content_raw)\n",
1037
+ " except Exception as e:\n",
1038
+ " print(str(e))\n",
1039
+ " print(json_path)\n",
1040
+ " print(full_path)\n",
1041
+ " print(content_raw)\n",
1042
+ " errors +=1\n",
1043
+ " continue\n",
1044
+ " \n",
1045
+ "\n",
1046
+ " text = content_dict[\"text\"]\n",
1047
+ "\n",
1048
+ " new_txt_path = json_path.removesuffix(\".json\")+\".txt\"\n",
1049
+ " new_txt_path = os.path.join(new_corpus_grammaticorum_path, new_txt_path)\n",
1050
+ "\n",
1051
+ " with open(new_txt_path, \"w\",encoding=\"utf-8\") as f:\n",
1052
+ " f.write(text)\n",
1053
+ "\n",
1054
+ "errors"
1055
+ ]
1056
+ },
1057
+ {
1058
+ "cell_type": "code",
1059
+ "execution_count": 39,
1060
+ "metadata": {},
1061
+ "outputs": [
1062
+ {
1063
+ "data": {
1064
+ "text/plain": [
1065
+ "44"
1066
+ ]
1067
+ },
1068
+ "execution_count": 39,
1069
+ "metadata": {},
1070
+ "output_type": "execute_result"
1071
+ }
1072
+ ],
1073
+ "source": [
1074
+ "len(os.listdir(corpus_grammaticorum_path))"
1075
+ ]
1076
+ }
1077
+ ],
1078
+ "metadata": {
1079
+ "kernelspec": {
1080
+ "display_name": "bertenv",
1081
+ "language": "python",
1082
+ "name": "python3"
1083
+ },
1084
+ "language_info": {
1085
+ "codemirror_mode": {
1086
+ "name": "ipython",
1087
+ "version": 3
1088
+ },
1089
+ "file_extension": ".py",
1090
+ "mimetype": "text/x-python",
1091
+ "name": "python",
1092
+ "nbconvert_exporter": "python",
1093
+ "pygments_lexer": "ipython3",
1094
+ "version": "3.9.2"
1095
+ },
1096
+ "orig_nbformat": 4
1097
+ },
1098
+ "nbformat": 4,
1099
+ "nbformat_minor": 2
1100
+ }
training_notebooks/02_more_preprocessing_and_training_WP_tokenizer.ipynb ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Now training Wordpiece with instructions from huggingface\n"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 3,
14
+ "metadata": {},
15
+ "outputs": [],
16
+ "source": [
17
+ "# https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb\n"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": 4,
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "# join all txt files into a single txt file\n",
27
+ "import os\n",
28
+ "from pathlib import Path\n",
29
+ "import time\n",
30
+ "\n",
31
+ "paths = [str(x) for x in Path(\"./custom_latin_corpus\").glob(\"**/*.txt\")]\n",
32
+ "all_text = []\n",
33
+ "for path in paths:\n",
34
+ " with open(path, \"r\") as f:\n",
35
+ " text = f.read()\n",
36
+ "\n",
37
+ " all_text.append(text)\n",
38
+ "# text batch size\n",
39
+ "batch_size = 100\n",
40
+ "def batch_iterator():\n",
41
+ " for i in range(0, len(all_text), batch_size):\n",
42
+ " yield all_text[i : i + batch_size]"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": 5,
48
+ "metadata": {},
49
+ "outputs": [],
50
+ "source": [
51
+ "with open(\"03_full_latin_corpus_for_training.txt\", \"w\") as f:\n",
52
+ " f.writelines(all_text)"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": 6,
58
+ "metadata": {},
59
+ "outputs": [],
60
+ "source": [
61
+ "\n",
62
+ "from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer\n"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": 7,
68
+ "metadata": {},
69
+ "outputs": [],
70
+ "source": [
71
+ "tokenizer = Tokenizer(models.WordPiece(unk_token=\"[UNK]\"))\n",
72
+ "tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)\n",
73
+ "tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()\n",
74
+ "tokenizer.pre_tokenizer.pre_tokenize_str(\"This is an example!\")\n",
75
+ "special_tokens = [\"[UNK]\", \"[PAD]\", \"[CLS]\", \"[SEP]\", \"[MASK]\"]"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 8,
81
+ "metadata": {},
82
+ "outputs": [],
83
+ "source": [
84
+ "\n",
85
+ "trainer = trainers.WordPieceTrainer(\n",
86
+ " vocab_size=25000, \n",
87
+ " special_tokens=special_tokens,\n",
88
+ " min_frequency=2,\n",
89
+ " limit_alphabet=50\n",
90
+ " )"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": 9,
96
+ "metadata": {},
97
+ "outputs": [
98
+ {
99
+ "name": "stdout",
100
+ "output_type": "stream",
101
+ "text": [
102
+ "\n",
103
+ "\n",
104
+ "\n"
105
+ ]
106
+ }
107
+ ],
108
+ "source": [
109
+ "\n",
110
+ "\n",
111
+ "\n",
112
+ "tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)\n",
113
+ "\n"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": 10,
119
+ "metadata": {},
120
+ "outputs": [
121
+ {
122
+ "name": "stdout",
123
+ "output_type": "stream",
124
+ "text": [
125
+ "2 3\n"
126
+ ]
127
+ }
128
+ ],
129
+ "source": [
130
+ "# now, define the post processor\n",
131
+ "cls_token_id = tokenizer.token_to_id(\"[CLS]\")\n",
132
+ "sep_token_id = tokenizer.token_to_id(\"[SEP]\")\n",
133
+ "print(cls_token_id, sep_token_id)\n",
134
+ "tokenizer.post_processor = processors.TemplateProcessing(\n",
135
+ " single=f\"[CLS]:0 $A:0 [SEP]:0\",\n",
136
+ " pair=f\"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1\",\n",
137
+ " special_tokens=[\n",
138
+ " (\"[CLS]\", cls_token_id),\n",
139
+ " (\"[SEP]\", sep_token_id),\n",
140
+ " ],\n",
141
+ ")"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": 11,
147
+ "metadata": {},
148
+ "outputs": [],
149
+ "source": [
150
+ "# test an example\n",
151
+ "example_encoding = tokenizer.encode(\"Roma in Italia est.\", \"Italia in Europa est.\")\n",
152
+ "example_encoding.tokens\n",
153
+ "tokenizer.decoder = decoders.WordPiece(prefix=\"##\")\n"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": 12,
159
+ "metadata": {},
160
+ "outputs": [
161
+ {
162
+ "data": {
163
+ "text/plain": [
164
+ "('latin_WP_tokenizer/tokenizer_config.json',\n",
165
+ " 'latin_WP_tokenizer/special_tokens_map.json',\n",
166
+ " 'latin_WP_tokenizer/vocab.txt',\n",
167
+ " 'latin_WP_tokenizer/added_tokens.json',\n",
168
+ " 'latin_WP_tokenizer/tokenizer.json')"
169
+ ]
170
+ },
171
+ "execution_count": 12,
172
+ "metadata": {},
173
+ "output_type": "execute_result"
174
+ }
175
+ ],
176
+ "source": [
177
+ "# wrap it inside transformers object\n",
178
+ "\n",
179
+ "from transformers import BertTokenizerFast\n",
180
+ "\n",
181
+ "new_wp_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)\n",
182
+ "new_wp_tokenizer.save_pretrained(\n",
183
+ " \"latin_WP_tokenizer\"\n",
184
+ ")\n"
185
+ ]
186
+ }
187
+ ],
188
+ "metadata": {
189
+ "kernelspec": {
190
+ "display_name": "bertenv",
191
+ "language": "python",
192
+ "name": "python3"
193
+ },
194
+ "language_info": {
195
+ "codemirror_mode": {
196
+ "name": "ipython",
197
+ "version": 3
198
+ },
199
+ "file_extension": ".py",
200
+ "mimetype": "text/x-python",
201
+ "name": "python",
202
+ "nbconvert_exporter": "python",
203
+ "pygments_lexer": "ipython3",
204
+ "version": "3.9.2"
205
+ },
206
+ "orig_nbformat": 4
207
+ },
208
+ "nbformat": 4,
209
+ "nbformat_minor": 2
210
+ }
training_notebooks/04_train_mlm_script.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb#scrollTo=VNZZs-r6iKAV
4
+
5
+ from transformers import Trainer, TrainingArguments
6
+ from transformers import DataCollatorForLanguageModeling
7
+ from transformers import LineByLineTextDataset
8
+ from transformers import BertForMaskedLM
9
+ from transformers import BertTokenizerFast, BertTokenizer
10
+ from transformers import BertConfig
11
+ import torch
12
+ import time
13
+ print(torch.cuda.is_available())
14
+
15
+ # Start the timer
16
+ start_time = time.time()
17
+ # Define a config
18
+
19
+
20
+ config = BertConfig(
21
+ vocab_size=25000,
22
+ max_position_embeddings=512,
23
+ num_attention_heads=12,
24
+ num_hidden_layers=12,
25
+ type_vocab_size=2,
26
+ )
27
+
28
+
29
+ tokenizer = BertTokenizerFast.from_pretrained(
30
+ "latin_WP_tokenizer",
31
+ )
32
+
33
+ # initialize from config to train from scratch
34
+
35
+
36
+ model = BertForMaskedLM(config=config)
37
+
38
+ print(f"There are {model.num_parameters()} parameters")
39
+
40
+
41
+ full_corpus_file = "03_full_latin_corpus_for_training.txt"
42
+
43
+
44
+ dataset = LineByLineTextDataset(
45
+ tokenizer=tokenizer,
46
+ file_path=full_corpus_file,
47
+ block_size=128,
48
+ )
49
+
50
+
51
+ data_collator = DataCollatorForLanguageModeling(
52
+ tokenizer=tokenizer, mlm=True, mlm_probability=0.15
53
+ )
54
+
55
+ # initialize trainer
56
+
57
+
58
+ output_dir = "./Latin_BERT_training_2"
59
+
60
+ training_args = TrainingArguments(
61
+ output_dir=output_dir,
62
+ overwrite_output_dir=True,
63
+ # num_train_epochs=3, # like the original BERT
64
+ num_train_epochs=1, # just one epoch
65
+ per_device_train_batch_size=64,
66
+ save_steps=10000,
67
+ save_total_limit=2,
68
+ prediction_loss_only=True,
69
+ )
70
+
71
+ trainer = Trainer(
72
+ model=model,
73
+ args=training_args,
74
+ data_collator=data_collator,
75
+ train_dataset=dataset,
76
+ )
77
+
78
+
79
+ # now train
80
+ trainer.train()
81
+
82
+ trainer.save_model("./latin_BERT_2")
83
+
84
+
85
+ # End the timer
86
+ end_time = time.time()
87
+ # Calculate the elapsed time
88
+ elapsed_time = end_time - start_time
89
+ # Print the elapsed time
90
+ print(f"Elapsed time: {elapsed_time} seconds")
training_notebooks/05_loading_and_saving_pipeline.ipynb ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from transformers import pipeline"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": null,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "fill_mask = pipeline(\n",
19
+ " \"fill-mask\",\n",
20
+ " model=\"latin_BERT_2\",\n",
21
+ " tokenizer=\"latin_WP_tokenizer\"\n",
22
+ ")"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": null,
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "fill_mask.save_pretrained(\"latin_BERT_final\")"
32
+ ]
33
+ }
34
+ ],
35
+ "metadata": {
36
+ "kernelspec": {
37
+ "display_name": "dhvenv",
38
+ "language": "python",
39
+ "name": "python3"
40
+ },
41
+ "language_info": {
42
+ "codemirror_mode": {
43
+ "name": "ipython",
44
+ "version": 3
45
+ },
46
+ "file_extension": ".py",
47
+ "mimetype": "text/x-python",
48
+ "name": "python",
49
+ "nbconvert_exporter": "python",
50
+ "pygments_lexer": "ipython3",
51
+ "version": "3.10.6"
52
+ },
53
+ "orig_nbformat": 4
54
+ },
55
+ "nbformat": 4,
56
+ "nbformat_minor": 2
57
+ }
training_notebooks/05_testing_latin_bert.ipynb ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from transformers import pipeline\n",
10
+ "\n",
11
+ "fill_mask = pipeline(\n",
12
+ " \"fill-mask\",\n",
13
+ " model=\"latin_BERT_final\",\n",
14
+ " tokenizer=\"latin_WP_tokenizer\"\n",
15
+ ")"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 5,
21
+ "metadata": {},
22
+ "outputs": [
23
+ {
24
+ "name": "stdout",
25
+ "output_type": "stream",
26
+ "text": [
27
+ "False False False False False\n"
28
+ ]
29
+ },
30
+ {
31
+ "data": {
32
+ "text/plain": [
33
+ "[{'score': 0.05949964374303818,\n",
34
+ " 'token': 18,\n",
35
+ " 'token_str': '.',\n",
36
+ " 'sequence': 'roma in. est.'},\n",
37
+ " {'score': 0.05125246196985245,\n",
38
+ " 'token': 16,\n",
39
+ " 'token_str': ',',\n",
40
+ " 'sequence': 'roma in, est.'},\n",
41
+ " {'score': 0.014972569420933723,\n",
42
+ " 'token': 870,\n",
43
+ " 'token_str': 'et',\n",
44
+ " 'sequence': 'roma in et est.'},\n",
45
+ " {'score': 0.009671307168900967,\n",
46
+ " 'token': 879,\n",
47
+ " 'token_str': '##que',\n",
48
+ " 'sequence': 'roma inque est.'},\n",
49
+ " {'score': 0.007990601472556591,\n",
50
+ " 'token': 30,\n",
51
+ " 'token_str': ':',\n",
52
+ " 'sequence': 'roma in : est.'}]"
53
+ ]
54
+ },
55
+ "execution_count": 5,
56
+ "metadata": {},
57
+ "output_type": "execute_result"
58
+ }
59
+ ],
60
+ "source": [
61
+ "fill_mask(\"Roma in [MASK] est.\")"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": 6,
67
+ "metadata": {},
68
+ "outputs": [
69
+ {
70
+ "name": "stdout",
71
+ "output_type": "stream",
72
+ "text": [
73
+ "False False False False False\n"
74
+ ]
75
+ },
76
+ {
77
+ "data": {
78
+ "text/plain": [
79
+ "[{'score': 0.05949964374303818,\n",
80
+ " 'token': 18,\n",
81
+ " 'token_str': '.',\n",
82
+ " 'sequence': 'ubi est.?.'},\n",
83
+ " {'score': 0.05125246196985245,\n",
84
+ " 'token': 16,\n",
85
+ " 'token_str': ',',\n",
86
+ " 'sequence': 'ubi est,?.'},\n",
87
+ " {'score': 0.014972569420933723,\n",
88
+ " 'token': 870,\n",
89
+ " 'token_str': 'et',\n",
90
+ " 'sequence': 'ubi est et?.'},\n",
91
+ " {'score': 0.009671302512288094,\n",
92
+ " 'token': 879,\n",
93
+ " 'token_str': '##que',\n",
94
+ " 'sequence': 'ubi estque?.'},\n",
95
+ " {'score': 0.007990599609911442,\n",
96
+ " 'token': 30,\n",
97
+ " 'token_str': ':',\n",
98
+ " 'sequence': 'ubi est :?.'}]"
99
+ ]
100
+ },
101
+ "execution_count": 6,
102
+ "metadata": {},
103
+ "output_type": "execute_result"
104
+ }
105
+ ],
106
+ "source": [
107
+ "fill_mask(\"Ubi est [MASK] ?.\")"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 7,
113
+ "metadata": {},
114
+ "outputs": [
115
+ {
116
+ "name": "stdout",
117
+ "output_type": "stream",
118
+ "text": [
119
+ "False False False False False\n"
120
+ ]
121
+ },
122
+ {
123
+ "data": {
124
+ "text/plain": [
125
+ "[{'score': 0.05949964374303818,\n",
126
+ " 'token': 18,\n",
127
+ " 'token_str': '.',\n",
128
+ " 'sequence': 'de honoratorum.'},\n",
129
+ " {'score': 0.05125246196985245,\n",
130
+ " 'token': 16,\n",
131
+ " 'token_str': ',',\n",
132
+ " 'sequence': 'de honoratorum,'},\n",
133
+ " {'score': 0.014972569420933723,\n",
134
+ " 'token': 870,\n",
135
+ " 'token_str': 'et',\n",
136
+ " 'sequence': 'de honoratorum et'},\n",
137
+ " {'score': 0.009671302512288094,\n",
138
+ " 'token': 879,\n",
139
+ " 'token_str': '##que',\n",
140
+ " 'sequence': 'de honoratorumque'},\n",
141
+ " {'score': 0.007990601472556591,\n",
142
+ " 'token': 30,\n",
143
+ " 'token_str': ':',\n",
144
+ " 'sequence': 'de honoratorum :'}]"
145
+ ]
146
+ },
147
+ "execution_count": 7,
148
+ "metadata": {},
149
+ "output_type": "execute_result"
150
+ }
151
+ ],
152
+ "source": [
153
+ "# De honoratorum vehiculis\n",
154
+ "fill_mask(\"De honoratorum [MASK]\")"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": null,
160
+ "metadata": {},
161
+ "outputs": [],
162
+ "source": []
163
+ }
164
+ ],
165
+ "metadata": {
166
+ "kernelspec": {
167
+ "display_name": "bertenv",
168
+ "language": "python",
169
+ "name": "python3"
170
+ },
171
+ "language_info": {
172
+ "codemirror_mode": {
173
+ "name": "ipython",
174
+ "version": 3
175
+ },
176
+ "file_extension": ".py",
177
+ "mimetype": "text/x-python",
178
+ "name": "python",
179
+ "nbconvert_exporter": "python",
180
+ "pygments_lexer": "ipython3",
181
+ "version": "3.9.2"
182
+ },
183
+ "orig_nbformat": 4
184
+ },
185
+ "nbformat": 4,
186
+ "nbformat_minor": 2
187
+ }
training_notebooks/06_testing_latin_bert.ipynb ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from transformers import pipeline\n",
10
+ "\n",
11
+ "fill_mask = pipeline(\n",
12
+ " \"fill-mask\",\n",
13
+ " \"latin_BERT_final\"\n",
14
+ ")"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 2,
20
+ "metadata": {},
21
+ "outputs": [
22
+ {
23
+ "data": {
24
+ "text/plain": [
25
+ "[{'score': 0.010115200653672218,\n",
26
+ " 'token': 265,\n",
27
+ " 'token_str': 'hoc',\n",
28
+ " 'sequence': 'roma in hoc est.'},\n",
29
+ " {'score': 0.004335678182542324,\n",
30
+ " 'token': 1156,\n",
31
+ " 'token_str': 'deo',\n",
32
+ " 'sequence': 'roma in deo est.'},\n",
33
+ " {'score': 0.003660168731585145,\n",
34
+ " 'token': 146,\n",
35
+ " 'token_str': 'non',\n",
36
+ " 'sequence': 'roma in non est.'},\n",
37
+ " {'score': 0.0034285704605281353,\n",
38
+ " 'token': 745,\n",
39
+ " 'token_str': 'nobis',\n",
40
+ " 'sequence': 'roma in nobis est.'},\n",
41
+ " {'score': 0.0032894855830818415,\n",
42
+ " 'token': 971,\n",
43
+ " 'token_str': 'rebus',\n",
44
+ " 'sequence': 'roma in rebus est.'}]"
45
+ ]
46
+ },
47
+ "execution_count": 2,
48
+ "metadata": {},
49
+ "output_type": "execute_result"
50
+ }
51
+ ],
52
+ "source": [
53
+ "fill_mask(\"Roma in [MASK] est.\")"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": 3,
59
+ "metadata": {},
60
+ "outputs": [
61
+ {
62
+ "data": {
63
+ "text/plain": [
64
+ "[{'score': 0.04990068078041077,\n",
65
+ " 'token': 145,\n",
66
+ " 'token_str': 'est',\n",
67
+ " 'sequence': 'ubi est est?.'},\n",
68
+ " {'score': 0.01739734411239624,\n",
69
+ " 'token': 215,\n",
70
+ " 'token_str': 'quid',\n",
71
+ " 'sequence': 'ubi est quid?.'},\n",
72
+ " {'score': 0.008733403868973255,\n",
73
+ " 'token': 391,\n",
74
+ " 'token_str': 'mihi',\n",
75
+ " 'sequence': 'ubi est mihi?.'},\n",
76
+ " {'score': 0.007146364543586969,\n",
77
+ " 'token': 368,\n",
78
+ " 'token_str': 'sum',\n",
79
+ " 'sequence': 'ubi est sum?.'},\n",
80
+ " {'score': 0.006486538797616959,\n",
81
+ " 'token': 425,\n",
82
+ " 'token_str': 'tibi',\n",
83
+ " 'sequence': 'ubi est tibi?.'}]"
84
+ ]
85
+ },
86
+ "execution_count": 3,
87
+ "metadata": {},
88
+ "output_type": "execute_result"
89
+ }
90
+ ],
91
+ "source": [
92
+ "fill_mask(\"Ubi est [MASK] ?.\")"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": 4,
98
+ "metadata": {},
99
+ "outputs": [
100
+ {
101
+ "data": {
102
+ "text/plain": [
103
+ "[{'score': 0.7615262269973755,\n",
104
+ " 'token': 12,\n",
105
+ " 'token_str': '.',\n",
106
+ " 'sequence': 'de honoratorum.'},\n",
107
+ " {'score': 0.03385818004608154,\n",
108
+ " 'token': 23,\n",
109
+ " 'token_str': ':',\n",
110
+ " 'sequence': 'de honoratorum :'},\n",
111
+ " {'score': 0.02129465527832508,\n",
112
+ " 'token': 10,\n",
113
+ " 'token_str': ',',\n",
114
+ " 'sequence': 'de honoratorum,'},\n",
115
+ " {'score': 0.014383483678102493,\n",
116
+ " 'token': 25,\n",
117
+ " 'token_str': '?',\n",
118
+ " 'sequence': 'de honoratorum?'},\n",
119
+ " {'score': 0.008870471268892288,\n",
120
+ " 'token': 109,\n",
121
+ " 'token_str': 'et',\n",
122
+ " 'sequence': 'de honoratorum et'}]"
123
+ ]
124
+ },
125
+ "execution_count": 4,
126
+ "metadata": {},
127
+ "output_type": "execute_result"
128
+ }
129
+ ],
130
+ "source": [
131
+ "# De honoratorum vehiculis\n",
132
+ "fill_mask(\"De honoratorum [MASK]\")"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": 5,
138
+ "metadata": {},
139
+ "outputs": [
140
+ {
141
+ "data": {
142
+ "text/plain": [
143
+ "[{'score': 0.00837777741253376,\n",
144
+ " 'token': 1838,\n",
145
+ " 'token_str': 'urbe',\n",
146
+ " 'sequence': 'gallia est omnis divisa in urbe tres'},\n",
147
+ " {'score': 0.007593569345772266,\n",
148
+ " 'token': 1628,\n",
149
+ " 'token_str': 'corpore',\n",
150
+ " 'sequence': 'gallia est omnis divisa in corpore tres'},\n",
151
+ " {'score': 0.007336211856454611,\n",
152
+ " 'token': 2035,\n",
153
+ " 'token_str': 'medio',\n",
154
+ " 'sequence': 'gallia est omnis divisa in medio tres'},\n",
155
+ " {'score': 0.006218622904270887,\n",
156
+ " 'token': 983,\n",
157
+ " 'token_str': 'parte',\n",
158
+ " 'sequence': 'gallia est omnis divisa in parte tres'},\n",
159
+ " {'score': 0.0054352362640202045,\n",
160
+ " 'token': 238,\n",
161
+ " 'token_str': 'quo',\n",
162
+ " 'sequence': 'gallia est omnis divisa in quo tres'}]"
163
+ ]
164
+ },
165
+ "execution_count": 5,
166
+ "metadata": {},
167
+ "output_type": "execute_result"
168
+ }
169
+ ],
170
+ "source": [
171
+ "fill_mask(\"Gallia est omnis divisa in [MASK] tres\")"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": 7,
177
+ "metadata": {},
178
+ "outputs": [
179
+ {
180
+ "data": {
181
+ "text/plain": [
182
+ "[{'score': 0.008044794201850891,\n",
183
+ " 'token': 1628,\n",
184
+ " 'token_str': 'corpore',\n",
185
+ " 'sequence': 'gallia est omnis divisa in corpore tres.'},\n",
186
+ " {'score': 0.00732386251911521,\n",
187
+ " 'token': 1838,\n",
188
+ " 'token_str': 'urbe',\n",
189
+ " 'sequence': 'gallia est omnis divisa in urbe tres.'},\n",
190
+ " {'score': 0.0072334203869104385,\n",
191
+ " 'token': 983,\n",
192
+ " 'token_str': 'parte',\n",
193
+ " 'sequence': 'gallia est omnis divisa in parte tres.'},\n",
194
+ " {'score': 0.006316048558801413,\n",
195
+ " 'token': 2035,\n",
196
+ " 'token_str': 'medio',\n",
197
+ " 'sequence': 'gallia est omnis divisa in medio tres.'},\n",
198
+ " {'score': 0.004988126456737518,\n",
199
+ " 'token': 1177,\n",
200
+ " 'token_str': 'terra',\n",
201
+ " 'sequence': 'gallia est omnis divisa in terra tres.'}]"
202
+ ]
203
+ },
204
+ "execution_count": 7,
205
+ "metadata": {},
206
+ "output_type": "execute_result"
207
+ }
208
+ ],
209
+ "source": [
210
+ "# Adding a point changes the predictions!\n",
211
+ "fill_mask(\"Gallia est omnis divisa in [MASK] tres.\")"
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": null,
217
+ "metadata": {},
218
+ "outputs": [],
219
+ "source": []
220
+ }
221
+ ],
222
+ "metadata": {
223
+ "kernelspec": {
224
+ "display_name": "bertenv",
225
+ "language": "python",
226
+ "name": "python3"
227
+ },
228
+ "language_info": {
229
+ "codemirror_mode": {
230
+ "name": "ipython",
231
+ "version": 3
232
+ },
233
+ "file_extension": ".py",
234
+ "mimetype": "text/x-python",
235
+ "name": "python",
236
+ "nbconvert_exporter": "python",
237
+ "pygments_lexer": "ipython3",
238
+ "version": "3.10.6"
239
+ },
240
+ "orig_nbformat": 4
241
+ },
242
+ "nbformat": 4,
243
+ "nbformat_minor": 2
244
+ }