Colin Leong commited on
Commit
09baee5
·
1 Parent(s): f44afa7

CDL: initial form

Browse files
Files changed (1) hide show
  1. app.py +408 -0
app.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from uuid import uuid4
3
+ import langcodes
4
+ import itertools
5
+
6
+ example_languages_from_labse="""Afrikaans
7
+ Albanian
8
+ Amharic
9
+ Arabic
10
+ Armenian
11
+ Assamese
12
+ Azerbaijani
13
+ Basque
14
+ Belarusian
15
+ Bengali
16
+ Bosnian
17
+ Bulgarian
18
+ Burmese
19
+ Catalan
20
+ Cebuano
21
+ Chinese
22
+ Corsican
23
+ Croatian
24
+ Czech
25
+ Danish
26
+ Dutch
27
+ English
28
+ Esperanto
29
+ Estonian
30
+ Finnish
31
+ French
32
+ Western Frisian
33
+ Galician
34
+ Georgian
35
+ German
36
+ Greek
37
+ Gujarati
38
+ Haitian
39
+ Hausa
40
+ Hawaiian
41
+ Hebrew
42
+ Hindi
43
+ Hmong
44
+ Hungarian
45
+ Icelandic
46
+ Igbo
47
+ Indonesian
48
+ Irish
49
+ Italian
50
+ Japanese
51
+ Javanese
52
+ Kannada
53
+ Kazakh
54
+ Khmer
55
+ Kinyarwanda
56
+ Korean
57
+ Kurdish
58
+ Kyrgyz
59
+ Lao
60
+ Latin
61
+ Latvian
62
+ Lithuanian
63
+ Luxembourgish
64
+ Macedonian
65
+ Malagasy
66
+ Malay
67
+ Malayalam
68
+ Maltese
69
+ Māori
70
+ Marathi
71
+ Mongolian
72
+ Nepali
73
+ Norwegian
74
+ Chichewa
75
+ Oriya
76
+ Persian
77
+ Polish
78
+ Portuguese
79
+ Panjabi
80
+ Romanian
81
+ Russian
82
+ Samoan
83
+ Scottish Gaelic
84
+ Serbian
85
+ Southern Sotho
86
+ Shona
87
+ Sinhala
88
+ Slovak
89
+ Slovenian
90
+ Somali
91
+ Spanish
92
+ Sundanese
93
+ Swahili
94
+ Swedish
95
+ Tagalog
96
+ Tajik
97
+ Tamil
98
+ Tatar
99
+ Telugu
100
+ Thai
101
+ Tibetan
102
+ Turkish
103
+ Turkmen
104
+ Uyghur
105
+ Ukrainian
106
+ Urdu
107
+ Uzbek
108
+ Vietnamese
109
+ Welsh
110
+ Wolof
111
+ Xhosa
112
+ Yiddish
113
+ Yoruba
114
+ Zulu""".splitlines()
115
+
116
+ # example_language_tag_string_from_labse = """af
117
+ # sq
118
+ # am
119
+ # ar
120
+ # hy
121
+ # as
122
+ # az
123
+ # eu
124
+ # be
125
+ # bn
126
+ # bs
127
+ # bg
128
+ # my
129
+ # ca
130
+ # ceb
131
+ # zh
132
+ # co
133
+ # hr
134
+ # cs
135
+ # da
136
+ # nl
137
+ # en
138
+ # eo
139
+ # et
140
+ # fi
141
+ # fr
142
+ # fy
143
+ # gl
144
+ # ka
145
+ # de
146
+ # el
147
+ # gu
148
+ # ht
149
+ # ha
150
+ # haw
151
+ # he
152
+ # hi
153
+ # hmn
154
+ # hu
155
+ # is
156
+ # ig
157
+ # id
158
+ # ga
159
+ # it
160
+ # ja
161
+ # jv
162
+ # kn
163
+ # kk
164
+ # km
165
+ # rw
166
+ # ko
167
+ # ku
168
+ # ky
169
+ # lo
170
+ # la
171
+ # lv
172
+ # lt
173
+ # lb
174
+ # mk
175
+ # mg
176
+ # ms
177
+ # ml
178
+ # mt
179
+ # mi
180
+ # mr
181
+ # mn
182
+ # ne
183
+ # no
184
+ # ny
185
+ # or
186
+ # fa
187
+ # pl
188
+ # pt
189
+ # pa
190
+ # ro
191
+ # ru
192
+ # sm
193
+ # gd
194
+ # sr
195
+ # st
196
+ # sn
197
+ # si
198
+ # sk
199
+ # sl
200
+ # so
201
+ # es
202
+ # su
203
+ # sw
204
+ # sv
205
+ # tl
206
+ # tg
207
+ # ta
208
+ # tt
209
+ # te
210
+ # th
211
+ # bo
212
+ # tr
213
+ # tk
214
+ # ug
215
+ # uk
216
+ # ur
217
+ # uz
218
+ # vi
219
+ # cy
220
+ # wo
221
+ # xh
222
+ # yi
223
+ # yo
224
+ # zu"""
225
+
226
+ labse_huggingface_tags = """- af
227
+ - sq
228
+ - am
229
+ - ar
230
+ - hy
231
+ - as
232
+ - az
233
+ - eu
234
+ - be
235
+ - bn
236
+ - bs
237
+ - bg
238
+ - my
239
+ - ca
240
+ - ceb
241
+ - zh
242
+ - co
243
+ - hr
244
+ - cs
245
+ - da
246
+ - nl
247
+ - en
248
+ - eo
249
+ - et
250
+ - fi
251
+ - fr
252
+ - fy
253
+ - gl
254
+ - ka
255
+ - de
256
+ - el
257
+ - gu
258
+ - ht
259
+ - ha
260
+ - haw
261
+ - he
262
+ - hi
263
+ - hmn
264
+ - hu
265
+ - is
266
+ - ig
267
+ - id
268
+ - ga
269
+ - it
270
+ - ja
271
+ - jv
272
+ - kn
273
+ - kk
274
+ - km
275
+ - rw
276
+ - ko
277
+ - ku
278
+ - ky
279
+ - lo
280
+ - la
281
+ - lv
282
+ - lt
283
+ - lb
284
+ - mk
285
+ - mg
286
+ - ms
287
+ - ml
288
+ - mt
289
+ - mi
290
+ - mr
291
+ - mn
292
+ - ne
293
+ - no
294
+ - ny
295
+ - or
296
+ - fa
297
+ - pl
298
+ - pt
299
+ - pa
300
+ - ro
301
+ - ru
302
+ - sm
303
+ - gd
304
+ - sr
305
+ - st
306
+ - sn
307
+ - si
308
+ - sk
309
+ - sl
310
+ - so
311
+ - es
312
+ - su
313
+ - sw
314
+ - sv
315
+ - tl
316
+ - tg
317
+ - ta
318
+ - tt
319
+ - te
320
+ - th
321
+ - bo
322
+ - tr
323
+ - tk
324
+ - ug
325
+ - uk
326
+ - ur
327
+ - uz
328
+ - vi
329
+ - cy
330
+ - wo
331
+ - xh
332
+ - yi
333
+ - yo
334
+ - zu""".splitlines()
335
+ labse_huggingface_tags = [tag.strip() for tag in labse_huggingface_tags if tag]
336
+ labse_huggingface_tags = [tag.split()[-1] for tag in labse_huggingface_tags]
337
+
338
+ def match_based_on_tag_distance(model_languages, data_languages, model_name, data_name="eBible", dedupe=False, threshold=9):
339
+ print(f"Model language count: {len(model_languages)}")
340
+ print(f"Data language count: {len(data_languages)}")
341
+
342
+ if dedupe:
343
+ print(f"Filtering for duplicates...")
344
+ model_languages = list(set(model_languages))
345
+ data_languages = list(set(data_languages))
346
+ print(f"Model languages remaining: {len(model_languages)}")
347
+ print(f"Data language remaining: {len(data_languages)}")
348
+ # Match based on tag distances
349
+ tag_distance_matches = []
350
+
351
+ product_of_lists = list(itertools.product(model_languages, data_languages))
352
+ print(f"checking {len(model_languages)} model languages against {len(data_languages)} data languages, giving {len(product_of_lists)} combinations")
353
+ for combination in tqdm(product_of_lists):
354
+ model_lang = combination[0]
355
+ data_lang = combination[1]
356
+ tag_distance = langcodes.tag_distance(model_lang, data_lang)
357
+ # print(f"{model_lang} and {data_lang} are {tag_distance} tag-distance apart")
358
+ if tag_distance <= threshold:
359
+ tag_distance_matches.append((model_lang, data_lang, tag_distance))
360
+ # print(f"{model_lang} and {data_lang} are {langcodes.tag_distance(model_lang, data_lang)} tag-distance apart")
361
+ # else:
362
+ # print(f"{model_lang} and {data_lang} are {langcodes.tag_distance(model_lang, data_lang)} tag-distance apart")
363
+ # tag_distance_matches = sorted(tag_distance_matches)
364
+
365
+ model_unmatched = [lang for lang in model_languages if lang not in [match[0] for match in tag_distance_matches]]
366
+ data_unmatched = [lang for lang in data_languages if lang not in [match[1] for match in tag_distance_matches]]
367
+ print(f"Found {len(tag_distance_matches)} matches, {len(model_unmatched)} model languages not matched")
368
+ return tag_distance_matches, model_unmatched,data_unmatched, model_languages, data_languages
369
+
370
+
371
+
372
+ def parse_language_list():
373
+ language_list_options = ["Language names", "Language Tags/Codes",
374
+ # "huggingface model/dataset name"
375
+ ]
376
+ language_list_type = st.selectbox(f"What format is your language list?",language_list_options, key=uuid4())
377
+
378
+ language_list = []
379
+ not_parsed = []
380
+ if language_list_type==language_list_options[0]:
381
+ languages_input = st.text_area("Language names, comma-separated", f"{",".join(example_languages_from_labse)}", key=uuid4())
382
+ for lang in languages_input.split(","):
383
+ try:
384
+ language_list.append(langcodes.find(lang.strip()))
385
+ except LookupError as e:
386
+ not_parsed.append(lang)
387
+
388
+
389
+
390
+ elif language_list_type==language_list_options[1]:
391
+ languages_input = st.text_area("Language tags, comma-separated", f"{','.join(labse_huggingface_tags)}", key=uuid4())
392
+
393
+
394
+ for lang in languages_input.split(","):
395
+ try:
396
+ language_list.append(langcodes.get(lang.strip()))
397
+ except langcodes.tag_parser.LanguageTagError as e:
398
+ print(e)
399
+ not_parsed.append(lang)
400
+
401
+ # = []
402
+ st.write(f"Langcodes list: {language_list}")
403
+ # st.write(f"Langcodes could not parse {not_parsed}")
404
+ return language_list
405
+ first_lang_list = parse_language_list()
406
+ second_lang_list = parse_language_list()
407
+
408
+