import streamlit as st from uuid import uuid4 import langcodes import itertools example_languages_from_labse="""Afrikaans Albanian Amharic Arabic Armenian Assamese Azerbaijani Basque Belarusian Bengali Bosnian Bulgarian Burmese Catalan Cebuano Chinese Corsican Croatian Czech Danish Dutch English Esperanto Estonian Finnish French Western Frisian Galician Georgian German Greek Gujarati Haitian Hausa Hawaiian Hebrew Hindi Hmong Hungarian Icelandic Igbo Indonesian Irish Italian Japanese Javanese Kannada Kazakh Khmer Kinyarwanda Korean Kurdish Kyrgyz Lao Latin Latvian Lithuanian Luxembourgish Macedonian Malagasy Malay Malayalam Maltese Māori Marathi Mongolian Nepali Norwegian Chichewa Oriya Persian Polish Portuguese Panjabi Romanian Russian Samoan Scottish Gaelic Serbian Southern Sotho Shona Sinhala Slovak Slovenian Somali Spanish Sundanese Swahili Swedish Tagalog Tajik Tamil Tatar Telugu Thai Tibetan Turkish Turkmen Uyghur Ukrainian Urdu Uzbek Vietnamese Welsh Wolof Xhosa Yiddish Yoruba Zulu""".splitlines() # example_language_tag_string_from_labse = """af # sq # am # ar # hy # as # az # eu # be # bn # bs # bg # my # ca # ceb # zh # co # hr # cs # da # nl # en # eo # et # fi # fr # fy # gl # ka # de # el # gu # ht # ha # haw # he # hi # hmn # hu # is # ig # id # ga # it # ja # jv # kn # kk # km # rw # ko # ku # ky # lo # la # lv # lt # lb # mk # mg # ms # ml # mt # mi # mr # mn # ne # no # ny # or # fa # pl # pt # pa # ro # ru # sm # gd # sr # st # sn # si # sk # sl # so # es # su # sw # sv # tl # tg # ta # tt # te # th # bo # tr # tk # ug # uk # ur # uz # vi # cy # wo # xh # yi # yo # zu""" labse_huggingface_tags = """- af - sq - am - ar - hy - as - az - eu - be - bn - bs - bg - my - ca - ceb - zh - co - hr - cs - da - nl - en - eo - et - fi - fr - fy - gl - ka - de - el - gu - ht - ha - haw - he - hi - hmn - hu - is - ig - id - ga - it - ja - jv - kn - kk - km - rw - ko - ku - ky - lo - la - lv - lt - lb - mk - mg - ms - ml - mt - mi - mr - mn - ne - no - ny - or - fa - pl - pt - pa - ro - ru - sm - gd - sr - st - sn - si - sk - sl - so - es - su - sw - sv - tl - tg - ta - tt - te - th - bo - tr - tk - ug - uk - ur - uz - vi - cy - wo - xh - yi - yo - zu""".splitlines() labse_huggingface_tags = [tag.strip() for tag in labse_huggingface_tags if tag] labse_huggingface_tags = [tag.split()[-1] for tag in labse_huggingface_tags] def match_based_on_tag_distance(model_languages, data_languages, model_name, data_name="eBible", dedupe=False, threshold=9): print(f"Model language count: {len(model_languages)}") print(f"Data language count: {len(data_languages)}") if dedupe: print(f"Filtering for duplicates...") model_languages = list(set(model_languages)) data_languages = list(set(data_languages)) print(f"Model languages remaining: {len(model_languages)}") print(f"Data language remaining: {len(data_languages)}") # Match based on tag distances tag_distance_matches = [] product_of_lists = list(itertools.product(model_languages, data_languages)) print(f"checking {len(model_languages)} model languages against {len(data_languages)} data languages, giving {len(product_of_lists)} combinations") for combination in tqdm(product_of_lists): model_lang = combination[0] data_lang = combination[1] tag_distance = langcodes.tag_distance(model_lang, data_lang) # print(f"{model_lang} and {data_lang} are {tag_distance} tag-distance apart") if tag_distance <= threshold: tag_distance_matches.append((model_lang, data_lang, tag_distance)) # print(f"{model_lang} and {data_lang} are {langcodes.tag_distance(model_lang, data_lang)} tag-distance apart") # else: # print(f"{model_lang} and {data_lang} are {langcodes.tag_distance(model_lang, data_lang)} tag-distance apart") # tag_distance_matches = sorted(tag_distance_matches) model_unmatched = [lang for lang in model_languages if lang not in [match[0] for match in tag_distance_matches]] data_unmatched = [lang for lang in data_languages if lang not in [match[1] for match in tag_distance_matches]] print(f"Found {len(tag_distance_matches)} matches, {len(model_unmatched)} model languages not matched") return tag_distance_matches, model_unmatched,data_unmatched, model_languages, data_languages def parse_language_list(): language_list_options = ["Language names", "Language Tags/Codes", # "huggingface model/dataset name" ] language_list_type = st.selectbox(f"What format is your language list?",language_list_options, key=uuid4()) language_list = [] not_parsed = [] if language_list_type==language_list_options[0]: languages_input = st.text_area("Language names, comma-separated", f"{','.join(example_languages_from_labse)}", key=uuid4()) for lang in languages_input.split(","): try: language_list.append(langcodes.find(lang.strip())) except LookupError as e: not_parsed.append(lang) elif language_list_type==language_list_options[1]: languages_input = st.text_area("Language tags, comma-separated", f"{','.join(labse_huggingface_tags)}", key=uuid4()) for lang in languages_input.split(","): try: language_list.append(langcodes.get(lang.strip())) except langcodes.tag_parser.LanguageTagError as e: print(e) not_parsed.append(lang) # = [] st.write(f"Langcodes list: {language_list}") # st.write(f"Langcodes could not parse {not_parsed}") return language_list first_lang_list = parse_language_list() # second_lang_list = parse_language_list()