Spaces:

cdleong
/

langcode-search

Runtime error

File size: 8,153 Bytes

68a8c29
6570b48
25a3c87
65485b5
bdc0541
65485b5
1e9b3ba
 
 
d7bf3e7
1e9b3ba
 
1dcc788
 
6998de9
1c14c96
0cf0819
5a1315d
9989672
1dcc788
d262183
4955959
39897d9
94e5414
293b978
a0fdec6
abed01c
293b978
39897d9
293b978
39897d9
 
e3d850e
 
ff7c666
67daf03
293b978
e3ca56d
726336c
293b978
e3d850e
21247cf
606d796
a397155
0fef655
 
 
 
293b978
0fef655
3661574
293b978
65485b5
 
0fef655
2f590b1
16fc4ca
92a84ae
 
 
9989672
25a3c87
5d305df
 
 
 
25a3c87
5d305df
 
92a84ae
dedac74
7a00e93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdc0541
 
 
 
 
 
 
 
8329262
b65ecd9
878ffe0
 
55f8482
 
 
bdc0541
6ee2f5a
7a00e93
bdc0541
 
051e947
e14dc11
1392687
f023ff1
 
878ffe0
 
6ee2f5a
fa7755f
 
 
55f8482
fa7755f
 
1232c23
 
04021ee
1232c23
 
7a00e93
f78a30e
0ba0d66
 
 
d85245e
fa7755f
1232c23
55f8482
918f014
 
10bed1d
0ba0d66
 
 
 
 
1232c23
10bed1d
0ba0d66
 
 
9c98688
0ba0d66
ff2969f
0ba0d66
bdc0541
 
 
 
88c8646
bdc0541
0ba0d66
3792961
537dd73
92a84ae
ca54e5d
fa7755f
 
07e3fb8

import streamlit as st
import langcodes
from requests_html import HTMLSession
import urllib
import requests

# FEATURE: get wikipedia codes, e.g. from https://en.wikipedia.org/wiki/List_of_Wikipedias or https://meta.wikimedia.org/wiki/List_of_Wikipedias, some of which are nonstandard. Then output f"{code}.wikipedia.org"
# FEATURE: add programming languages easter egg
# TODO: fix 'knh', it has an empty ISO section. Turns out some languages only have 639-3
# TODO: add in some nice things from https://docs.streamlit.io/library/cheatsheet like error codes and status messages. 
# TODO: add in vachan search even if lang not found

st.write("# Language code/tag search")
st.write("Fed up with language tag confusion? Here's your one-stop shop!")
st.write("Try typing your language below! Accepts either codes like `eng`/`en`, or full names like `English` (or even some in other languages, try '法语' or 'français'), and we will use the [langcodes](https://github.com/rspeer/langcodes) library to figure out the correct modern BCP-47 code according to [official W3 Guidelines](https://www.w3.org/International/questions/qa-choosing-language-tags)")
st.write(f"**Feedback:** Provide feedback at https://twitter.com/cleong110, or via slack: https://masakhane-nlp.slack.com/archives/D01DU3MHP7A")

# https://huggingface.co/blog/streamlit-spaces
# https://github.com/psf/requests-html
# https://docs.streamlit.io/library/api-reference/write-magic/st.write
example_valid_lookups = ["zh-CN", "Chinese", "zh-Latn-pinyin", "en-Latn-US", "en", "English", "fr-CA", "French (Canada)", "français", "法语"]
langtext = st.text_input("Language Code/Tag Lookup using langcodes", "Swahili", help=f"Try language codes or language names! Examples: {example_valid_lookups}").strip()

# TODO: st.code() for these "lookup in progress" outputs. 
st.write("* Checking whether the tag is valid. That is, the language, script, territory, and variants (if present) are all tags that have meanings assigned by IANA.")

if langcodes.tag_is_valid(langtext):
  st.write(f"* ...True! '{langtext}' parses meaningfully as a language tag according to IANA.")
else:
  st.write(f"* ...False! '{langtext}' doesn't parse meaningfully as a language tag according to IANA, some of its subcomponents may be invalid or it might be a natural language description.")
    

try:
  lang = langcodes.Language.get(langtext)
#  st.write(f"{lang} is the BCP-47 tag.")
  if "unknown" in lang.display_name().lower():
    st.write(f"* Attempting to lookup the code directly gives us '{lang.display_name()}', attempting to search for it as a natural language string.")
    lang = None
except langcodes.LanguageTagError as e: 
  st.write(f"* Could not lookup code directly, attempting to search for it as a natural language string.")
  lang = None
  


if lang is None:
  try:
    found = langcodes.find(langtext)
    lang = found
    st.write(f"* Natural language search found the following BCP-47 tag: {lang}")
  except LookupError as e:
    st.write("## Result: failure!")
    st.write(f"Unable to look up language code. But all hope is not lost...")
    st.write(f"* You can also try https://r12a.github.io/app-subtags/")    
    st.write(f"* Or possibly https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)}")
    lang = None


def pull_obsolete_codes(iso_code):
  session = HTMLSession() 
  r= session.get(f"https://iso639-3.sil.org/code/{iso_code}")
  # https://www.w3schools.com/cssref/css_selectors.asp
  obsolete_codes = {}
  for found_element in r.html.find(".views-field-nothing", clean=True):
    lines = found_element.text.splitlines()
    for line in lines:
      for obsolete_code_name in ["639-1","639-2/B", "639-2/T", "639-3"]:
        if obsolete_code_name in line and ":" in line:
          code = line.split()[-1]
          obsolete_codes[obsolete_code_name] = code  
  return obsolete_codes


def try_retrieving_glottolog_id(langtext):
  languoid_id = ""
  session = HTMLSession() 
  langtext_quoted = urllib.parse.quote(langtext)
  query_url=f"https://glottolog.org/glottolog?search={langtext_quoted}"
  glottolog_r= session.get(query_url)
  returned_url = glottolog_r.html.url

  
  if "languoid" in returned_url:
    last_section = returned_url.split("/")[-1]
    languoid_id = last_section
  return languoid_id    

def try_searching_vachan_engine(langtext):
  results_list = []
  langtext_quoted = urllib.parse.quote(langtext)
  query_url = f"https://api.vachanengine.org/v2/languages?search_word={langtext_quoted}"
  vachan_r= requests.get(query_url)
  if vachan_r.status_code == 200:
    results_list = vachan_r.json()
  return results_list

#st.write(f"langcodes found the following tag: {type(found)}") # a Language object
if lang is not None: 
  display = lang.display_name()
  b_variant = lang.to_alpha3(variant='B')
  t_variant = lang.to_alpha3(variant='T')
  broader_tags = lang.broader_tags()
  results_from_vachan = try_searching_vachan_engine(langtext)
  standardized_tag = langcodes.standardize_tag(lang)
  languoid_id = try_retrieving_glottolog_id(langtext)
  
  
  st.write(f"## Results: probably use '{standardized_tag}'")
  # TODO: make a results dictionary so it's easy to copy-paste?
  st.write(f"Best-match BCP-47 tag for '{langtext}', according to the langcodes library: {lang}")  
  st.write(f"Breakdown of tag components:")  
  st.write(lang.describe())
  st.write(f"Display name for {lang}: {lang.display_name()}")
  st.write(f"Autonym for {lang}: {lang.autonym()}")
  st.write(f"**Correct, standardized, BCP-47 tag for {langtext}, according to the langcodes library:** `{standardized_tag}`")
  
  
  st.write("## Further Information:")

  st.write(f"Broader tags for this language, if any:")
  st.write(broader_tags)
  
  st.write(f"### Language Subtag Search Tool")
  st.write(f"Try also: https://r12a.github.io/app-subtags/?lookup={lang}, which will likely have links to Ethnologue, Wikipedia, and Character usage. You can also try searching for '{langtext}' there!")
  
  st.write(f"### Glottolog")
  if languoid_id:
    st.write(f"**Glottolog Languoid ID:** Searching for '{langtext}' on Glottolog returns the following 'languoid ID': [{languoid_id}](https://glottolog.org/resource/languoid/id/{languoid_id})")
  st.write(f"https://glottolog.org/glottolog?search={t_variant} may be also of interest, with links to various resources including WALS, Wikidata, Odin, and OLAC. ")
  if t_variant != b_variant:
    st.write(f"If that doesn't work, try https://glottolog.org/glottolog?search={b_variant}, or put in a [custom search query](https://glottolog.org/glottolog)") 
  st.write(f"https://glottolog.org/glottolog?search={urllib.parse.quote(langtext)} may pull up something as well.")

  st.write("### Older / Related Codes")

  st.write(f"ISO 639-3 'alpha3' code, 'terminology' or 'T' variant (deprecated): {t_variant}")
  st.write(f"ISO 639-3 'alpha3' code, 'bibliographic' or 'B' variant (deprecated): {b_variant}")
  
  # ethnologue prefers T for german (deu), and T for French
  st.write(f"If it exists, the ISO 639 Code Tables entry for the T variant would be at https://iso639-3.sil.org/code/{t_variant}. That is also the code variant that typically has a working link to Ethnologue.")
  if t_variant != b_variant:
    st.write(f"If it exists, the ISO 639 Code Tables entry for the B variant would be at https://iso639-3.sil.org/code/{b_variant}")  

  st.write("#### Codes scraped from iso639-3.sil.org")
  #TODO: Cleanup this bit
  t_obsolete_codes = pull_obsolete_codes(t_variant)
  b_obsolete_codes = pull_obsolete_codes(b_variant)   
  if t_obsolete_codes:
    st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{t_variant}:")
    st.write(t_obsolete_codes)
  elif b_obsolete_codes:
    st.write(f"Obsolete codes from previous ISO-639 iterations, pulled from https://iso639-3.sil.org/code/{b_variant}:")
    st.write(b_obsolete_codes)
    
  
  if results_from_vachan:
    st.write("### Other potential matches, from [Vachan Engine](https://github.com/Bridgeconn/vachan-api/tree/version-2) (experimental)")
    st.write(results_from_vachan)