dsfsi-language-identification-spaces

Runtime error

App Files Files Community

kargaranamir commited on Oct 20, 2023

Commit

715fd06

1 Parent(s): f462b08

Upadte GlotLID

Browse files

Files changed (5) hide show

README.md +4 -4
app.py +118 -43
assets/GlotLID_logo.svg +0 -0
assets/language_names.json +0 -0
constants.py +1 -1

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: GlotLID
-emoji: ☕
-colorFrom: indigo
-colorTo: purple
 sdk: streamlit
 sdk_version: 1.27.2
 app_file: app.py

 ---
+title: GlotLID Space
+emoji: 📐
+colorFrom: yellow
+colorTo: red
 sdk: streamlit
 sdk_version: 1.27.2
 app_file: app.py

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ import fasttext
 import altair as alt
 from altair import X, Y, Scale
 import base64
 @st.cache_resource
 def load_sp():
@@ -28,16 +28,39 @@ def load_sp():
 sp = load_sp()
 def get_script(text):
-    """Get the writing system of given text.
     Args:
         text: The text to be preprocessed.
     Returns:
-        The writing system of text.
     """
-    return sp(text)[0]
 @st.cache_data
 def render_svg(svg):
@@ -55,17 +78,45 @@ def convert_df(df):
 @st.cache_resource
-def load_model(model_name):
-    model_path = hf_hub_download(repo_id=model_name, filename="model.bin")
     model = fasttext.load_model(model_path)
     return model
-model = load_model(constants.MODEL_NAME)
-def compute(sentences):
-    """Computes the language labels for the given sentences.
     Args:
         sentences: A list of sentences.
@@ -74,81 +125,105 @@ def compute(sentences):
         A list of language probablities and labels for the given sentences.
     """
     progress_text = "Computing Language..."
     my_bar = st.progress(0, text=progress_text)
-    BATCH_SIZE = 1
     probs = []
     labels = []
-    preprocessed_sentences = sentences
-    for first_index in range(0, len(preprocessed_sentences), BATCH_SIZE):
-        outputs = model.predict(preprocessed_sentences[first_index : first_index + BATCH_SIZE])
-        # BATCH_SIZE = 1
-        outputs_labels  = outputs[0][0]
-        outputs_probs = outputs[1][0]
-        probs = probs + [max(min(o, 1), 0) for o in outputs_probs]
-        labels = labels + outputs_labels
         my_bar.progress(
-            min((first_index + BATCH_SIZE) / len(preprocessed_sentences), 1),
             text=progress_text,
         )
     my_bar.empty()
     return probs, labels
 render_svg(open("assets/GlotLID_logo.svg").read())
 tab1, tab2 = st.tabs(["Input a Sentence", "Upload a File"])
 with tab1:
     sent = st.text_input(
         "Sentence:", placeholder="Enter a sentence.", on_change=None
     )
     # TODO: Check if this is needed!
     clicked = st.button("Submit")
     if sent:
-        probs, labels = compute([sent])
         prob = probs[0]
         label = labels[0]
-        ORANGE_COLOR = "#FF8000"
-        fig, ax = plt.subplots(figsize=(8, 1))
-        fig.patch.set_facecolor("none")
-        ax.set_facecolor("none")
-        ax.spines["left"].set_color(ORANGE_COLOR)
-        ax.spines["bottom"].set_color(ORANGE_COLOR)
-        ax.tick_params(axis="x", colors=ORANGE_COLOR)
-        ax.spines[["right", "top"]].set_visible(False)
-        ax.barh(y=[0], width=[prob], color=ORANGE_COLOR)
-        ax.set_xlim(0, 1)
-        ax.set_ylim(-1, 1)
-        ax.set_title(f"Langauge is: {label}", color=ORANGE_COLOR)
-        ax.get_yaxis().set_visible(False)
-        ax.set_xlabel("Confidence", color=ORANGE_COLOR)
-        st.pyplot(fig)
         print(sent)
         with open("logs.txt", "a") as f:
             f.write(sent + "\n")
 with tab2:
     file = st.file_uploader("Upload a file", type=["txt"])
     if file is not None:
-        df = pd.read_csv(file, sep="\t", header=None)
         df.columns = ["Sentence"]
         df.reset_index(drop=True, inplace=True)
         # TODO: Run the model
-        df['Probs'], df["Language"] = compute(df["Sentence"].tolist())
         # A horizontal rule
         st.markdown("""---""")
@@ -158,7 +233,7 @@ with tab2:
             .mark_area(color="darkorange", opacity=0.5)
             .encode(
                 x=X(field="index", title="Sentence Index"),
-                y=Y("Probs", scale=Scale(domain=[0, 1])),
             )
         )
         st.altair_chart(chart.interactive(), use_container_width=True)

 import altair as alt
 from altair import X, Y, Scale
 import base64
+import json
 @st.cache_resource
 def load_sp():
 sp = load_sp()
 def get_script(text):
+    """Get the writing systems of given text.
     Args:
         text: The text to be preprocessed.
     Returns:
+        The main script and list of all scripts.
     """
+    res = sp(text)
+    main_script = res[0] if res[0] else 'Zyyy'
+    all_scripts_dict = res[2]['details']
+    if all_scripts_dict:
+        all_scripts = list(all_scripts_dict.keys())
+    else:
+        all_scripts = 'Zyyy'
+    return main_script, all_scripts
+@st.cache_data
+def language_names(json_path):
+    with open(json_path, 'r') as json_file:
+        data = json.load(json_file)
+    return data
+label2name = language_names("assets/language_names.json")
+def get_name(label):
+    """Get the name of language from label"""
+    iso_3 = label.split('_')[0]
+    name = label2name[iso_3]
+    return name
 @st.cache_data
 def render_svg(svg):
 @st.cache_resource
+def load_GlotLID_v1(model_name, file_name):
+    model_path = hf_hub_download(repo_id=model_name, filename=file_name)
     model = fasttext.load_model(model_path)
     return model
+@st.cache_resource
+def load_GlotLID_v2(model_name, file_name):
+    model_path = hf_hub_download(repo_id=model_name, filename=file_name)
+    model = fasttext.load_model(model_path)
+    return model
+model_1 = load_GlotLID_v1(constants.MODEL_NAME, "model_v1.bin")
+model_2 = load_GlotLID_v2(constants.MODEL_NAME, "model_v2.bin")
+@st.cache_resource
+def plot(label, prob):
+    ORANGE_COLOR = "#FF8000"
+    fig, ax = plt.subplots(figsize=(8, 1))
+    fig.patch.set_facecolor("none")
+    ax.set_facecolor("none")
+    ax.spines["left"].set_color(ORANGE_COLOR)
+    ax.spines["bottom"].set_color(ORANGE_COLOR)
+    ax.tick_params(axis="x", colors=ORANGE_COLOR)
+    ax.spines[["right", "top"]].set_visible(False)
+    ax.barh(y=[0], width=[prob], color=ORANGE_COLOR)
+    ax.set_xlim(0, 1)
+    ax.set_ylim(-1, 1)
+    ax.set_title(f"Label: {label}, Language: {get_name(label)}", color=ORANGE_COLOR)
+    ax.get_yaxis().set_visible(False)
+    ax.set_xlabel("Confidence", color=ORANGE_COLOR)
+    st.pyplot(fig)
+def compute(sentences, version = 'v2'):
+    """Computes the language probablities and labels for the given sentences.
     Args:
         sentences: A list of sentences.
         A list of language probablities and labels for the given sentences.
     """
     progress_text = "Computing Language..."
+    model_choice = model_2 if version == 'v2' else model_1
     my_bar = st.progress(0, text=progress_text)
     probs = []
     labels = []
+    for index, sent in enumerate(sentences):
+        output = model_choice.predict(sent)
+        output_label  = output[0][0].split('__')[-1]
+        output_prob = max(min(output[1][0], 1), 0)
+        output_label_language = output_label.split('_')[0]
+        # script control
+        if version in ['v2'] and output_label_language!= 'zxx':
+            main_script, all_scripts = get_script(sent)
+            output_label_script = output_label.split('_')[1]
+            if output_label_script not in all_scripts:
+                output_label_script = main_script
+                output_label = f"und_{output_label_script}"
+                output_prob = 0
+        labels = labels + [output_label]
+        probs = probs + [output_prob]
         my_bar.progress(
+            min((index) / len(sentences), 1),
             text=progress_text,
         )
     my_bar.empty()
     return probs, labels
+st.markdown("[![Duplicate Space](https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14)](https://huggingface.co/spaces/cis-lmu/glotlid-space?duplicate=true)")
 render_svg(open("assets/GlotLID_logo.svg").read())
 tab1, tab2 = st.tabs(["Input a Sentence", "Upload a File"])
 with tab1:
+    # choice = st.radio(
+    #     "Set granularity level",
+    #     ["default", "merge", "individual"],
+    #     captions=["enable both macrolanguage and its varieties (default)", "merge macrolanguage and its varieties into one label", "remove macrolanguages - only shows individual langauges"],
+    # )
+    version = st.radio(
+        "Choose model",
+        ["v1", "v2"],
+        captions=["GlotLID version 1", "GlotLID version 2 (more data and languages)"],
+        index = 1,
+        key = 'version_tab1',
+        horizontal = True
+    )
     sent = st.text_input(
         "Sentence:", placeholder="Enter a sentence.", on_change=None
     )
     # TODO: Check if this is needed!
     clicked = st.button("Submit")
     if sent:
+        sent = sent.replace('\n', '')
+        probs, labels = compute([sent], version=version)
         prob = probs[0]
         label = labels[0]
+        # plot
+        plot(label, prob)
         print(sent)
         with open("logs.txt", "a") as f:
             f.write(sent + "\n")
 with tab2:
+    version = st.radio(
+        "Choose model",
+        ["v1", "v2"],
+        captions=["GlotLID version 1", "GlotLID version 2 (more data and languages)"],
+        index = 1,
+        key = 'version_tab2',
+        horizontal = True
+    )
     file = st.file_uploader("Upload a file", type=["txt"])
     if file is not None:
+        df = pd.read_csv(file, sep="¦\t¦", header=None)
         df.columns = ["Sentence"]
         df.reset_index(drop=True, inplace=True)
         # TODO: Run the model
+        df['Prob'], df["Label"] = compute(df["Sentence"].tolist(), version= version)
+        df['Language'] = df["Label"].apply(get_name)
         # A horizontal rule
         st.markdown("""---""")
             .mark_area(color="darkorange", opacity=0.5)
             .encode(
                 x=X(field="index", title="Sentence Index"),
+                y=Y("Prob", scale=Scale(domain=[0, 1])),
             )
         )
         st.altair_chart(chart.interactive(), use_container_width=True)

assets/GlotLID_logo.svg CHANGED Viewed

assets/language_names.json ADDED Viewed

The diff for this file is too large to render. See raw diff

constants.py CHANGED Viewed

@@ -1,4 +1,4 @@
 CHOICE_TEXT = "Input Text"
 CHOICE_FILE = "Upload File"
 TITLE = "GlotLID: Language Identification for Around 2000 Languages"
-MODEL_NAME = "cis-lmu/GlotLID"

 CHOICE_TEXT = "Input Text"
 CHOICE_FILE = "Upload File"
 TITLE = "GlotLID: Language Identification for Around 2000 Languages"
+MODEL_NAME = "cis-lmu/glotlid"