Spaces:

nurasaki
/

gradio_nlp_berta_masked_example

Runtime error

App Files Files Community

nurasaki commited on Apr 3, 2023

Commit

7360456

1 Parent(s): 704dc9c

gradio_nlp_berta_masked_example: first commit

Browse files

Files changed (4) hide show

.gitignore +1 -0
README.md +20 -11
app.py +95 -37
flagged/log.csv +6 -0

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 __pycache__/
 .DS_Store

 __pycache__/
 .DS_Store
+private.md

README.md CHANGED Viewed

@@ -10,20 +10,29 @@ pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
-```sh git commands
-git clone https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example
-echo __pycache__/ > .gitignore\n
-git status
-git add .
-git commit -am "gradio_nlp_berta_masked_example: first commit"
-git push
-git remote add gh_repo git@github.com:nurasaki/gradio_nlp_berta_masked_example.git
-git push gh_repo main
-```

 ---
+# Masked Language Modeling Example
+by [nurasaki](https://huggingface.co/spaces/nurasaki)
+* Space : [https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example](https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example)
+* Model used: Catalan BERTa-v2 (roberta-base-ca-v2) base model
+* Hugginface link: [https://huggingface.co/projecte-aina/roberta-base-ca-v2](https://huggingface.co/projecte-aina/roberta-base-ca-v2)
+<br>
+## Model description
+The **roberta-base-ca-v2** is a transformer-based masked language model for the Catalan language.
+It is based on the [RoBERTA](https://github.com/pytorch/fairseq/tree/master/examples/roberta) base model and has been trained on a medium-size corpus collected from publicly available corpora and crawlers.
+<br>
+## Usage
+The model accepts an input text with a *mask* (for example, "La meva mare es diu \<mask\>.") and generates the *k* most probable words that could fill the *mask* position in the sentence.
+Choose one of the provided examples or enter your own masked text.
+<br>

app.py CHANGED Viewed

@@ -1,12 +1,23 @@
 import gradio as gr
 import os
 # save your HF API token from https:/hf.co/settings/tokens as an env variable to avoid rate limiting
 auth_token = os.getenv("auth_token")
 print("========================================================================")
 print("Starting ... gradio_demo_nlp_autocomplete/app.py")
 print("AUTH TOKEN:", auth_token)
@@ -14,58 +25,105 @@ print("AUTH TOKEN:", auth_token)
 # load a model from https://hf.co/models as an interface, then use it as an api
 # you can remove the api_key parameter if you don't care about rate limiting.
-api = gr.Interface.load("huggingface/projecte-aina/roberta-base-ca-v2", api_key=auth_token,)
-def complete_with_gpt(text):
-    print("------------------------------------------------------------------------")
-    print("type(api):", type(api) )
-    print("Api:", api, "\n" )
-    print("------------------------------------------------------------------------")
-    print("text:")
-    print(text)
-    print("------------------------------------------------------------------------")
-    print("text[:-50]:")
-    print(text[:-50])
-    print("------------------------------------------------------------------------")
-    print("api(text):")
-    print(api(text))
-    print("------------------------------------------------------------------------")
-    print("text[-50:]:")
-    print(text[-50:])
-    print("------------------------------------------------------------------------")
-    print("api(text[-50:]")
-    print(api(text[-50:]))
-    print("------------------------------------------------------------------------")
-    return text[:-50] + api(text[-50:])
-with gr.Blocks() as demo:
-    print("------------------------------------------------------------------------")
-    print("with gr.Blocks")
-    textbox = gr.Textbox(placeholder="Type here...", lines=4)
-    btn = gr.Button("Autocomplete")
-    print("textbox:", textbox)
-    # define what will run when the button is clicked, here the textbox is used as both an input and an output
-    btn.click(fn=complete_with_gpt, inputs=textbox, outputs=textbox, queue=False)
-demo.launch(favicon_path="favicon.png")
-# /Users/nurasaki/miniforge3/envs/conda_tfg_clone/lib/python3.8/site-packages/gradio/interface.py:93:
-# UserWarning: gr.Intrerface.load() will be deprecated. Use gr.load() instead.
-# warnings.warn("gr.Intrerface.load() will be deprecated. Use gr.load() instead.")

 import gradio as gr
 import os
+import torch
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+import logging
+from torch.nn.functional import softmax
+import pandas as pd
 # save your HF API token from https:/hf.co/settings/tokens as an env variable to avoid rate limiting
 auth_token = os.getenv("auth_token")
 print("========================================================================")
 print("Starting ... gradio_demo_nlp_autocomplete/app.py")
 print("AUTH TOKEN:", auth_token)
 # load a model from https://hf.co/models as an interface, then use it as an api
 # you can remove the api_key parameter if you don't care about rate limiting.
+# api = gr.Interface.load(, api_key=auth_token,)
+model_ref = "projecte-aina/roberta-base-ca-v2"
+tokenizer = AutoTokenizer.from_pretrained(model_ref)
+model = AutoModelForMaskedLM.from_pretrained(model_ref)
+def get_topk(text, tokenizer, model, k):
+    print("Get top K,", text)
+    # Tokenize
+    # ==========================================================================================
+    tokenizer_kwargs = dict(padding='longest', return_token_type_ids=False, return_tensors="pt")
+    inputs = tokenizer(text, **tokenizer_kwargs).to("cpu")
+    input_ids = inputs.input_ids
+    # Get model outputs and probabilities
+    # ==========================================================================================
+    # logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
+    logits = model.to("cpu")(**inputs).logits
+    probs = softmax(logits, dim=2)
+    # Index ok <mask> (ojo només funciona quan hi ha 1 MASK)
+    # ==========================================================================================
+    row_idx, mask_idx = torch.where(input_ids.to("cpu") == tokenizer.mask_token_id)
+    return probs[row_idx, mask_idx].topk(k), mask_idx
+def generate_output(text, k):
+    # lines = print_topk(text, tokenizer, model, k=10)
+    (values, indices), input_idx = get_topk(text, tokenizer, model, int(k))
+    for mask_vals, mask_indices, input_idx in zip(values, indices, input_idx):
+        labels = {tokenizer.decode(ind): val.item()
+                  for val, ind in zip(mask_vals, mask_indices)}
+    return labels
+md_text ="""
+# Masked Language Modeling Example
+by [nurasaki](https://huggingface.co/spaces/nurasaki)
+* Space : [https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example](https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example)
+* Model used: Catalan BERTa-v2 (roberta-base-ca-v2) base model
+* Hugginface link: [https://huggingface.co/projecte-aina/roberta-base-ca-v2](https://huggingface.co/projecte-aina/roberta-base-ca-v2)
+<br>
+## Model description
+The **roberta-base-ca-v2** is a transformer-based masked language model for the Catalan language.
+It is based on the [RoBERTA](https://github.com/pytorch/fairseq/tree/master/examples/roberta) base model and has been trained on a medium-size corpus collected from publicly available corpora and crawlers.
+<br>
+## Usage
+The model accepts an input text with a *mask* (for example, "La meva mare es diu \<mask\>.") and generates the *k* most probable words that could fill the *mask* position in the sentence.
+Choose one of the provided examples or enter your own masked text.
+<br>
+"""
+examples = [
+    "La meva mare es diu <mask>.",
+    "La meva mare treballa de <mask>.",
+    "El meu fill es diu <mask>.",
+    "El teu pare treballa de <mask>.",
+]
+with gr.Blocks() as demo:
+    gr.Markdown(md_text)
+    with gr.Row():
+        with gr.Column():
+            text = gr.Textbox("La meva mare es diu <mask>.", label="Masked text")
+            k = gr.Number(value=10, label="Num. results")
+            btn = gr.Button("Generate")
+        with gr.Column():
+            out_label = gr.Label(label="Results")
+    btn.click(generate_output, inputs=[text, k], outputs=[out_label])
+    gr.Examples(examples, inputs=[text])
+# if __name__ == "__main__":
+demo.launch(favicon_path="favicon.png")

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+Input Text,output,flag,username,timestamp
+"The tower is 324 metres (1,063 ft) tall,",,,,2023-04-03 16:23:19.212953
+,"<p>Start typing below and then click <strong>Run</strong> to see the output.</p>
+",,,2023-04-03 16:28:32.735416
+El teu pare treballa de <maks>.,"<p>Masked Text: xxx</p>
+",,,,2023-04-03 17:34:10.400919