Spaces:
Runtime error
Runtime error
nurasaki
commited on
Commit
•
7360456
1
Parent(s):
704dc9c
gradio_nlp_berta_masked_example: first commit
Browse files- .gitignore +1 -0
- README.md +20 -11
- app.py +95 -37
- flagged/log.csv +6 -0
.gitignore
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
__pycache__/
|
2 |
.DS_Store
|
|
|
|
1 |
__pycache__/
|
2 |
.DS_Store
|
3 |
+
private.md
|
README.md
CHANGED
@@ -10,20 +10,29 @@ pinned: false
|
|
10 |
---
|
11 |
|
12 |
|
|
|
13 |
|
14 |
-
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
git status
|
20 |
-
git add .
|
21 |
-
git commit -am "gradio_nlp_berta_masked_example: first commit"
|
22 |
-
git push
|
23 |
|
24 |
-
|
25 |
-
git push gh_repo main
|
26 |
-
```
|
27 |
|
|
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
|
|
10 |
---
|
11 |
|
12 |
|
13 |
+
# Masked Language Modeling Example
|
14 |
|
15 |
+
by [nurasaki](https://huggingface.co/spaces/nurasaki)
|
16 |
|
17 |
+
* Space : [https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example](https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example)
|
18 |
+
* Model used: Catalan BERTa-v2 (roberta-base-ca-v2) base model
|
19 |
+
* Hugginface link: [https://huggingface.co/projecte-aina/roberta-base-ca-v2](https://huggingface.co/projecte-aina/roberta-base-ca-v2)
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
<br>
|
|
|
|
|
22 |
|
23 |
+
## Model description
|
24 |
|
25 |
+
The **roberta-base-ca-v2** is a transformer-based masked language model for the Catalan language.
|
26 |
+
|
27 |
+
It is based on the [RoBERTA](https://github.com/pytorch/fairseq/tree/master/examples/roberta) base model and has been trained on a medium-size corpus collected from publicly available corpora and crawlers.
|
28 |
+
|
29 |
+
<br>
|
30 |
+
|
31 |
+
## Usage
|
32 |
+
|
33 |
+
The model accepts an input text with a *mask* (for example, "La meva mare es diu \<mask\>.") and generates the *k* most probable words that could fill the *mask* position in the sentence.
|
34 |
+
|
35 |
+
Choose one of the provided examples or enter your own masked text.
|
36 |
+
|
37 |
+
<br>
|
38 |
|
app.py
CHANGED
@@ -1,12 +1,23 @@
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
# save your HF API token from https:/hf.co/settings/tokens as an env variable to avoid rate limiting
|
5 |
auth_token = os.getenv("auth_token")
|
6 |
|
7 |
|
8 |
|
9 |
|
|
|
10 |
print("========================================================================")
|
11 |
print("Starting ... gradio_demo_nlp_autocomplete/app.py")
|
12 |
print("AUTH TOKEN:", auth_token)
|
@@ -14,58 +25,105 @@ print("AUTH TOKEN:", auth_token)
|
|
14 |
|
15 |
# load a model from https://hf.co/models as an interface, then use it as an api
|
16 |
# you can remove the api_key parameter if you don't care about rate limiting.
|
17 |
-
api = gr.Interface.load(
|
18 |
|
19 |
|
|
|
|
|
|
|
20 |
|
|
|
21 |
|
|
|
22 |
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
|
27 |
-
print("type(api):", type(api) )
|
28 |
-
print("Api:", api, "\n" )
|
29 |
|
30 |
|
|
|
31 |
|
32 |
-
|
33 |
-
print("text:")
|
34 |
-
print(text)
|
35 |
-
print("------------------------------------------------------------------------")
|
36 |
-
print("text[:-50]:")
|
37 |
-
print(text[:-50])
|
38 |
-
print("------------------------------------------------------------------------")
|
39 |
-
print("api(text):")
|
40 |
-
print(api(text))
|
41 |
-
print("------------------------------------------------------------------------")
|
42 |
-
print("text[-50:]:")
|
43 |
-
print(text[-50:])
|
44 |
-
print("------------------------------------------------------------------------")
|
45 |
-
print("api(text[-50:]")
|
46 |
-
print(api(text[-50:]))
|
47 |
-
print("------------------------------------------------------------------------")
|
48 |
-
|
49 |
|
50 |
-
|
51 |
|
|
|
|
|
|
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
print("------------------------------------------------------------------------")
|
56 |
-
print("with gr.Blocks")
|
57 |
|
58 |
-
textbox = gr.Textbox(placeholder="Type here...", lines=4)
|
59 |
-
btn = gr.Button("Autocomplete")
|
60 |
-
|
61 |
-
print("textbox:", textbox)
|
62 |
|
63 |
-
|
64 |
-
|
65 |
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
|
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
|
4 |
+
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
8 |
+
import logging
|
9 |
+
from torch.nn.functional import softmax
|
10 |
+
import pandas as pd
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
# save your HF API token from https:/hf.co/settings/tokens as an env variable to avoid rate limiting
|
15 |
auth_token = os.getenv("auth_token")
|
16 |
|
17 |
|
18 |
|
19 |
|
20 |
+
|
21 |
print("========================================================================")
|
22 |
print("Starting ... gradio_demo_nlp_autocomplete/app.py")
|
23 |
print("AUTH TOKEN:", auth_token)
|
|
|
25 |
|
26 |
# load a model from https://hf.co/models as an interface, then use it as an api
|
27 |
# you can remove the api_key parameter if you don't care about rate limiting.
|
28 |
+
# api = gr.Interface.load(, api_key=auth_token,)
|
29 |
|
30 |
|
31 |
+
model_ref = "projecte-aina/roberta-base-ca-v2"
|
32 |
+
tokenizer = AutoTokenizer.from_pretrained(model_ref)
|
33 |
+
model = AutoModelForMaskedLM.from_pretrained(model_ref)
|
34 |
|
35 |
+
def get_topk(text, tokenizer, model, k):
|
36 |
|
37 |
+
print("Get top K,", text)
|
38 |
|
39 |
+
# Tokenize
|
40 |
+
# ==========================================================================================
|
41 |
+
tokenizer_kwargs = dict(padding='longest', return_token_type_ids=False, return_tensors="pt")
|
42 |
+
inputs = tokenizer(text, **tokenizer_kwargs).to("cpu")
|
43 |
+
input_ids = inputs.input_ids
|
44 |
|
45 |
+
|
46 |
+
# Get model outputs and probabilities
|
47 |
+
# ==========================================================================================
|
48 |
+
# logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
|
49 |
+
logits = model.to("cpu")(**inputs).logits
|
50 |
+
probs = softmax(logits, dim=2)
|
51 |
+
|
52 |
+
|
53 |
+
# Index ok <mask> (ojo només funciona quan hi ha 1 MASK)
|
54 |
+
# ==========================================================================================
|
55 |
+
row_idx, mask_idx = torch.where(input_ids.to("cpu") == tokenizer.mask_token_id)
|
56 |
|
57 |
+
return probs[row_idx, mask_idx].topk(k), mask_idx
|
|
|
|
|
58 |
|
59 |
|
60 |
+
def generate_output(text, k):
|
61 |
|
62 |
+
# lines = print_topk(text, tokenizer, model, k=10)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
+
(values, indices), input_idx = get_topk(text, tokenizer, model, int(k))
|
65 |
|
66 |
+
for mask_vals, mask_indices, input_idx in zip(values, indices, input_idx):
|
67 |
+
labels = {tokenizer.decode(ind): val.item()
|
68 |
+
for val, ind in zip(mask_vals, mask_indices)}
|
69 |
|
70 |
+
return labels
|
|
|
|
|
|
|
71 |
|
|
|
|
|
|
|
|
|
72 |
|
73 |
+
md_text ="""
|
74 |
+
# Masked Language Modeling Example
|
75 |
|
76 |
+
by [nurasaki](https://huggingface.co/spaces/nurasaki)
|
77 |
+
|
78 |
+
* Space : [https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example](https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example)
|
79 |
+
* Model used: Catalan BERTa-v2 (roberta-base-ca-v2) base model
|
80 |
+
* Hugginface link: [https://huggingface.co/projecte-aina/roberta-base-ca-v2](https://huggingface.co/projecte-aina/roberta-base-ca-v2)
|
81 |
+
|
82 |
+
<br>
|
83 |
+
|
84 |
+
## Model description
|
85 |
+
|
86 |
+
The **roberta-base-ca-v2** is a transformer-based masked language model for the Catalan language.
|
87 |
|
88 |
+
It is based on the [RoBERTA](https://github.com/pytorch/fairseq/tree/master/examples/roberta) base model and has been trained on a medium-size corpus collected from publicly available corpora and crawlers.
|
89 |
|
90 |
+
<br>
|
91 |
+
|
92 |
+
## Usage
|
93 |
+
|
94 |
+
The model accepts an input text with a *mask* (for example, "La meva mare es diu \<mask\>.") and generates the *k* most probable words that could fill the *mask* position in the sentence.
|
95 |
+
|
96 |
+
Choose one of the provided examples or enter your own masked text.
|
97 |
+
|
98 |
+
<br>
|
99 |
+
|
100 |
+
|
101 |
+
|
102 |
+
"""
|
103 |
+
|
104 |
+
examples = [
|
105 |
+
"La meva mare es diu <mask>.",
|
106 |
+
"La meva mare treballa de <mask>.",
|
107 |
+
"El meu fill es diu <mask>.",
|
108 |
+
"El teu pare treballa de <mask>.",
|
109 |
+
]
|
110 |
+
|
111 |
+
|
112 |
+
|
113 |
+
with gr.Blocks() as demo:
|
114 |
+
gr.Markdown(md_text)
|
115 |
+
with gr.Row():
|
116 |
+
with gr.Column():
|
117 |
+
text = gr.Textbox("La meva mare es diu <mask>.", label="Masked text")
|
118 |
+
k = gr.Number(value=10, label="Num. results")
|
119 |
+
btn = gr.Button("Generate")
|
120 |
+
|
121 |
+
with gr.Column():
|
122 |
+
out_label = gr.Label(label="Results")
|
123 |
+
|
124 |
+
|
125 |
+
btn.click(generate_output, inputs=[text, k], outputs=[out_label])
|
126 |
+
gr.Examples(examples, inputs=[text])
|
127 |
+
|
128 |
+
# if __name__ == "__main__":
|
129 |
+
demo.launch(favicon_path="favicon.png")
|
flagged/log.csv
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Input Text,output,flag,username,timestamp
|
2 |
+
"The tower is 324 metres (1,063 ft) tall,",,,,2023-04-03 16:23:19.212953
|
3 |
+
,"<p>Start typing below and then click <strong>Run</strong> to see the output.</p>
|
4 |
+
",,,2023-04-03 16:28:32.735416
|
5 |
+
El teu pare treballa de <maks>.,"<p>Masked Text: xxx</p>
|
6 |
+
",,,,2023-04-03 17:34:10.400919
|