File size: 4,096 Bytes
704dc9c
 
 
7360456
48b5e6e
7360456
 
 
9609874
 
 
7360456
 
 
704dc9c
 
 
 
 
 
7360456
704dc9c
 
 
 
 
 
 
7360456
704dc9c
 
7360456
 
 
704dc9c
7360456
704dc9c
7360456
704dc9c
7360456
 
 
 
 
704dc9c
7360456
 
 
 
 
 
 
 
 
 
 
704dc9c
7360456
704dc9c
 
7360456
704dc9c
7360456
704dc9c
7360456
704dc9c
7360456
 
 
704dc9c
7360456
704dc9c
 
7360456
 
704dc9c
7360456
 
 
 
 
 
 
 
 
 
 
704dc9c
7360456
704dc9c
7360456
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import gradio as gr
import os

 
# https://huggingface.co/docs/hub/spaces-gpus
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from torch.nn.functional import softmax

# import logging
# import pandas as pd



# save your HF API token from https:/hf.co/settings/tokens as an env variable to avoid rate limiting
auth_token = os.getenv("auth_token")





print("========================================================================")
print("Starting ... gradio_demo_nlp_autocomplete/app.py")
print("AUTH TOKEN:", auth_token)


# load a model from https://hf.co/models as an interface, then use it as an api 
# you can remove the api_key parameter if you don't care about rate limiting. 
# api = gr.Interface.load(, api_key=auth_token,)


model_ref = "projecte-aina/roberta-base-ca-v2"
tokenizer = AutoTokenizer.from_pretrained(model_ref)
model = AutoModelForMaskedLM.from_pretrained(model_ref)

def get_topk(text, tokenizer, model, k):

    print("Get top K,", text)

    # Tokenize
    # ==========================================================================================
    tokenizer_kwargs = dict(padding='longest', return_token_type_ids=False, return_tensors="pt")
    inputs = tokenizer(text, **tokenizer_kwargs).to("cpu")
    input_ids = inputs.input_ids

    
    # Get model outputs and probabilities
    # ==========================================================================================
    # logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
    logits = model.to("cpu")(**inputs).logits
    probs = softmax(logits, dim=2)
    
    
    # Index ok <mask> (ojo només funciona quan hi ha 1 MASK)
    # ==========================================================================================
    row_idx, mask_idx = torch.where(input_ids.to("cpu") == tokenizer.mask_token_id)

    return probs[row_idx, mask_idx].topk(k), mask_idx


def generate_output(text, k):

    # lines = print_topk(text, tokenizer, model, k=10)

    (values, indices), input_idx = get_topk(text, tokenizer, model, int(k))

    for mask_vals, mask_indices, input_idx in zip(values, indices, input_idx):
        labels = {tokenizer.decode(ind): val.item()
                  for val, ind in zip(mask_vals, mask_indices)}

    return labels


md_text ="""
# Masked Language Modeling Example

by [nurasaki](https://huggingface.co/spaces/nurasaki)  

* Space : [https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example](https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example)
* Model used: Catalan BERTa-v2 (roberta-base-ca-v2) base model
* Hugginface link: [https://huggingface.co/projecte-aina/roberta-base-ca-v2](https://huggingface.co/projecte-aina/roberta-base-ca-v2)

<br>

## Model description

The **roberta-base-ca-v2** is a transformer-based masked language model for the Catalan language. 

It is based on the [RoBERTA](https://github.com/pytorch/fairseq/tree/master/examples/roberta) base model and has been trained on a medium-size corpus collected from publicly available corpora and crawlers.

<br>

## Usage

The model accepts an input text with a *mask* (for example, "La meva mare es diu \<mask\>.") and generates the *k* most probable words that could fill the *mask* position in the sentence.  

Choose one of the provided examples or enter your own masked text.

<br>



"""

examples = [
    "La meva mare es diu <mask>.",
    "La meva mare treballa de <mask>.",
    "El meu fill es diu <mask>.",
    "El teu pare treballa de <mask>.",
]



with gr.Blocks() as demo:
    gr.Markdown(md_text)
    with gr.Row():
        with gr.Column():
            text = gr.Textbox("La meva mare es diu <mask>.", label="Masked text")
            k = gr.Number(value=10, label="Num. results")
            btn = gr.Button("Generate")
            
        with gr.Column():
            out_label = gr.Label(label="Results")
    
    
    btn.click(generate_output, inputs=[text, k], outputs=[out_label])
    gr.Examples(examples, inputs=[text])

# if __name__ == "__main__":
demo.launch(favicon_path="favicon.png")