File size: 5,451 Bytes
b3b8331
 
 
 
 
 
c627b59
b3b8331
 
 
 
 
99c48ed
b3b8331
 
 
 
 
 
 
 
 
 
 
 
c627b59
 
 
 
 
 
 
 
 
 
 
b3b8331
 
 
 
 
 
 
 
 
 
67037c0
bae53d6
9e95735
b3b8331
 
 
 
 
 
 
 
 
 
 
 
 
3aaa4ea
99c48ed
b3b8331
 
 
 
 
 
 
 
 
 
 
 
c627b59
 
 
 
 
 
 
 
 
b3b8331
 
 
 
 
 
 
 
 
 
 
 
9e95735
bae53d6
67037c0
b3b8331
 
 
 
 
 
 
9e95735
b3b8331
 
9e95735
 
 
b3b8331
9e95735
b3b8331
9e95735
b3b8331
9e95735
b3b8331
9e95735
b3b8331
389d4d9
9407769
 
725ec24
9407769
9e95735
b3b8331
 
 
9e95735
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import streamlit as st
from PIL import Image
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

def get_result_text_es_pt (list_entity, text, lang):
    result_words = []
    tmp_word = ""
    if lang == "es":
        punc_tags = ['¿', '?', '¡', '!', ',', '.', ':']
    else:
        punc_tags = ['?', '!', ',', '.', ':']
    
    for idx, entity in enumerate(list_entity): 
        tag = entity["entity"]
        word = entity["word"]
        start = entity["start"]
        end = entity["end"]
        
        # check punctuation
        punc_in = next((p for p in punc_tags if p in tag), "")
                
        subword = False
        # check subwords
        if word[0] == "#": 
            subword = True
            if tmp_word == "":
                p_s = list_entity[idx-1]["start"]
                p_e = list_entity[idx-1]["end"]
                tmp_word = text[p_s:p_e] + text[start:end]
            else: 
                tmp_word = tmp_word + text[start:end]
            word = tmp_word
        else:
            tmp_word = ""
            word = text[start:end]
            
        if tag == "l": 
            word = word 
        elif tag == "u":
            word = word.capitalize()
        # case with punctuation
        else:
            if tag[-1] == "l":
                word = (punc_in + word) if punc_in in ["¿", "¡"] else (word + punc_in)
            elif tag[-1] == "u":
                word = (punc_in + word.capitalize()) if punc_in in ["¿", "¡"] else (word.capitalize() + punc_in)     
        
        if tag != "l":
            word = '<span style="font-weight:bold; color:rgb(142, 208, 129);">' + word + '</span>'
		
        if subword == True: 
            result_words[-1] = word
        else:
            result_words.append(word)

    return " ".join(result_words)
            


def get_result_text_ca (list_entity, text):
    result_words = []
    punc_tags = ['?', '!', ',', '.', ':']
    tmp_word = ""
    for idx, entity in enumerate(list_entity): 
        start = entity["start"]
        end = entity["end"]
        tag = entity["entity"]
        word = entity["word"]
        
        # check punctuation
        punc_in = next((p for p in punc_tags if p in tag), "")
                
        subword = False
        # check subwords
        if word[0] != "Ġ": 
            subword = True
            if tmp_word == "":
                p_s = list_entity[idx-1]["start"]
                p_e = list_entity[idx-1]["end"]
                tmp_word = text[p_s:p_e] + text[start:end]
            else: 
                tmp_word = tmp_word + text[start:end]
            word = tmp_word
        else:
            tmp_word = ""
            word = text[start:end]
        
        if tag == "l": 
            word = word 
        elif tag == "u":
            word = word.capitalize()
        # case with punctuation
        else:
            if tag[-1] == "l":
                word = (punc_in + word) if punc_in in ["¿", "¡"] else (word + punc_in)
            elif tag[-1] == "u":
                word = (punc_in + word.capitalize()) if punc_in in ["¿", "¡"] else (word.capitalize() + punc_in)     
        
        if tag != "l":
            word = '<span style="font-weight:bold; color:rgb(142, 208, 129);">' + word + '</span>'
			
        if subword == True: 
            result_words[-1] = word
        else:
            result_words.append(word)

    return " ".join(result_words)


if __name__ == "__main__":

    st.title('Sanivert Punctuation And Capitalization Restoration')
    model_es = AutoModelForTokenClassification.from_pretrained("VOCALINLP/spanish_capitalization_punctuation_restoration_sanivert")
    tokenizer_es = AutoTokenizer.from_pretrained("VOCALINLP/spanish_capitalization_punctuation_restoration_sanivert")
    pipe_es = pipeline("token-classification", model=model_es, tokenizer=tokenizer_es)
	
    model_ca = AutoModelForTokenClassification.from_pretrained("VOCALINLP/catalan_capitalization_punctuation_restoration_sanivert")
    tokenizer_ca = AutoTokenizer.from_pretrained("VOCALINLP/catalan_capitalization_punctuation_restoration_sanivert")
    pipe_ca = pipeline("token-classification", model=model_ca, tokenizer=tokenizer_ca)
	
    model_pt = AutoModelForTokenClassification.from_pretrained("VOCALINLP/portuguese_capitalization_punctuation_restoration_sanivert")
    tokenizer_pt = AutoTokenizer.from_pretrained("VOCALINLP/portuguese_capitalization_punctuation_restoration_sanivert")
    pipe_pt = pipeline("token-classification", model=model_pt, tokenizer=tokenizer_pt)

    st.sidebar.image("vocali_logo.jpg")
    st.sidebar.subheader("Parque Científico de Murcia, Carretera de Madrid km 388. Complejo de Espinardo, 30100 Murcia")
    
    input_text = st.selectbox(
      label = "Choose an language",
      options = ["Spanish", "Portuguese", "Catalan"]
	)

    st.subheader("Enter the text to be analyzed.")
    text = st.text_input('Enter text') #text is stored in this variable    
    if input_text == "Spanish": 
        result_pipe = pipe_es(text)
        out = get_result_text_es_pt(result_pipe, text, "es")
    elif input_text == "Portuguese": 
        result_pipe = pipe_pt(text)
        out = get_result_text_es_pt(result_pipe, text, "pt")
    elif input_text == "Catalan": 
        result_pipe = pipe_ca(text)
        out = get_result_text_ca(result_pipe, text)

    st.markdown(out, unsafe_allow_html=True)
    text = ""