Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sumy.parsers.plaintext import PlaintextParser
|
2 |
+
from sumy.nlp.tokenizers import Tokenizer
|
3 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
4 |
+
from transformers import pipeline
|
5 |
+
from nltk.corpus import stopwords
|
6 |
+
from collections import Counter
|
7 |
+
import regex as re
|
8 |
+
import pandas as pd
|
9 |
+
import gradio as gr
|
10 |
+
import nltk
|
11 |
+
|
12 |
+
nltk.download("wordnet")
|
13 |
+
nltk.download("omw-1.4")
|
14 |
+
|
15 |
+
|
16 |
+
def run(the_method, text, compression_ratio, use_golden=False, golden=None):
|
17 |
+
if the_method[0:4] == "Sumy":
|
18 |
+
return run_sumy(the_method, _clean_text(text), compression_ratio, golden), run_eval(use_golden, _clean_text(text), run_sumy(the_method, _clean_text(text), compression_ratio, golden), golden)
|
19 |
+
elif the_method[0:13] == "Transformers-":
|
20 |
+
return run_transformers(the_method, _clean_text(text), compression_ratio, golden), run_eval(use_golden, _clean_text(text), run_transformers(the_method, _clean_text(text), compression_ratio, golden), golden)
|
21 |
+
|
22 |
+
|
23 |
+
def run_csv(the_method, csv_input, compression_ratio=1 / 8, use_golden=False):
|
24 |
+
pass
|
25 |
+
|
26 |
+
|
27 |
+
def _clean_text(content):
|
28 |
+
if isinstance(content, str):
|
29 |
+
pass
|
30 |
+
else:
|
31 |
+
content = str(content)
|
32 |
+
# strange jump lines
|
33 |
+
content = re.sub(r"\.", ". ", str(content))
|
34 |
+
# URLs
|
35 |
+
content = re.sub(r"http\S+", "", str(content))
|
36 |
+
# trouble characters
|
37 |
+
content = re.sub(r"\\r\\n", " ", str(content))
|
38 |
+
# clean jump lines
|
39 |
+
content = re.sub(r"\u000D\u000A|[\u000A\u000B\u000C\u000D\u0085\u2028\u2029]", " ", content)
|
40 |
+
# Replace different spaces
|
41 |
+
content = re.sub(r"\u00A0\u1680\u180e\u2000-\u2009\u200a\u200b\u202f\u205f\u3000", " ", content)
|
42 |
+
# replace multiple spaces
|
43 |
+
content = re.sub(r" +", " ", content)
|
44 |
+
# normalize hiphens
|
45 |
+
content = re.sub(r"\p{Pd}+", "-", content)
|
46 |
+
# normalize single quotations
|
47 |
+
content = re.sub(r"[\u02BB\u02BC\u066C\u2018-\u201A\u275B\u275C]", "'", content)
|
48 |
+
# normalize double quotations
|
49 |
+
content = re.sub(r"[\u201C-\u201E\u2033\u275D\u275E\u301D\u301E]", '"', content)
|
50 |
+
# normalize apostrophes
|
51 |
+
content = re.sub(r"[\u0027\u02B9\u02BB\u02BC\u02BE\u02C8\u02EE\u0301\u0313\u0315\u055A\u05F3\u07F4\u07F5\u1FBF\u2018\u2019\u2032\uA78C\uFF07]", "'", content)
|
52 |
+
|
53 |
+
content = " ".join(content.split())
|
54 |
+
return content
|
55 |
+
|
56 |
+
|
57 |
+
def run_sumy(method, text, compression_ratio, golden):
|
58 |
+
from sumy.summarizers.random import RandomSummarizer
|
59 |
+
from sumy.summarizers.luhn import LuhnSummarizer
|
60 |
+
from sumy.summarizers.lsa import LsaSummarizer
|
61 |
+
from sumy.summarizers.lex_rank import LexRankSummarizer
|
62 |
+
from sumy.summarizers.text_rank import TextRankSummarizer
|
63 |
+
from sumy.summarizers.sum_basic import SumBasicSummarizer
|
64 |
+
from sumy.summarizers.kl import KLSummarizer
|
65 |
+
from sumy.summarizers.reduction import ReductionSummarizer
|
66 |
+
from sumy.summarizers.edmundson import EdmundsonSummarizer
|
67 |
+
|
68 |
+
def word_frequency(golden, text, n=20):
|
69 |
+
|
70 |
+
sum_tokens = [t.lower() for t in word_tokenize(golden) if t not in stopwords.words("english") and t.isalpha()]
|
71 |
+
print(sum_tokens)
|
72 |
+
sum_word_freq_descending = pd.DataFrame(Counter(sum_tokens).items(), columns=["word", "frequency sum"]).sort_values(by="frequency sum", ascending=False)
|
73 |
+
|
74 |
+
texts_tokens = [t.lower() for t in word_tokenize(text) if t not in stopwords.words("english") and t.isalpha()]
|
75 |
+
print(texts_tokens)
|
76 |
+
texts_word_freq_descending = pd.DataFrame(Counter(texts_tokens).items(), columns=["word", "frequency text"]).sort_values(by="frequency text", ascending=False)
|
77 |
+
|
78 |
+
stigma_words = pd.merge(sum_word_freq_descending, texts_word_freq_descending, on="word")
|
79 |
+
stigma_words["frequency"] = stigma_words["frequency text"] / stigma_words["frequency sum"]
|
80 |
+
stigma_words = stigma_words.sort_values(by="frequency", ascending=False)
|
81 |
+
|
82 |
+
stigma_words = stigma_words["word"].tolist()[:n]
|
83 |
+
bonus_words = sum_word_freq_descending["word"].tolist()[:n]
|
84 |
+
return bonus_words, stigma_words
|
85 |
+
|
86 |
+
the_method = method.replace("Sumy", "")
|
87 |
+
summarizer = locals()[the_method + "Summarizer"]()
|
88 |
+
sentence_count = int(len(sent_tokenize(text)) * compression_ratio / 100)
|
89 |
+
parser = PlaintextParser.from_string(text, Tokenizer("english"))
|
90 |
+
|
91 |
+
if the_method != "Edmundson":
|
92 |
+
summary = summarizer(parser.document, sentence_count)
|
93 |
+
else:
|
94 |
+
bonus_words, stigma_words = word_frequency(golden, text, 10)
|
95 |
+
summarizer = EdmundsonSummarizer(cue_weight=1, key_weight=1, title_weight=0, location_weight=0)
|
96 |
+
summarizer.bonus_words = bonus_words
|
97 |
+
summarizer.stigma_words = stigma_words
|
98 |
+
summarizer.null_words = stopwords.words("english")
|
99 |
+
print(bonus_words)
|
100 |
+
print(stigma_words)
|
101 |
+
summary = summarizer(parser.document, sentence_count)
|
102 |
+
|
103 |
+
text_summary = ""
|
104 |
+
for s in summary:
|
105 |
+
text_summary += str(s) + " "
|
106 |
+
return text_summary
|
107 |
+
|
108 |
+
|
109 |
+
def run_transformers(method, text, compression_ratio, golden):
|
110 |
+
|
111 |
+
the_method = method.replace("Transformers-", "")
|
112 |
+
summarizer = pipeline("summarization", model=the_method)
|
113 |
+
|
114 |
+
length = 3000
|
115 |
+
while len(word_tokenize(text[0:length])) > 450:
|
116 |
+
length -= 100
|
117 |
+
token_count = len(word_tokenize(text[0:length])) * compression_ratio / 100
|
118 |
+
aux_summary = summarizer(text[0:length], min_length=(int(token_count - 5)), max_length=(int(token_count + 5)))
|
119 |
+
summary = aux_summary[0]["summary_text"]
|
120 |
+
return summary
|
121 |
+
|
122 |
+
|
123 |
+
def run_eval(use_golden, text, summary, golden):
|
124 |
+
if use_golden:
|
125 |
+
rouge = run_rouge_eval(summary, golden)
|
126 |
+
nltk = run_nltk_eval(summary, golden)
|
127 |
+
gensim = run_gensim_eval(summary, golden)
|
128 |
+
sklearn = run_sklearn_eval(summary, golden)
|
129 |
+
return rouge + nltk + gensim + sklearn
|
130 |
+
else:
|
131 |
+
gensim = run_gensim_eval(summary, text)
|
132 |
+
sklearn = run_sklearn_eval(summary, text)
|
133 |
+
return gensim + sklearn
|
134 |
+
|
135 |
+
|
136 |
+
def run_rouge_eval(text, golden):
|
137 |
+
import rouge
|
138 |
+
from rouge_metric import PyRouge
|
139 |
+
|
140 |
+
def print_results(m, p, r, f):
|
141 |
+
return str("{}:\t\t{}: {:5.2f} \t{}: {:5.2f} \t{}: {:5.2f}\n".format(str(m).upper(), "P", 100.0 * p, "R", 100.0 * r, "F1", 100.0 * f))
|
142 |
+
|
143 |
+
evaluator = rouge.Rouge(
|
144 |
+
metrics=["rouge-n", "rouge-l", "rouge-w"],
|
145 |
+
max_n=4,
|
146 |
+
limit_length=True,
|
147 |
+
length_limit=100,
|
148 |
+
length_limit_type="words",
|
149 |
+
apply_avg=False,
|
150 |
+
apply_best=False,
|
151 |
+
alpha=0.5,
|
152 |
+
weight_factor=1.2,
|
153 |
+
stemming=True,
|
154 |
+
) # Default F1_score
|
155 |
+
|
156 |
+
evaluator_su = PyRouge(
|
157 |
+
rouge_n=(1, 2, 3, 4),
|
158 |
+
rouge_l=True,
|
159 |
+
rouge_w=True,
|
160 |
+
rouge_w_weight=1.2,
|
161 |
+
# rouge_s=True,
|
162 |
+
rouge_su=True,
|
163 |
+
skip_gap=4,
|
164 |
+
)
|
165 |
+
|
166 |
+
scores = evaluator_su.evaluate([text], [[golden]])
|
167 |
+
|
168 |
+
rouge_strings = ""
|
169 |
+
for m, results in sorted(scores.items()):
|
170 |
+
p = results["p"]
|
171 |
+
r = results["r"]
|
172 |
+
f = results["f"]
|
173 |
+
rouge_strings += print_results(m, p, r, f)
|
174 |
+
return rouge_strings
|
175 |
+
|
176 |
+
|
177 |
+
def run_nltk_eval(text, golden):
|
178 |
+
from nltk.metrics.scores import precision, recall, f_measure
|
179 |
+
|
180 |
+
def print_results(p, r, f):
|
181 |
+
return str(f"NLTK:\t\t\t\tP: {100*p:5.2f} \tR: {100*r:5.2f} \tF1: {100*f:5.2f}\n")
|
182 |
+
|
183 |
+
p, r, f = [], [], []
|
184 |
+
|
185 |
+
reference = [i for i in golden.split()]
|
186 |
+
hypothesis = [i for i in text.split()]
|
187 |
+
|
188 |
+
p = precision(set(reference), set(hypothesis))
|
189 |
+
r = recall(set(reference), set(hypothesis))
|
190 |
+
f = f_measure(set(reference), set(hypothesis), alpha=0.5)
|
191 |
+
|
192 |
+
return print_results(p, r, f)
|
193 |
+
|
194 |
+
|
195 |
+
def run_gensim_eval(text, golden):
|
196 |
+
from gensim.matutils import kullback_leibler, hellinger, jaccard, jensen_shannon
|
197 |
+
from gensim.corpora import Dictionary, HashDictionary
|
198 |
+
from gensim.models import ldamodel, NormModel
|
199 |
+
|
200 |
+
def print_results(h, j, kld):
|
201 |
+
return str(f"Gensim:\t\t\tH: {h:5.2f} \tJ: {j:5.2f} \tKLD: {kld:5.2f}\n")
|
202 |
+
|
203 |
+
def generate_freqdist(text, golden):
|
204 |
+
|
205 |
+
ref_hyp = text + golden
|
206 |
+
ref_hyp_dict = HashDictionary([ref_hyp])
|
207 |
+
ref_hyp_bow = ref_hyp_dict.doc2bow(ref_hyp)
|
208 |
+
ref_hyp_bow = [(i[0], 0) for i in ref_hyp_bow]
|
209 |
+
ref_bow_base = [ref_hyp_dict.doc2bow(text) for text in [golden]][0]
|
210 |
+
hyp_bow_base = [ref_hyp_dict.doc2bow(text) for text in [text]][0]
|
211 |
+
ref_bow, hyp_bow = [], []
|
212 |
+
ref_list = [i[0] for i in ref_bow_base]
|
213 |
+
hyp_list = [i[0] for i in hyp_bow_base]
|
214 |
+
|
215 |
+
for base in ref_hyp_bow:
|
216 |
+
if base[0] not in ref_list:
|
217 |
+
ref_bow.append((base[0], base[1] + 1))
|
218 |
+
else:
|
219 |
+
for ref in ref_bow_base:
|
220 |
+
if ref[0] == base[0]:
|
221 |
+
ref_bow.append((ref[0], ref[1] + 1))
|
222 |
+
|
223 |
+
for base in ref_hyp_bow:
|
224 |
+
if base[0] not in hyp_list:
|
225 |
+
hyp_bow.append((base[0], base[1] + 1))
|
226 |
+
else:
|
227 |
+
for hyp in hyp_bow_base:
|
228 |
+
if hyp[0] == base[0]:
|
229 |
+
hyp_bow.append((hyp[0], hyp[1] + 1))
|
230 |
+
|
231 |
+
sum_ref = sum([i[1] for i in ref_bow])
|
232 |
+
sum_hyp = sum([i[1] for i in ref_bow])
|
233 |
+
vec_ref = [i[1] / sum_ref for i in ref_bow]
|
234 |
+
vec_hyp = [i[1] / sum_hyp for i in hyp_bow]
|
235 |
+
|
236 |
+
return vec_ref, vec_hyp, ref_bow_base, hyp_bow_base
|
237 |
+
|
238 |
+
ref_bow_norm, hyp_bow_norm, ref_bow, hyp_bow = generate_freqdist(text, golden)
|
239 |
+
|
240 |
+
h = hellinger(hyp_bow_norm, ref_bow_norm)
|
241 |
+
kld = kullback_leibler(hyp_bow_norm, ref_bow_norm)
|
242 |
+
j = jaccard(hyp_bow, ref_bow)
|
243 |
+
|
244 |
+
return print_results(h, j, kld)
|
245 |
+
|
246 |
+
|
247 |
+
def run_sklearn_eval(text, golden):
|
248 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
249 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
250 |
+
|
251 |
+
def print_results(cosim_avg):
|
252 |
+
return str(f"SKLearn:\t\t\tC: {cosim_avg:5.2f}\n")
|
253 |
+
|
254 |
+
Tfidf_vect = TfidfVectorizer()
|
255 |
+
vector_matrix = Tfidf_vect.fit_transform([text, golden])
|
256 |
+
cosine_similarity_matrix = cosine_similarity(vector_matrix)
|
257 |
+
cosim = cosine_similarity_matrix[0, 1]
|
258 |
+
|
259 |
+
return print_results(cosim)
|
260 |
+
|
261 |
+
|
262 |
+
if __name__ == "__main__":
|
263 |
+
|
264 |
+
with gr.Blocks() as demo:
|
265 |
+
with gr.Row():
|
266 |
+
with gr.Column(scale=1, min_width=300):
|
267 |
+
gr.Markdown("### Sumarização Automática de Textos + Avaliação de Resumos\n Projeto de Pesquisa de Ciência de Dados aplicada ao Portfólio de Produtos Financeiros - PPF-MCTI")
|
268 |
+
with gr.Row():
|
269 |
+
with gr.Column(scale=1, min_width=300):
|
270 |
+
dropdown = gr.Dropdown(
|
271 |
+
label="Método de Sumarização",
|
272 |
+
choices=[
|
273 |
+
"SumyRandom",
|
274 |
+
"SumyLuhn",
|
275 |
+
"SumyLsa",
|
276 |
+
"SumyLexRank",
|
277 |
+
# "SumyEdmundson",
|
278 |
+
"SumyTextRank",
|
279 |
+
"SumySumBasic",
|
280 |
+
"SumyKL",
|
281 |
+
"SumyReduction",
|
282 |
+
"Transformers-google/pegasus-xsum",
|
283 |
+
"Transformers-facebook/bart-large-cnn",
|
284 |
+
"Transformers-csebuetnlp/mT5_multilingual_XLSum",
|
285 |
+
],
|
286 |
+
value="SumyLuhn",
|
287 |
+
)
|
288 |
+
with gr.Column(scale=1, min_width=300):
|
289 |
+
compression_ratio = gr.Slider(
|
290 |
+
label="Taxa de Compressão (% do tamanho original)",
|
291 |
+
value=30,
|
292 |
+
minimum=1,
|
293 |
+
maximum=100,
|
294 |
+
)
|
295 |
+
use_golden = gr.Checkbox(label="Avaliar usando Golden Summary?")
|
296 |
+
with gr.Tab("Texto"):
|
297 |
+
with gr.Row():
|
298 |
+
with gr.Column(scale=1, min_width=300):
|
299 |
+
text = gr.Textbox(
|
300 |
+
label="Texto",
|
301 |
+
placeholder="Insira seu texto aqui",
|
302 |
+
)
|
303 |
+
golden = gr.Textbox(
|
304 |
+
label="Golden Summary",
|
305 |
+
placeholder="Insira o resumo ideal do texto aqui (opcional)",
|
306 |
+
)
|
307 |
+
with gr.Column(scale=1, min_width=300):
|
308 |
+
generated_summary = gr.Textbox(label="Resumo gerado automaticamente")
|
309 |
+
evaluators = gr.Textbox(label="Avaliação do resumo")
|
310 |
+
text_button = gr.Button("Executar")
|
311 |
+
with gr.Tab("CSV"):
|
312 |
+
with gr.Column(scale=1, min_width=300):
|
313 |
+
gr.Checkbox(
|
314 |
+
label="Insira abaixo um arquivo CSV com uma coluna de textos a serem sumarizados. Caso opte por avaliar usando golden summaries, estes deverão estar presentes em uma segunda coluna.",
|
315 |
+
value=False,
|
316 |
+
interactive=False,
|
317 |
+
)
|
318 |
+
with gr.Row():
|
319 |
+
csv_input = gr.File(label="Arquivo .csv de textos")
|
320 |
+
csv_output = gr.Files(label="Arquivos .csv de resumos e avaliação")
|
321 |
+
csv_button = gr.Button("Executar")
|
322 |
+
with gr.Tab("DataFrame"):
|
323 |
+
with gr.Column(scale=1, min_width=300):
|
324 |
+
gr.Checkbox(
|
325 |
+
label="Preencha o DataFrame abaixo com textos a serem sumarizados. Caso opte por avaliar usando golden summaries, estes deverão estar presentes na segunda coluna.",
|
326 |
+
value=False,
|
327 |
+
interactive=False,
|
328 |
+
)
|
329 |
+
with gr.Row():
|
330 |
+
df_input = gr.DataFrame(headers=["Texto","Golden Summary"],row_count=(4,"dynamic"),col_count=(2,"fixed"))
|
331 |
+
df_output = gr.Files(label="Arquivos .csv de resumos e avaliação")
|
332 |
+
df_button = gr.Button("Executar")
|
333 |
+
|
334 |
+
text_button.click(run, inputs=[dropdown, text, compression_ratio, use_golden, golden], outputs=[generated_summary, evaluators])
|
335 |
+
csv_button.click(run_csv, inputs=[dropdown, csv_input, compression_ratio, use_golden], outputs=[csv_output])
|
336 |
+
df_button.click(run_csv, inputs=[dropdown, df_input, compression_ratio, use_golden], outputs=[df_output])
|
337 |
+
|
338 |
+
demo.launch()
|