r2nery commited on
Commit
c6f8b96
·
1 Parent(s): 1d12cd4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +338 -0
app.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sumy.parsers.plaintext import PlaintextParser
2
+ from sumy.nlp.tokenizers import Tokenizer
3
+ from nltk.tokenize import word_tokenize, sent_tokenize
4
+ from transformers import pipeline
5
+ from nltk.corpus import stopwords
6
+ from collections import Counter
7
+ import regex as re
8
+ import pandas as pd
9
+ import gradio as gr
10
+ import nltk
11
+
12
+ nltk.download("wordnet")
13
+ nltk.download("omw-1.4")
14
+
15
+
16
+ def run(the_method, text, compression_ratio, use_golden=False, golden=None):
17
+ if the_method[0:4] == "Sumy":
18
+ return run_sumy(the_method, _clean_text(text), compression_ratio, golden), run_eval(use_golden, _clean_text(text), run_sumy(the_method, _clean_text(text), compression_ratio, golden), golden)
19
+ elif the_method[0:13] == "Transformers-":
20
+ return run_transformers(the_method, _clean_text(text), compression_ratio, golden), run_eval(use_golden, _clean_text(text), run_transformers(the_method, _clean_text(text), compression_ratio, golden), golden)
21
+
22
+
23
+ def run_csv(the_method, csv_input, compression_ratio=1 / 8, use_golden=False):
24
+ pass
25
+
26
+
27
+ def _clean_text(content):
28
+ if isinstance(content, str):
29
+ pass
30
+ else:
31
+ content = str(content)
32
+ # strange jump lines
33
+ content = re.sub(r"\.", ". ", str(content))
34
+ # URLs
35
+ content = re.sub(r"http\S+", "", str(content))
36
+ # trouble characters
37
+ content = re.sub(r"\\r\\n", " ", str(content))
38
+ # clean jump lines
39
+ content = re.sub(r"\u000D\u000A|[\u000A\u000B\u000C\u000D\u0085\u2028\u2029]", " ", content)
40
+ # Replace different spaces
41
+ content = re.sub(r"\u00A0\u1680​\u180e\u2000-\u2009\u200a​\u200b​\u202f\u205f​\u3000", " ", content)
42
+ # replace multiple spaces
43
+ content = re.sub(r" +", " ", content)
44
+ # normalize hiphens
45
+ content = re.sub(r"\p{Pd}+", "-", content)
46
+ # normalize single quotations
47
+ content = re.sub(r"[\u02BB\u02BC\u066C\u2018-\u201A\u275B\u275C]", "'", content)
48
+ # normalize double quotations
49
+ content = re.sub(r"[\u201C-\u201E\u2033\u275D\u275E\u301D\u301E]", '"', content)
50
+ # normalize apostrophes
51
+ content = re.sub(r"[\u0027\u02B9\u02BB\u02BC\u02BE\u02C8\u02EE\u0301\u0313\u0315\u055A\u05F3\u07F4\u07F5\u1FBF\u2018\u2019\u2032\uA78C\uFF07]", "'", content)
52
+
53
+ content = " ".join(content.split())
54
+ return content
55
+
56
+
57
+ def run_sumy(method, text, compression_ratio, golden):
58
+ from sumy.summarizers.random import RandomSummarizer
59
+ from sumy.summarizers.luhn import LuhnSummarizer
60
+ from sumy.summarizers.lsa import LsaSummarizer
61
+ from sumy.summarizers.lex_rank import LexRankSummarizer
62
+ from sumy.summarizers.text_rank import TextRankSummarizer
63
+ from sumy.summarizers.sum_basic import SumBasicSummarizer
64
+ from sumy.summarizers.kl import KLSummarizer
65
+ from sumy.summarizers.reduction import ReductionSummarizer
66
+ from sumy.summarizers.edmundson import EdmundsonSummarizer
67
+
68
+ def word_frequency(golden, text, n=20):
69
+
70
+ sum_tokens = [t.lower() for t in word_tokenize(golden) if t not in stopwords.words("english") and t.isalpha()]
71
+ print(sum_tokens)
72
+ sum_word_freq_descending = pd.DataFrame(Counter(sum_tokens).items(), columns=["word", "frequency sum"]).sort_values(by="frequency sum", ascending=False)
73
+
74
+ texts_tokens = [t.lower() for t in word_tokenize(text) if t not in stopwords.words("english") and t.isalpha()]
75
+ print(texts_tokens)
76
+ texts_word_freq_descending = pd.DataFrame(Counter(texts_tokens).items(), columns=["word", "frequency text"]).sort_values(by="frequency text", ascending=False)
77
+
78
+ stigma_words = pd.merge(sum_word_freq_descending, texts_word_freq_descending, on="word")
79
+ stigma_words["frequency"] = stigma_words["frequency text"] / stigma_words["frequency sum"]
80
+ stigma_words = stigma_words.sort_values(by="frequency", ascending=False)
81
+
82
+ stigma_words = stigma_words["word"].tolist()[:n]
83
+ bonus_words = sum_word_freq_descending["word"].tolist()[:n]
84
+ return bonus_words, stigma_words
85
+
86
+ the_method = method.replace("Sumy", "")
87
+ summarizer = locals()[the_method + "Summarizer"]()
88
+ sentence_count = int(len(sent_tokenize(text)) * compression_ratio / 100)
89
+ parser = PlaintextParser.from_string(text, Tokenizer("english"))
90
+
91
+ if the_method != "Edmundson":
92
+ summary = summarizer(parser.document, sentence_count)
93
+ else:
94
+ bonus_words, stigma_words = word_frequency(golden, text, 10)
95
+ summarizer = EdmundsonSummarizer(cue_weight=1, key_weight=1, title_weight=0, location_weight=0)
96
+ summarizer.bonus_words = bonus_words
97
+ summarizer.stigma_words = stigma_words
98
+ summarizer.null_words = stopwords.words("english")
99
+ print(bonus_words)
100
+ print(stigma_words)
101
+ summary = summarizer(parser.document, sentence_count)
102
+
103
+ text_summary = ""
104
+ for s in summary:
105
+ text_summary += str(s) + " "
106
+ return text_summary
107
+
108
+
109
+ def run_transformers(method, text, compression_ratio, golden):
110
+
111
+ the_method = method.replace("Transformers-", "")
112
+ summarizer = pipeline("summarization", model=the_method)
113
+
114
+ length = 3000
115
+ while len(word_tokenize(text[0:length])) > 450:
116
+ length -= 100
117
+ token_count = len(word_tokenize(text[0:length])) * compression_ratio / 100
118
+ aux_summary = summarizer(text[0:length], min_length=(int(token_count - 5)), max_length=(int(token_count + 5)))
119
+ summary = aux_summary[0]["summary_text"]
120
+ return summary
121
+
122
+
123
+ def run_eval(use_golden, text, summary, golden):
124
+ if use_golden:
125
+ rouge = run_rouge_eval(summary, golden)
126
+ nltk = run_nltk_eval(summary, golden)
127
+ gensim = run_gensim_eval(summary, golden)
128
+ sklearn = run_sklearn_eval(summary, golden)
129
+ return rouge + nltk + gensim + sklearn
130
+ else:
131
+ gensim = run_gensim_eval(summary, text)
132
+ sklearn = run_sklearn_eval(summary, text)
133
+ return gensim + sklearn
134
+
135
+
136
+ def run_rouge_eval(text, golden):
137
+ import rouge
138
+ from rouge_metric import PyRouge
139
+
140
+ def print_results(m, p, r, f):
141
+ return str("{}:\t\t{}: {:5.2f} \t{}: {:5.2f} \t{}: {:5.2f}\n".format(str(m).upper(), "P", 100.0 * p, "R", 100.0 * r, "F1", 100.0 * f))
142
+
143
+ evaluator = rouge.Rouge(
144
+ metrics=["rouge-n", "rouge-l", "rouge-w"],
145
+ max_n=4,
146
+ limit_length=True,
147
+ length_limit=100,
148
+ length_limit_type="words",
149
+ apply_avg=False,
150
+ apply_best=False,
151
+ alpha=0.5,
152
+ weight_factor=1.2,
153
+ stemming=True,
154
+ ) # Default F1_score
155
+
156
+ evaluator_su = PyRouge(
157
+ rouge_n=(1, 2, 3, 4),
158
+ rouge_l=True,
159
+ rouge_w=True,
160
+ rouge_w_weight=1.2,
161
+ # rouge_s=True,
162
+ rouge_su=True,
163
+ skip_gap=4,
164
+ )
165
+
166
+ scores = evaluator_su.evaluate([text], [[golden]])
167
+
168
+ rouge_strings = ""
169
+ for m, results in sorted(scores.items()):
170
+ p = results["p"]
171
+ r = results["r"]
172
+ f = results["f"]
173
+ rouge_strings += print_results(m, p, r, f)
174
+ return rouge_strings
175
+
176
+
177
+ def run_nltk_eval(text, golden):
178
+ from nltk.metrics.scores import precision, recall, f_measure
179
+
180
+ def print_results(p, r, f):
181
+ return str(f"NLTK:\t\t\t\tP: {100*p:5.2f} \tR: {100*r:5.2f} \tF1: {100*f:5.2f}\n")
182
+
183
+ p, r, f = [], [], []
184
+
185
+ reference = [i for i in golden.split()]
186
+ hypothesis = [i for i in text.split()]
187
+
188
+ p = precision(set(reference), set(hypothesis))
189
+ r = recall(set(reference), set(hypothesis))
190
+ f = f_measure(set(reference), set(hypothesis), alpha=0.5)
191
+
192
+ return print_results(p, r, f)
193
+
194
+
195
+ def run_gensim_eval(text, golden):
196
+ from gensim.matutils import kullback_leibler, hellinger, jaccard, jensen_shannon
197
+ from gensim.corpora import Dictionary, HashDictionary
198
+ from gensim.models import ldamodel, NormModel
199
+
200
+ def print_results(h, j, kld):
201
+ return str(f"Gensim:\t\t\tH: {h:5.2f} \tJ: {j:5.2f} \tKLD: {kld:5.2f}\n")
202
+
203
+ def generate_freqdist(text, golden):
204
+
205
+ ref_hyp = text + golden
206
+ ref_hyp_dict = HashDictionary([ref_hyp])
207
+ ref_hyp_bow = ref_hyp_dict.doc2bow(ref_hyp)
208
+ ref_hyp_bow = [(i[0], 0) for i in ref_hyp_bow]
209
+ ref_bow_base = [ref_hyp_dict.doc2bow(text) for text in [golden]][0]
210
+ hyp_bow_base = [ref_hyp_dict.doc2bow(text) for text in [text]][0]
211
+ ref_bow, hyp_bow = [], []
212
+ ref_list = [i[0] for i in ref_bow_base]
213
+ hyp_list = [i[0] for i in hyp_bow_base]
214
+
215
+ for base in ref_hyp_bow:
216
+ if base[0] not in ref_list:
217
+ ref_bow.append((base[0], base[1] + 1))
218
+ else:
219
+ for ref in ref_bow_base:
220
+ if ref[0] == base[0]:
221
+ ref_bow.append((ref[0], ref[1] + 1))
222
+
223
+ for base in ref_hyp_bow:
224
+ if base[0] not in hyp_list:
225
+ hyp_bow.append((base[0], base[1] + 1))
226
+ else:
227
+ for hyp in hyp_bow_base:
228
+ if hyp[0] == base[0]:
229
+ hyp_bow.append((hyp[0], hyp[1] + 1))
230
+
231
+ sum_ref = sum([i[1] for i in ref_bow])
232
+ sum_hyp = sum([i[1] for i in ref_bow])
233
+ vec_ref = [i[1] / sum_ref for i in ref_bow]
234
+ vec_hyp = [i[1] / sum_hyp for i in hyp_bow]
235
+
236
+ return vec_ref, vec_hyp, ref_bow_base, hyp_bow_base
237
+
238
+ ref_bow_norm, hyp_bow_norm, ref_bow, hyp_bow = generate_freqdist(text, golden)
239
+
240
+ h = hellinger(hyp_bow_norm, ref_bow_norm)
241
+ kld = kullback_leibler(hyp_bow_norm, ref_bow_norm)
242
+ j = jaccard(hyp_bow, ref_bow)
243
+
244
+ return print_results(h, j, kld)
245
+
246
+
247
+ def run_sklearn_eval(text, golden):
248
+ from sklearn.metrics.pairwise import cosine_similarity
249
+ from sklearn.feature_extraction.text import TfidfVectorizer
250
+
251
+ def print_results(cosim_avg):
252
+ return str(f"SKLearn:\t\t\tC: {cosim_avg:5.2f}\n")
253
+
254
+ Tfidf_vect = TfidfVectorizer()
255
+ vector_matrix = Tfidf_vect.fit_transform([text, golden])
256
+ cosine_similarity_matrix = cosine_similarity(vector_matrix)
257
+ cosim = cosine_similarity_matrix[0, 1]
258
+
259
+ return print_results(cosim)
260
+
261
+
262
+ if __name__ == "__main__":
263
+
264
+ with gr.Blocks() as demo:
265
+ with gr.Row():
266
+ with gr.Column(scale=1, min_width=300):
267
+ gr.Markdown("### Sumarização Automática de Textos + Avaliação de Resumos\n Projeto de Pesquisa de Ciência de Dados aplicada ao Portfólio de Produtos Financeiros - PPF-MCTI")
268
+ with gr.Row():
269
+ with gr.Column(scale=1, min_width=300):
270
+ dropdown = gr.Dropdown(
271
+ label="Método de Sumarização",
272
+ choices=[
273
+ "SumyRandom",
274
+ "SumyLuhn",
275
+ "SumyLsa",
276
+ "SumyLexRank",
277
+ # "SumyEdmundson",
278
+ "SumyTextRank",
279
+ "SumySumBasic",
280
+ "SumyKL",
281
+ "SumyReduction",
282
+ "Transformers-google/pegasus-xsum",
283
+ "Transformers-facebook/bart-large-cnn",
284
+ "Transformers-csebuetnlp/mT5_multilingual_XLSum",
285
+ ],
286
+ value="SumyLuhn",
287
+ )
288
+ with gr.Column(scale=1, min_width=300):
289
+ compression_ratio = gr.Slider(
290
+ label="Taxa de Compressão (% do tamanho original)",
291
+ value=30,
292
+ minimum=1,
293
+ maximum=100,
294
+ )
295
+ use_golden = gr.Checkbox(label="Avaliar usando Golden Summary?")
296
+ with gr.Tab("Texto"):
297
+ with gr.Row():
298
+ with gr.Column(scale=1, min_width=300):
299
+ text = gr.Textbox(
300
+ label="Texto",
301
+ placeholder="Insira seu texto aqui",
302
+ )
303
+ golden = gr.Textbox(
304
+ label="Golden Summary",
305
+ placeholder="Insira o resumo ideal do texto aqui (opcional)",
306
+ )
307
+ with gr.Column(scale=1, min_width=300):
308
+ generated_summary = gr.Textbox(label="Resumo gerado automaticamente")
309
+ evaluators = gr.Textbox(label="Avaliação do resumo")
310
+ text_button = gr.Button("Executar")
311
+ with gr.Tab("CSV"):
312
+ with gr.Column(scale=1, min_width=300):
313
+ gr.Checkbox(
314
+ label="Insira abaixo um arquivo CSV com uma coluna de textos a serem sumarizados. Caso opte por avaliar usando golden summaries, estes deverão estar presentes em uma segunda coluna.",
315
+ value=False,
316
+ interactive=False,
317
+ )
318
+ with gr.Row():
319
+ csv_input = gr.File(label="Arquivo .csv de textos")
320
+ csv_output = gr.Files(label="Arquivos .csv de resumos e avaliação")
321
+ csv_button = gr.Button("Executar")
322
+ with gr.Tab("DataFrame"):
323
+ with gr.Column(scale=1, min_width=300):
324
+ gr.Checkbox(
325
+ label="Preencha o DataFrame abaixo com textos a serem sumarizados. Caso opte por avaliar usando golden summaries, estes deverão estar presentes na segunda coluna.",
326
+ value=False,
327
+ interactive=False,
328
+ )
329
+ with gr.Row():
330
+ df_input = gr.DataFrame(headers=["Texto","Golden Summary"],row_count=(4,"dynamic"),col_count=(2,"fixed"))
331
+ df_output = gr.Files(label="Arquivos .csv de resumos e avaliação")
332
+ df_button = gr.Button("Executar")
333
+
334
+ text_button.click(run, inputs=[dropdown, text, compression_ratio, use_golden, golden], outputs=[generated_summary, evaluators])
335
+ csv_button.click(run_csv, inputs=[dropdown, csv_input, compression_ratio, use_golden], outputs=[csv_output])
336
+ df_button.click(run_csv, inputs=[dropdown, df_input, compression_ratio, use_golden], outputs=[df_output])
337
+
338
+ demo.launch()