r2nery commited on
Commit
cd507e6
·
1 Parent(s): 1416c31

Added CSV and Dataframe inputs (evaluation still WIP)

Browse files
Files changed (1) hide show
  1. app.py +111 -41
app.py CHANGED
@@ -16,14 +16,46 @@ nltk.download("punkt")
16
 
17
  def run(the_method, text, compression_ratio, use_golden=False, golden=None):
18
  if the_method[0:4] == "Sumy":
19
- return run_sumy(the_method, _clean_text(text), compression_ratio, golden), run_eval(use_golden, _clean_text(text), run_sumy(the_method, _clean_text(text), compression_ratio, golden), golden)
20
  elif the_method[0:13] == "Transformers-":
21
- return run_transformers(the_method, _clean_text(text), compression_ratio, golden), run_eval(use_golden, _clean_text(text), run_transformers(the_method, _clean_text(text), compression_ratio, golden), golden)
22
 
 
 
 
 
23
 
24
- def run_csv(the_method, csv_input, compression_ratio=1 / 8, use_golden=False):
25
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  def _clean_text(content):
29
  if isinstance(content, str):
@@ -55,7 +87,7 @@ def _clean_text(content):
55
  return content
56
 
57
 
58
- def run_sumy(method, text, compression_ratio, golden):
59
  from sumy.summarizers.random import RandomSummarizer
60
  from sumy.summarizers.luhn import LuhnSummarizer
61
  from sumy.summarizers.lsa import LsaSummarizer
@@ -66,40 +98,14 @@ def run_sumy(method, text, compression_ratio, golden):
66
  from sumy.summarizers.reduction import ReductionSummarizer
67
  from sumy.summarizers.edmundson import EdmundsonSummarizer
68
 
69
- def word_frequency(golden, text, n=20):
70
-
71
- sum_tokens = [t.lower() for t in word_tokenize(golden) if t not in stopwords.words("english") and t.isalpha()]
72
- print(sum_tokens)
73
- sum_word_freq_descending = pd.DataFrame(Counter(sum_tokens).items(), columns=["word", "frequency sum"]).sort_values(by="frequency sum", ascending=False)
74
-
75
- texts_tokens = [t.lower() for t in word_tokenize(text) if t not in stopwords.words("english") and t.isalpha()]
76
- print(texts_tokens)
77
- texts_word_freq_descending = pd.DataFrame(Counter(texts_tokens).items(), columns=["word", "frequency text"]).sort_values(by="frequency text", ascending=False)
78
-
79
- stigma_words = pd.merge(sum_word_freq_descending, texts_word_freq_descending, on="word")
80
- stigma_words["frequency"] = stigma_words["frequency text"] / stigma_words["frequency sum"]
81
- stigma_words = stigma_words.sort_values(by="frequency", ascending=False)
82
-
83
- stigma_words = stigma_words["word"].tolist()[:n]
84
- bonus_words = sum_word_freq_descending["word"].tolist()[:n]
85
- return bonus_words, stigma_words
86
-
87
  the_method = method.replace("Sumy", "")
88
  summarizer = locals()[the_method + "Summarizer"]()
89
  sentence_count = int(len(sent_tokenize(text)) * compression_ratio / 100)
 
 
90
  parser = PlaintextParser.from_string(text, Tokenizer("english"))
91
 
92
- if the_method != "Edmundson":
93
- summary = summarizer(parser.document, sentence_count)
94
- else:
95
- bonus_words, stigma_words = word_frequency(golden, text, 10)
96
- summarizer = EdmundsonSummarizer(cue_weight=1, key_weight=1, title_weight=0, location_weight=0)
97
- summarizer.bonus_words = bonus_words
98
- summarizer.stigma_words = stigma_words
99
- summarizer.null_words = stopwords.words("english")
100
- print(bonus_words)
101
- print(stigma_words)
102
- summary = summarizer(parser.document, sentence_count)
103
 
104
  text_summary = ""
105
  for s in summary:
@@ -107,7 +113,7 @@ def run_sumy(method, text, compression_ratio, golden):
107
  return text_summary
108
 
109
 
110
- def run_transformers(method, text, compression_ratio, golden):
111
 
112
  the_method = method.replace("Transformers-", "")
113
  summarizer = pipeline("summarization", model=the_method)
@@ -121,6 +127,68 @@ def run_transformers(method, text, compression_ratio, golden):
121
  return summary
122
 
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  def run_eval(use_golden, text, summary, golden):
125
  if use_golden:
126
  rouge = run_rouge_eval(summary, golden)
@@ -289,7 +357,7 @@ if __name__ == "__main__":
289
  with gr.Column(scale=1, min_width=300):
290
  compression_ratio = gr.Slider(
291
  label="Taxa de Compressão (% do tamanho original)",
292
- value=30,
293
  minimum=1,
294
  maximum=100,
295
  )
@@ -312,12 +380,14 @@ if __name__ == "__main__":
312
  with gr.Tab("CSV"):
313
  with gr.Column(scale=1, min_width=300):
314
  gr.Checkbox(
315
- label="Insira abaixo um arquivo CSV com uma coluna de textos a serem sumarizados. Caso opte por avaliar usando golden summaries, estes deverão estar presentes em uma segunda coluna.",
316
  value=False,
317
  interactive=False,
318
  )
319
  with gr.Row():
320
- csv_input = gr.File(label="Arquivo .csv de textos")
 
 
321
  csv_output = gr.Files(label="Arquivos .csv de resumos e avaliação")
322
  csv_button = gr.Button("Executar")
323
  with gr.Tab("DataFrame"):
@@ -328,12 +398,12 @@ if __name__ == "__main__":
328
  interactive=False,
329
  )
330
  with gr.Row():
331
- df_input = gr.DataFrame(headers=["Texto","Golden Summary"],row_count=(4,"dynamic"),col_count=(2,"fixed"))
332
  df_output = gr.Files(label="Arquivos .csv de resumos e avaliação")
333
  df_button = gr.Button("Executar")
334
 
335
  text_button.click(run, inputs=[dropdown, text, compression_ratio, use_golden, golden], outputs=[generated_summary, evaluators])
336
- csv_button.click(run_csv, inputs=[dropdown, csv_input, compression_ratio, use_golden], outputs=[csv_output])
337
- df_button.click(run_csv, inputs=[dropdown, df_input, compression_ratio, use_golden], outputs=[df_output])
338
 
339
  demo.launch()
 
16
 
17
  def run(the_method, text, compression_ratio, use_golden=False, golden=None):
18
  if the_method[0:4] == "Sumy":
19
+ return run_sumy(the_method, _clean_text(text), compression_ratio), run_eval(use_golden, _clean_text(text), run_sumy(the_method, _clean_text(text), compression_ratio), golden)
20
  elif the_method[0:13] == "Transformers-":
21
+ return run_transformers(the_method, _clean_text(text), compression_ratio), run_eval(use_golden, _clean_text(text), run_transformers(the_method, _clean_text(text), compression_ratio), golden)
22
 
23
+ def run_csv(the_method, csv_input, text_column, compression_ratio=1 / 8, use_golden=False):
24
+ df_original = pd.read_csv(csv_input.name)
25
+ text_series = df_original[text_column]
26
+ text_series = text_series.apply(lambda x: _clean_text(x))
27
 
28
+ if the_method[0:4] == "Sumy":
29
+ result = run_sumy_df(the_method, text_series, compression_ratio)
30
+ the_method_dir = the_method[4:]
31
+ #run_eval(use_golden, df, run_sumy(the_method, df, compression_ratio))
32
+ elif the_method[0:13] == "Transformers-":
33
+ the_method_dir = re.sub(r"[\/]","-",the_method[13:])
34
+ result = run_transformers_df(the_method, text_series, compression_ratio)
35
+ #run_eval(use_golden, df, run_sumy(the_method, df, compression_ratio))
36
+
37
+ column_name = "summary_"+the_method_dir
38
+ df_original[column_name] = result["summary"]
39
+ df_original.to_csv(the_method_dir+"_results.csv", index=False)
40
+ return str(the_method_dir+"_results.csv")
41
+
42
+
43
+ def run_df(the_method, df, compression_ratio=1 / 8, use_golden=False):
44
+
45
+ text_series = df.iloc[:,0].apply(lambda x: _clean_text(x))
46
+ print(text_series)
47
 
48
+ if the_method[0:4] == "Sumy":
49
+ result = run_sumy_df(the_method, text_series, compression_ratio)
50
+ the_method_dir = the_method[4:]
51
+ #run_eval(use_golden, df, run_sumy(the_method, df, compression_ratio))
52
+ elif the_method[0:13] == "Transformers-":
53
+ the_method_dir = re.sub(r"[\/]","-",the_method[13:])
54
+ result = run_transformers_df(the_method, text_series, compression_ratio)
55
+ #run_eval(use_golden, df, run_sumy(the_method, df, compression_ratio))
56
+
57
+ result.to_csv(the_method_dir+"_results.csv", index=False)
58
+ return str(the_method_dir+"_results.csv")
59
 
60
  def _clean_text(content):
61
  if isinstance(content, str):
 
87
  return content
88
 
89
 
90
+ def run_sumy(method, text, compression_ratio):
91
  from sumy.summarizers.random import RandomSummarizer
92
  from sumy.summarizers.luhn import LuhnSummarizer
93
  from sumy.summarizers.lsa import LsaSummarizer
 
98
  from sumy.summarizers.reduction import ReductionSummarizer
99
  from sumy.summarizers.edmundson import EdmundsonSummarizer
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  the_method = method.replace("Sumy", "")
102
  summarizer = locals()[the_method + "Summarizer"]()
103
  sentence_count = int(len(sent_tokenize(text)) * compression_ratio / 100)
104
+ if sentence_count < 1:
105
+ sentence_count = 1
106
  parser = PlaintextParser.from_string(text, Tokenizer("english"))
107
 
108
+ summary = summarizer(parser.document, sentence_count)
 
 
 
 
 
 
 
 
 
 
109
 
110
  text_summary = ""
111
  for s in summary:
 
113
  return text_summary
114
 
115
 
116
+ def run_transformers(method, text, compression_ratio):
117
 
118
  the_method = method.replace("Transformers-", "")
119
  summarizer = pipeline("summarization", model=the_method)
 
127
  return summary
128
 
129
 
130
+ def run_sumy_df(method, texts_series, compression_ratio):
131
+
132
+ from sumy.summarizers.random import RandomSummarizer
133
+ from sumy.summarizers.luhn import LuhnSummarizer
134
+ from sumy.summarizers.lsa import LsaSummarizer
135
+ from sumy.summarizers.lex_rank import LexRankSummarizer
136
+ from sumy.summarizers.text_rank import TextRankSummarizer
137
+ from sumy.summarizers.sum_basic import SumBasicSummarizer
138
+ from sumy.summarizers.kl import KLSummarizer
139
+ from sumy.summarizers.reduction import ReductionSummarizer
140
+ from sumy.summarizers.edmundson import EdmundsonSummarizer
141
+ from sumy.parsers.plaintext import PlaintextParser
142
+ from sumy.nlp.tokenizers import Tokenizer # For Strings
143
+ from sumy.parsers.html import HtmlParser
144
+ from sumy.utils import get_stop_words
145
+ from nltk.tokenize import word_tokenize
146
+ from nltk.corpus import stopwords
147
+ from nltk.stem import WordNetLemmatizer
148
+ from collections import Counter
149
+
150
+ the_method = method.replace("Sumy", "")
151
+ the_summarizer = locals()[the_method + "Summarizer"]()
152
+
153
+ summarizer_output_list = []
154
+ for text in texts_series:
155
+ parser = PlaintextParser.from_string(text, Tokenizer("english"))
156
+ sentence_count = int(len(sent_tokenize(text)) * compression_ratio / 100)
157
+ if sentence_count < 1:
158
+ sentence_count = 1
159
+ summarizer_output_list.append(the_summarizer(parser.document, sentence_count))
160
+
161
+ candidate_summaries = []
162
+ for summarizer_output in summarizer_output_list:
163
+ text_summary = ""
164
+ for sentence in summarizer_output:
165
+ text_summary += str(sentence) + " "
166
+
167
+ candidate_summaries.append(text_summary)
168
+
169
+ results = pd.DataFrame({"text": texts_series, "summary": candidate_summaries})
170
+ return results
171
+
172
+ def run_transformers_df(method, texts_series, compression_ratio):
173
+ from transformers import pipeline
174
+ from nltk.tokenize import word_tokenize
175
+
176
+ the_method = method.replace("Transformers-", "")
177
+ summarizer = pipeline("summarization", model=the_method)
178
+
179
+ aux_summaries_list = []
180
+ for text in texts_series:
181
+ length = 3000
182
+ while len(word_tokenize(text[0:length])) > 450:
183
+ length -= 100
184
+ token_count = len(word_tokenize(text[0:length])) * compression_ratio / 100
185
+ aux_summaries_list.append(summarizer(text[0:length], min_length=int(token_count - 5), max_length=int(token_count + 5)))
186
+
187
+ candidate_summaries = [x[0]["summary_text"] for x in aux_summaries_list]
188
+
189
+ results = pd.DataFrame({"text": texts_series, "summary": candidate_summaries})
190
+ return results
191
+
192
  def run_eval(use_golden, text, summary, golden):
193
  if use_golden:
194
  rouge = run_rouge_eval(summary, golden)
 
357
  with gr.Column(scale=1, min_width=300):
358
  compression_ratio = gr.Slider(
359
  label="Taxa de Compressão (% do tamanho original)",
360
+ value=10,
361
  minimum=1,
362
  maximum=100,
363
  )
 
380
  with gr.Tab("CSV"):
381
  with gr.Column(scale=1, min_width=300):
382
  gr.Checkbox(
383
+ label="Insira abaixo um arquivo CSV com uma coluna de textos a serem sumarizados. Caso opte por avaliar usando golden summaries, estes deverão estar presentes em uma coluna entitulada 'golden'.",
384
  value=False,
385
  interactive=False,
386
  )
387
  with gr.Row():
388
+ with gr.Column(scale=1, min_width=300):
389
+ text_column = gr.Textbox(label="Título da coluna a ser sumarizada", placeholder="text")
390
+ csv_input = gr.File(label="Arquivo .csv de textos")
391
  csv_output = gr.Files(label="Arquivos .csv de resumos e avaliação")
392
  csv_button = gr.Button("Executar")
393
  with gr.Tab("DataFrame"):
 
398
  interactive=False,
399
  )
400
  with gr.Row():
401
+ df_input = gr.DataFrame(headers=["Texto","Golden Summary"],row_count=(1,"dynamic"),col_count=(2,"fixed"))
402
  df_output = gr.Files(label="Arquivos .csv de resumos e avaliação")
403
  df_button = gr.Button("Executar")
404
 
405
  text_button.click(run, inputs=[dropdown, text, compression_ratio, use_golden, golden], outputs=[generated_summary, evaluators])
406
+ csv_button.click(run_csv, inputs=[dropdown, csv_input, text_column, compression_ratio, use_golden], outputs=[csv_output])
407
+ df_button.click(run_df, inputs=[dropdown, df_input, compression_ratio, use_golden], outputs=[df_output])
408
 
409
  demo.launch()