summarize-long-text

Sleeping

App Files Files Community

Peter commited on May 23, 2022

Commit

66e7228

•

1 Parent(s): e05a3b5

:art: apply black

Browse files

Files changed (2) hide show

app.py +23 -13
summarize.py +39 -37

app.py CHANGED Viewed

@@ -17,6 +17,7 @@ import transformers
 transformers.logging.set_verbosity_error()
 logging.basicConfig()
 def truncate_word_count(text, max_words=512):
     """
     truncate_word_count - a helper function for the gradio module
@@ -38,6 +39,7 @@ def truncate_word_count(text, max_words=512):
         processed["truncated_text"] = text
     return processed
 def proc_submission(
     input_text: str,
     num_beams,
@@ -80,15 +82,15 @@ def proc_submission(
         history["was_truncated"] = False
     _summaries = summarize_via_tokenbatches(
-                    history["input_text"],
-                    model, tokenizer,
-                    batch_length=token_batch_length,
-                    **settings,
-                )
-    sum_text = [s['summary'][0] for s in _summaries]
     sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in _summaries]
     history["Input"] = input_text
     history["Summary Text"] = "\n\t".join(sum_text)
     history["Summary Scores"] = "\n".join(sum_scores)
@@ -104,7 +106,8 @@ def proc_submission(
     return html
-def load_examples(examples_dir='examples'):
     src = _here / examples_dir
     src.mkdir(exist_ok=True)
     examples = [f for f in src.glob("*.txt")]
@@ -113,15 +116,18 @@ def load_examples(examples_dir='examples'):
     for example in examples:
         with open(example, "r") as f:
             text = f.read()
-            text_examples.append([text, 4, 2048, 0.7,3.5,3])
     return text_examples
 if __name__ == "__main__":
-    model, tokenizer = load_model_and_tokenizer('pszemraj/led-large-book-summary')
     title = "Long-form text summarization with LED on the BookSumm dataset"
-    description = "This is a simple example of using the LED model to summarize a long-form text."
     gr.Interface(
         proc_submission,
@@ -130,7 +136,11 @@ if __name__ == "__main__":
             gr.inputs.Slider(
                 minimum=4, maximum=10, label="num_beams", default=4, step=1
             ),
-            gr.Dropdown(choices=[512, 1024, 2048, 4096], label="token_batch_length", default=2048),
             gr.inputs.Slider(
                 minimum=0.5, maximum=1.1, label="length_penalty", default=0.7, step=0.05
             ),
@@ -150,4 +160,4 @@ if __name__ == "__main__":
         title=title,
         description=description,
         examples=load_examples(),
-    ).launch(enable_queue=True, share=True)

 transformers.logging.set_verbosity_error()
 logging.basicConfig()
 def truncate_word_count(text, max_words=512):
     """
     truncate_word_count - a helper function for the gradio module
         processed["truncated_text"] = text
     return processed
 def proc_submission(
     input_text: str,
     num_beams,
         history["was_truncated"] = False
     _summaries = summarize_via_tokenbatches(
+        history["input_text"],
+        model,
+        tokenizer,
+        batch_length=token_batch_length,
+        **settings,
+    )
+    sum_text = [s["summary"][0] for s in _summaries]
     sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in _summaries]
     history["Input"] = input_text
     history["Summary Text"] = "\n\t".join(sum_text)
     history["Summary Scores"] = "\n".join(sum_scores)
     return html
+def load_examples(examples_dir="examples"):
     src = _here / examples_dir
     src.mkdir(exist_ok=True)
     examples = [f for f in src.glob("*.txt")]
     for example in examples:
         with open(example, "r") as f:
             text = f.read()
+            text_examples.append([text, 4, 2048, 0.7, 3.5, 3])
     return text_examples
 if __name__ == "__main__":
+    model, tokenizer = load_model_and_tokenizer("pszemraj/led-large-book-summary")
     title = "Long-form text summarization with LED on the BookSumm dataset"
+    description = (
+        "This is a simple example of using the LED model to summarize a long-form text."
+    )
     gr.Interface(
         proc_submission,
             gr.inputs.Slider(
                 minimum=4, maximum=10, label="num_beams", default=4, step=1
             ),
+            gr.Dropdown(
+                choices=[512, 1024, 2048, 4096],
+                label="token_batch_length",
+                default=2048,
+            ),
             gr.inputs.Slider(
                 minimum=0.5, maximum=1.1, label="length_penalty", default=0.7, step=0.05
             ),
         title=title,
         description=description,
         examples=load_examples(),
+    ).launch(enable_queue=True, share=True)

summarize.py CHANGED Viewed

@@ -2,6 +2,7 @@ import torch
 from tqdm.auto import tqdm
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 def load_model_and_tokenizer(model_name):
     """
     load_model_and_tokenizer - a function that loads a model and tokenizer from huggingface
@@ -14,14 +15,15 @@ def load_model_and_tokenizer(model_name):
     """
     model = AutoModelForSeq2SeqLM.from_pretrained(
-    model_name,
-    low_cpu_mem_usage=True,
-    use_cache=False,
-)
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = model.to("cuda") if torch.cuda.is_available() else model
     return model, tokenizer
 def summarize_and_score(ids, mask, model, tokenizer, **kwargs):
     """
     summarize_and_score - given a batch of ids and a mask, return a summary and a score for the summary
@@ -36,43 +38,43 @@ def summarize_and_score(ids, mask, model, tokenizer, **kwargs):
         str: the summary of the batch
     """
     ids = ids[None, :]
     mask = mask[None, :]
     input_ids = ids.to("cuda") if torch.cuda.is_available() else ids
     attention_mask = mask.to("cuda") if torch.cuda.is_available() else mask
     attention_mask = mask.to("cuda")
     global_attention_mask = torch.zeros_like(attention_mask)
     # put global attention on <s> token
     global_attention_mask[:, 0] = 1
     summary_pred_ids = model.generate(
-            input_ids,
-            attention_mask=attention_mask,
-            global_attention_mask=global_attention_mask,
-            output_scores=True,
-            return_dict_in_generate=True,
-            **kwargs
-        )
     summary = tokenizer.batch_decode(
-                summary_pred_ids.sequences,
-                skip_special_tokens=True,
-                remove_invalid_values=True,
-            )
     score = round(summary_pred_ids.sequences_scores.cpu().numpy()[0], 4)
     return summary, score
 def summarize_via_tokenbatches(
-        input_text:str,
-        model, tokenizer,
-        batch_length=2048,
-        batch_stride=16,
-        **kwargs,
-    ):
     """
     summarize_via_tokenbatches - a function that takes a string and returns a summary
@@ -88,15 +90,15 @@ def summarize_via_tokenbatches(
     """
     encoded_input = tokenizer(
-                        input_text,
-                        padding='max_length',
-                        truncation=True,
-                        max_length=batch_length,
-                        stride=batch_stride,
-                        return_overflowing_tokens=True,
-                        add_special_tokens =False,
-                        return_tensors='pt',
-                    )
     in_id_arr, att_arr = encoded_input.input_ids, encoded_input.attention_mask
     gen_summaries = []
@@ -112,11 +114,11 @@ def summarize_via_tokenbatches(
             tokenizer=tokenizer,
             **kwargs,
         )
-        score = round(float(score),4)
         _sum = {
-            "input_tokens":_id,
-            "summary":result,
-            "summary_score":score,
         }
         gen_summaries.append(_sum)
         print(f"\t{result[0]}\nScore:\t{score}")
@@ -124,4 +126,4 @@ def summarize_via_tokenbatches(
     pbar.close()
-    return gen_summaries

 from tqdm.auto import tqdm
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 def load_model_and_tokenizer(model_name):
     """
     load_model_and_tokenizer - a function that loads a model and tokenizer from huggingface
     """
     model = AutoModelForSeq2SeqLM.from_pretrained(
+        model_name,
+        low_cpu_mem_usage=True,
+        use_cache=False,
+    )
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = model.to("cuda") if torch.cuda.is_available() else model
     return model, tokenizer
 def summarize_and_score(ids, mask, model, tokenizer, **kwargs):
     """
     summarize_and_score - given a batch of ids and a mask, return a summary and a score for the summary
         str: the summary of the batch
     """
     ids = ids[None, :]
     mask = mask[None, :]
     input_ids = ids.to("cuda") if torch.cuda.is_available() else ids
     attention_mask = mask.to("cuda") if torch.cuda.is_available() else mask
     attention_mask = mask.to("cuda")
     global_attention_mask = torch.zeros_like(attention_mask)
     # put global attention on <s> token
     global_attention_mask[:, 0] = 1
     summary_pred_ids = model.generate(
+        input_ids,
+        attention_mask=attention_mask,
+        global_attention_mask=global_attention_mask,
+        output_scores=True,
+        return_dict_in_generate=True,
+        **kwargs,
+    )
     summary = tokenizer.batch_decode(
+        summary_pred_ids.sequences,
+        skip_special_tokens=True,
+        remove_invalid_values=True,
+    )
     score = round(summary_pred_ids.sequences_scores.cpu().numpy()[0], 4)
     return summary, score
 def summarize_via_tokenbatches(
+    input_text: str,
+    model,
+    tokenizer,
+    batch_length=2048,
+    batch_stride=16,
+    **kwargs,
+):
     """
     summarize_via_tokenbatches - a function that takes a string and returns a summary
     """
     encoded_input = tokenizer(
+        input_text,
+        padding="max_length",
+        truncation=True,
+        max_length=batch_length,
+        stride=batch_stride,
+        return_overflowing_tokens=True,
+        add_special_tokens=False,
+        return_tensors="pt",
+    )
     in_id_arr, att_arr = encoded_input.input_ids, encoded_input.attention_mask
     gen_summaries = []
             tokenizer=tokenizer,
             **kwargs,
         )
+        score = round(float(score), 4)
         _sum = {
+            "input_tokens": _id,
+            "summary": result,
+            "summary_score": score,
         }
         gen_summaries.append(_sum)
         print(f"\t{result[0]}\nScore:\t{score}")
     pbar.close()
+    return gen_summaries