document-summarization

Build error

App Files Files Community

pszemraj commited on May 28, 2023

Commit

9e8f29e

1 Parent(s): 80098ed

⚡️ 🐛 fix issue of wrong input text, disambiguate vars

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (2) hide show

app.py +13 -8
utils.py +12 -8

app.py CHANGED Viewed

@@ -45,7 +45,9 @@ from aggregate import BatchAggregator
 from pdf2text import convert_PDF_to_Text
 from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
 from utils import (
     extract_batches,
     load_example_filenames,
     remove_stagnant_files,
     saves_summary,
@@ -241,10 +243,13 @@ def proc_submission(
     history = {}
     clean_text = clean(input_text, lower=False)
     clean_text = remove_stopwords(clean_text) if predrop_stopwords else clean_text
-    processed = truncate_word_count(clean_text, max_words=max_input_length)
-    if processed["was_truncated"]:
-        tr_in = processed["truncated_text"]
         # create elaborate HTML warning
         input_wc = re.split(r"\s+", input_text)
         msg = f"""
@@ -256,7 +261,7 @@ def proc_submission(
         logging.warning(msg)
         history["WARNING"] = msg
     else:
-        tr_in = input_text
         msg = None
     if len(input_text) < 50:
@@ -278,7 +283,7 @@ def proc_submission(
         return msg, "<strong>No summary generated.</strong>", "", []
     _summaries = predict(
-        input_text=tr_in,
         model_name=model_name,
         token_batch_length=token_batch_length,
         **settings,
@@ -410,14 +415,14 @@ def parse_args():
         "--add_beam_option",
         type=int,
         default=None,
-        help=f"Add a beam search option to the list of beam search options: {pp.pformat(BEAM_OPTIONS, compact=True)}",
     )
     parser.add_argument(
         "-batch",
         "--token_batch_option",
         type=int,
         default=None,
-        help=f"Add a token batch option to the list of token batch options: {pp.pformat(TOKEN_BATCH_OPTIONS, compact=True)}",
     )
     parser.add_argument(
         "-level",
@@ -577,7 +582,7 @@ if __name__ == "__main__":
                         value="<center><i>Aggregate summary will appear here!</i></center>",
                     )
                     gr.Markdown(
-                        "\n\n_Aggregate summary also appended to the bottom of the `.txt` file!_"
                     )
         gr.Markdown("---")

 from pdf2text import convert_PDF_to_Text
 from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
 from utils import (
+    contraction_aware_tokenize,
     extract_batches,
+    extract_keywords,
     load_example_filenames,
     remove_stagnant_files,
     saves_summary,
     history = {}
     clean_text = clean(input_text, lower=False)
     clean_text = remove_stopwords(clean_text) if predrop_stopwords else clean_text
+    logging.info(
+        f"pre-truncation word count: {len(contraction_aware_tokenize(clean_text))}"
+    )
+    truncation_validated = truncate_word_count(clean_text, max_words=max_input_length)
+    if truncation_validated["was_truncated"]:
+        model_input_text = truncation_validated["processed_text"]
         # create elaborate HTML warning
         input_wc = re.split(r"\s+", input_text)
         msg = f"""
         logging.warning(msg)
         history["WARNING"] = msg
     else:
+        model_input_text = truncation_validated["processed_text"]
         msg = None
     if len(input_text) < 50:
         return msg, "<strong>No summary generated.</strong>", "", []
     _summaries = predict(
+        input_text=model_input_text,
         model_name=model_name,
         token_batch_length=token_batch_length,
         **settings,
         "--add_beam_option",
         type=int,
         default=None,
+        help=f"Add a beam search option to the demo UI options, default: {pp.pformat(BEAM_OPTIONS, compact=True)}",
     )
     parser.add_argument(
         "-batch",
         "--token_batch_option",
         type=int,
         default=None,
+        help=f"Add a token batch size to the demo UI options, default: {pp.pformat(TOKEN_BATCH_OPTIONS, compact=True)}",
     )
     parser.add_argument(
         "-level",
                         value="<center><i>Aggregate summary will appear here!</i></center>",
                     )
                     gr.Markdown(
+                        "\n\n_Aggregate summary is also appended to the bottom of the `.txt` file._"
                     )
         gr.Markdown("---")

utils.py CHANGED Viewed

@@ -27,8 +27,8 @@ STOPWORDS = set(
 )
-def custom_tokenize(text: str) -> List[str]:
-    """custom_tokenize - merges words containing apostrophes as one token."""
     # Tokenize the text using the WhitespaceTokenizer
     tokenizer = WhitespaceTokenizer()
@@ -56,17 +56,21 @@ def custom_tokenize(text: str) -> List[str]:
 def remove_stopwords(
-    text: str, stopwords: List[str] = STOPWORDS, use_custom_tokenize: bool = True
 ) -> str:
     """
     remove_stopwords - Remove stopwords from text.
     :param str text: input text
     :param List[str] stopwords: list of stopwords, defaults to STOPWORDS
-    :param bool use_custom_tokenize: use custom apostrophe tokenizer, defaults to True
     :return str: text with stopwords removed
     """
-    words = custom_tokenize(text) if use_custom_tokenize else word_tokenize(text)
     filtered_words = []
     for word in words:
@@ -204,14 +208,14 @@ def truncate_word_count(text: str, max_words=1024) -> dict:
     :param int max_words: the maximum number of words to keep, defaults to 1024
     :return: dict, the processed text
     """
-    words = re.split(r"\s+", text)
     processed = {}
     if len(words) > max_words:
         processed["was_truncated"] = True
-        processed["truncated_text"] = " ".join(words[:max_words])
     else:
         processed["was_truncated"] = False
-        processed["truncated_text"] = text
     return processed

 )
+def contraction_aware_tokenize(text: str) -> List[str]:
+    """contraction_aware_tokenize - merges words containing apostrophes as one token."""
     # Tokenize the text using the WhitespaceTokenizer
     tokenizer = WhitespaceTokenizer()
 def remove_stopwords(
+    text: str, stopwords: List[str] = STOPWORDS, contraction_tokenize: bool = True
 ) -> str:
     """
     remove_stopwords - Remove stopwords from text.
     :param str text: input text
     :param List[str] stopwords: list of stopwords, defaults to STOPWORDS
+    :param bool contraction_tokenize: use custom apostrophe tokenizer, defaults to True
     :return str: text with stopwords removed
     """
+    words = (
+        contraction_aware_tokenize(text)
+        if contraction_tokenize
+        else word_tokenize(text)
+    )
     filtered_words = []
     for word in words:
     :param int max_words: the maximum number of words to keep, defaults to 1024
     :return: dict, the processed text
     """
+    words = contraction_aware_tokenize(str(text))
     processed = {}
     if len(words) > max_words:
         processed["was_truncated"] = True
+        processed["processed_text"] = " ".join(words[:max_words])
     else:
         processed["was_truncated"] = False
+        processed["processed_text"] = text
     return processed