document-summarization

Running

App Files Files Community

pszemraj commited on Apr 30, 2023

Commit

73feb19

•

1 Parent(s): 77d5469

✨ easily customize app

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show

app.py +34 -9

app.py CHANGED Viewed

@@ -3,6 +3,13 @@ app.py - the main module for the gradio app
 Usage:
     python app.py
 """
 import contextlib
 import gc
@@ -14,9 +21,7 @@ import time
 from pathlib import Path
 os.environ["USE_TORCH"] = "1"
-os.environ[
-    "TOKENIZERS_PARALLELISM"
-] = "false"  # parallelism on tokenizers is buggy with gradio
 logging.basicConfig(
     level=logging.INFO,
@@ -48,6 +53,10 @@ MODEL_OPTIONS = [
     "pszemraj/pegasus-x-large-book-summary",
 ]  # models users can choose from
 def predict(
     input_text: str,
@@ -105,7 +114,11 @@ def proc_submission(
         length_penalty (float): the length penalty to use
         repetition_penalty (float): the repetition penalty to use
         no_repeat_ngram_size (int): the no repeat ngram size to use
-        max_input_length (int, optional): the maximum input length to use. Defaults to 2048.
     Returns:
         str in HTML format, string of the summary, str of score
@@ -122,6 +135,9 @@ def proc_submission(
         "early_stopping": True,
         "do_sample": False,
     }
     st = time.perf_counter()
     history = {}
     clean_text = clean(input_text, lower=False)
@@ -186,7 +202,7 @@ def proc_submission(
     # save to file
     settings["model_name"] = model_name
-    saved_file = saves_summary(_summaries, **settings)
     return html, sum_text_out, scores_out, saved_file
@@ -211,6 +227,8 @@ def load_single_example_text(
         text = clean(raw_text, lower=False)
     elif full_ex_path.suffix == ".pdf":
         logging.info(f"Loading PDF file {full_ex_path}")
         conversion_stats = convert_PDF_to_Text(
             full_ex_path,
             ocr_model=ocr_model,
@@ -241,12 +259,14 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
     file_path = Path(file_obj.name)
     try:
         logger.info(f"Loading file:\t{file_path}")
-        if file_path.suffix == ".txt":
             with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                 raw_text = f.read()
             text = clean(raw_text, lower=lower)
         elif file_path.suffix == ".pdf":
             logger.info(f"loading as PDF file {file_path}")
             conversion_stats = convert_PDF_to_Text(
                 file_path,
                 ocr_model=ocr_model,
@@ -254,8 +274,8 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
             )
             text = conversion_stats["converted_text"]
         else:
-            logger.error(f"Unknown file type {file_path.suffix}")
-            text = "ERROR - check file - unknown file type"
         return text
     except Exception as e:
@@ -276,7 +296,8 @@ if __name__ == "__main__":
         )
     name_to_path = load_example_filenames(_here / "examples")
     logger.info(f"Loaded {len(name_to_path)} examples")
-    demo = gr.Blocks()
     _examples = list(name_to_path.keys())
     with demo:
         gr.Markdown("# Document Summarization with Long-Document Transformers")
@@ -318,6 +339,7 @@ if __name__ == "__main__":
             with gr.Row():
                 input_text = gr.Textbox(
                     lines=4,
                     label="Input Text (for summarization)",
                     placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
                 )
@@ -389,6 +411,9 @@ if __name__ == "__main__":
             gr.Markdown(
                 "- _Update April 2023:_ Additional models fine-tuned on the [PLOS](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-plos-norm) and [ELIFE](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-elife-norm) subsets of the [scientific lay summaries](https://arxiv.org/abs/2210.09932) dataset are available (see dropdown at the top)."
             )
             gr.Markdown("---")
         load_examples_button.click(

 Usage:
     python app.py
+Environment Variables:
+    USE_TORCH (str): whether to use torch (1) or not (0)
+    TOKENIZERS_PARALLELISM (str): whether to use parallelism (true) or not (false)
+Optional Environment Variables:
+    APP_MAX_WORDS (int): the maximum number of words to use for summarization
+    APP_OCR_MAX_PAGES (int): the maximum number of pages to use for OCR
 """
 import contextlib
 import gc
 from pathlib import Path
 os.environ["USE_TORCH"] = "1"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 logging.basicConfig(
     level=logging.INFO,
     "pszemraj/pegasus-x-large-book-summary",
 ]  # models users can choose from
+# if duplicating space,, uncomment this line to adjust the max words
+# os.environ["APP_MAX_WORDS"] = str(2048)  # set the max words to 2048
+# os.environ["APP_OCR_MAX_PAGES"] = str(40)  # set the max pages to 40
 def predict(
     input_text: str,
         length_penalty (float): the length penalty to use
         repetition_penalty (float): the repetition penalty to use
         no_repeat_ngram_size (int): the no repeat ngram size to use
+        max_input_length (int, optional): the maximum input length to use. Defaults to 4096.
+    Note:
+        the max_input_length is set to 4096 by default, but can be changed by setting the
+        environment variable APP_MAX_WORDS to a different value.
     Returns:
         str in HTML format, string of the summary, str of score
         "early_stopping": True,
         "do_sample": False,
     }
+    max_input_length = int(os.environ.get("APP_MAX_WORDS", max_input_length))
+    logging.info(f"max_input_length set to: {max_input_length}")
     st = time.perf_counter()
     history = {}
     clean_text = clean(input_text, lower=False)
     # save to file
     settings["model_name"] = model_name
+    saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
     return html, sum_text_out, scores_out, saved_file
         text = clean(raw_text, lower=False)
     elif full_ex_path.suffix == ".pdf":
         logging.info(f"Loading PDF file {full_ex_path}")
+        max_pages = int(os.environ.get("APP_MAX_PAGES", max_pages))
+        logging.info(f"max_pages set to: {max_pages}")
         conversion_stats = convert_PDF_to_Text(
             full_ex_path,
             ocr_model=ocr_model,
     file_path = Path(file_obj.name)
     try:
         logger.info(f"Loading file:\t{file_path}")
+        if file_path.suffix in [".txt", ".md"]:
             with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                 raw_text = f.read()
             text = clean(raw_text, lower=lower)
         elif file_path.suffix == ".pdf":
             logger.info(f"loading as PDF file {file_path}")
+            max_pages = int(os.environ.get("APP_MAX_PAGES", max_pages))
+            logger.info(f"max_pages set to: {max_pages}")
             conversion_stats = convert_PDF_to_Text(
                 file_path,
                 ocr_model=ocr_model,
             )
             text = conversion_stats["converted_text"]
         else:
+            logger.error(f"Unknown file type:\t{file_path.suffix}")
+            text = "ERROR - check file - unknown file type. PDF, TXT, and MD are supported."
         return text
     except Exception as e:
         )
     name_to_path = load_example_filenames(_here / "examples")
     logger.info(f"Loaded {len(name_to_path)} examples")
+    demo = gr.Blocks(title="Document Summarization with Long-Document Transformers")
     _examples = list(name_to_path.keys())
     with demo:
         gr.Markdown("# Document Summarization with Long-Document Transformers")
             with gr.Row():
                 input_text = gr.Textbox(
                     lines=4,
+                    max_lines=12,
                     label="Input Text (for summarization)",
                     placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
                 )
             gr.Markdown(
                 "- _Update April 2023:_ Additional models fine-tuned on the [PLOS](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-plos-norm) and [ELIFE](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-elife-norm) subsets of the [scientific lay summaries](https://arxiv.org/abs/2210.09932) dataset are available (see dropdown at the top)."
             )
+            gr.Markdown(
+                "Adjust the max input words & max PDF pages for OCR by duplicating this space and [setting the environment variables](https://huggingface.co/docs/hub/spaces-overview#managing-secrets) `APP_MAX_WORDS` and `APP_OCR_MAX_PAGES` to the desired integer values."
+            )
             gr.Markdown("---")
         load_examples_button.click(