Spaces:

Blaise-g
/

summarize-biomedical-papers-long-summary-or-tldr

Runtime error

App Files Files Community

Blaise-g commited on Aug 17, 2022

Commit

e154b2a

1 Parent(s): 046f8fb

First quick update

Browse files

Files changed (1) hide show

app.py +14 -14

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ logging.basicConfig(
 def proc_submission(
     input_text: str,
-    model_size: str,
     num_beams,
     token_batch_length,
     length_penalty,
@@ -51,15 +51,15 @@ def proc_submission(
         "no_repeat_ngram_size": int(no_repeat_ngram_size),
         "encoder_no_repeat_ngram_size": 4,
         "num_beams": int(num_beams),
-        "min_length": 4,
         "max_length": int(token_batch_length // 4),
         "early_stopping": True,
-        "do_sample": False,
     }
     st = time.perf_counter()
     history = {}
     clean_text = clean(input_text, lower=False)
-    max_input_length = 2048 if model_size == "base" else max_input_length
     processed = truncate_word_count(clean_text, max_input_length)
     if processed["was_truncated"]:
@@ -73,8 +73,8 @@ def proc_submission(
     _summaries = summarize_via_tokenbatches(
         tr_in,
-        model_sm if model_size == "base" else model,
-        tokenizer_sm if model_size == "base" else tokenizer,
         batch_length=token_batch_length,
         **settings,
     )
@@ -144,8 +144,8 @@ def load_uploaded_file(file_obj):
 if __name__ == "__main__":
-    model, tokenizer = load_model_and_tokenizer("pszemraj/led-large-book-summary")
-    model_sm, tokenizer_sm = load_model_and_tokenizer("pszemraj/led-base-book-summary")
     name_to_path = load_example_filenames(_here / "examples")
     logging.info(f"Loaded {len(name_to_path)} examples")
@@ -153,9 +153,9 @@ if __name__ == "__main__":
     with demo:
-        gr.Markdown("# Long-Form Summarization: LED & BookSum")
         gr.Markdown(
-            "A simple demo using a fine-tuned LED model to summarize long-form text. See [model card](https://huggingface.co/pszemraj/led-large-book-summary) for a notebook with GPU inference (much faster) on Colab."
         )
         with gr.Column():
@@ -165,7 +165,7 @@ if __name__ == "__main__":
             )
             with gr.Row():
                 model_size = gr.Radio(
-                    choices=["base", "large"], label="Model Variant", value="large"
                 )
                 num_beams = gr.Radio(
                     choices=[2, 3, 4],
@@ -173,7 +173,7 @@ if __name__ == "__main__":
                     value=2,
                 )
             gr.Markdown(
-                "_The base model is less performant than the large model, but is faster and will accept up to 2048 words per input (Large model accepts up to 768)._"
             )
             with gr.Row():
                 length_penalty = gr.inputs.Slider(
@@ -213,7 +213,7 @@ if __name__ == "__main__":
             input_text = gr.Textbox(
                 lines=6,
                 label="Input Text (for summarization)",
-                placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
             )
             gr.Markdown("Upload your own file:")
             with gr.Row():
@@ -259,7 +259,7 @@ if __name__ == "__main__":
                 "- The two most important parameters-empirically-are the `num_beams` and `token_batch_length`. However, increasing these will also increase the amount of time it takes to generate a summary. The `length_penalty` and `repetition_penalty` parameters are also important for the model to generate good summaries."
             )
             gr.Markdown(
-                "- The model can be used with tag [pszemraj/led-large-book-summary](https://huggingface.co/pszemraj/led-large-book-summary). See the model card for details on usage & a notebook for a tutorial."
             )
             gr.Markdown("---")

 def proc_submission(
     input_text: str,
+    model_type: str,
     num_beams,
     token_batch_length,
     length_penalty,
         "no_repeat_ngram_size": int(no_repeat_ngram_size),
         "encoder_no_repeat_ngram_size": 4,
         "num_beams": int(num_beams),
+        "min_length": 11,
         "max_length": int(token_batch_length // 4),
         "early_stopping": True,
+        #"do_sample": False,
     }
     st = time.perf_counter()
     history = {}
     clean_text = clean(input_text, lower=False)
+    #max_input_length = 2048 if model_type == "tldr" else max_input_length
     processed = truncate_word_count(clean_text, max_input_length)
     if processed["was_truncated"]:
     _summaries = summarize_via_tokenbatches(
         tr_in,
+        model_tldr if model_type == "tldr" else model,
+        tokenizer_tldr if model_type == "tldr" else tokenizer,
         batch_length=token_batch_length,
         **settings,
     )
 if __name__ == "__main__":
+    model, tokenizer = load_model_and_tokenizer("Blaise-g/longt5_tglobal_large_sumpubmed")
+    model_tldr, tokenizer_tldr = load_model_and_tokenizer("Blaise-g/longt5_tglobal_large_scitldr")
     name_to_path = load_example_filenames(_here / "examples")
     logging.info(f"Loaded {len(name_to_path)} examples")
     with demo:
+        gr.Markdown("#Automatic summarization of biomedical research papers with neural abstractive methods into a long and comprehensive synopsis or extreme TLDR summary version")
         gr.Markdown(
+            "A rather simple demo using an ad-hoc fine-tuned LongT5 or LED model to summarize long biomedical articles (or any scientific text related to the biomedical domain) into a detailed or extreme TLDR version."
         )
         with gr.Column():
             )
             with gr.Row():
                 model_size = gr.Radio(
+                    choices=["tldr", "sumpubmed"], label="Model Variant", value="large"
                 )
                 num_beams = gr.Radio(
                     choices=[2, 3, 4],
                     value=2,
                 )
             gr.Markdown(
+                "_The LED model is less performant than the LongT5 model, but is faster and will accept up to 2048 words per input (Large model accepts up to 768)._"
             )
             with gr.Row():
                 length_penalty = gr.inputs.Slider(
             input_text = gr.Textbox(
                 lines=6,
                 label="Input Text (for summarization)",
+                placeholder="Enter any scientific text to be condensed into a long and comprehensive digested format or an extreme TLDR summary version, the text will be preprocessed and truncated if necessary to fit within the computational constraints. The models were trained to handle long scientific papers but generalize reasonably well also to shorter text documents like abstracts with an appropriate. Might take a well to produce long summaries :)",
             )
             gr.Markdown("Upload your own file:")
             with gr.Row():
                 "- The two most important parameters-empirically-are the `num_beams` and `token_batch_length`. However, increasing these will also increase the amount of time it takes to generate a summary. The `length_penalty` and `repetition_penalty` parameters are also important for the model to generate good summaries."
             )
             gr.Markdown(
+                "- The model can be "
             )
             gr.Markdown("---")