Blaise-g commited on
Commit
e154b2a
Β·
1 Parent(s): 046f8fb

First quick update

Browse files
Files changed (1) hide show
  1. app.py +14 -14
app.py CHANGED
@@ -20,7 +20,7 @@ logging.basicConfig(
20
 
21
  def proc_submission(
22
  input_text: str,
23
- model_size: str,
24
  num_beams,
25
  token_batch_length,
26
  length_penalty,
@@ -51,15 +51,15 @@ def proc_submission(
51
  "no_repeat_ngram_size": int(no_repeat_ngram_size),
52
  "encoder_no_repeat_ngram_size": 4,
53
  "num_beams": int(num_beams),
54
- "min_length": 4,
55
  "max_length": int(token_batch_length // 4),
56
  "early_stopping": True,
57
- "do_sample": False,
58
  }
59
  st = time.perf_counter()
60
  history = {}
61
  clean_text = clean(input_text, lower=False)
62
- max_input_length = 2048 if model_size == "base" else max_input_length
63
  processed = truncate_word_count(clean_text, max_input_length)
64
 
65
  if processed["was_truncated"]:
@@ -73,8 +73,8 @@ def proc_submission(
73
 
74
  _summaries = summarize_via_tokenbatches(
75
  tr_in,
76
- model_sm if model_size == "base" else model,
77
- tokenizer_sm if model_size == "base" else tokenizer,
78
  batch_length=token_batch_length,
79
  **settings,
80
  )
@@ -144,8 +144,8 @@ def load_uploaded_file(file_obj):
144
 
145
  if __name__ == "__main__":
146
 
147
- model, tokenizer = load_model_and_tokenizer("pszemraj/led-large-book-summary")
148
- model_sm, tokenizer_sm = load_model_and_tokenizer("pszemraj/led-base-book-summary")
149
 
150
  name_to_path = load_example_filenames(_here / "examples")
151
  logging.info(f"Loaded {len(name_to_path)} examples")
@@ -153,9 +153,9 @@ if __name__ == "__main__":
153
 
154
  with demo:
155
 
156
- gr.Markdown("# Long-Form Summarization: LED & BookSum")
157
  gr.Markdown(
158
- "A simple demo using a fine-tuned LED model to summarize long-form text. See [model card](https://huggingface.co/pszemraj/led-large-book-summary) for a notebook with GPU inference (much faster) on Colab."
159
  )
160
  with gr.Column():
161
 
@@ -165,7 +165,7 @@ if __name__ == "__main__":
165
  )
166
  with gr.Row():
167
  model_size = gr.Radio(
168
- choices=["base", "large"], label="Model Variant", value="large"
169
  )
170
  num_beams = gr.Radio(
171
  choices=[2, 3, 4],
@@ -173,7 +173,7 @@ if __name__ == "__main__":
173
  value=2,
174
  )
175
  gr.Markdown(
176
- "_The base model is less performant than the large model, but is faster and will accept up to 2048 words per input (Large model accepts up to 768)._"
177
  )
178
  with gr.Row():
179
  length_penalty = gr.inputs.Slider(
@@ -213,7 +213,7 @@ if __name__ == "__main__":
213
  input_text = gr.Textbox(
214
  lines=6,
215
  label="Input Text (for summarization)",
216
- placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
217
  )
218
  gr.Markdown("Upload your own file:")
219
  with gr.Row():
@@ -259,7 +259,7 @@ if __name__ == "__main__":
259
  "- The two most important parameters-empirically-are the `num_beams` and `token_batch_length`. However, increasing these will also increase the amount of time it takes to generate a summary. The `length_penalty` and `repetition_penalty` parameters are also important for the model to generate good summaries."
260
  )
261
  gr.Markdown(
262
- "- The model can be used with tag [pszemraj/led-large-book-summary](https://huggingface.co/pszemraj/led-large-book-summary). See the model card for details on usage & a notebook for a tutorial."
263
  )
264
  gr.Markdown("---")
265
 
 
20
 
21
  def proc_submission(
22
  input_text: str,
23
+ model_type: str,
24
  num_beams,
25
  token_batch_length,
26
  length_penalty,
 
51
  "no_repeat_ngram_size": int(no_repeat_ngram_size),
52
  "encoder_no_repeat_ngram_size": 4,
53
  "num_beams": int(num_beams),
54
+ "min_length": 11,
55
  "max_length": int(token_batch_length // 4),
56
  "early_stopping": True,
57
+ #"do_sample": False,
58
  }
59
  st = time.perf_counter()
60
  history = {}
61
  clean_text = clean(input_text, lower=False)
62
+ #max_input_length = 2048 if model_type == "tldr" else max_input_length
63
  processed = truncate_word_count(clean_text, max_input_length)
64
 
65
  if processed["was_truncated"]:
 
73
 
74
  _summaries = summarize_via_tokenbatches(
75
  tr_in,
76
+ model_tldr if model_type == "tldr" else model,
77
+ tokenizer_tldr if model_type == "tldr" else tokenizer,
78
  batch_length=token_batch_length,
79
  **settings,
80
  )
 
144
 
145
  if __name__ == "__main__":
146
 
147
+ model, tokenizer = load_model_and_tokenizer("Blaise-g/longt5_tglobal_large_sumpubmed")
148
+ model_tldr, tokenizer_tldr = load_model_and_tokenizer("Blaise-g/longt5_tglobal_large_scitldr")
149
 
150
  name_to_path = load_example_filenames(_here / "examples")
151
  logging.info(f"Loaded {len(name_to_path)} examples")
 
153
 
154
  with demo:
155
 
156
+ gr.Markdown("#Automatic summarization of biomedical research papers with neural abstractive methods into a long and comprehensive synopsis or extreme TLDR summary version")
157
  gr.Markdown(
158
+ "A rather simple demo using an ad-hoc fine-tuned LongT5 or LED model to summarize long biomedical articles (or any scientific text related to the biomedical domain) into a detailed or extreme TLDR version."
159
  )
160
  with gr.Column():
161
 
 
165
  )
166
  with gr.Row():
167
  model_size = gr.Radio(
168
+ choices=["tldr", "sumpubmed"], label="Model Variant", value="large"
169
  )
170
  num_beams = gr.Radio(
171
  choices=[2, 3, 4],
 
173
  value=2,
174
  )
175
  gr.Markdown(
176
+ "_The LED model is less performant than the LongT5 model, but is faster and will accept up to 2048 words per input (Large model accepts up to 768)._"
177
  )
178
  with gr.Row():
179
  length_penalty = gr.inputs.Slider(
 
213
  input_text = gr.Textbox(
214
  lines=6,
215
  label="Input Text (for summarization)",
216
+ placeholder="Enter any scientific text to be condensed into a long and comprehensive digested format or an extreme TLDR summary version, the text will be preprocessed and truncated if necessary to fit within the computational constraints. The models were trained to handle long scientific papers but generalize reasonably well also to shorter text documents like abstracts with an appropriate. Might take a well to produce long summaries :)",
217
  )
218
  gr.Markdown("Upload your own file:")
219
  with gr.Row():
 
259
  "- The two most important parameters-empirically-are the `num_beams` and `token_batch_length`. However, increasing these will also increase the amount of time it takes to generate a summary. The `length_penalty` and `repetition_penalty` parameters are also important for the model to generate good summaries."
260
  )
261
  gr.Markdown(
262
+ "- The model can be "
263
  )
264
  gr.Markdown("---")
265