First quick update
Browse files
app.py
CHANGED
@@ -20,7 +20,7 @@ logging.basicConfig(
|
|
20 |
|
21 |
def proc_submission(
|
22 |
input_text: str,
|
23 |
-
|
24 |
num_beams,
|
25 |
token_batch_length,
|
26 |
length_penalty,
|
@@ -51,15 +51,15 @@ def proc_submission(
|
|
51 |
"no_repeat_ngram_size": int(no_repeat_ngram_size),
|
52 |
"encoder_no_repeat_ngram_size": 4,
|
53 |
"num_beams": int(num_beams),
|
54 |
-
"min_length":
|
55 |
"max_length": int(token_batch_length // 4),
|
56 |
"early_stopping": True,
|
57 |
-
"do_sample": False,
|
58 |
}
|
59 |
st = time.perf_counter()
|
60 |
history = {}
|
61 |
clean_text = clean(input_text, lower=False)
|
62 |
-
max_input_length = 2048 if
|
63 |
processed = truncate_word_count(clean_text, max_input_length)
|
64 |
|
65 |
if processed["was_truncated"]:
|
@@ -73,8 +73,8 @@ def proc_submission(
|
|
73 |
|
74 |
_summaries = summarize_via_tokenbatches(
|
75 |
tr_in,
|
76 |
-
|
77 |
-
|
78 |
batch_length=token_batch_length,
|
79 |
**settings,
|
80 |
)
|
@@ -144,8 +144,8 @@ def load_uploaded_file(file_obj):
|
|
144 |
|
145 |
if __name__ == "__main__":
|
146 |
|
147 |
-
model, tokenizer = load_model_and_tokenizer("
|
148 |
-
|
149 |
|
150 |
name_to_path = load_example_filenames(_here / "examples")
|
151 |
logging.info(f"Loaded {len(name_to_path)} examples")
|
@@ -153,9 +153,9 @@ if __name__ == "__main__":
|
|
153 |
|
154 |
with demo:
|
155 |
|
156 |
-
gr.Markdown("#
|
157 |
gr.Markdown(
|
158 |
-
"A simple demo using
|
159 |
)
|
160 |
with gr.Column():
|
161 |
|
@@ -165,7 +165,7 @@ if __name__ == "__main__":
|
|
165 |
)
|
166 |
with gr.Row():
|
167 |
model_size = gr.Radio(
|
168 |
-
choices=["
|
169 |
)
|
170 |
num_beams = gr.Radio(
|
171 |
choices=[2, 3, 4],
|
@@ -173,7 +173,7 @@ if __name__ == "__main__":
|
|
173 |
value=2,
|
174 |
)
|
175 |
gr.Markdown(
|
176 |
-
"_The
|
177 |
)
|
178 |
with gr.Row():
|
179 |
length_penalty = gr.inputs.Slider(
|
@@ -213,7 +213,7 @@ if __name__ == "__main__":
|
|
213 |
input_text = gr.Textbox(
|
214 |
lines=6,
|
215 |
label="Input Text (for summarization)",
|
216 |
-
placeholder="Enter text to
|
217 |
)
|
218 |
gr.Markdown("Upload your own file:")
|
219 |
with gr.Row():
|
@@ -259,7 +259,7 @@ if __name__ == "__main__":
|
|
259 |
"- The two most important parameters-empirically-are the `num_beams` and `token_batch_length`. However, increasing these will also increase the amount of time it takes to generate a summary. The `length_penalty` and `repetition_penalty` parameters are also important for the model to generate good summaries."
|
260 |
)
|
261 |
gr.Markdown(
|
262 |
-
"- The model can be
|
263 |
)
|
264 |
gr.Markdown("---")
|
265 |
|
|
|
20 |
|
21 |
def proc_submission(
|
22 |
input_text: str,
|
23 |
+
model_type: str,
|
24 |
num_beams,
|
25 |
token_batch_length,
|
26 |
length_penalty,
|
|
|
51 |
"no_repeat_ngram_size": int(no_repeat_ngram_size),
|
52 |
"encoder_no_repeat_ngram_size": 4,
|
53 |
"num_beams": int(num_beams),
|
54 |
+
"min_length": 11,
|
55 |
"max_length": int(token_batch_length // 4),
|
56 |
"early_stopping": True,
|
57 |
+
#"do_sample": False,
|
58 |
}
|
59 |
st = time.perf_counter()
|
60 |
history = {}
|
61 |
clean_text = clean(input_text, lower=False)
|
62 |
+
#max_input_length = 2048 if model_type == "tldr" else max_input_length
|
63 |
processed = truncate_word_count(clean_text, max_input_length)
|
64 |
|
65 |
if processed["was_truncated"]:
|
|
|
73 |
|
74 |
_summaries = summarize_via_tokenbatches(
|
75 |
tr_in,
|
76 |
+
model_tldr if model_type == "tldr" else model,
|
77 |
+
tokenizer_tldr if model_type == "tldr" else tokenizer,
|
78 |
batch_length=token_batch_length,
|
79 |
**settings,
|
80 |
)
|
|
|
144 |
|
145 |
if __name__ == "__main__":
|
146 |
|
147 |
+
model, tokenizer = load_model_and_tokenizer("Blaise-g/longt5_tglobal_large_sumpubmed")
|
148 |
+
model_tldr, tokenizer_tldr = load_model_and_tokenizer("Blaise-g/longt5_tglobal_large_scitldr")
|
149 |
|
150 |
name_to_path = load_example_filenames(_here / "examples")
|
151 |
logging.info(f"Loaded {len(name_to_path)} examples")
|
|
|
153 |
|
154 |
with demo:
|
155 |
|
156 |
+
gr.Markdown("#Automatic summarization of biomedical research papers with neural abstractive methods into a long and comprehensive synopsis or extreme TLDR summary version")
|
157 |
gr.Markdown(
|
158 |
+
"A rather simple demo using an ad-hoc fine-tuned LongT5 or LED model to summarize long biomedical articles (or any scientific text related to the biomedical domain) into a detailed or extreme TLDR version."
|
159 |
)
|
160 |
with gr.Column():
|
161 |
|
|
|
165 |
)
|
166 |
with gr.Row():
|
167 |
model_size = gr.Radio(
|
168 |
+
choices=["tldr", "sumpubmed"], label="Model Variant", value="large"
|
169 |
)
|
170 |
num_beams = gr.Radio(
|
171 |
choices=[2, 3, 4],
|
|
|
173 |
value=2,
|
174 |
)
|
175 |
gr.Markdown(
|
176 |
+
"_The LED model is less performant than the LongT5 model, but is faster and will accept up to 2048 words per input (Large model accepts up to 768)._"
|
177 |
)
|
178 |
with gr.Row():
|
179 |
length_penalty = gr.inputs.Slider(
|
|
|
213 |
input_text = gr.Textbox(
|
214 |
lines=6,
|
215 |
label="Input Text (for summarization)",
|
216 |
+
placeholder="Enter any scientific text to be condensed into a long and comprehensive digested format or an extreme TLDR summary version, the text will be preprocessed and truncated if necessary to fit within the computational constraints. The models were trained to handle long scientific papers but generalize reasonably well also to shorter text documents like abstracts with an appropriate. Might take a well to produce long summaries :)",
|
217 |
)
|
218 |
gr.Markdown("Upload your own file:")
|
219 |
with gr.Row():
|
|
|
259 |
"- The two most important parameters-empirically-are the `num_beams` and `token_batch_length`. However, increasing these will also increase the amount of time it takes to generate a summary. The `length_penalty` and `repetition_penalty` parameters are also important for the model to generate good summaries."
|
260 |
)
|
261 |
gr.Markdown(
|
262 |
+
"- The model can be "
|
263 |
)
|
264 |
gr.Markdown("---")
|
265 |
|