pszemraj commited on
Commit
f84fce9
โ€ข
1 Parent(s): dcce2ac

๐Ÿ“ docs

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show
  1. app.py +21 -4
app.py CHANGED
@@ -212,6 +212,7 @@ def proc_submission(
212
  length_penalty (float): the length penalty to use
213
  repetition_penalty (float): the repetition penalty to use
214
  no_repeat_ngram_size (int): the no repeat ngram size to use
 
215
  max_input_length (int, optional): the maximum input length to use. Defaults to 6144.
216
 
217
  Note:
@@ -219,7 +220,7 @@ def proc_submission(
219
  environment variable APP_MAX_WORDS to a different value.
220
 
221
  Returns:
222
- str in HTML format, string of the summary, str of score
223
  """
224
 
225
  remove_stagnant_files() # clean up old files
@@ -257,7 +258,7 @@ def proc_submission(
257
  msg = f"""
258
  <div style="background-color: #FFA500; color: white; padding: 20px;">
259
  <h3>Warning</h3>
260
- <p>Input text was truncated to {max_input_length} words. That's about {100*max_input_length/len(input_wc):.2f}% of the submission.</p>
261
  <p>Dropping stopwords is set to {predrop_stopwords}. If this is not what you intended, please validate the advanced settings.</p>
262
  </div>
263
  """
@@ -267,6 +268,22 @@ def proc_submission(
267
  model_input_text = truncation_validated["processed_text"]
268
  msg = None
269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  if len(input_text) < 50:
271
  # this is essentially a different case from the above
272
  msg = f"""
@@ -589,8 +606,8 @@ if __name__ == "__main__":
589
  )
590
  gr.Markdown(
591
  f"""Aggregate the above batches into a cohesive summary.
592
- - a secondary instruct-tuned LM consolidates info from the batches
593
- - current model: [{AGGREGATE_MODEL}](https://hf.co/{AGGREGATE_MODEL})
594
  """
595
  )
596
  with gr.Column(variant="panel"):
 
212
  length_penalty (float): the length penalty to use
213
  repetition_penalty (float): the repetition penalty to use
214
  no_repeat_ngram_size (int): the no repeat ngram size to use
215
+ predrop_stopwords (bool): whether to pre-drop stopwords before truncating/summarizing
216
  max_input_length (int, optional): the maximum input length to use. Defaults to 6144.
217
 
218
  Note:
 
220
  environment variable APP_MAX_WORDS to a different value.
221
 
222
  Returns:
223
+ tuple (4): a tuple containing the following:
224
  """
225
 
226
  remove_stagnant_files() # clean up old files
 
258
  msg = f"""
259
  <div style="background-color: #FFA500; color: white; padding: 20px;">
260
  <h3>Warning</h3>
261
+ <p>Input text was truncated to {max_input_length} words. That's about {100*max_input_length/input_wc:.2f}% of the original text.</p>
262
  <p>Dropping stopwords is set to {predrop_stopwords}. If this is not what you intended, please validate the advanced settings.</p>
263
  </div>
264
  """
 
268
  model_input_text = truncation_validated["processed_text"]
269
  msg = None
270
 
271
+ if predrop_stopwords:
272
+ # TODO: remove this
273
+
274
+ outdir = Path.cwd() / "scratch" / "predrop_stopwords-v4"
275
+ outdir.mkdir(parents=True, exist_ok=True)
276
+ keywords_cln = " ".join(extract_keywords(cln_text, kw_max_len=4))
277
+ keywords_sw_removed = "_".join(extract_keywords(model_input_text, kw_max_len=4))
278
+ cln_filename = f"{keywords_cln}_{len(cln_text)}.txt"
279
+ cln_outdir = outdir.parent / "source-text"
280
+ cln_outdir.mkdir(parents=True, exist_ok=True)
281
+ with open(cln_outdir / cln_filename, "w", encoding="utf-8") as f:
282
+ f.write(cln_text)
283
+ sw_rm_filename = f"{keywords_sw_removed}_{len(model_input_text)}.txt"
284
+ with open(outdir / sw_rm_filename, "w", encoding="utf-8") as f:
285
+ f.write(model_input_text)
286
+ logging.info(f"saved predrop_stopwords file to {outdir / sw_rm_filename}")
287
  if len(input_text) < 50:
288
  # this is essentially a different case from the above
289
  msg = f"""
 
606
  )
607
  gr.Markdown(
608
  f"""Aggregate the above batches into a cohesive summary.
609
+ - A secondary instruct-tuned LM consolidates info
610
+ - Current model: [{AGGREGATE_MODEL}](https://hf.co/{AGGREGATE_MODEL})
611
  """
612
  )
613
  with gr.Column(variant="panel"):