ksvmuralidhar commited on
Commit
1f114ec
·
verified ·
1 Parent(s): aa42935

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -7
app.py CHANGED
@@ -220,20 +220,35 @@ def summ_inference_tokenize(input_: list, n_tokens: int):
220
  tokenized_data = summ_tokenizer(text=input_, max_length=SUMM_TARGET_N_TOKENS, truncation=True, padding="max_length", return_tensors="tf")
221
  return summ_tokenizer, tokenized_data
222
 
223
- def summ_inference(txt: str):
224
- txt = summ_preprocess(txt)
225
- test_data = [txt]
226
- inference_tokenizer, tokenized_data = summ_inference_tokenize(input_=test_data, n_tokens=SUMM_INPUT_N_TOKENS)
 
 
 
 
 
 
 
 
 
 
 
227
  pred = summ_model.generate(**tokenized_data, max_new_tokens=SUMM_TARGET_N_TOKENS)
228
- result = inference_tokenizer.decode(pred[0])
229
- result = re.sub("<.*?>", "", result).strip()
230
  return result
231
  ############ SUMMARIZATION MODEL & VARS INITIALIZATION END ####################
232
 
233
  ############## ENTRY POINT START #######################
234
  def main():
235
  st.markdown('''<h3>News Summarizer and NER</h3>
236
- <p><a href="https://huggingface.co/spaces/ksvmuralidhar/news_summarizer_ner/blob/main/README.md#new-summarization-and-ner" target="_blank">README</a></p>
 
 
 
 
 
237
  ''', unsafe_allow_html=True)
238
  input_type = st.radio('Select an option:', ['Paste news URL', 'Paste news text'],
239
  horizontal=True)
 
220
  tokenized_data = summ_tokenizer(text=input_, max_length=SUMM_TARGET_N_TOKENS, truncation=True, padding="max_length", return_tensors="tf")
221
  return summ_tokenizer, tokenized_data
222
 
223
+ def clean_summary(summary: str):
224
+ summary = summary.strip()
225
+ if summary[-1] != '.':
226
+ sents = summary.split(". ")
227
+ summary = ". ".join(sents[:-1])
228
+ summary += "."
229
+ summary = re.sub(r'^-', "", summary)
230
+ summary = summary.strip()
231
+ if len(summary) <= 5:
232
+ summary = ""
233
+ return summary
234
+
235
+ def summ_inference(txts: str):
236
+ txts = [*map(summ_preprocess, txts)]
237
+ inference_tokenizer, tokenized_data = summ_inference_tokenize(input_=txts, n_tokens=SUMM_INPUT_N_TOKENS)
238
  pred = summ_model.generate(**tokenized_data, max_new_tokens=SUMM_TARGET_N_TOKENS)
239
+ result = ["" if t=="" else clean_summary(inference_tokenizer.decode(p, skip_special_tokens=True)) for t, p in zip(txts, pred)]
 
240
  return result
241
  ############ SUMMARIZATION MODEL & VARS INITIALIZATION END ####################
242
 
243
  ############## ENTRY POINT START #######################
244
  def main():
245
  st.markdown('''<h3>News Summarizer and NER</h3>
246
+ <p><a href="https://huggingface.co/spaces/ksvmuralidhar/news_summarizer_ner/blob/main/README.md#new-summarization-and-ner" target="_blank">README</a>
247
+ <br>
248
+ The app works best in summarizing <a href="https://edition.cnn.com/">CNN</a> and <a href="https://www.dailymail.co.uk/home/index.html">Daily Mail</a> news articles,
249
+ as the BART model is fine-tuned on them.
250
+ </p>
251
+
252
  ''', unsafe_allow_html=True)
253
  input_type = st.radio('Select an option:', ['Paste news URL', 'Paste news text'],
254
  horizontal=True)