Spaces:

ksvmuralidhar
/

news_summarizer_ner

Sleeping

App Files Files Community

ksvmuralidhar commited on Apr 14, 2024

Commit

1f114ec

verified ·

1 Parent(s): aa42935

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -7

app.py CHANGED Viewed

@@ -220,20 +220,35 @@ def summ_inference_tokenize(input_: list, n_tokens: int):
     tokenized_data = summ_tokenizer(text=input_, max_length=SUMM_TARGET_N_TOKENS, truncation=True, padding="max_length", return_tensors="tf")
     return summ_tokenizer, tokenized_data
-def summ_inference(txt: str):
-    txt = summ_preprocess(txt)
-    test_data = [txt]
-    inference_tokenizer, tokenized_data = summ_inference_tokenize(input_=test_data, n_tokens=SUMM_INPUT_N_TOKENS)
     pred = summ_model.generate(**tokenized_data, max_new_tokens=SUMM_TARGET_N_TOKENS)
-    result = inference_tokenizer.decode(pred[0])
-    result = re.sub("<.*?>", "", result).strip()
     return result
 ############ SUMMARIZATION MODEL & VARS INITIALIZATION END ####################
 ############## ENTRY POINT START #######################
 def main():
     st.markdown('''<h3>News Summarizer and NER</h3>
-    <p><a href="https://huggingface.co/spaces/ksvmuralidhar/news_summarizer_ner/blob/main/README.md#new-summarization-and-ner" target="_blank">README</a></p>
     ''', unsafe_allow_html=True)
     input_type = st.radio('Select an option:', ['Paste news URL', 'Paste news text'],
                       horizontal=True)

     tokenized_data = summ_tokenizer(text=input_, max_length=SUMM_TARGET_N_TOKENS, truncation=True, padding="max_length", return_tensors="tf")
     return summ_tokenizer, tokenized_data
+def clean_summary(summary: str):
+    summary = summary.strip()
+    if summary[-1] != '.':
+        sents = summary.split(". ")
+        summary = ". ".join(sents[:-1])
+        summary += "."
+    summary = re.sub(r'^-', "", summary)
+    summary = summary.strip()
+    if len(summary) <= 5:
+        summary = ""
+    return summary
+def summ_inference(txts: str):
+    txts = [*map(summ_preprocess, txts)]
+    inference_tokenizer, tokenized_data = summ_inference_tokenize(input_=txts, n_tokens=SUMM_INPUT_N_TOKENS)
     pred = summ_model.generate(**tokenized_data, max_new_tokens=SUMM_TARGET_N_TOKENS)
+    result = ["" if t=="" else clean_summary(inference_tokenizer.decode(p, skip_special_tokens=True)) for t, p in zip(txts, pred)]
     return result
 ############ SUMMARIZATION MODEL & VARS INITIALIZATION END ####################
 ############## ENTRY POINT START #######################
 def main():
     st.markdown('''<h3>News Summarizer and NER</h3>
+    <p><a href="https://huggingface.co/spaces/ksvmuralidhar/news_summarizer_ner/blob/main/README.md#new-summarization-and-ner" target="_blank">README</a>
+    <br>
+    The app works best in summarizing <a href="https://edition.cnn.com/">CNN</a> and <a href="https://www.dailymail.co.uk/home/index.html">Daily Mail</a> news articles,
+    as the BART model is fine-tuned on them.
+    </p>
     ''', unsafe_allow_html=True)
     input_type = st.radio('Select an option:', ['Paste news URL', 'Paste news text'],
                       horizontal=True)