Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -220,20 +220,35 @@ def summ_inference_tokenize(input_: list, n_tokens: int):
|
|
220 |
tokenized_data = summ_tokenizer(text=input_, max_length=SUMM_TARGET_N_TOKENS, truncation=True, padding="max_length", return_tensors="tf")
|
221 |
return summ_tokenizer, tokenized_data
|
222 |
|
223 |
-
def
|
224 |
-
|
225 |
-
|
226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
pred = summ_model.generate(**tokenized_data, max_new_tokens=SUMM_TARGET_N_TOKENS)
|
228 |
-
result = inference_tokenizer.decode(pred
|
229 |
-
result = re.sub("<.*?>", "", result).strip()
|
230 |
return result
|
231 |
############ SUMMARIZATION MODEL & VARS INITIALIZATION END ####################
|
232 |
|
233 |
############## ENTRY POINT START #######################
|
234 |
def main():
|
235 |
st.markdown('''<h3>News Summarizer and NER</h3>
|
236 |
-
<p><a href="https://huggingface.co/spaces/ksvmuralidhar/news_summarizer_ner/blob/main/README.md#new-summarization-and-ner" target="_blank">README</a
|
|
|
|
|
|
|
|
|
|
|
237 |
''', unsafe_allow_html=True)
|
238 |
input_type = st.radio('Select an option:', ['Paste news URL', 'Paste news text'],
|
239 |
horizontal=True)
|
|
|
220 |
tokenized_data = summ_tokenizer(text=input_, max_length=SUMM_TARGET_N_TOKENS, truncation=True, padding="max_length", return_tensors="tf")
|
221 |
return summ_tokenizer, tokenized_data
|
222 |
|
223 |
+
def clean_summary(summary: str):
|
224 |
+
summary = summary.strip()
|
225 |
+
if summary[-1] != '.':
|
226 |
+
sents = summary.split(". ")
|
227 |
+
summary = ". ".join(sents[:-1])
|
228 |
+
summary += "."
|
229 |
+
summary = re.sub(r'^-', "", summary)
|
230 |
+
summary = summary.strip()
|
231 |
+
if len(summary) <= 5:
|
232 |
+
summary = ""
|
233 |
+
return summary
|
234 |
+
|
235 |
+
def summ_inference(txts: str):
|
236 |
+
txts = [*map(summ_preprocess, txts)]
|
237 |
+
inference_tokenizer, tokenized_data = summ_inference_tokenize(input_=txts, n_tokens=SUMM_INPUT_N_TOKENS)
|
238 |
pred = summ_model.generate(**tokenized_data, max_new_tokens=SUMM_TARGET_N_TOKENS)
|
239 |
+
result = ["" if t=="" else clean_summary(inference_tokenizer.decode(p, skip_special_tokens=True)) for t, p in zip(txts, pred)]
|
|
|
240 |
return result
|
241 |
############ SUMMARIZATION MODEL & VARS INITIALIZATION END ####################
|
242 |
|
243 |
############## ENTRY POINT START #######################
|
244 |
def main():
|
245 |
st.markdown('''<h3>News Summarizer and NER</h3>
|
246 |
+
<p><a href="https://huggingface.co/spaces/ksvmuralidhar/news_summarizer_ner/blob/main/README.md#new-summarization-and-ner" target="_blank">README</a>
|
247 |
+
<br>
|
248 |
+
The app works best in summarizing <a href="https://edition.cnn.com/">CNN</a> and <a href="https://www.dailymail.co.uk/home/index.html">Daily Mail</a> news articles,
|
249 |
+
as the BART model is fine-tuned on them.
|
250 |
+
</p>
|
251 |
+
|
252 |
''', unsafe_allow_html=True)
|
253 |
input_type = st.radio('Select an option:', ['Paste news URL', 'Paste news text'],
|
254 |
horizontal=True)
|