Spaces:
Running
Running
⚡️ 🐛 fix issue of wrong input text, disambiguate vars
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
app.py
CHANGED
@@ -45,7 +45,9 @@ from aggregate import BatchAggregator
|
|
45 |
from pdf2text import convert_PDF_to_Text
|
46 |
from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
|
47 |
from utils import (
|
|
|
48 |
extract_batches,
|
|
|
49 |
load_example_filenames,
|
50 |
remove_stagnant_files,
|
51 |
saves_summary,
|
@@ -241,10 +243,13 @@ def proc_submission(
|
|
241 |
history = {}
|
242 |
clean_text = clean(input_text, lower=False)
|
243 |
clean_text = remove_stopwords(clean_text) if predrop_stopwords else clean_text
|
244 |
-
|
|
|
|
|
|
|
245 |
|
246 |
-
if
|
247 |
-
|
248 |
# create elaborate HTML warning
|
249 |
input_wc = re.split(r"\s+", input_text)
|
250 |
msg = f"""
|
@@ -256,7 +261,7 @@ def proc_submission(
|
|
256 |
logging.warning(msg)
|
257 |
history["WARNING"] = msg
|
258 |
else:
|
259 |
-
|
260 |
msg = None
|
261 |
|
262 |
if len(input_text) < 50:
|
@@ -278,7 +283,7 @@ def proc_submission(
|
|
278 |
return msg, "<strong>No summary generated.</strong>", "", []
|
279 |
|
280 |
_summaries = predict(
|
281 |
-
input_text=
|
282 |
model_name=model_name,
|
283 |
token_batch_length=token_batch_length,
|
284 |
**settings,
|
@@ -410,14 +415,14 @@ def parse_args():
|
|
410 |
"--add_beam_option",
|
411 |
type=int,
|
412 |
default=None,
|
413 |
-
help=f"Add a beam search option to the
|
414 |
)
|
415 |
parser.add_argument(
|
416 |
"-batch",
|
417 |
"--token_batch_option",
|
418 |
type=int,
|
419 |
default=None,
|
420 |
-
help=f"Add a token batch
|
421 |
)
|
422 |
parser.add_argument(
|
423 |
"-level",
|
@@ -577,7 +582,7 @@ if __name__ == "__main__":
|
|
577 |
value="<center><i>Aggregate summary will appear here!</i></center>",
|
578 |
)
|
579 |
gr.Markdown(
|
580 |
-
"\n\n_Aggregate summary also appended to the bottom of the `.txt` file
|
581 |
)
|
582 |
|
583 |
gr.Markdown("---")
|
|
|
45 |
from pdf2text import convert_PDF_to_Text
|
46 |
from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
|
47 |
from utils import (
|
48 |
+
contraction_aware_tokenize,
|
49 |
extract_batches,
|
50 |
+
extract_keywords,
|
51 |
load_example_filenames,
|
52 |
remove_stagnant_files,
|
53 |
saves_summary,
|
|
|
243 |
history = {}
|
244 |
clean_text = clean(input_text, lower=False)
|
245 |
clean_text = remove_stopwords(clean_text) if predrop_stopwords else clean_text
|
246 |
+
logging.info(
|
247 |
+
f"pre-truncation word count: {len(contraction_aware_tokenize(clean_text))}"
|
248 |
+
)
|
249 |
+
truncation_validated = truncate_word_count(clean_text, max_words=max_input_length)
|
250 |
|
251 |
+
if truncation_validated["was_truncated"]:
|
252 |
+
model_input_text = truncation_validated["processed_text"]
|
253 |
# create elaborate HTML warning
|
254 |
input_wc = re.split(r"\s+", input_text)
|
255 |
msg = f"""
|
|
|
261 |
logging.warning(msg)
|
262 |
history["WARNING"] = msg
|
263 |
else:
|
264 |
+
model_input_text = truncation_validated["processed_text"]
|
265 |
msg = None
|
266 |
|
267 |
if len(input_text) < 50:
|
|
|
283 |
return msg, "<strong>No summary generated.</strong>", "", []
|
284 |
|
285 |
_summaries = predict(
|
286 |
+
input_text=model_input_text,
|
287 |
model_name=model_name,
|
288 |
token_batch_length=token_batch_length,
|
289 |
**settings,
|
|
|
415 |
"--add_beam_option",
|
416 |
type=int,
|
417 |
default=None,
|
418 |
+
help=f"Add a beam search option to the demo UI options, default: {pp.pformat(BEAM_OPTIONS, compact=True)}",
|
419 |
)
|
420 |
parser.add_argument(
|
421 |
"-batch",
|
422 |
"--token_batch_option",
|
423 |
type=int,
|
424 |
default=None,
|
425 |
+
help=f"Add a token batch size to the demo UI options, default: {pp.pformat(TOKEN_BATCH_OPTIONS, compact=True)}",
|
426 |
)
|
427 |
parser.add_argument(
|
428 |
"-level",
|
|
|
582 |
value="<center><i>Aggregate summary will appear here!</i></center>",
|
583 |
)
|
584 |
gr.Markdown(
|
585 |
+
"\n\n_Aggregate summary is also appended to the bottom of the `.txt` file._"
|
586 |
)
|
587 |
|
588 |
gr.Markdown("---")
|
utils.py
CHANGED
@@ -27,8 +27,8 @@ STOPWORDS = set(
|
|
27 |
)
|
28 |
|
29 |
|
30 |
-
def
|
31 |
-
"""
|
32 |
|
33 |
# Tokenize the text using the WhitespaceTokenizer
|
34 |
tokenizer = WhitespaceTokenizer()
|
@@ -56,17 +56,21 @@ def custom_tokenize(text: str) -> List[str]:
|
|
56 |
|
57 |
|
58 |
def remove_stopwords(
|
59 |
-
text: str, stopwords: List[str] = STOPWORDS,
|
60 |
) -> str:
|
61 |
"""
|
62 |
remove_stopwords - Remove stopwords from text.
|
63 |
|
64 |
:param str text: input text
|
65 |
:param List[str] stopwords: list of stopwords, defaults to STOPWORDS
|
66 |
-
:param bool
|
67 |
:return str: text with stopwords removed
|
68 |
"""
|
69 |
-
words =
|
|
|
|
|
|
|
|
|
70 |
|
71 |
filtered_words = []
|
72 |
for word in words:
|
@@ -204,14 +208,14 @@ def truncate_word_count(text: str, max_words=1024) -> dict:
|
|
204 |
:param int max_words: the maximum number of words to keep, defaults to 1024
|
205 |
:return: dict, the processed text
|
206 |
"""
|
207 |
-
words =
|
208 |
processed = {}
|
209 |
if len(words) > max_words:
|
210 |
processed["was_truncated"] = True
|
211 |
-
processed["
|
212 |
else:
|
213 |
processed["was_truncated"] = False
|
214 |
-
processed["
|
215 |
return processed
|
216 |
|
217 |
|
|
|
27 |
)
|
28 |
|
29 |
|
30 |
+
def contraction_aware_tokenize(text: str) -> List[str]:
|
31 |
+
"""contraction_aware_tokenize - merges words containing apostrophes as one token."""
|
32 |
|
33 |
# Tokenize the text using the WhitespaceTokenizer
|
34 |
tokenizer = WhitespaceTokenizer()
|
|
|
56 |
|
57 |
|
58 |
def remove_stopwords(
|
59 |
+
text: str, stopwords: List[str] = STOPWORDS, contraction_tokenize: bool = True
|
60 |
) -> str:
|
61 |
"""
|
62 |
remove_stopwords - Remove stopwords from text.
|
63 |
|
64 |
:param str text: input text
|
65 |
:param List[str] stopwords: list of stopwords, defaults to STOPWORDS
|
66 |
+
:param bool contraction_tokenize: use custom apostrophe tokenizer, defaults to True
|
67 |
:return str: text with stopwords removed
|
68 |
"""
|
69 |
+
words = (
|
70 |
+
contraction_aware_tokenize(text)
|
71 |
+
if contraction_tokenize
|
72 |
+
else word_tokenize(text)
|
73 |
+
)
|
74 |
|
75 |
filtered_words = []
|
76 |
for word in words:
|
|
|
208 |
:param int max_words: the maximum number of words to keep, defaults to 1024
|
209 |
:return: dict, the processed text
|
210 |
"""
|
211 |
+
words = contraction_aware_tokenize(str(text))
|
212 |
processed = {}
|
213 |
if len(words) > max_words:
|
214 |
processed["was_truncated"] = True
|
215 |
+
processed["processed_text"] = " ".join(words[:max_words])
|
216 |
else:
|
217 |
processed["was_truncated"] = False
|
218 |
+
processed["processed_text"] = text
|
219 |
return processed
|
220 |
|
221 |
|