Spaces:
Running
Running
♻️ 🔊
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
- summarize.py +12 -6
- utils.py +24 -21
summarize.py
CHANGED
@@ -1,4 +1,8 @@
|
|
|
|
|
|
|
|
1 |
import logging
|
|
|
2 |
|
3 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
|
4 |
|
@@ -98,6 +102,7 @@ def summarize_via_tokenbatches(
|
|
98 |
tokenizer,
|
99 |
batch_length=2048,
|
100 |
batch_stride=16,
|
|
|
101 |
**kwargs,
|
102 |
) -> list:
|
103 |
"""
|
@@ -116,14 +121,15 @@ def summarize_via_tokenbatches(
|
|
116 |
|
117 |
logger = logging.getLogger(__name__)
|
118 |
# log all input parameters
|
119 |
-
if batch_length <
|
120 |
-
batch_length = 512
|
121 |
logger.warning(
|
122 |
-
f"batch_length must be at least
|
123 |
)
|
124 |
-
|
125 |
-
|
126 |
-
)
|
|
|
|
|
127 |
encoded_input = tokenizer(
|
128 |
input_text,
|
129 |
padding="max_length",
|
|
|
1 |
+
"""
|
2 |
+
summarize - a module for summarizing text using a model from the Hugging Face model hub
|
3 |
+
"""
|
4 |
import logging
|
5 |
+
import pprint as pp
|
6 |
|
7 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
|
8 |
|
|
|
102 |
tokenizer,
|
103 |
batch_length=2048,
|
104 |
batch_stride=16,
|
105 |
+
min_batch_length=512,
|
106 |
**kwargs,
|
107 |
) -> list:
|
108 |
"""
|
|
|
121 |
|
122 |
logger = logging.getLogger(__name__)
|
123 |
# log all input parameters
|
124 |
+
if batch_length < min_batch_length:
|
|
|
125 |
logger.warning(
|
126 |
+
f"batch_length must be at least {min_batch_length}. Setting batch_length to {min_batch_length}"
|
127 |
)
|
128 |
+
batch_length = min_batch_length
|
129 |
+
|
130 |
+
logger.info(f"input parameters:\n{pp.pformat(kwargs)}")
|
131 |
+
logger.info(f"batch_length: {batch_length}, batch_stride: {batch_stride}")
|
132 |
+
|
133 |
encoded_input = tokenizer(
|
134 |
input_text,
|
135 |
padding="max_length",
|
utils.py
CHANGED
@@ -2,26 +2,27 @@
|
|
2 |
utils.py - Utility functions for the project.
|
3 |
"""
|
4 |
|
|
|
5 |
import re
|
6 |
import subprocess
|
7 |
-
from collections import defaultdict
|
8 |
from datetime import datetime
|
9 |
-
from itertools import combinations
|
10 |
from pathlib import Path
|
11 |
from typing import List
|
12 |
|
13 |
-
|
|
|
|
|
|
|
14 |
import torch
|
15 |
from natsort import natsorted
|
16 |
-
from nltk.corpus import stopwords
|
17 |
-
from nltk.tokenize import sent_tokenize, word_tokenize
|
18 |
from rapidfuzz import fuzz
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
from
|
23 |
-
|
24 |
-
from rapidfuzz import fuzz
|
25 |
|
26 |
|
27 |
def validate_pytorch2(torch_version: str = None):
|
@@ -32,13 +33,17 @@ def validate_pytorch2(torch_version: str = None):
|
|
32 |
return True if re.match(pattern, torch_version) else False
|
33 |
|
34 |
|
35 |
-
def get_timestamp() -> str:
|
36 |
"""
|
37 |
get_timestamp - get a timestamp for the current time
|
38 |
Returns:
|
39 |
str, the timestamp
|
40 |
"""
|
41 |
-
return
|
|
|
|
|
|
|
|
|
42 |
|
43 |
|
44 |
def truncate_word_count(text, max_words=512):
|
@@ -115,16 +120,12 @@ def extract_keywords(
|
|
115 |
Returns:
|
116 |
A list of strings, where each string is a keyword extracted from the input text.
|
117 |
"""
|
118 |
-
|
119 |
-
stop_words = set(
|
120 |
-
"a about above after again against all am an and any are aren't as at be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves".split()
|
121 |
-
)
|
122 |
-
|
123 |
# Remove stopwords and tokenize the text into words
|
124 |
words = [
|
125 |
word
|
126 |
for word in re.findall(r"\b\w{3,}\b", text.lower())
|
127 |
-
if word not in
|
128 |
]
|
129 |
|
130 |
# Create a graph of word co-occurrences within a moving window of words
|
@@ -149,13 +150,13 @@ def extract_keywords(
|
|
149 |
|
150 |
# Sort the words by score and return the top num_keywords keywords
|
151 |
keywords = sorted(scores, key=scores.get, reverse=True)[:num_keywords]
|
152 |
-
|
153 |
# Use fuzzy matching to remove similar keywords
|
154 |
final_keywords = []
|
155 |
for keyword in keywords:
|
156 |
if not any(fuzz.ratio(keyword, other) > 70 for other in final_keywords):
|
157 |
final_keywords.append(keyword)
|
158 |
-
|
159 |
return final_keywords
|
160 |
|
161 |
|
@@ -170,18 +171,20 @@ def saves_summary(
|
|
170 |
add_signature: whether to add a signature to the output file
|
171 |
kwargs: additional keyword arguments to include in the output file
|
172 |
"""
|
|
|
173 |
sum_text = [f"\t{s['summary'][0]}\n" for s in summarize_output]
|
174 |
sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
|
175 |
scores_text = "\n".join(sum_scores)
|
176 |
full_summary = "\n".join(sum_text)
|
177 |
|
178 |
keywords = "_".join(extract_keywords(full_summary))
|
|
|
179 |
outpath = (
|
180 |
Path.cwd() / f"document_summary_{get_timestamp()}_{keywords}.txt"
|
181 |
if outpath is None
|
182 |
else Path(outpath)
|
183 |
)
|
184 |
-
|
185 |
with open(
|
186 |
outpath,
|
187 |
"w",
|
|
|
2 |
utils.py - Utility functions for the project.
|
3 |
"""
|
4 |
|
5 |
+
import logging
|
6 |
import re
|
7 |
import subprocess
|
8 |
+
from collections import defaultdict, deque
|
9 |
from datetime import datetime
|
10 |
+
from itertools import combinations, islice
|
11 |
from pathlib import Path
|
12 |
from typing import List
|
13 |
|
14 |
+
logging.basicConfig(
|
15 |
+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
16 |
+
level=logging.INFO,
|
17 |
+
)
|
18 |
import torch
|
19 |
from natsort import natsorted
|
|
|
|
|
20 |
from rapidfuzz import fuzz
|
21 |
|
22 |
+
# Define stopwords
|
23 |
+
STOPWORDS = set(
|
24 |
+
"a about above after again against all am an and any are aren't as at be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves".split()
|
25 |
+
)
|
|
|
26 |
|
27 |
|
28 |
def validate_pytorch2(torch_version: str = None):
|
|
|
33 |
return True if re.match(pattern, torch_version) else False
|
34 |
|
35 |
|
36 |
+
def get_timestamp(detailed=False) -> str:
|
37 |
"""
|
38 |
get_timestamp - get a timestamp for the current time
|
39 |
Returns:
|
40 |
str, the timestamp
|
41 |
"""
|
42 |
+
return (
|
43 |
+
datetime.now().strftime("%b%d%Y_%H%M%S%f")
|
44 |
+
if detailed
|
45 |
+
else datetime.now().strftime("%b%d%Y_%H")
|
46 |
+
)
|
47 |
|
48 |
|
49 |
def truncate_word_count(text, max_words=512):
|
|
|
120 |
Returns:
|
121 |
A list of strings, where each string is a keyword extracted from the input text.
|
122 |
"""
|
123 |
+
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
124 |
# Remove stopwords and tokenize the text into words
|
125 |
words = [
|
126 |
word
|
127 |
for word in re.findall(r"\b\w{3,}\b", text.lower())
|
128 |
+
if word not in STOPWORDS
|
129 |
]
|
130 |
|
131 |
# Create a graph of word co-occurrences within a moving window of words
|
|
|
150 |
|
151 |
# Sort the words by score and return the top num_keywords keywords
|
152 |
keywords = sorted(scores, key=scores.get, reverse=True)[:num_keywords]
|
153 |
+
logger.debug(f"All keywords: {keywords}")
|
154 |
# Use fuzzy matching to remove similar keywords
|
155 |
final_keywords = []
|
156 |
for keyword in keywords:
|
157 |
if not any(fuzz.ratio(keyword, other) > 70 for other in final_keywords):
|
158 |
final_keywords.append(keyword)
|
159 |
+
logger.info(f"Keywords (final):\t{final_keywords}")
|
160 |
return final_keywords
|
161 |
|
162 |
|
|
|
171 |
add_signature: whether to add a signature to the output file
|
172 |
kwargs: additional keyword arguments to include in the output file
|
173 |
"""
|
174 |
+
logger = logging.getLogger(__name__)
|
175 |
sum_text = [f"\t{s['summary'][0]}\n" for s in summarize_output]
|
176 |
sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
|
177 |
scores_text = "\n".join(sum_scores)
|
178 |
full_summary = "\n".join(sum_text)
|
179 |
|
180 |
keywords = "_".join(extract_keywords(full_summary))
|
181 |
+
logger.info(f"kw:\t{keywords}")
|
182 |
outpath = (
|
183 |
Path.cwd() / f"document_summary_{get_timestamp()}_{keywords}.txt"
|
184 |
if outpath is None
|
185 |
else Path(outpath)
|
186 |
)
|
187 |
+
logger.info(f"Saving summary to:\t{outpath.name}")
|
188 |
with open(
|
189 |
outpath,
|
190 |
"w",
|