fix math term detection
Browse files- handler.py +18 -12
handler.py
CHANGED
@@ -265,23 +265,29 @@ def load_math_terms():
|
|
265 |
math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
|
266 |
math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
|
267 |
else:
|
268 |
-
|
269 |
-
|
270 |
return math_terms, math_terms_dict
|
271 |
|
272 |
def run_math_density(transcript):
|
273 |
math_terms, math_terms_dict = load_math_terms()
|
274 |
-
|
275 |
-
|
276 |
text = utt.get_clean_text(remove_punct=False)
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
class EndpointHandler():
|
287 |
def __init__(self, path="."):
|
|
|
265 |
math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
|
266 |
math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
|
267 |
else:
|
268 |
+
math_terms.append(term)
|
269 |
+
math_terms_dict[term] = term
|
270 |
return math_terms, math_terms_dict
|
271 |
|
272 |
def run_math_density(transcript):
|
273 |
math_terms, math_terms_dict = load_math_terms()
|
274 |
+
sorted_terms = sorted(math_terms, key=len, reverse=True)
|
275 |
+
for i, utt in enumerate(transcript.utterances):
|
276 |
text = utt.get_clean_text(remove_punct=False)
|
277 |
+
num_matches = 0
|
278 |
+
matched_positions = set()
|
279 |
+
match_list = []
|
280 |
+
for term in sorted_terms:
|
281 |
+
matches = list(re.finditer(term, text, re.IGNORECASE))
|
282 |
+
# Filter out matches that share positions with longer terms
|
283 |
+
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
284 |
+
if len(matches) > 0:
|
285 |
+
match_list.append(math_terms_dict[term])
|
286 |
+
# Update existing match positions
|
287 |
+
matched_positions.update((match.start(), match.end()) for match in matches)
|
288 |
+
num_matches += len(matches)
|
289 |
+
utt.num_math_terms = num_matches
|
290 |
+
utt.math_terms = match_list
|
291 |
|
292 |
class EndpointHandler():
|
293 |
def __init__(self, path="."):
|