omit math words matched from general words
Browse files- handler.py +31 -30
handler.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from typing import Dict, List, Any
|
2 |
from scipy.special import softmax
|
|
|
3 |
import numpy as np
|
4 |
import weakref
|
5 |
import re
|
@@ -152,31 +153,35 @@ class Transcript:
|
|
152 |
return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
|
153 |
|
154 |
def get_word_clouds(self):
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
158 |
stop_words = stopwords.words('english')
|
159 |
-
for utt in self.utterances:
|
160 |
-
words = (utt.get_clean_text(remove_punct=True)).split(' ')
|
161 |
-
for word in words:
|
162 |
-
if word in stop_words or word in ['inaudible', 'crosstalk']: continue
|
163 |
-
# handle uptake case
|
164 |
-
if utt.role == 'teacher':
|
165 |
-
if utt.uptake == 1:
|
166 |
-
if word not in uptake_teacher_dict:
|
167 |
-
uptake_teacher_dict[word] = 0
|
168 |
-
uptake_teacher_dict[word] += 1
|
169 |
-
# ignore math words so they don't get tagged as general
|
170 |
-
if any(math_word in word for math_word in utt.math_terms): continue
|
171 |
-
if utt.role == 'teacher':
|
172 |
-
if word not in teacher_dict:
|
173 |
-
teacher_dict[word] = 0
|
174 |
-
teacher_dict[word] += 1
|
175 |
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
dict_list = []
|
181 |
uptake_dict_list = []
|
182 |
teacher_dict_list = []
|
@@ -395,12 +400,11 @@ def run_math_density(transcript):
|
|
395 |
text = utt.get_clean_text(remove_punct=True)
|
396 |
num_matches = 0
|
397 |
matched_positions = set()
|
398 |
-
match_list =
|
399 |
for regex in sorted_regexes:
|
400 |
matches = list(re.finditer(regex, text, re.IGNORECASE))
|
401 |
# Filter out matches that share positions with longer terms
|
402 |
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
403 |
-
# matched_text = [match.group(0) for match in matches]
|
404 |
if len(matches) > 0:
|
405 |
if utt.role == "teacher":
|
406 |
if math_terms_dict[regex] not in teacher_math_word_cloud:
|
@@ -411,14 +415,11 @@ def run_math_density(transcript):
|
|
411 |
student_math_word_cloud[math_terms_dict[regex]] = 0
|
412 |
student_math_word_cloud[math_terms_dict[regex]] += len(matches)
|
413 |
for match in matches:
|
414 |
-
match_list.
|
415 |
matched_positions.add((match.start(), match.end()))
|
416 |
num_matches += len(matches)
|
417 |
-
# print("match group list: ", [match.group(0) for match in matches])
|
418 |
utt.num_math_terms = num_matches
|
419 |
-
utt.math_terms = match_list
|
420 |
-
# utt.math_match_positions = list(matched_positions)
|
421 |
-
# utt.math_terms_raw = [text[start:end] for start, end in matched_positions]
|
422 |
teacher_dict_list = []
|
423 |
student_dict_list = []
|
424 |
dict_list = []
|
|
|
1 |
from typing import Dict, List, Any
|
2 |
from scipy.special import softmax
|
3 |
+
from collections import Counter
|
4 |
import numpy as np
|
5 |
import weakref
|
6 |
import re
|
|
|
153 |
return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
|
154 |
|
155 |
def get_word_clouds(self):
|
156 |
+
# Initialize dictionaries
|
157 |
+
teacher_dict = Counter()
|
158 |
+
student_dict = Counter()
|
159 |
+
uptake_teacher_dict = Counter()
|
160 |
stop_words = stopwords.words('english')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
+
# Go through the utterances
|
163 |
+
for utt in self.utterances:
|
164 |
+
# Get clean text
|
165 |
+
clean_text = utt.get_clean_text(remove_punct=True)
|
166 |
+
words = clean_text.split()
|
167 |
+
words = [word for word in words if word not in stop_words and word not in ['inaudible', 'crosstalk']]
|
168 |
+
|
169 |
+
# Handle uptake case
|
170 |
+
if utt.role == 'teacher' and utt.uptake == 1:
|
171 |
+
uptake_teacher_dict.update(words)
|
172 |
+
|
173 |
+
general_text = ' '.join(words)
|
174 |
+
# Replace math terms with empty strings
|
175 |
+
for math_term in utt.math_terms:
|
176 |
+
general_text = general_text.replace(math_term, '')
|
177 |
+
general_text = general_text.replace(' ', ' ')
|
178 |
+
|
179 |
+
general_words = general_text.split()
|
180 |
+
# Update the appropriate dictionary
|
181 |
+
if utt.role == 'teacher':
|
182 |
+
teacher_dict.update(general_words)
|
183 |
+
else:
|
184 |
+
student_dict.update(general_words)
|
185 |
dict_list = []
|
186 |
uptake_dict_list = []
|
187 |
teacher_dict_list = []
|
|
|
400 |
text = utt.get_clean_text(remove_punct=True)
|
401 |
num_matches = 0
|
402 |
matched_positions = set()
|
403 |
+
match_list = set()
|
404 |
for regex in sorted_regexes:
|
405 |
matches = list(re.finditer(regex, text, re.IGNORECASE))
|
406 |
# Filter out matches that share positions with longer terms
|
407 |
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
|
|
408 |
if len(matches) > 0:
|
409 |
if utt.role == "teacher":
|
410 |
if math_terms_dict[regex] not in teacher_math_word_cloud:
|
|
|
415 |
student_math_word_cloud[math_terms_dict[regex]] = 0
|
416 |
student_math_word_cloud[math_terms_dict[regex]] += len(matches)
|
417 |
for match in matches:
|
418 |
+
match_list.add(match.group())
|
419 |
matched_positions.add((match.start(), match.end()))
|
420 |
num_matches += len(matches)
|
|
|
421 |
utt.num_math_terms = num_matches
|
422 |
+
utt.math_terms = list(match_list)
|
|
|
|
|
423 |
teacher_dict_list = []
|
424 |
student_dict_list = []
|
425 |
dict_list = []
|