ikarasz commited on
Commit
f0d3fb6
·
1 Parent(s): 0d0084a

omit math words matched from general words

Browse files
Files changed (1) hide show
  1. handler.py +31 -30
handler.py CHANGED
@@ -1,5 +1,6 @@
1
  from typing import Dict, List, Any
2
  from scipy.special import softmax
 
3
  import numpy as np
4
  import weakref
5
  import re
@@ -152,31 +153,35 @@ class Transcript:
152
  return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
153
 
154
  def get_word_clouds(self):
155
- teacher_dict = {}
156
- student_dict = {}
157
- uptake_teacher_dict = {}
 
158
  stop_words = stopwords.words('english')
159
- for utt in self.utterances:
160
- words = (utt.get_clean_text(remove_punct=True)).split(' ')
161
- for word in words:
162
- if word in stop_words or word in ['inaudible', 'crosstalk']: continue
163
- # handle uptake case
164
- if utt.role == 'teacher':
165
- if utt.uptake == 1:
166
- if word not in uptake_teacher_dict:
167
- uptake_teacher_dict[word] = 0
168
- uptake_teacher_dict[word] += 1
169
- # ignore math words so they don't get tagged as general
170
- if any(math_word in word for math_word in utt.math_terms): continue
171
- if utt.role == 'teacher':
172
- if word not in teacher_dict:
173
- teacher_dict[word] = 0
174
- teacher_dict[word] += 1
175
 
176
- else:
177
- if word not in student_dict:
178
- student_dict[word] = 0
179
- student_dict[word] += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  dict_list = []
181
  uptake_dict_list = []
182
  teacher_dict_list = []
@@ -395,12 +400,11 @@ def run_math_density(transcript):
395
  text = utt.get_clean_text(remove_punct=True)
396
  num_matches = 0
397
  matched_positions = set()
398
- match_list = []
399
  for regex in sorted_regexes:
400
  matches = list(re.finditer(regex, text, re.IGNORECASE))
401
  # Filter out matches that share positions with longer terms
402
  matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
403
- # matched_text = [match.group(0) for match in matches]
404
  if len(matches) > 0:
405
  if utt.role == "teacher":
406
  if math_terms_dict[regex] not in teacher_math_word_cloud:
@@ -411,14 +415,11 @@ def run_math_density(transcript):
411
  student_math_word_cloud[math_terms_dict[regex]] = 0
412
  student_math_word_cloud[math_terms_dict[regex]] += len(matches)
413
  for match in matches:
414
- match_list.append(match.group())
415
  matched_positions.add((match.start(), match.end()))
416
  num_matches += len(matches)
417
- # print("match group list: ", [match.group(0) for match in matches])
418
  utt.num_math_terms = num_matches
419
- utt.math_terms = match_list
420
- # utt.math_match_positions = list(matched_positions)
421
- # utt.math_terms_raw = [text[start:end] for start, end in matched_positions]
422
  teacher_dict_list = []
423
  student_dict_list = []
424
  dict_list = []
 
1
  from typing import Dict, List, Any
2
  from scipy.special import softmax
3
+ from collections import Counter
4
  import numpy as np
5
  import weakref
6
  import re
 
153
  return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
154
 
155
  def get_word_clouds(self):
156
+ # Initialize dictionaries
157
+ teacher_dict = Counter()
158
+ student_dict = Counter()
159
+ uptake_teacher_dict = Counter()
160
  stop_words = stopwords.words('english')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
+ # Go through the utterances
163
+ for utt in self.utterances:
164
+ # Get clean text
165
+ clean_text = utt.get_clean_text(remove_punct=True)
166
+ words = clean_text.split()
167
+ words = [word for word in words if word not in stop_words and word not in ['inaudible', 'crosstalk']]
168
+
169
+ # Handle uptake case
170
+ if utt.role == 'teacher' and utt.uptake == 1:
171
+ uptake_teacher_dict.update(words)
172
+
173
+ general_text = ' '.join(words)
174
+ # Replace math terms with empty strings
175
+ for math_term in utt.math_terms:
176
+ general_text = general_text.replace(math_term, '')
177
+ general_text = general_text.replace(' ', ' ')
178
+
179
+ general_words = general_text.split()
180
+ # Update the appropriate dictionary
181
+ if utt.role == 'teacher':
182
+ teacher_dict.update(general_words)
183
+ else:
184
+ student_dict.update(general_words)
185
  dict_list = []
186
  uptake_dict_list = []
187
  teacher_dict_list = []
 
400
  text = utt.get_clean_text(remove_punct=True)
401
  num_matches = 0
402
  matched_positions = set()
403
+ match_list = set()
404
  for regex in sorted_regexes:
405
  matches = list(re.finditer(regex, text, re.IGNORECASE))
406
  # Filter out matches that share positions with longer terms
407
  matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
 
408
  if len(matches) > 0:
409
  if utt.role == "teacher":
410
  if math_terms_dict[regex] not in teacher_math_word_cloud:
 
415
  student_math_word_cloud[math_terms_dict[regex]] = 0
416
  student_math_word_cloud[math_terms_dict[regex]] += len(matches)
417
  for match in matches:
418
+ match_list.add(match.group())
419
  matched_positions.add((match.start(), match.end()))
420
  num_matches += len(matches)
 
421
  utt.num_math_terms = num_matches
422
+ utt.math_terms = list(match_list)
 
 
423
  teacher_dict_list = []
424
  student_dict_list = []
425
  dict_list = []