stanford-nlpxed
/

transcript-analysis

Inference Endpoints

Model card Files Files and versions Community

update-math-words

by ikarasz - opened 10 days ago

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+582

-474

Files changed (3) hide show

handler.py +68 -64
requirements.txt +1 -0
utils.py +513 -410

handler.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import Dict, List, Any
 from scipy.special import softmax
 import numpy as np
 import weakref
 import re
@@ -9,7 +10,7 @@ nltk.download('stopwords')
 from utils import clean_str, clean_str_nopunct
 import torch
-from utils import MultiHeadModel, BertInputBuilder, get_num_words, MATH_PREFIXES, MATH_WORDS
 import transformers
 from transformers import BertTokenizer, BertForSequenceClassification
@@ -94,7 +95,6 @@ class Utterance:
                f"text='{self.text}', uid={self.uid}," \
                f"starttime={self.starttime}, endtime={self.endtime}, props={self.props})"
 class Transcript:
     def __init__(self, **kwargs):
         self.utterances = []
@@ -152,45 +152,42 @@ class Transcript:
         return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
     def get_word_clouds(self):
-        teacher_dict = {}
-        student_dict = {}
-        uptake_teacher_dict = {}
         stop_words = stopwords.words('english')
         for utt in self.utterances:
-            words = (utt.get_clean_text(remove_punct=True)).split(' ')
-            for word in words:
-                if word in stop_words or word in ['inaudible', 'crosstalk']: continue
-                # handle uptake case
-                if utt.role == 'teacher':
-                    if utt.uptake == 1:
-                        if word not in uptake_teacher_dict:
-                            uptake_teacher_dict[word] = 0
-                        uptake_teacher_dict[word] += 1
-                # ignore math words so they don't get tagged as general
-                if any(math_word in word for math_word in utt.math_terms): continue
-                if utt.role == 'teacher':
-                    if word not in teacher_dict:
-                        teacher_dict[word] = 0
-                    teacher_dict[word] += 1
-                else:
-                    if word not in student_dict:
-                        student_dict[word] = 0
-                    student_dict[word] += 1
-        dict_list = []
-        uptake_dict_list = []
-        teacher_dict_list = []
-        student_dict_list = []
-        for word in uptake_teacher_dict.keys():
-            uptake_dict_list.append({'text': word, 'value': uptake_teacher_dict[word], 'category': 'teacher'})
-        for word in teacher_dict.keys():
-            teacher_dict_list.append(
-                {'text': word, 'value': teacher_dict[word], 'category': 'general'})
-            dict_list.append({'text': word, 'value': teacher_dict[word], 'category': 'general'})
-        for word in student_dict.keys():
-            student_dict_list.append(
-                {'text': word, 'value': student_dict[word], 'category': 'general'})
-            dict_list.append({'text': word, 'value': student_dict[word], 'category': 'general'})
         sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
         sorted_uptake_dict_list = sorted(uptake_dict_list, key=lambda x: x['value'], reverse=True)
         sorted_teacher_dict_list = sorted(teacher_dict_list, key=lambda x: x['value'], reverse=True)
@@ -219,7 +216,6 @@ class Transcript:
     def __repr__(self):
         return f"Transcript(utterances={self.utterances}, custom_params={self.params})"
 class QuestionModel:
     def __init__(self, device, tokenizer, input_builder, max_length=300, path=QUESTION_MODEL):
         print("Loading models...")
@@ -260,7 +256,6 @@ class QuestionModel:
                             return_pooler_output=False)
         return output
 class ReasoningModel:
     def __init__(self, device, tokenizer, input_builder, max_length=128, path=REASONING_MODEL):
         print("Loading models...")
@@ -294,7 +289,6 @@ class ReasoningModel:
                             token_type_ids=instance["token_type_ids"])
         return output
 class UptakeModel:
     def __init__(self, device, tokenizer, input_builder, max_length=120, path=UPTAKE_MODEL):
         print("Loading models...")
@@ -373,16 +367,24 @@ class FocusingQuestionModel:
                             token_type_ids=instance["token_type_ids"])
         return output
 def load_math_terms():
     math_regexes = []
     math_terms_dict = {}
     for term in MATH_WORDS:
-        if term in MATH_PREFIXES:
-            math_terms_dict[rf"\b{term}(s|es|d|ed)?\b"] = term
-            math_regexes.append(rf"\b{term}(s|es|d|ed)?\b")
-        else:
-            math_regexes.append(rf"\b{term}\b")
             math_terms_dict[rf"\b{term}\b"] = term
     return math_regexes, math_terms_dict
 def run_math_density(transcript):
@@ -390,16 +392,16 @@ def run_math_density(transcript):
     sorted_regexes = sorted(math_regexes, key=len, reverse=True)
     teacher_math_word_cloud = {}
     student_math_word_cloud = {}
     for i, utt in enumerate(transcript.utterances):
         text = utt.get_clean_text(remove_punct=True)
         num_matches = 0
         matched_positions = set()
-        match_list = []
         for regex in sorted_regexes:
             matches = list(re.finditer(regex, text, re.IGNORECASE))
             # Filter out matches that share positions with longer terms
             matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
-            # matched_text = [match.group(0) for match in matches]
             if len(matches) > 0:
                 if utt.role == "teacher":
                     if math_terms_dict[regex] not in teacher_math_word_cloud:
@@ -409,30 +411,32 @@ def run_math_density(transcript):
                     if math_terms_dict[regex] not in student_math_word_cloud:
                         student_math_word_cloud[math_terms_dict[regex]] = 0
                     student_math_word_cloud[math_terms_dict[regex]] += len(matches)
-                match_list.append(math_terms_dict[regex])
-            # Update matched positions
-            matched_positions.update((match.start(), match.end()) for match in matches)
             num_matches += len(matches)
-            # print("match group list: ", [match.group(0) for match in matches])
         utt.num_math_terms = num_matches
-        utt.math_terms = match_list
-        # utt.math_match_positions = list(matched_positions)
-        # utt.math_terms_raw = [text[start:end] for start, end in matched_positions]
     teacher_dict_list = []
     student_dict_list = []
     dict_list = []
-    for word in teacher_math_word_cloud.keys():
-        teacher_dict_list.append(
-            {'text': word, 'value': teacher_math_word_cloud[word], 'category': "math"})
-        dict_list.append({'text': word, 'value': teacher_math_word_cloud[word], 'category': "math"})
-    for word in student_math_word_cloud.keys():
-        student_dict_list.append(
-            {'text': word, 'value': student_math_word_cloud[word], 'category': "math"})
-        dict_list.append({'text': word, 'value': student_math_word_cloud[word], 'category': "math"})
     sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
     sorted_teacher_dict_list = sorted(teacher_dict_list, key=lambda x: x['value'], reverse=True)
     sorted_student_dict_list = sorted(student_dict_list, key=lambda x: x['value'], reverse=True)
-    # return sorted_dict_list[:50]
     return sorted_dict_list[:50], sorted_teacher_dict_list[:50], sorted_student_dict_list[:50]
 class EndpointHandler():

 from typing import Dict, List, Any
 from scipy.special import softmax
+from collections import Counter
 import numpy as np
 import weakref
 import re
 from utils import clean_str, clean_str_nopunct
 import torch
+from utils import MultiHeadModel, BertInputBuilder, get_num_words, MATH_PREFIXES, MATH_WORDS, plural_to_singular
 import transformers
 from transformers import BertTokenizer, BertForSequenceClassification
                f"text='{self.text}', uid={self.uid}," \
                f"starttime={self.starttime}, endtime={self.endtime}, props={self.props})"
 class Transcript:
     def __init__(self, **kwargs):
         self.utterances = []
         return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
     def get_word_clouds(self):
+        # Initialize dictionaries
+        teacher_dict = Counter()
+        student_dict = Counter()
+        uptake_teacher_dict = Counter()
         stop_words = stopwords.words('english')
+        # Go through the utterances
         for utt in self.utterances:
+            # Get clean text
+            clean_text = utt.get_clean_text(remove_punct=True)
+            words = clean_text.split()
+            words = [word for word in words if word not in stop_words and word not in ['inaudible', 'crosstalk']]
+            # Handle uptake case
+            if utt.role == 'teacher' and utt.uptake == 1:
+                uptake_teacher_dict.update(words)
+            general_text = ' '.join(words)
+            # Replace math terms with empty strings
+            for math_term in utt.math_terms:
+                general_text = general_text.replace(math_term, '')
+                general_text = general_text.replace('  ', ' ')
+            general_words = general_text.split()
+            # Update the appropriate dictionary
+            if utt.role == 'teacher':
+                teacher_dict.update(general_words)
+            else:
+                student_dict.update(general_words)
+        # Sorting and trimming dictionaries
+        dict_list = dict_to_list(teacher_dict, 'general') + dict_to_list(student_dict, 'general')
+        uptake_dict_list = dict_to_list(uptake_teacher_dict, 'teacher')
+        teacher_dict_list = dict_to_list(teacher_dict, 'general')
+        student_dict_list = dict_to_list(student_dict, 'general')
         sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
         sorted_uptake_dict_list = sorted(uptake_dict_list, key=lambda x: x['value'], reverse=True)
         sorted_teacher_dict_list = sorted(teacher_dict_list, key=lambda x: x['value'], reverse=True)
     def __repr__(self):
         return f"Transcript(utterances={self.utterances}, custom_params={self.params})"
 class QuestionModel:
     def __init__(self, device, tokenizer, input_builder, max_length=300, path=QUESTION_MODEL):
         print("Loading models...")
                             return_pooler_output=False)
         return output
 class ReasoningModel:
     def __init__(self, device, tokenizer, input_builder, max_length=128, path=REASONING_MODEL):
         print("Loading models...")
                             token_type_ids=instance["token_type_ids"])
         return output
 class UptakeModel:
     def __init__(self, device, tokenizer, input_builder, max_length=120, path=UPTAKE_MODEL):
         print("Loading models...")
                             token_type_ids=instance["token_type_ids"])
         return output
+def dict_to_list(d, category):
+    combined_dict = Counter()
+    for word, count in d.items():
+        singular_word = plural_to_singular(word)
+        combined_dict[singular_word] += count
+    return [{'text': word, 'value': count, 'category': category} for word, count in combined_dict.items()]
 def load_math_terms():
     math_regexes = []
     math_terms_dict = {}
+    for term in MATH_PREFIXES:
+        math_terms_dict[rf"\b{term}(s|es|d|ed)?\b"] = term
+        math_regexes.append(rf"\b{term}(s|es|d|ed)?\b")
     for term in MATH_WORDS:
+        if not term in MATH_PREFIXES:
             math_terms_dict[rf"\b{term}\b"] = term
+            math_regexes.append(rf"\b{term}\b")
     return math_regexes, math_terms_dict
 def run_math_density(transcript):
     sorted_regexes = sorted(math_regexes, key=len, reverse=True)
     teacher_math_word_cloud = {}
     student_math_word_cloud = {}
     for i, utt in enumerate(transcript.utterances):
         text = utt.get_clean_text(remove_punct=True)
         num_matches = 0
         matched_positions = set()
+        match_list = set()
         for regex in sorted_regexes:
             matches = list(re.finditer(regex, text, re.IGNORECASE))
             # Filter out matches that share positions with longer terms
             matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
             if len(matches) > 0:
                 if utt.role == "teacher":
                     if math_terms_dict[regex] not in teacher_math_word_cloud:
                     if math_terms_dict[regex] not in student_math_word_cloud:
                         student_math_word_cloud[math_terms_dict[regex]] = 0
                     student_math_word_cloud[math_terms_dict[regex]] += len(matches)
+            for match in matches:
+                match_list.add(match.group())
+                matched_positions.add((match.start(), match.end()))
             num_matches += len(matches)
         utt.num_math_terms = num_matches
+        utt.math_terms = list(match_list)
+    # Initialize lists
     teacher_dict_list = []
     student_dict_list = []
     dict_list = []
+    # Process teacher_math_word_cloud
+    teacher_dict_list = dict_to_list(teacher_math_word_cloud, 'math')
+    dict_list.extend(teacher_dict_list)
+    # Process student_math_word_cloud
+    student_dict_list = dict_to_list(student_math_word_cloud, 'math')
+    dict_list.extend(student_dict_list)
+    # Sort the lists
     sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
     sorted_teacher_dict_list = sorted(teacher_dict_list, key=lambda x: x['value'], reverse=True)
     sorted_student_dict_list = sorted(student_dict_list, key=lambda x: x['value'], reverse=True)
+    # Return the sorted lists
     return sorted_dict_list[:50], sorted_teacher_dict_list[:50], sorted_student_dict_list[:50]
 class EndpointHandler():

requirements.txt CHANGED Viewed

@@ -5,3 +5,4 @@ scipy==1.9.2
 torch==2.3.1
 transformers==4.46.1
 nltk==3.9.1

 torch==2.3.1
 transformers==4.46.1
 nltk==3.9.1
+inflect==7.5.0

utils.py CHANGED Viewed

@@ -7,6 +7,7 @@ from cleantext import clean
 from num2words import num2words
 import re
 import string
 punct_chars = list((set(string.punctuation) | {'’', '‘', '–', '—', '~', '|', '“', '”', '…', "'", "`", '_'}))
 punct_chars.sort()
@@ -34,530 +35,708 @@ MATH_PREFIXES = [
     "median",
     "ratio",
     "area",
-]
-MATH_WORDS = [
-    "absolute value",
-    "algebra",
-    "area",
-    "average",
-    "base of",
-    "box plot",
-    "categorical",
-    "coefficient",
-    "common factor",
-    "common multiple",
-    "compose",
-    "coordinate",
-    "cubed",
-    "decompose",
-    "dependent variable",
-    "distribution",
-    "dot plot",
-    "double number line diagram",
-    "equivalent",
-    "equivalent expression",
-    "ratio",
-    "exponent",
-    "frequency",
-    "greatest common factor",
-    "gcd",
-    "height of",
-    "histogram",
-    "independent variable",
-    "integer",
-    "interquartile range",
-    "iqr",
-    "least common multiple",
-    "long division",
-    "mean absolute deviation",
-    "median",
-    "negative number",
-    "opposite vertex",
-    "parallelogram",
-    "percent",
-    "polygon",
-    "polyhedron",
-    "positive number",
-    "prism",
-    "pyramid",
-    "quadrant",
-    "quadrilateral",
-    "quartile",
-    "rational number",
-    "reciprocal",
-    "equality",
-    "inequality",
-    "squared",
-    "statistic",
-    "surface area",
-    "identity property",
-    "addend",
-    "unit",
-    "number sentence",
-    "make ten",
-    "take from ten",
-    "number bond",
-    "total",
-    "estimate",
-    "hashmark",
-    "meter",
-    "number line",
-    "ruler",
-    "centimeter",
-    "base ten",
-    "expanded form",
-    "hundred",
-    "thousand",
-    "place value",
-    "number disk",
-    "standard form",
-    "unit form",
-    "word form",
-    "tens place",
-    "algorithm",
-    "equation",
-    "simplif",
-    "addition",
-    "subtract",
-    "array",
-    "even number",
-    "odd number",
-    "repeated addition",
-    "tessellat",
-    "whole number",
-    "number path",
-    "rectangle",
-    "square",
-    "bar graph",
-    "data",
-    "degree",
-    "line plot",
-    "picture graph",
-    "scale",
-    "survey",
-    "thermometer",
-    "estimat",
-    "tape diagram",
-    "value",
-    "analog",
-    "angle",
-    "parallel",
-    "partition",
-    "pentagon",
-    "right angle",
-    "cube",
-    "digital",
-    "quarter of",
-    "tangram",
-    "circle",
-    "hexagon",
-    "half circle",
-    "half-circle",
-    "quarter circle",
-    "quarter-circle",
-    "semicircle",
-    "semi-circle",
-    "rectang",
-    "rhombus",
-    "trapezoid",
-    "triangle",
-    "commutative",
-    "equal group",
-    "distributive",
-    "divide",
-    "division",
     "multipl",
-    "parentheses",
-    "quotient",
-    "rotate",
-    "unknown",
-    "add",
-    "capacity",
-    "continuous",
-    "endpoint",
-    "gram",
-    "interval",
-    "kilogram",
-    "volume",
-    "liter",
-    "milliliter",
-    "approximate",
-    "area model",
-    "square unit",
-    "unit square",
     "geometr",
-    "equivalent fraction",
-    "fraction form",
-    "fractional unit",
-    "unit fraction",
-    "unit interval",
-    "measur",
-    "graph",
-    "scaled graph",
-    "diagonal",
-    "perimeter",
-    "regular polygon",
-    "tessellate",
-    "tetromino",
-    "heptagon",
-    "octagon",
-    "digit",
-    "expression",
-    "sum",
-    "kilometer",
-    "mass",
-    "mixed unit",
-    "length",
     "measure",
-    "simplify",
-    "associative",
-    "composite",
-    "divisible",
-    "divisor",
-    "partial product",
-    "prime number",
-    "remainder",
-    "acute",
-    "arc",
-    "collinear",
-    "equilateral",
-    "intersect",
-    "isosceles",
-    "symmetry",
-    "line segment",
-    "line",
-    "obtuse",
-    "perpendicular",
-    "protractor",
-    "scalene",
-    "straight angle",
-    "supplementary angle",
-    "vertex",
-    "common denominator",
-    "denominator",
-    "fraction",
-    "mixed number",
-    "numerator",
-    "whole",
-    "decimal expanded form",
-    "decimal",
-    "hundredth",
-    "tenth",
-    "customary system of measurement",
-    "customary unit",
-    "gallon",
-    "metric",
-    "metric unit",
-    "ounce",
-    "pint",
-    "quart",
-    "convert",
-    "distance",
-    "millimeter",
-    "thousandth",
-    "hundredths",
-    "conversion factor",
-    "decimal fraction",
-    "multiplier",
-    "equivalence",
-    "multiple",
-    "product",
-    "benchmark fraction",
-    "cup",
-    "pound",
-    "yard",
-    "whole unit",
-    "decimal divisor",
-    "factors",
-    "bisect",
-    "cubic units",
-    "hierarchy",
-    "unit cube",
-    "attribute",
-    "kite",
-    "bisector",
-    "solid figure",
-    "square units",
-    "dimension",
-    "axis",
-    "ordered pair",
-    "angle measure",
-    "horizontal",
-    "vertical",
-    "categorical data",
-    "lcm",
-    "measure of center",
-    "meters per second",
-    "numerical",
-    "solution",
-    "unit price",
-    "unit rate",
-    "variability",
-    "variable",
     "abundant number",
     "accurate",
     "acre",
     "addition fact",
     "algebraic",
     "altitude",
     "apex",
-    "arithmetic facts",
     "associative property",
     "astronomical unit",
     "base",
     "baseline",
     "billion",
     "celsius",
     "census",
     "cent",
     "center of a circle",
     "center of a sphere",
     "chance",
     "circle graph",
     "column",
     "combine",
     "common fraction",
     "comparison diagram",
     "comparison story",
     "compass",
     "complement",
     "concave polygon",
     "concentric circles",
     "consecutive",
     "constant",
     "continuous model of area",
     "continuous model of volume",
     "contour",
     "conversion fact",
     "convex polygon",
     "counting numbers",
     "counting up subtraction",
     "cover-up method",
     "cross multiplication",
     "cubic",
     "cubit",
     "curved surface",
     "cylinder",
     "decagon",
     "decimeter",
     "deficient number",
     "density",
     "discrete model",
     "displacement method",
     "divisibility test",
     "divisible by",
     "dodecahedron",
     "double stem plot",
     "doubles fact",
     "egyptian multiplication",
     "elevation",
     "embed figure",
     "end point",
     "enlarge",
     "equal",
-    "equal groups",
-    "equal parts",
-    "equidistant marks",
     "equilateral polygon",
-    "equivalent fractions",
     "european subtraction",
     "expanded notation",
     "expected outcome",
-    "exponential",
-    "extended facts",
     "fact power",
     "fact triangle",
     "factor",
-    "factors of numbers",
     "fahrenheit",
     "false number sentence",
-    "figurate numbers",
     "flowchart",
     "fluid ounce",
     "fractional part",
     "fulcrum",
     "function machine",
     "furlong",
     "genus",
     "geoboard",
     "geometric solid",
     "geometry template",
     "girth",
     "golden ratio",
     "golden rectangle",
     "graph key",
     "grouping symbol",
     "hemisphere",
     "icosahedron",
     "improper fraction",
     "inch",
-    "index of locations",
     "indirect measurement",
     "input",
     "inscribed polygon",
     "instance of a pattern",
     "interior of a figure",
     "interpolate",
     "irrational",
     "isometry transformation",
     "isosceles trapezoid",
     "juxtapose",
     "key sequence",
     "label",
     "landmark",
     "latitude",
     "lattice multiplication",
     "left to right subtraction",
     "leg of a right triangle",
     "like terms",
     "line graph",
     "line of reflection",
     "line of symmetry",
     "line symmetry",
     "lines of latitude",
     "lines of longitude",
     "longitude",
     "magnitude estimate",
     "map legend",
     "map scale",
     "maximum",
     "measurement division",
     "measurement unit",
     "meridian bar",
     "metric system",
     "midpoint",
     "mile",
     "millisecond",
     "minimum",
     "minuend",
     "mirror image",
     "mobius",
     "modal",
     "multiplication counting principle",
     "multiplication diagram",
     "multiplication fact",
-    "multiplication symbols",
     "multiplication use class",
-    "negative rational numbers",
     "nested parentheses",
     "net score",
     "net weight",
     "nonagon",
     "nonconvex polygon",
     "normal span",
     "number grid",
     "number sequence",
     "numeral",
     "numeration",
     "octahedron",
     "open proportion",
-    "operation",
     "operation symbol",
     "opposite angle",
     "opposite change rule",
     "opposite of a number",
     "opposite side",
     "order of magnitude",
     "order of operations",
     "order of rotation symmetry",
     "ordinal number",
     "pan balance",
     "parabola",
     "parallel lines",
-    "parallel planes",
     "part to part ratio",
     "part to whole ratio",
     "part whole fraction",
     "partial differences subtraction",
     "partial products multiplication",
     "partial quotients division",
     "partial sums addition",
     "partitive division",
     "parts and total diagram",
     "per capita",
     "per unit rate",
     "percent circle",
     "perfect number",
     "perpetual calendar",
     "pie graph",
-    "plane",
     "plane figure",
     "point symmetry",
     "population density",
     "precise",
     "predict",
     "prediction line",
     "preimage",
     "prime factorization",
     "prime meridian",
-    "probability",
     "probability meter",
     "probability tree diagram",
     "proper factor",
     "proper fraction",
     "property",
     "quadrangle",
     "quick common denominator",
     "quotitive division",
     "random draw",
     "random experiment",
     "random number",
     "random sample",
     "rank",
     "rate diagram",
     "rate multiplication ",
     "rate unit",
     "recall survey",
     "rectangular array",
     "rectangular coordinate grid",
     "rectangular prism",
     "rectangular pyramid",
     "rectilinear figure",
     "reflection",
     "reflex angle",
     "regular polyhedron",
     "regular tessellation",
     "relation symbol",
     "revolution",
     "right cone",
     "right cylinder",
     "right prism",
     "right pyramid",
     "right triangle",
     "roman numerals",
     "rotation symmetry",
     "same change rule for subtraction",
     "scale model",
     "scale of a map",
     "scale of a number line",
     "sector",
     "segment",
     "sequence",
-    "significant digits",
     "similar figures",
     "simpler form",
     "situtation diagram",
-    "skew lines",
     "slanted",
     "slide rule",
     "span",
     "stacked bar graph",
     "standard unit",
     "stem and leaf plot",
     "step graph",
     "straightedge",
     "substitute",
     "subtrahend",
     "surface",
     "symmetric",
     "tally",
     "tangent",
-    "tangent circles",
     "temperature",
     "template",
     "tetrahedron",
     "theorem",
     "tile",
     "tiling",
     "time graph",
@@ -565,159 +744,83 @@ MATH_WORDS = [
     "top heavy fraction",
     "topological",
     "topology",
     "trade first subtraction",
     "tree diagram",
     "triangular",
     "true number sentence",
     "truncate",
-    "twin primes",
-    "unlike denominators",
-    "unlike fractions",
     "vanishing ",
     "venn diagram",
     "vernal equinox",
     "weight",
     "width",
-    "base of a prism",
-    "base of a pyramid",
-    "face",
-    "numerical data",
-    "opposite",
-    "pace",
-    "per",
-    "region",
-    "sign",
-    "alternate interior angles",
-    "base of an exponent",
-    "cone",
-    "congruent",
-    "counterclockwise",
-    "cube root",
-    "hypotenuse",
-    "irrational number",
-    "linear relationship",
-    "positive association",
-    "rate of change",
-    "translation",
-    "transversal",
-    "circumference",
-    "corresponding",
-    "expand",
-    "population",
-    "proportion",
-    "radius",
-    "random",
-    "repeating decimal",
-    "representative",
-    "scaled",
     "withdrawal",
-    "center",
-    "edge",
-    "height of a parallelogram or triangle",
-    "net",
-    "speed",
-    "table",
-    "term",
-    "adjacent",
-    "complementary",
-    "cross-section",
-    "cross section",
-    "deposit",
-    "event",
-    "measurement error",
-    "proportional",
-    "simulation",
-    "center of a dilation",
-    "clockwise",
-    "dilation",
-    "function",
-    "negative association",
-    "pythagorean theorem",
-    "relative frequency",
-    "rigid transformation",
-    "scale factor",
-    "scatter plot",
-    "similar",
-    "sphere",
-    "two-way table",
-    "additive identity",
-    "additive inverse",
-    "box and whisker plot",
-    "cartesian coordinates",
-    "central angle",
-    "chord",
-    "combination",
-    "commutative property",
-    "coplanar",
-    "cross product",
-    "dependent events",
-    "difference",
-    "dividend",
-    "equilateral triangle",
-    "error of measurement",
-    "factorial",
-    "formula",
-    "identity property of",
-    "independent events",
-    "infinity",
-    "inscribed angle",
-    "intercept",
-    "intercepted arc",
-    "inverse",
-    "inverse operations",
-    "isosceles triangle",
-    "least common denominator",
-    "like fractions",
-    "locus",
-    "logic",
-    "lowest terms",
-    "mode",
-    "multiplicative identity",
-    "multiplicative inverse",
-    "mutually exclusive events",
-    "natural numbers",
-    "normal",
-    "permutation",
-    "pi",
-    "point",
-    "power",
-    "range",
-    "rate",
-    "ray",
-    "real numbers",
-    "rectangular",
-    "root",
-    "rotation",
-    "scalene triangle",
-    "scattergram",
-    "set",
-    "statistics",
-    "terminating decimal",
-    "transformation",
     "x intercept",
     "x-axis",
-    "x-intercept",
     "y intercept",
     "y-axis",
     "y-intercept",
-    "zero",
     "zero property of multiplication",
-    "base of a parallelogram",
-    "base of a triangle",
-    "height",
-    "chance experiment",
-    "diameter",
-    "mean",
-    "percentage",
-    "sample",
-    "legs",
-    "outlier",
-    "slope",
-    "square root",
-    "system of equations",
-    "tessellation",
 ]
 def get_num_words(text):
     if not isinstance(text, str):
         print("%s is not a string" % text)

 from num2words import num2words
 import re
 import string
+import inflect
 punct_chars = list((set(string.punctuation) | {'’', '‘', '–', '—', '~', '|', '“', '”', '…', "'", "`", '_'}))
 punct_chars.sort()
     "median",
     "ratio",
     "area",
+    # added
     "multipl",
+    "divid",
+    "subtrac",
+    "logarit",
+    "algebr",
+    "calcul",
+    "matri",
+    "vect",
     "geometr",
+    "statist",
+    "probabli",
+    "coeffi",
     "measure",
+    "simplif"
+]
+MATH_WORDS = [
+    "absolute deviation",
+    "absolute value",
     "abundant number",
     "accurate",
     "acre",
+    "acute",
+    "add",
+    "addend",
     "addition fact",
+    "addition",
+    "additive identity",
+    "additive inverse",
+    "adjacent",
+    "algebra",
     "algebraic",
+    "algorithm",
+    "alternate interior angle",
     "altitude",
+    "analog",
+    "angle measure",
+    "angle",
+    "angular",
     "apex",
+    "approximate",
+    "arc",
+    "area model",
+    "area",
+    "arithmetic fact",
+    "arithmetic",
+    "array",
     "associative property",
+    "associative",
     "astronomical unit",
+    "attribute",
+    "average",
+    "axis",
+    "bar graph",
+    "base of a parallelogram",
+    "base of a prism",
+    "base of a pyramid",
+    "base of a triangle",
+    "base of an exponent",
+    "base of",
+    "base ten",
     "base",
     "baseline",
+    "benchmark fraction",
     "billion",
+    "binomial",
+    "bisect",
+    "bisector",
+    "box and whisker plot",
+    "box plot",
+    "capacity",
+    "cartesian coordinate",
+    "categorical data",
+    "categorical",
     "celsius",
     "census",
     "cent",
     "center of a circle",
+    "center of a dilation",
     "center of a sphere",
+    "center",
+    "centimeter",
+    "central angle",
+    "centroid",
+    "chance experiment",
     "chance",
+    "chord",
     "circle graph",
+    "circle",
+    "circular",
+    "circumference",
+    "clockwise",
+    "coefficient",
+    "collinear",
+    "column matrix"
     "column",
+    "combination",
     "combine",
+    "common denominator",
+    "common factor",
     "common fraction",
+    "common multiple",
+    "commutative property",
+    "commutative",
     "comparison diagram",
     "comparison story",
     "compass",
     "complement",
+    "complementary",
+    "compose",
+    "composite",
     "concave polygon",
     "concentric circles",
+    "concentric",
+    "cone",
+    "congruent",
     "consecutive",
+    "constant function",
     "constant",
     "continuous model of area",
     "continuous model of volume",
+    "continuous",
     "contour",
     "conversion fact",
+    "conversion factor",
+    "convert",
+    "convex function",
     "convex polygon",
+    "coordinate",
+    "coplanar",
+    "corresponding",
+    "counterclockwise",
     "counting numbers",
     "counting up subtraction",
+    "covariance",
+    "covariate",
     "cover-up method",
     "cross multiplication",
+    "cross product",
+    "cross section",
+    "cross-section",
+    "cube root",
+    "cube",
+    "cubed",
+    "cubic unit",
     "cubic",
     "cubit",
+    "cup",
     "curved surface",
+    "customary system of measurement",
+    "customary unit",
     "cylinder",
+    "cylindrical",
+    "data",
     "decagon",
+    "decimal divisor",
+    "decimal expanded form",
+    "decimal fraction",
+    "decimal point",
+    "decimal",
     "decimeter",
+    "decompose",
     "deficient number",
+    "degree",
+    "delta",
+    "denominator",
     "density",
+    "dependent event",
+    "dependent variable",
+    "deposit",
+    "derivative",
+    "determinant",
+    "diagonal",
+    "diameter",
+    "difference",
+    "differential"
+    "digit",
+    "digital",
+    "dilation",
+    "dimension",
     "discrete model",
     "displacement method",
+    "distance",
+    "distribution",
+    "distributive",
+    "divide",
+    "dividend",
     "divisibility test",
     "divisible by",
+    "divisible",
+    "division",
+    "divisor",
     "dodecahedron",
+    "dot plot",
+    "double number line diagram",
     "double stem plot",
     "doubles fact",
+    "edge",
     "egyptian multiplication",
     "elevation",
     "embed figure",
     "end point",
+    "endpoint",
     "enlarge",
+    "equal group",
+    "equal part",
     "equal",
+    "equality",
+    "equation",
+    "equidistant mark",
     "equilateral polygon",
+    "equilateral triangle",
+    "equilateral",
+    "equivalence",
+    "equivalent expression",
+    "equivalent fraction",
+    "equivalent",
+    "error bound",
+    "error of measurement",
+    "estimat",
+    "estimate",
     "european subtraction",
+    "even number",
+    "event",
+    "expand",
+    "expanded form",
     "expanded notation",
     "expected outcome",
+    "expected value",
+    "exponent",
+    "exponential function",
+    "exponential growth",
+    "expression",
+    "extended fact",
+    "face",
     "fact power",
     "fact triangle",
     "factor",
+    "factorial",
+    "factors of number",
     "fahrenheit",
     "false number sentence",
+    "figurate number",
     "flowchart",
     "fluid ounce",
+    "formula",
+    "fraction form",
+    "fraction",
     "fractional part",
+    "fractional unit",
+    "frequency",
     "fulcrum",
     "function machine",
+    "function",
     "furlong",
+    "gallon",
+    "gcd",
     "genus",
     "geoboard",
+    "geometr",
     "geometric solid",
     "geometry template",
     "girth",
     "golden ratio",
     "golden rectangle",
+    "gram",
     "graph key",
+    "graph",
+    "greatest common divisor"
+    "greatest common factor",
     "grouping symbol",
+    "half circle",
+    "half-circle",
+    "hashmark",
+    "height of a parallelogram or triangle",
+    "height of",
+    "height",
     "hemisphere",
+    "heptagon",
+    "heptagonal",
+    "hexagon",
+    "hexagonal",
+    "hierarchy",
+    "histogram",
+    "horizontal shift",
+    "horizontal stretch",
+    "horizontal",
+    "hundred",
+    "hundredth",
+    "hypotenuse",
+    "hypothesis",
     "icosahedron",
+    "identity function",
+    "identity matrix",
+    "identity property of",
+    "identity property",
     "improper fraction",
     "inch",
+    "incircle",
+    "indefinite integral",
+    "independent event",
+    "independent variable",
+    "index of location",
     "indirect measurement",
+    "inequality",
+    "infinity",
     "input",
+    "inscribed angle",
     "inscribed polygon",
     "instance of a pattern",
+    "integer",
+    "intercept",
+    "intercepted arc",
+    "interior angle",
     "interior of a figure",
     "interpolate",
+    "interquartile range",
+    "intersect",
+    "interval",
+    "inverse operation",
+    "inverse",
+    "iqr",
+    "irrational number",
+    "irrational root",
     "irrational",
     "isometry transformation",
     "isosceles trapezoid",
+    "isosceles triangle",
+    "isosceles",
+    "joint probability",
+    "joint variation",
     "juxtapose",
     "key sequence",
+    "kilogram",
+    "kilometer",
+    "kite",
     "label",
     "landmark",
     "latitude",
     "lattice multiplication",
+    "lcm",
+    "least common denominator",
+    "least common multiple",
     "left to right subtraction",
     "leg of a right triangle",
+    "legs",
+    "length",
+    "like fraction",
     "like terms",
     "line graph",
     "line of reflection",
     "line of symmetry",
+    "line plot",
+    "line segment",
     "line symmetry",
+    "line",
+    "linear relationship",
     "lines of latitude",
     "lines of longitude",
+    "liter",
+    "local maximum",
+    "local minimum",
+    "locus",
+    "logarithm",
+    "logarithmic function",
+    "logarithmic scale",
+    "logic",
+    "long division",
     "longitude",
+    "lowest term",
     "magnitude estimate",
+    "make ten",
     "map legend",
     "map scale",
+    "mass",
     "maximum",
+    "mean absolute deviation",
+    "mean value",
+    "mean",
+    "measure of center",
+    "measure",
     "measurement division",
+    "measurement error",
     "measurement unit",
+    "median",
     "meridian bar",
+    "meter",
+    "meters per second",
     "metric system",
+    "metric unit",
+    "metric",
     "midpoint",
     "mile",
+    "milliliter",
+    "millimeter",
     "millisecond",
     "minimum",
     "minuend",
     "mirror image",
+    "mixed number",
+    "mixed unit",
     "mobius",
     "modal",
+    "mode",
+    "multipl",
+    "multiple",
     "multiplication counting principle",
     "multiplication diagram",
     "multiplication fact",
+    "multiplication symbol",
     "multiplication use class",
+    "multiplicative identity",
+    "multiplicative inverse",
+    "multiplier",
+    "mutually exclusive event",
+    "natural number",
+    "negative association",
+    "negative exponent",
+    "negative number",
+    "negative rational number",
     "nested parentheses",
     "net score",
     "net weight",
+    "net",
     "nonagon",
     "nonconvex polygon",
+    "nonlinear",
+    "normal distribution",
     "normal span",
+    "normal",
+    "number bond",
+    "number disk",
     "number grid",
+    "number line",
+    "number path",
+    "number sentence",
     "number sequence",
     "numeral",
     "numeration",
+    "numerator",
+    "numerical data",
+    "numerical",
+    "obtuse",
+    "octagon",
+    "octagonal",
     "octahedron",
+    "odd number",
     "open proportion",
     "operation symbol",
+    "operational",
     "opposite angle",
     "opposite change rule",
     "opposite of a number",
     "opposite side",
+    "opposite vertex",
+    "opposite",
     "order of magnitude",
     "order of operations",
     "order of rotation symmetry",
+    "order of",
+    "ordered pair",
+    "ordered",
     "ordinal number",
+    "orthogonal",
+    "ounce",
+    "outlier",
+    "pace",
     "pan balance",
     "parabola",
     "parallel lines",
+    "parallel plane",
+    "parallel",
+    "parallelogram",
+    "parentheses",
     "part to part ratio",
     "part to whole ratio",
     "part whole fraction",
     "partial differences subtraction",
+    "partial product",
     "partial products multiplication",
     "partial quotients division",
     "partial sums addition",
+    "partition",
     "partitive division",
     "parts and total diagram",
+    "pentagon",
+    "pentagonal",
     "per capita",
     "per unit rate",
+    "per",
     "percent circle",
+    "percent",
+    "percentage",
     "perfect number",
+    "perfect square",
+    "perfect triangle",
+    "perimeter",
+    "permutation",
+    "perpendicular",
     "perpetual calendar",
+    "pi",
+    "picture graph",
     "pie graph",
+    "pint",
+    "pivot",
+    "place value",
     "plane figure",
+    "plane",
     "point symmetry",
+    "point",
+    "polar coordinate",
+    "polygon",
+    "polyhedron",
+    "polynominal"
     "population density",
+    "population",
+    "positive association",
+    "positive number",
+    "pound",
+    "power",
     "precise",
     "predict",
     "prediction line",
     "preimage",
+    "prime factor",
     "prime factorization",
     "prime meridian",
+    "prime number",
+    "prism",
     "probability meter",
     "probability tree diagram",
+    "probability",
+    "product",
     "proper factor",
     "proper fraction",
     "property",
+    "proportion",
+    "proportional",
+    "proportionality",
+    "protractor",
+    "pyramid",
+    "pythagorean theorem",
     "quadrangle",
+    "quadrant",
+    "quadratic",
+    "quadrilateral",
+    "quart",
+    "quarter circle",
+    "quarter of",
+    "quarter-circle",
+    "quartile",
     "quick common denominator",
+    "quotient",
     "quotitive division",
+    "radian",
+    "radius of"
+    "radius",
     "random draw",
     "random experiment",
     "random number",
     "random sample",
+    "random",
+    "range",
     "rank",
     "rate diagram",
     "rate multiplication ",
+    "rate of change",
     "rate unit",
+    "rate",
+    "ratio of",
+    "ratio",
+    "rational equation",
+    "rational number",
+    "ray",
+    "real number",
     "recall survey",
+    "reciprocal",
+    "rectang",
+    "rectangle",
     "rectangular array",
     "rectangular coordinate grid",
     "rectangular prism",
     "rectangular pyramid",
+    "rectangular",
     "rectilinear figure",
     "reflection",
     "reflex angle",
+    "region",
+    "regular polygon",
     "regular polyhedron",
     "regular tessellation",
     "relation symbol",
+    "relative frequency",
+    "remainder",
+    "repeated addition",
+    "repeating decimal",
+    "representative",
     "revolution",
+    "rhombus",
+    "right angle",
     "right cone",
     "right cylinder",
     "right prism",
     "right pyramid",
     "right triangle",
+    "rigid transformation",
     "roman numerals",
+    "root",
+    "rotate",
     "rotation symmetry",
+    "rotation",
+    "round off",
+    "round-off",
+    "ruler",
     "same change rule for subtraction",
+    "sample",
+    "scalar",
+    "scale factor",
     "scale model",
     "scale of a map",
     "scale of a number line",
+    "scale",
+    "scaled graph",
+    "scaled",
+    "scalene triangle",
+    "scalene",
+    "scatter plot",
+    "scattergram",
     "sector",
     "segment",
+    "semi-circle",
+    "semicircle",
     "sequence",
+    "set",
+    "sign",
+    "significant digit",
+    "significant figure",
     "similar figures",
+    "similar",
     "simpler form",
+    "simplify",
+    "simulation",
     "situtation diagram",
+    "skew line",
     "slanted",
     "slide rule",
+    "slope",
+    "solid figure",
+    "solution",
     "span",
+    "speed",
+    "sphere",
+    "square root",
+    "square unit",
+    "square",
+    "squared",
     "stacked bar graph",
+    "standard form",
     "standard unit",
+    "statistic",
     "stem and leaf plot",
     "step graph",
+    "straight angle",
     "straightedge",
+    "subset of"
     "substitute",
+    "subtract",
     "subtrahend",
+    "sum of",
+    "sum",
+    "supplementary angle",
+    "surface area",
     "surface",
+    "survey",
     "symmetric",
+    "symmetry",
+    "system of equation",
+    "system of",
+    "table",
+    "take from ten",
     "tally",
+    "tangent circle",
     "tangent",
+    "tangram",
+    "tape diagram",
     "temperature",
     "template",
+    "tens place",
+    "tenth",
+    "term",
+    "terminating decimal",
+    "tessellat",
+    "tessellate",
+    "tessellation",
     "tetrahedron",
+    "tetromino",
     "theorem",
+    "thermometer",
+    "thousand",
+    "thousandth",
     "tile",
     "tiling",
     "time graph",
     "top heavy fraction",
     "topological",
     "topology",
+    "total area",
+    "total of",
+    "total surface",
+    "total volume",
     "trade first subtraction",
+    "transformation",
+    "translation",
+    "transversal",
+    "trapezoid",
     "tree diagram",
+    "triangle",
     "triangular",
     "true number sentence",
     "truncate",
+    "twin prime",
+    "two-way table",
+    "unit cube",
+    "unit form",
+    "unit fraction",
+    "unit interval",
+    "unit price",
+    "unit rate",
+    "unit square",
+    "unit",
+    "unknown",
+    "unlike denominator",
+    "unlike fraction",
+    "value",
     "vanishing ",
+    "variability",
+    "variable",
+    "velocity",
     "venn diagram",
     "vernal equinox",
+    "vertex",
+    "vertical",
+    "volume of",
+    "volume",
     "weight",
+    "whole number",
+    "whole unit",
+    "whole",
     "width",
     "withdrawal",
+    "word form",
+    "x axes",
+    "x axis",
     "x intercept",
+    "x-axes",
     "x-axis",
+    "y axes",
+    "y axis",
     "y intercept",
+    "y-axes",
     "y-axis",
     "y-intercept",
+    "yard",
     "zero property of multiplication",
+    "zero",
 ]
+p = inflect.engine()
+def singular_to_plural(word):
+    """Convert singular words to plural using inflect."""
+    plural = p.plural(word)
+    return plural or word
+def plural_to_singular(word):
+    """Convert plural word to singular using inflect."""
+    return p.singular_noun(word) or word
+plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]
+MATH_WORDS += plural_MATH_WORDS
 def get_num_words(text):
     if not isinstance(text, str):
         print("%s is not a string" % text)