Spaces:
Running
Running
import time | |
import pandas as pd | |
import sys | |
class DataPreprocessor: | |
def __init__(self, input_file_path): | |
self.input_file_path = input_file_path | |
self.unique_students = None | |
self.unique_problems = None | |
self.unique_prob_hierarchy = None | |
self.unique_steps = None | |
self.unique_kcs = None | |
def analyze_dataset(self): | |
file_iterator = self.load_file_iterator() | |
start_time = time.time() | |
self.unique_students = {"st"} | |
self.unique_problems = {"pr"} | |
self.unique_prob_hierarchy = {"ph"} | |
self.unique_kcs = {"kc"} | |
for chunk_data in file_iterator: | |
for student_id, std_groups in chunk_data.groupby('Anon Student Id'): | |
self.unique_students.update({student_id}) | |
prob_hierarchy = std_groups.groupby('Level (Workspace Id)') | |
for hierarchy, hierarchy_groups in prob_hierarchy: | |
self.unique_prob_hierarchy.update({hierarchy}) | |
prob_name = hierarchy_groups.groupby('Problem Name') | |
for problem_name, prob_name_groups in prob_name: | |
self.unique_problems.update({problem_name}) | |
sub_skills = prob_name_groups['KC Model(MATHia)'] | |
for a in sub_skills: | |
if str(a) != "nan": | |
temp = a.split("~~") | |
for kc in temp: | |
self.unique_kcs.update({kc}) | |
self.unique_students.remove("st") | |
self.unique_problems.remove("pr") | |
self.unique_prob_hierarchy.remove("ph") | |
self.unique_kcs.remove("kc") | |
end_time = time.time() | |
print("Time Taken to analyze dataset = ", end_time - start_time) | |
print("Length of unique students->", len(self.unique_students)) | |
print("Length of unique problems->", len(self.unique_problems)) | |
print("Length of unique problem hierarchy->", len(self.unique_prob_hierarchy)) | |
print("Length of Unique Knowledge components ->", len(self.unique_kcs)) | |
def analyze_dataset_by_section(self, workspace_name): | |
file_iterator = self.load_file_iterator() | |
start_time = time.time() | |
self.unique_students = {"st"} | |
self.unique_problems = {"pr"} | |
self.unique_prob_hierarchy = {"ph"} | |
self.unique_steps = {"s"} | |
self.unique_kcs = {"kc"} | |
# with open("workspace_info.txt", 'a') as f: | |
# sys.stdout = f | |
for chunk_data in file_iterator: | |
for student_id, std_groups in chunk_data.groupby('Anon Student Id'): | |
prob_hierarchy = std_groups.groupby('Level (Workspace Id)') | |
for hierarchy, hierarchy_groups in prob_hierarchy: | |
if workspace_name == hierarchy: | |
# print("Workspace : ", hierarchy) | |
self.unique_students.update({student_id}) | |
self.unique_prob_hierarchy.update({hierarchy}) | |
prob_name = hierarchy_groups.groupby('Problem Name') | |
for problem_name, prob_name_groups in prob_name: | |
self.unique_problems.update({problem_name}) | |
step_names = prob_name_groups['Step Name'] | |
sub_skills = prob_name_groups['KC Model(MATHia)'] | |
for step in step_names: | |
if str(step) != "nan": | |
self.unique_steps.update({step}) | |
for a in sub_skills: | |
if str(a) != "nan": | |
temp = a.split("~~") | |
for kc in temp: | |
self.unique_kcs.update({kc}) | |
self.unique_problems.remove("pr") | |
self.unique_prob_hierarchy.remove("ph") | |
self.unique_steps.remove("s") | |
self.unique_kcs.remove("kc") | |
end_time = time.time() | |
print("Time Taken to analyze dataset = ", end_time - start_time) | |
print("Workspace-> ",workspace_name) | |
print("Length of unique students->", len(self.unique_students)) | |
print("Length of unique problems->", len(self.unique_problems)) | |
print("Length of unique problem hierarchy->", len(self.unique_prob_hierarchy)) | |
print("Length of unique step names ->", len(self.unique_steps)) | |
print("Length of unique knowledge components ->", len(self.unique_kcs)) | |
# f.close() | |
# sys.stdout = sys.__stdout__ | |
def load_file_iterator(self): | |
chunk_iterator = pd.read_csv(self.input_file_path, sep="\t", header=0, iterator=True, chunksize=1000000) | |
return chunk_iterator | |