import csv import os import xlrd import numpy as np from sklearn.impute import SimpleImputer import element import clear_data import pandas as pd ''' The purpose of this file is to read the contents of the dataset, normalize the elements, in the file of "parser_category". Output three different null values in three folders "drop_null", "fill_null", "interpolate". ''' category = ["compressive_strength","elongation","hardness","plasticity","tensile_strength","yield_strength"] def read_data(category): csv_reader = csv.reader(open(category+".csv")) total_row = sum(1 for line in open(category+".csv")) ## Build a new array whose elements are all 0. result = np.zeros(((total_row, len(element.elements_list))), dtype=float) count = 0 for alloy in csv_reader: ## interate every line(alloy) in the csv file. alloy_ratio = clear_data.normalize_molar_ratios(clear_data.clean_row(str(alloy[0]))[1]) alloy_dic = dict(zip(clear_data.clean_row(str(alloy[0]))[0], alloy_ratio)) ## Add the corresponding ratios at the proper location. for key in alloy_dic.keys(): result[count, element.elements_list.index(key)] = float(alloy_dic.get(key)) count += 1 ## Save the result(array) as the 'Parser.csv' err_csv = os.path.join(os.path.expanduser('.'), 'deploy', 'error.csv') with open("parser_result/parser_category/"+"Parser_element.csv", 'w') as f: writer = csv.writer(f) writer.writerow(element.elements_list) count = 0 for row in result: writer.writerow(row) count += 1 def get_mechnical(path,category): ## For Mechnical Targets.csv m_target = xlrd.open_workbook(path) m_sheet = m_target.sheets()[0] # Get the target data of the machine learning model hardness = m_sheet.col_values(4)[2:] hardness.insert(0,"hardness") yield_strength = m_sheet.col_values(5)[2:] yield_strength.insert(0, "yield_strength") tensile_strength = m_sheet.col_values(6)[2:] tensile_strength.insert(0,"tensile_strength") elongation = m_sheet.col_values(7)[2:] elongation.insert(0,"elongation") compressive_strength = m_sheet.col_values(8)[2:] compressive_strength.insert(0,"compressive_strength") plasticity = m_sheet.col_values(9)[2:] plasticity.insert(0,"plasticity") # Save the mechanical properties of alloys. with open("parser_result/Parser_element.csv") as csvFile: rows = csv.reader(csvFile) with open(("parser_result/parser_category/Parser_"+category+".csv"), 'w') as f: writer = csv.writer(f) index = 0 for row in rows: if category=="hardness": row.append(hardness[index]) elif category=="yield_strength": row.append(yield_strength[index]) elif category == "tensile_strength": row.append(tensile_strength[index]) elif category == "elongation": row.append(elongation[index]) elif category == "compressive_strength": row.append(compressive_strength[index]) elif category == "plasticity": row.append(plasticity[index]) writer.writerow(row) index += 1 data = pd.read_csv('parser_result/parser_category/Parser_'+category+'.csv') last_column = data.iloc[:, -1] null_ratio = last_column.isnull().mean() print("Null ratio in " + category +"dataset is: ", round(null_ratio,2)) # Replace null with 0s. data_fillna = data.fillna(0) df1 = pd.DataFrame(data=data_fillna) df1.to_csv('parser_result/fill_null/'+category+'_fill_null.csv', index=False) # Delete null. data_dropna = data.dropna(axis=0, how='any') df1 = pd.DataFrame(data=data_dropna) df1.to_csv('parser_result/drop_null/'+category+'_drop_null.csv', index=False) # # Split dataset to knn&rf model. # data = data.fillna(0) # df_test = data.drop(index=data.index) # idx = 0 # idx_exit = int(data.shape[0] * 0.07) # for index, row in data.iterrows(): # if row.astype(int)[-1] != 0 and idx <= idx_exit: # df_test = df_test.append(row, ignore_index=True) # data = data.drop([index]) # idx += 1 # df_test.to_csv('parser_result/RF_test/'+category+'_RF_test.csv', index=False) # # # Dealing with rfr_train, split it into knn_train and knn_test. # df_train = pd.DataFrame(data=data) # # Calculate the average number X of data(not 0). # sum_num = 0 # num = 0 # for index, row in df_train.iterrows(): # if row.astype(int)[-1] != 0: # num += 1 # sum_num += row.astype(int)[-1] # mean_num = sum_num / num # # df_0: which need to be imputed by KNN. # df_0 = data.drop(index=data.index) # df_pure = data.drop(index=data.index) # for index, row in df_train.iterrows(): # if row.astype(int)[-1] == 0: # df_0 = df_0.append(row, ignore_index=True) # else: # df_pure = df_pure.append(row, ignore_index=True) # df_0.to_csv('parser_result/KNN_test/'+category+'_KNN_test.csv', index=False) # df_pure.to_csv('parser_result/KNN_train/' + category + '_KNN_train.csv', index=False) if __name__ =="__main__": read_data("mechanical_composition") for c in category: get_mechnical('mechanical.xls', c)