Spaces:
Build error
Build error
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[28]: | |
import numpy as np | |
import pandas as pd | |
import datetime | |
import seaborn as sns | |
import matplotlib | |
matplotlib.use('Agg') | |
import matplotlib.pyplot as plt | |
import missingno as msno | |
import statistics | |
import plotly | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from wordcloud import WordCloud, STOPWORDS | |
import nlplot | |
# #### Reading the Data set ######### | |
# In[2]: | |
def get_details(data): | |
try: | |
correlation_matrix_info = {} | |
missing_values_info = {} | |
print("started") | |
s_time = datetime.datetime.now() | |
data_columns=data.columns.tolist() | |
# ##################################### | |
# | |
# ########## Types of variable ############ | |
# In[3]: | |
num_data = data.select_dtypes(include=np.number) # numeric data | |
num_data_col = data.select_dtypes(include=np.number).columns.tolist() # numeric column name | |
# print("numeric column",len(num_data_col)) | |
cat_data = data.select_dtypes(include=['object']) # Categorical data | |
cat_data_col = data.select_dtypes(include=['object']).columns.tolist() # categorical column names | |
# print("Categorical column",len(cat_data_col)) | |
bool_data = data.select_dtypes(include=["bool_"]) # bool data | |
bool_data_col = data.select_dtypes(include=["bool_"]).columns.tolist() # bool column names | |
# print("Boolean column",len(bool_data_col)) | |
unsupported_data = data.select_dtypes(exclude=["number", "bool_", "object_"]) | |
# ########################################################################################## | |
# | |
# ################################### No of columns ######################################### | |
# In[4]: | |
column = data.columns | |
col_length = len(column) | |
row_length = len(data) | |
# print("Number of variables ",col_length) #Number of variables | |
# print("Number of observations ",row_length) #Number of observations | |
total_cells = col_length * row_length | |
# ############################################################################################ | |
# | |
# ################################ Missing cell and % ######################################### | |
# In[5]: | |
missing_values = np.where(pd.isnull(data)) | |
no_of_missing_values = len(missing_values[0]) # no of missing cells | |
missing_value_per = (no_of_missing_values / total_cells) * 100 # missing cell % | |
# print("no of missing cells ",no_of_missing_values) | |
# print("missing cell(%) ",missing_value_per,"%") | |
# ############################################################################################# | |
# | |
# ################################# duplicate rows and % ####################################### | |
# In[6]: | |
duplicate = data[data.duplicated()] | |
duplicate_rows = len(duplicate) | |
dup_row_per = (duplicate_rows / row_length) * 100 | |
# print("Duplicate rows ",duplicate_rows) | |
# print("Duplicate rows (%) ",dup_row_per,"%") | |
# ############################################################################################### | |
# | |
# #################################### Memory usage ############################################### | |
# In[7]: | |
memory_usage = data.memory_usage(deep=True).sum() | |
memory_usage_MB = memory_usage / 1024 ** 2 | |
# print("Total size in memory ",memory_usage_MB,"MiB") | |
avg_memory_usage = data.memory_usage(deep=True).mean() | |
avg_memory_usage_MB = avg_memory_usage / 1024 ** 2 | |
# print("Average record size in memory ",avg_memory_usage_MB,"MiB") | |
# ################################################################################################# | |
# | |
print("Overview Completed") | |
# ####################################### General Insights of Numeric Variable ########################################## | |
# | |
# In[8]: | |
num_variable = {} | |
for col in num_data_col: | |
val = {} | |
distinct_val = data[col].nunique() | |
val['distinct'] = int(distinct_val) | |
total_count = len(data[col]) | |
distinct_per = (distinct_val / total_count) * 100 | |
val['distinct_percent'] = str(distinct_per) + "%" | |
null = data[col].isnull().sum() | |
val['missing'] = int(null) | |
percent_missing = data[col].isnull().sum() * 100 / len(data[col]) | |
val['missing_percent'] = str(percent_missing) + "%" | |
zeros_in_col = (data[col] == 0).sum() | |
val['zeros'] = int(zeros_in_col) | |
zero_percent = (zeros_in_col / total_count) * 100 | |
val['zero_percent'] = str(zero_percent) + "%" | |
mean = data[col].mean() | |
val['mean'] = float(mean) | |
mini = data[col].min() | |
val['minimum'] = str(mini) | |
median = data[col].median() | |
val['median'] = str(median) | |
maxi = data[col].max() | |
val['maximum'] = str(maxi) | |
# infinite = df[col].isin([np.inf, -np.inf]) | |
infinite = np.isinf(data[col]).values.sum() | |
val['infinite'] = int(infinite) | |
infinite_percent = infinite * 100 / len(data[col]) | |
val['infinite_percent'] = str(infinite_percent) + "%" | |
percent5 = np.percentile(data[col], 5) | |
val['5th_percentile'] = str(percent5) | |
percent95 = np.percentile(data[col], 95) | |
val['95th_percentile'] = str(percent95) | |
range1 = maxi - mini | |
val['range'] = str(range1) | |
q1 = np.percentile(data[col], 25) | |
val['q1'] = str(q1) | |
q3 = np.percentile(data[col], 75) | |
val['q3'] = str(q3) | |
iqr = q3 - q1 | |
val['iqr'] = str(iqr) | |
sample = data[col] | |
standard_deviation = statistics.stdev(data[col]) | |
val['standard_deviation'] = str(standard_deviation) | |
df1 = pd.DataFrame(data) | |
val['skewness'] = str(data[col].skew()) | |
val['kurtosis'] = str(data[col].kurtosis()) | |
val['sum'] = str(data[col].sum()) | |
val['variance'] = str(data[col].var()) | |
cv = standard_deviation / mean | |
# val['co-efficient_variance'] = str(cv) | |
val['monotocity'] = str(((all(data[col][i] <= data[col][i + 1] for i in range(len(data[col]) - 1)) | |
or all(data[col][i] >= data[col][i + 1] for i in range(len(data[col] - 1)))))) | |
#fig, ax = px.subplots(figsize=(10, 10)) | |
fig = px.histogram(data, x=col) | |
fig.update_layout(bargap=0.2) | |
#fig.update_layout(width=25,height=25) | |
val['visual_path'] = fig | |
out_fig = px.box(data, x=col) | |
val['outlier_img'] = out_fig | |
#st.plotly_chart(fig) | |
#px.close(fig) | |
num_variable[col] = val | |
######################################################################################################################### | |
print("Numeric Variable Completed") | |
####################################### General Insights of Categorical Variable ########################################## | |
# In[9]: | |
cat_variable = {} | |
for col in cat_data_col: | |
val = {} | |
distinct_val = data[col].nunique() | |
total_count = len(data[col]) | |
distinct_per = (distinct_val / total_count) * 100 | |
val['distinct'] = int(distinct_val) | |
val['distinct_percent'] = str(round(distinct_per, 5)) + "%" | |
missing_val = np.where(pd.isnull(data[col])) | |
missing_val_count = len(missing_val[0]) | |
missing_value_per = (missing_val_count / total_count) * 100 | |
val['missing'] = int(missing_val_count) | |
val['missing_percent'] = str(str(round(missing_value_per, 5))) + "%" | |
memory_usage_col = data[col].memory_usage(deep=True) | |
memory_usage_col_MB = memory_usage_col / 1024 ** 2 | |
val['memory'] = str(round(memory_usage_col_MB, 5)) + " MiB" | |
measurer = np.vectorize(len) | |
temp_df1 = data[col].dropna() | |
length_result = measurer(temp_df1.values.astype(str)) | |
val['max_length'] = int(length_result.max()) | |
val['median_length'] = int(np.median(length_result)) | |
val['mean_length'] = float(length_result.mean()) | |
val['min_length'] = int(length_result.min()) | |
temp_df = pd.DataFrame(data[col].str.len()) | |
val['total_character'] = int(temp_df.sum()) | |
lst = [] | |
for i in data[col]: | |
if type(i) == str: | |
l = list(set(i)) | |
for j in l: | |
if j not in lst: | |
lst.append(j) | |
val['distinct_character'] = int(len(lst)) | |
val['distinct_categories'] = "" | |
val['distinct_blocks'] = "??" | |
val['distinct_scripts'] = "??" | |
val['unique'] = "??" | |
val['unique_percent'] = "??" | |
#fig=plt.figure() | |
fig = px.histogram(data, y=col) | |
#fig.update_layout(width=25,height=25) | |
val['visual_path'] = fig | |
#px.close(fig) | |
cat_variable[col] = val | |
# #################################################################################################### | |
print("Categorical Variable Completed") | |
##### Scatter Plot for dataset ########## | |
sc_fig = px.scatter_matrix(data) | |
######################################### | |
################# Correlation matrix Visualization ############################# | |
################## pearson ############################# | |
pearsoncorr = num_data.corr(method='pearson') | |
fig = go.Figure(data = [ | |
go.Heatmap( | |
z=pearsoncorr, | |
x=pearsoncorr.columns, | |
y=pearsoncorr.columns) | |
]) | |
correlation_matrix_info['pearsons'] = fig | |
########################################################## | |
################## spearman's ############################# | |
spearmancorr = num_data.corr(method='spearman') | |
fig = go.Figure(data = [ | |
go.Heatmap( | |
z=spearmancorr, | |
x=spearmancorr.columns, | |
y=spearmancorr.columns) | |
]) | |
correlation_matrix_info['spearmans'] = fig | |
########################################################### | |
# ################# kendall's ############################# | |
pearsoncorr = num_data.corr(method='kendall') | |
fig = go.Figure(data = [ | |
go.Heatmap( | |
z=pearsoncorr, | |
x=pearsoncorr.columns, | |
y=pearsoncorr.columns) | |
]) | |
correlation_matrix_info['kendall'] = fig | |
####################################################### | |
###################################################################################################################### | |
############################################### Missing Values #################################################### | |
#################### Count ################ | |
fig1=plt.figure() | |
msno.bar(data, figsize=(20, 20), color="dodgerblue") | |
missing_values_info['count'] = fig1 | |
plt.close(fig1) | |
########################################### | |
################## Matrix ################## | |
fig2=msno.matrix(data, color=(0.27, 0.52, 1.0)) | |
fig_2 = fig2.get_figure() | |
missing_values_info['matrix'] = fig_2 | |
plt.close() | |
############################################# | |
################ heatmap ################ | |
fig3=msno.heatmap(data) | |
fig_3 = fig3.get_figure() | |
missing_values_info['heatmap'] = fig_3 | |
plt.close() | |
############################################# | |
############## dendrogram ################## | |
fig4=msno.dendrogram(data) | |
fig_4 = fig4.get_figure() | |
missing_values_info['dendrogram'] = fig_4 | |
plt.close() | |
################################################ | |
################################################################### | |
f_time = datetime.datetime.now() | |
duration = f_time - s_time | |
final_output = {} | |
overview = {} | |
reproduction = {} | |
numerical_variable_info = {} | |
categorical_variable_info = {} | |
data_statistics = {} | |
variable_type = {} | |
data_statistics['number_of_variables'] = int(col_length) | |
data_statistics['number_of_observations'] = int(row_length) | |
data_statistics['no_of_missing_cells'] = int(no_of_missing_values) | |
data_statistics['missing_cell_percent'] = str(round(missing_value_per, 5)) + "%" | |
data_statistics['duplicate_rows'] = int(duplicate_rows) | |
data_statistics['duplicate_rows_percent'] = str(round(dup_row_per, 5)) + "%" | |
data_statistics['total_size_in_memory'] = str(round(memory_usage_MB, 5)) + "MiB" | |
data_statistics['average_memory_Usage'] = str(round(avg_memory_usage_MB, 5)) + "MiB" | |
variable_type['numeric_column'] = int(len(num_data_col)) | |
variable_type['categorical_column'] = int(len(cat_data_col)) | |
variable_type['boolean_column'] = int(len(bool_data_col)) | |
overview['data_statistics'] = data_statistics | |
overview['variable_type'] = variable_type | |
reproduction['analysis_started'] = str(s_time) | |
reproduction['analysis_finished'] = str(f_time) | |
reproduction['duration'] = str(duration) | |
reproduction['software_version'] = "??" | |
reproduction['download_configuration'] = "??" | |
numerical_variable_info['variable_info'] = num_variable | |
categorical_variable_info['variable_info'] = cat_variable | |
################## Main Functions ###################################### | |
final_output['overview'] = overview | |
final_output['reproduction'] = reproduction | |
final_output['numerical_variable_info'] = numerical_variable_info | |
final_output['categorical_variable_info'] = categorical_variable_info | |
final_output['scatter_chart_matrix']=sc_fig | |
final_output['correlation_matrix_info'] = correlation_matrix_info | |
final_output['missing_values_info'] = missing_values_info | |
####################################################################### | |
return final_output | |
except Exception as e: | |
# exc_type, exc_obj, exc_tb = sys.exc_info() | |
# fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] | |
return None | |
############### Prints the Imbalance Ration of the dataset ################## | |
def imbalnce_ratio(dataset, target): | |
val = "" | |
if dataset[target].nunique() <= 10: | |
dt = dataset[target].value_counts() | |
ln = len(dataset[target].value_counts()) | |
for i in range(0, ln): | |
ir_cal = round(dt[i] / len(dataset) * 10, 1) | |
category ="/"+ str(dt.index[i]) | |
if ir_cal.is_integer(): | |
val = val + str(int(ir_cal)) | |
val =val+ category | |
else: | |
val = val + str(ir_cal) | |
val = val + category | |
if i != (ln - 1): | |
val = val + " : " | |
return val | |
################################################################### | |
########### return's an image which describes about Text visulization ############ | |
def word_cloud(dataset, column): | |
if column == "Select": | |
pass | |
else: | |
comment_words = ' ' | |
wc = WordCloud(stopwords=set(STOPWORDS), | |
max_words=200, | |
max_font_size=100) | |
for val in dataset[column]: | |
# typecaste each val to string | |
val = str(val) | |
# split the value | |
tokens = val.split() | |
# Converts each token into lowercase | |
for i in range(len(tokens)): | |
tokens[i] = tokens[i].lower() | |
for words in tokens: | |
comment_words = comment_words + words + ' ' | |
wc.generate(comment_words) | |
word_list = [] | |
freq_list = [] | |
fontsize_list = [] | |
position_list = [] | |
orientation_list = [] | |
color_list = [] | |
for (word, freq), fontsize, position, orientation, color in wc.layout_: | |
word_list.append(word) | |
freq_list.append(freq) | |
fontsize_list.append(fontsize) | |
position_list.append(position) | |
orientation_list.append(orientation) | |
color_list.append(color) | |
# get the positions | |
x = [] | |
y = [] | |
for i in position_list: | |
x.append(i[0]) | |
y.append(i[1]) | |
# get the relative occurence frequencies | |
new_freq_list = [] | |
for i in freq_list: | |
new_freq_list.append(i * 100) | |
trace = go.Scatter(x=x, | |
y=y, | |
textfont=dict(size=new_freq_list, | |
color=color_list), | |
hoverinfo='text', | |
hovertext=['{0} {1:.2f} %'.format(w, f) for w, f in zip(word_list, new_freq_list)], | |
mode='text', | |
text=word_list | |
) | |
layout = go.Layout({'xaxis': {'showgrid': False, 'showticklabels': False, 'zeroline': False}, | |
'yaxis': {'showgrid': False, 'showticklabels': False, 'zeroline': False}}) | |
fig = go.Figure(data=[trace], layout=layout) | |
return fig | |
############################################################################### | |
########### return's an image which describes about target feature for NLP text classification ############ | |
def plotly_target(dataset, column): | |
if column == "Select": | |
return None | |
else: | |
fig = px.histogram(dataset, y=column) | |
fig.update_layout(bargap=0.2) | |
return fig | |
############################################################################################################ | |
############ Plotting n-gram for text feature in NLP Text Classification ########################### | |
def plot_ngram(dataset, input_col): | |
if input_col == 'Select': | |
return None | |
else: | |
train = dataset | |
train[input_col] = train[input_col].apply(lambda x: x.lower()) | |
npt = nlplot.NLPlot(train, target_col=input_col) | |
stopwords = npt.get_stopword(top_n=30, min_freq=0) | |
fig = npt.bar_ngram( | |
title='bi-gram', | |
xaxis_label='word_count', | |
yaxis_label='word', | |
ngram=2, | |
top_n=50, | |
width=700, | |
height=1100, | |
stopwords=stopwords, | |
) | |
return fig | |
################################################################################################# |