import pandas as pd | |
import numpy as np | |
from scipy import stats | |
class DataAnalyzer: | |
def analyze(self, data): | |
insights = {} | |
# Basic statistics | |
insights['basic_stats'] = data.describe().to_dict() | |
# Correlation analysis | |
numeric_columns = data.select_dtypes(include=[np.number]).columns | |
if len(numeric_columns) > 1: | |
correlation_matrix = data[numeric_columns].corr() | |
insights['correlations'] = correlation_matrix.to_dict() | |
# Skewness and kurtosis | |
skewness = data[numeric_columns].skew() | |
kurtosis = data[numeric_columns].kurtosis() | |
insights['distribution'] = { | |
'skewness': skewness.to_dict(), | |
'kurtosis': kurtosis.to_dict() | |
} | |
# Categorical data analysis | |
categorical_columns = data.select_dtypes(include=['object']).columns | |
for column in categorical_columns: | |
insights[f'{column}_distribution'] = data[column].value_counts().to_dict() | |
# Check for normality | |
normality_tests = {} | |
for column in numeric_columns: | |
_, p_value = stats.normaltest(data[column].dropna()) | |
normality_tests[column] = { | |
'is_normal': p_value > 0.05, | |
'p_value': p_value | |
} | |
insights['normality_tests'] = normality_tests | |
return insights |