|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from wordcloud import WordCloud |
|
|
|
class DataStoryteller: |
|
def __init__(self): |
|
pass |
|
|
|
def generate_story(self, data): |
|
story = "Data Story:\n\n" |
|
|
|
|
|
story += self._generate_basic_stats(data) |
|
|
|
|
|
story += self._generate_correlation_analysis(data) |
|
|
|
|
|
story += self._generate_trend_analysis(data) |
|
|
|
|
|
story += self._generate_distribution_analysis(data) |
|
|
|
return story |
|
|
|
def _generate_basic_stats(self, data): |
|
stats = data.describe() |
|
text = "Basic Statistics:\n" |
|
for column in stats.columns: |
|
text += f"\n{column}:\n" |
|
text += f" Mean: {stats[column]['mean']:.2f}\n" |
|
text += f" Median: {data[column].median():.2f}\n" |
|
text += f" Min: {stats[column]['min']:.2f}\n" |
|
text += f" Max: {stats[column]['max']:.2f}\n" |
|
return text |
|
|
|
def _generate_correlation_analysis(self, data): |
|
numeric_data = data.select_dtypes(include=[np.number]) |
|
corr_matrix = numeric_data.corr() |
|
|
|
text = "\nCorrelation Analysis:\n" |
|
for i in range(len(corr_matrix.columns)): |
|
for j in range(i+1, len(corr_matrix.columns)): |
|
col1, col2 = corr_matrix.columns[i], corr_matrix.columns[j] |
|
corr = corr_matrix.loc[col1, col2] |
|
if abs(corr) > 0.5: |
|
text += f" Strong correlation between {col1} and {col2}: {corr:.2f}\n" |
|
return text |
|
|
|
def _generate_trend_analysis(self, data): |
|
text = "\nTrend Analysis:\n" |
|
for column in data.select_dtypes(include=[np.number]).columns: |
|
trend = np.polyfit(range(len(data)), data[column], 1)[0] |
|
if trend > 0: |
|
text += f" {column} shows an increasing trend.\n" |
|
elif trend < 0: |
|
text += f" {column} shows a decreasing trend.\n" |
|
else: |
|
text += f" {column} shows no significant trend.\n" |
|
return text |
|
|
|
def _generate_distribution_analysis(self, data): |
|
text = "\nDistribution Analysis:\n" |
|
for column in data.select_dtypes(include=[np.number]).columns: |
|
skewness = data[column].skew() |
|
if abs(skewness) < 0.5: |
|
text += f" {column} is approximately symmetrically distributed.\n" |
|
elif skewness > 0: |
|
text += f" {column} is right-skewed.\n" |
|
else: |
|
text += f" {column} is left-skewed.\n" |
|
return text |
|
|
|
def generate_word_cloud(self, data, text_column): |
|
text = " ".join(data[text_column].astype(str)) |
|
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text) |
|
|
|
plt.figure(figsize=(10, 5)) |
|
plt.imshow(wordcloud, interpolation='bilinear') |
|
plt.axis('off') |
|
plt.title('Word Cloud') |
|
|
|
return plt |
|
|
|
def generate_summary_dashboard(self, data): |
|
fig, axs = plt.subplots(2, 2, figsize=(20, 15)) |
|
|
|
|
|
sns.histplot(data=data, x=data.select_dtypes(include=[np.number]).columns[0], ax=axs[0, 0]) |
|
axs[0, 0].set_title('Distribution of ' + data.select_dtypes(include=[np.number]).columns[0]) |
|
|
|
|
|
sns.scatterplot(data=data, x=data.select_dtypes(include=[np.number]).columns[0], |
|
y=data.select_dtypes(include=[np.number]).columns[1], ax=axs[0, 1]) |
|
axs[0, 1].set_title('Scatter Plot') |
|
|
|
|
|
sns.boxplot(data=data, y=data.select_dtypes(include=[np.number]).columns[0], ax=axs[1, 0]) |
|
axs[1, 0].set_title('Box Plot') |
|
|
|
|
|
sns.heatmap(data.select_dtypes(include=[np.number]).corr(), annot=True, cmap='coolwarm', ax=axs[1, 1]) |
|
axs[1, 1].set_title('Correlation Heatmap') |
|
|
|
plt.tight_layout() |
|
return fig |