hackathon / data_storytelling.py
Ashar086's picture
Create data_storytelling.py
260a971 verified
raw
history blame
4.12 kB
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
class DataStoryteller:
def __init__(self):
pass
def generate_story(self, data):
story = "Data Story:\n\n"
# Basic statistics
story += self._generate_basic_stats(data)
# Correlation analysis
story += self._generate_correlation_analysis(data)
# Trend analysis
story += self._generate_trend_analysis(data)
# Distribution analysis
story += self._generate_distribution_analysis(data)
return story
def _generate_basic_stats(self, data):
stats = data.describe()
text = "Basic Statistics:\n"
for column in stats.columns:
text += f"\n{column}:\n"
text += f" Mean: {stats[column]['mean']:.2f}\n"
text += f" Median: {data[column].median():.2f}\n"
text += f" Min: {stats[column]['min']:.2f}\n"
text += f" Max: {stats[column]['max']:.2f}\n"
return text
def _generate_correlation_analysis(self, data):
numeric_data = data.select_dtypes(include=[np.number])
corr_matrix = numeric_data.corr()
text = "\nCorrelation Analysis:\n"
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
col1, col2 = corr_matrix.columns[i], corr_matrix.columns[j]
corr = corr_matrix.loc[col1, col2]
if abs(corr) > 0.5:
text += f" Strong correlation between {col1} and {col2}: {corr:.2f}\n"
return text
def _generate_trend_analysis(self, data):
text = "\nTrend Analysis:\n"
for column in data.select_dtypes(include=[np.number]).columns:
trend = np.polyfit(range(len(data)), data[column], 1)[0]
if trend > 0:
text += f" {column} shows an increasing trend.\n"
elif trend < 0:
text += f" {column} shows a decreasing trend.\n"
else:
text += f" {column} shows no significant trend.\n"
return text
def _generate_distribution_analysis(self, data):
text = "\nDistribution Analysis:\n"
for column in data.select_dtypes(include=[np.number]).columns:
skewness = data[column].skew()
if abs(skewness) < 0.5:
text += f" {column} is approximately symmetrically distributed.\n"
elif skewness > 0:
text += f" {column} is right-skewed.\n"
else:
text += f" {column} is left-skewed.\n"
return text
def generate_word_cloud(self, data, text_column):
text = " ".join(data[text_column].astype(str))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud')
return plt
def generate_summary_dashboard(self, data):
fig, axs = plt.subplots(2, 2, figsize=(20, 15))
# Histogram
sns.histplot(data=data, x=data.select_dtypes(include=[np.number]).columns[0], ax=axs[0, 0])
axs[0, 0].set_title('Distribution of ' + data.select_dtypes(include=[np.number]).columns[0])
# Scatter plot
sns.scatterplot(data=data, x=data.select_dtypes(include=[np.number]).columns[0],
y=data.select_dtypes(include=[np.number]).columns[1], ax=axs[0, 1])
axs[0, 1].set_title('Scatter Plot')
# Box plot
sns.boxplot(data=data, y=data.select_dtypes(include=[np.number]).columns[0], ax=axs[1, 0])
axs[1, 0].set_title('Box Plot')
# Correlation heatmap
sns.heatmap(data.select_dtypes(include=[np.number]).corr(), annot=True, cmap='coolwarm', ax=axs[1, 1])
axs[1, 1].set_title('Correlation Heatmap')
plt.tight_layout()
return fig