Ashar086 commited on
Commit
260a971
·
verified ·
1 Parent(s): 6f0b83f

Create data_storytelling.py

Browse files
Files changed (1) hide show
  1. data_storytelling.py +107 -0
data_storytelling.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import seaborn as sns
4
+ from wordcloud import WordCloud
5
+
6
+ class DataStoryteller:
7
+ def __init__(self):
8
+ pass
9
+
10
+ def generate_story(self, data):
11
+ story = "Data Story:\n\n"
12
+
13
+ # Basic statistics
14
+ story += self._generate_basic_stats(data)
15
+
16
+ # Correlation analysis
17
+ story += self._generate_correlation_analysis(data)
18
+
19
+ # Trend analysis
20
+ story += self._generate_trend_analysis(data)
21
+
22
+ # Distribution analysis
23
+ story += self._generate_distribution_analysis(data)
24
+
25
+ return story
26
+
27
+ def _generate_basic_stats(self, data):
28
+ stats = data.describe()
29
+ text = "Basic Statistics:\n"
30
+ for column in stats.columns:
31
+ text += f"\n{column}:\n"
32
+ text += f" Mean: {stats[column]['mean']:.2f}\n"
33
+ text += f" Median: {data[column].median():.2f}\n"
34
+ text += f" Min: {stats[column]['min']:.2f}\n"
35
+ text += f" Max: {stats[column]['max']:.2f}\n"
36
+ return text
37
+
38
+ def _generate_correlation_analysis(self, data):
39
+ numeric_data = data.select_dtypes(include=[np.number])
40
+ corr_matrix = numeric_data.corr()
41
+
42
+ text = "\nCorrelation Analysis:\n"
43
+ for i in range(len(corr_matrix.columns)):
44
+ for j in range(i+1, len(corr_matrix.columns)):
45
+ col1, col2 = corr_matrix.columns[i], corr_matrix.columns[j]
46
+ corr = corr_matrix.loc[col1, col2]
47
+ if abs(corr) > 0.5:
48
+ text += f" Strong correlation between {col1} and {col2}: {corr:.2f}\n"
49
+ return text
50
+
51
+ def _generate_trend_analysis(self, data):
52
+ text = "\nTrend Analysis:\n"
53
+ for column in data.select_dtypes(include=[np.number]).columns:
54
+ trend = np.polyfit(range(len(data)), data[column], 1)[0]
55
+ if trend > 0:
56
+ text += f" {column} shows an increasing trend.\n"
57
+ elif trend < 0:
58
+ text += f" {column} shows a decreasing trend.\n"
59
+ else:
60
+ text += f" {column} shows no significant trend.\n"
61
+ return text
62
+
63
+ def _generate_distribution_analysis(self, data):
64
+ text = "\nDistribution Analysis:\n"
65
+ for column in data.select_dtypes(include=[np.number]).columns:
66
+ skewness = data[column].skew()
67
+ if abs(skewness) < 0.5:
68
+ text += f" {column} is approximately symmetrically distributed.\n"
69
+ elif skewness > 0:
70
+ text += f" {column} is right-skewed.\n"
71
+ else:
72
+ text += f" {column} is left-skewed.\n"
73
+ return text
74
+
75
+ def generate_word_cloud(self, data, text_column):
76
+ text = " ".join(data[text_column].astype(str))
77
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
78
+
79
+ plt.figure(figsize=(10, 5))
80
+ plt.imshow(wordcloud, interpolation='bilinear')
81
+ plt.axis('off')
82
+ plt.title('Word Cloud')
83
+
84
+ return plt
85
+
86
+ def generate_summary_dashboard(self, data):
87
+ fig, axs = plt.subplots(2, 2, figsize=(20, 15))
88
+
89
+ # Histogram
90
+ sns.histplot(data=data, x=data.select_dtypes(include=[np.number]).columns[0], ax=axs[0, 0])
91
+ axs[0, 0].set_title('Distribution of ' + data.select_dtypes(include=[np.number]).columns[0])
92
+
93
+ # Scatter plot
94
+ sns.scatterplot(data=data, x=data.select_dtypes(include=[np.number]).columns[0],
95
+ y=data.select_dtypes(include=[np.number]).columns[1], ax=axs[0, 1])
96
+ axs[0, 1].set_title('Scatter Plot')
97
+
98
+ # Box plot
99
+ sns.boxplot(data=data, y=data.select_dtypes(include=[np.number]).columns[0], ax=axs[1, 0])
100
+ axs[1, 0].set_title('Box Plot')
101
+
102
+ # Correlation heatmap
103
+ sns.heatmap(data.select_dtypes(include=[np.number]).corr(), annot=True, cmap='coolwarm', ax=axs[1, 1])
104
+ axs[1, 1].set_title('Correlation Heatmap')
105
+
106
+ plt.tight_layout()
107
+ return fig