Ashar086 commited on
Commit
568dff8
·
verified ·
1 Parent(s): b22d66b

Rename text_analyzer.py to data_analysis.py

Browse files
Files changed (2) hide show
  1. data_analysis.py +41 -0
  2. text_analyzer.py +0 -71
data_analysis.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from scipy import stats
4
+
5
+ class DataAnalyzer:
6
+ def analyze(self, data):
7
+ insights = {}
8
+
9
+ # Basic statistics
10
+ insights['basic_stats'] = data.describe().to_dict()
11
+
12
+ # Correlation analysis
13
+ numeric_columns = data.select_dtypes(include=[np.number]).columns
14
+ if len(numeric_columns) > 1:
15
+ correlation_matrix = data[numeric_columns].corr()
16
+ insights['correlations'] = correlation_matrix.to_dict()
17
+
18
+ # Skewness and kurtosis
19
+ skewness = data[numeric_columns].skew()
20
+ kurtosis = data[numeric_columns].kurtosis()
21
+ insights['distribution'] = {
22
+ 'skewness': skewness.to_dict(),
23
+ 'kurtosis': kurtosis.to_dict()
24
+ }
25
+
26
+ # Categorical data analysis
27
+ categorical_columns = data.select_dtypes(include=['object']).columns
28
+ for column in categorical_columns:
29
+ insights[f'{column}_distribution'] = data[column].value_counts().to_dict()
30
+
31
+ # Check for normality
32
+ normality_tests = {}
33
+ for column in numeric_columns:
34
+ _, p_value = stats.normaltest(data[column].dropna())
35
+ normality_tests[column] = {
36
+ 'is_normal': p_value > 0.05,
37
+ 'p_value': p_value
38
+ }
39
+ insights['normality_tests'] = normality_tests
40
+
41
+ return insights
text_analyzer.py DELETED
@@ -1,71 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import nltk
4
- from nltk.corpus import stopwords
5
- from nltk.tokenize import word_tokenize
6
- from nltk.sentiment import SentimentIntensityAnalyzer
7
- from collections import Counter
8
- from wordcloud import WordCloud
9
- import matplotlib.pyplot as plt
10
-
11
- nltk.download('punkt')
12
- nltk.download('stopwords')
13
- nltk.download('vader_lexicon')
14
-
15
- class TextAnalyzer:
16
- def analyze_text(self, df):
17
- text_columns = df.select_dtypes(include=['object']).columns
18
-
19
- if len(text_columns) > 0:
20
- text_column = st.selectbox("Select text column for analysis", text_columns)
21
- analysis_type = st.selectbox("Select analysis type", ["Word Frequency", "Sentiment Analysis", "Word Cloud"])
22
-
23
- if analysis_type == "Word Frequency":
24
- self.perform_word_frequency(df[text_column])
25
- elif analysis_type == "Sentiment Analysis":
26
- self.perform_sentiment_analysis(df[text_column])
27
- elif analysis_type == "Word Cloud":
28
- self.generate_word_cloud(df[text_column])
29
- else:
30
- st.write("No text columns found in the dataset.")
31
-
32
- def perform_word_frequency(self, text_series):
33
- stop_words = set(stopwords.words('english'))
34
- word_freq = Counter()
35
-
36
- for text in text_series:
37
- tokens = word_tokenize(text.lower())
38
- words = [word for word in tokens if word.isalnum() and word not in stop_words]
39
- word_freq.update(words)
40
-
41
- st.subheader("Word Frequency Analysis")
42
- n_words = st.slider("Select number of top words to display", min_value=5, max_value=50, value=20)
43
-
44
- top_words = word_freq.most_common(n_words)
45
- fig = px.bar(x=[word for word, _ in top_words], y=[freq for _, freq in top_words], title="Top Words")
46
- st.plotly_chart(fig)
47
-
48
- def perform_sentiment_analysis(self, text_series):
49
- sia = SentimentIntensityAnalyzer()
50
- sentiments = text_series.apply(lambda x: sia.polarity_scores(x))
51
-
52
- st.subheader("Sentiment Analysis")
53
- sentiment_df = pd.DataFrame(sentiments.tolist())
54
-
55
- fig = px.histogram(sentiment_df, x='compound', title="Sentiment Distribution")
56
- st.plotly_chart(fig)
57
-
58
- st.write("Average Sentiment Scores:")
59
- st.write(sentiment_df.mean())
60
-
61
- def generate_word_cloud(self, text_series):
62
- stop_words = set(stopwords.words('english'))
63
- text = ' '.join(text_series)
64
-
65
- wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=stop_words).generate(text)
66
-
67
- st.subheader("Word Cloud")
68
- fig, ax = plt.subplots()
69
- ax.imshow(wordcloud, interpolation='bilinear')
70
- ax.axis('off')
71
- st.pyplot(fig)