Manasa1 commited on
Commit
074cf17
·
verified ·
1 Parent(s): 5c256b4

Update tweet_analyzer.py

Browse files
Files changed (1) hide show
  1. tweet_analyzer.py +72 -68
tweet_analyzer.py CHANGED
@@ -1,4 +1,4 @@
1
- import os
2
  from PyPDF2 import PdfReader
3
  import pandas as pd
4
  from dotenv import load_dotenv
@@ -10,6 +10,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
10
  from sklearn.metrics.pairwise import cosine_similarity
11
  from sklearn.cluster import KMeans
12
  import random
 
13
 
14
  class TweetDatasetProcessor:
15
  def __init__(self):
@@ -18,6 +19,7 @@ class TweetDatasetProcessor:
18
  self.tweets = []
19
  self.personality_profile = {}
20
  self.vectorizer = TfidfVectorizer(stop_words='english')
 
21
 
22
  def extract_text_from_pdf(self, pdf_path):
23
  """Extract text content from PDF file."""
@@ -29,40 +31,33 @@ class TweetDatasetProcessor:
29
 
30
  def process_pdf_content(self, text):
31
  """Process PDF content and clean extracted tweets."""
 
 
 
32
  lines = text.split('\n')
33
- clean_tweets = []
34
- buffer = ""
35
-
36
- for line in lines:
37
- line = line.strip()
38
- if not line:
39
- if buffer: # End of a tweet
40
- clean_tweets.append(buffer.strip())
41
- buffer = ""
42
- elif line.startswith('http'): # Skip URLs
43
- continue
44
- else:
45
- buffer += " " + line # Append lines to form complete tweets
46
-
47
- if buffer: # Add the last tweet
48
- clean_tweets.append(buffer.strip())
49
-
50
- # Build the tweet list with metadata
51
- self.tweets = [
52
- {
53
- 'content': tweet,
54
- 'timestamp': datetime.now(), # Assign dummy timestamp
55
- 'mentions': self._extract_mentions(tweet),
56
- 'hashtags': self._extract_hashtags(tweet)
57
- }
58
- for tweet in clean_tweets
59
- ]
60
 
61
  # Save the processed tweets to a CSV
62
  df = pd.DataFrame(self.tweets)
63
  df.to_csv('processed_tweets.csv', index=False)
64
  return df
65
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  def _extract_mentions(self, text):
67
  """Extract mentioned users from tweet."""
68
  return [word for word in text.split() if word.startswith('@')]
@@ -74,6 +69,9 @@ class TweetDatasetProcessor:
74
  def categorize_tweets(self):
75
  """Cluster tweets into categories using KMeans."""
76
  all_tweets = [tweet['content'] for tweet in self.tweets]
 
 
 
77
  tfidf_matrix = self.vectorizer.fit_transform(all_tweets)
78
  kmeans = KMeans(n_clusters=5, random_state=1)
79
  kmeans.fit(tfidf_matrix)
@@ -82,30 +80,38 @@ class TweetDatasetProcessor:
82
  tweet['category'] = f"Category {kmeans.labels_[i]}"
83
  return pd.DataFrame(self.tweets)
84
 
85
- def analyze_personality(self):
86
- """Comprehensive personality analysis using all tweets."""
87
- all_tweets = [tweet['content'] for tweet in self.tweets]
88
- # Use a broader dataset for a comprehensive profile
 
 
89
  analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets:
90
  Core beliefs, emotional tendencies, cognitive patterns, etc.
91
  Tweets for analysis:
92
  {json.dumps(all_tweets, indent=2)}
93
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- response = self.groq_client.chat.completions.create(
96
- messages=[
97
- {"role": "system", "content": "You are an expert psychologist."},
98
- {"role": "user", "content": analysis_prompt},
99
- ],
100
- model="llama-3.1-70b-versatile",
101
- temperature=0.1,
102
- )
103
- self.personality_profile = response.choices[0].message.content
104
- return self.personality_profile
105
-
106
- def analyze_topics(self, n_topics=5):
107
  """Extract and identify different topics the author has tweeted about."""
108
  all_tweets = [tweet['content'] for tweet in self.tweets]
 
 
 
 
109
  tfidf_matrix = self.vectorizer.fit_transform(all_tweets)
110
  nmf_model = NMF(n_components=n_topics, random_state=1)
111
  nmf_model.fit(tfidf_matrix)
@@ -120,34 +126,33 @@ class TweetDatasetProcessor:
120
  """Estimate the number of tokens in the given text."""
121
  return len(text.split())
122
 
123
- def generate_tweet(self, context=""):
124
- """Generate a new tweet based on personality profile and optional context."""
125
- historical_topics = self.analyze_topics(n_topics=5)
126
- additional_contexts = historical_topics + [
127
- "Comment on a recent technological advancement.",
128
- "Share a motivational thought.",
129
- "Discuss a current trending topic.",
130
- "Reflect on a past experience.",
131
- "Provide advice to followers."
132
- ]
133
- selected_contexts = random.sample(additional_contexts, min(3, len(additional_contexts)))
134
-
135
- # Select tweets close to the author's style
136
- tfidf_matrix = self.vectorizer.transform([tweet['content'] for tweet in self.tweets])
137
- similarity = cosine_similarity(tfidf_matrix)
138
- tweet_sample_indices = similarity.sum(axis=1).argsort()[-5:] # Top 5 similar tweets
139
- all_tweets = [self.tweets[i]['content'] for i in tweet_sample_indices]
140
-
141
- personality_profile_excerpt = self.personality_profile[:400]
142
 
 
143
  prompt = f"""Based on this personality profile:
144
  {personality_profile_excerpt}
145
  Current context or topic (if any):
146
  {context}
147
- Additionally, consider these contexts to increase diversity:
148
- {', '.join(selected_contexts)}
149
  Tweets for context:
150
- {', '.join(all_tweets)}
151
  **Only generate the tweet. Do not include analysis, explanation, or any other content.**
152
  """
153
  try:
@@ -163,5 +168,4 @@ class TweetDatasetProcessor:
163
  tweet = response.choices[0].message.content.strip()
164
  return tweet
165
  except Exception as e:
166
- print(f"Error generating tweet: {e}")
167
- return "Error generating tweet"
 
1
+ import os
2
  from PyPDF2 import PdfReader
3
  import pandas as pd
4
  from dotenv import load_dotenv
 
10
  from sklearn.metrics.pairwise import cosine_similarity
11
  from sklearn.cluster import KMeans
12
  import random
13
+ from joblib import Parallel, delayed
14
 
15
  class TweetDatasetProcessor:
16
  def __init__(self):
 
19
  self.tweets = []
20
  self.personality_profile = {}
21
  self.vectorizer = TfidfVectorizer(stop_words='english')
22
+ self.used_tweets = set() # Track used tweets to avoid repetition
23
 
24
  def extract_text_from_pdf(self, pdf_path):
25
  """Extract text content from PDF file."""
 
31
 
32
  def process_pdf_content(self, text):
33
  """Process PDF content and clean extracted tweets."""
34
+ if not text.strip():
35
+ raise ValueError("The uploaded PDF appears to be empty.")
36
+
37
  lines = text.split('\n')
38
+ clean_tweets = Parallel(n_jobs=-1)(delayed(self._process_line)(line) for line in lines)
39
+ self.tweets = [tweet for tweet in clean_tweets if tweet]
40
+
41
+ if not self.tweets:
42
+ raise ValueError("No tweets were extracted from the PDF. Ensure the content is properly formatted.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  # Save the processed tweets to a CSV
45
  df = pd.DataFrame(self.tweets)
46
  df.to_csv('processed_tweets.csv', index=False)
47
  return df
48
 
49
+ def _process_line(self, line):
50
+ """Process a single line in parallel."""
51
+ line = line.strip()
52
+ if not line or line.startswith('http'): # Skip empty lines and URLs
53
+ return None
54
+ return {
55
+ 'content': line,
56
+ 'timestamp': datetime.now(),
57
+ 'mentions': self._extract_mentions(line),
58
+ 'hashtags': self._extract_hashtags(line)
59
+ }
60
+
61
  def _extract_mentions(self, text):
62
  """Extract mentioned users from tweet."""
63
  return [word for word in text.split() if word.startswith('@')]
 
69
  def categorize_tweets(self):
70
  """Cluster tweets into categories using KMeans."""
71
  all_tweets = [tweet['content'] for tweet in self.tweets]
72
+ if not all_tweets:
73
+ raise ValueError("No tweets available for clustering.")
74
+
75
  tfidf_matrix = self.vectorizer.fit_transform(all_tweets)
76
  kmeans = KMeans(n_clusters=5, random_state=1)
77
  kmeans.fit(tfidf_matrix)
 
80
  tweet['category'] = f"Category {kmeans.labels_[i]}"
81
  return pd.DataFrame(self.tweets)
82
 
83
+ def analyze_personality(self, max_tweets=50):
84
+ """Comprehensive personality analysis using a limited subset of tweets."""
85
+ if not self.tweets:
86
+ raise ValueError("No tweets available for personality analysis.")
87
+
88
+ all_tweets = [tweet['content'] for tweet in self.tweets][:max_tweets]
89
  analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets:
90
  Core beliefs, emotional tendencies, cognitive patterns, etc.
91
  Tweets for analysis:
92
  {json.dumps(all_tweets, indent=2)}
93
  """
94
+ try:
95
+ response = self.groq_client.chat.completions.create(
96
+ messages=[
97
+ {"role": "system", "content": "You are an expert psychologist."},
98
+ {"role": "user", "content": analysis_prompt},
99
+ ],
100
+ model="llama-3.1-70b-versatile",
101
+ temperature=0.1,
102
+ )
103
+ self.personality_profile = response.choices[0].message.content
104
+ return self.personality_profile
105
+ except Exception as e:
106
+ return f"Error during personality analysis: {str(e)}"
107
 
108
+ def analyze_topics(self, n_topics=None):
 
 
 
 
 
 
 
 
 
 
 
109
  """Extract and identify different topics the author has tweeted about."""
110
  all_tweets = [tweet['content'] for tweet in self.tweets]
111
+ if not all_tweets:
112
+ return []
113
+
114
+ n_topics = n_topics or min(5, len(all_tweets) // 10)
115
  tfidf_matrix = self.vectorizer.fit_transform(all_tweets)
116
  nmf_model = NMF(n_components=n_topics, random_state=1)
117
  nmf_model.fit(tfidf_matrix)
 
126
  """Estimate the number of tokens in the given text."""
127
  return len(text.split())
128
 
129
+ def generate_tweet(self, context="", sample_size=3):
130
+ """Generate a new tweet by sampling random tweets and avoiding repetition."""
131
+ if not self.tweets:
132
+ return "Error: No tweets available for generation."
133
+
134
+ # Randomly sample unique tweets
135
+ available_tweets = [tweet for tweet in self.tweets if tweet['content'] not in self.used_tweets]
136
+ if len(available_tweets) < sample_size:
137
+ self.used_tweets.clear() # Reset used tweets if all have been used
138
+ available_tweets = self.tweets
139
+
140
+ sampled_tweets = random.sample(available_tweets, sample_size)
141
+ sampled_contents = [tweet['content'] for tweet in sampled_tweets]
142
+
143
+ # Update the used tweets tracker
144
+ self.used_tweets.update(sampled_contents)
145
+
146
+ # Truncate personality profile to avoid token overflow
147
+ personality_profile_excerpt = self.personality_profile[:400] if len(self.personality_profile) > 400 else self.personality_profile
148
 
149
+ # Construct the prompt
150
  prompt = f"""Based on this personality profile:
151
  {personality_profile_excerpt}
152
  Current context or topic (if any):
153
  {context}
 
 
154
  Tweets for context:
155
+ {', '.join(sampled_contents)}
156
  **Only generate the tweet. Do not include analysis, explanation, or any other content.**
157
  """
158
  try:
 
168
  tweet = response.choices[0].message.content.strip()
169
  return tweet
170
  except Exception as e:
171
+ return f"Error generating tweet: {str(e)}"