Spaces:
Sleeping
Sleeping
Update tweet_analyzer.py
Browse files- tweet_analyzer.py +16 -41
tweet_analyzer.py
CHANGED
@@ -8,7 +8,6 @@ from datetime import datetime
|
|
8 |
from sklearn.decomposition import NMF
|
9 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
10 |
import random
|
11 |
-
from transformers import GPT2Tokenizer
|
12 |
|
13 |
class TweetDatasetProcessor:
|
14 |
def __init__(self):
|
@@ -16,7 +15,6 @@ class TweetDatasetProcessor:
|
|
16 |
self.groq_client = groq.Groq(api_key=os.getenv('Groq_api'))
|
17 |
self.tweets = []
|
18 |
self.personality_profile = {}
|
19 |
-
self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # Initialize tokenizer
|
20 |
|
21 |
def extract_text_from_pdf(self, pdf_path):
|
22 |
"""Extract text content from PDF file."""
|
@@ -74,46 +72,24 @@ class TweetDatasetProcessor:
|
|
74 |
"""Extract hashtags from tweet."""
|
75 |
return [word for word in text.split() if word.startswith('#')]
|
76 |
|
77 |
-
def truncate_to_token_limit(self, tweets, max_tokens=6000):
|
78 |
-
"""Truncate tweets to fit within token limit."""
|
79 |
-
total_tokens = 0
|
80 |
-
truncated_tweets = []
|
81 |
-
for tweet in tweets:
|
82 |
-
tokens = self.tokenizer.encode(tweet)
|
83 |
-
if total_tokens + len(tokens) > max_tokens:
|
84 |
-
break
|
85 |
-
total_tokens += len(tokens)
|
86 |
-
truncated_tweets.append(tweet)
|
87 |
-
return truncated_tweets
|
88 |
-
|
89 |
def analyze_personality(self):
|
90 |
"""Comprehensive personality analysis."""
|
91 |
all_tweets = [tweet['content'] for tweet in self.tweets]
|
92 |
-
|
93 |
-
# Truncate tweets to avoid exceeding token limit
|
94 |
-
truncated_tweets = self.truncate_to_token_limit(all_tweets, max_tokens=6000)
|
95 |
-
|
96 |
-
# Create analysis prompt with truncated tweets
|
97 |
analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets. Analyze:
|
98 |
Core beliefs, emotional tendencies, cognitive patterns, etc.
|
99 |
Tweets for analysis:
|
100 |
-
{json.dumps(
|
101 |
"""
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
self.personality_profile = response.choices[0].message.content
|
113 |
-
except Exception as e:
|
114 |
-
print(f"Error processing personality analysis: {e}")
|
115 |
-
self.personality_profile = {}
|
116 |
-
|
117 |
return self.personality_profile
|
118 |
|
119 |
def analyze_topics(self, n_topics=5):
|
@@ -156,12 +132,9 @@ class TweetDatasetProcessor:
|
|
156 |
{context}
|
157 |
Additionally, consider these contexts to increase diversity:
|
158 |
{', '.join(selected_contexts)}
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
3. Uses a natural communication style and vocabulary.
|
163 |
-
4. Includes relevant mentions or hashtags if applicable.
|
164 |
-
The tweet should feel diverse and authentic, touching on a variety of topics."""
|
165 |
|
166 |
try:
|
167 |
response = self.groq_client.chat.completions.create(
|
@@ -173,7 +146,9 @@ class TweetDatasetProcessor:
|
|
173 |
temperature=1.0, # Increased temperature for more diversity
|
174 |
max_tokens=150,
|
175 |
)
|
176 |
-
|
|
|
|
|
177 |
except Exception as e:
|
178 |
print(f"Error generating tweet: {e}")
|
179 |
return "Error generating tweet"
|
|
|
8 |
from sklearn.decomposition import NMF
|
9 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
10 |
import random
|
|
|
11 |
|
12 |
class TweetDatasetProcessor:
|
13 |
def __init__(self):
|
|
|
15 |
self.groq_client = groq.Groq(api_key=os.getenv('Groq_api'))
|
16 |
self.tweets = []
|
17 |
self.personality_profile = {}
|
|
|
18 |
|
19 |
def extract_text_from_pdf(self, pdf_path):
|
20 |
"""Extract text content from PDF file."""
|
|
|
72 |
"""Extract hashtags from tweet."""
|
73 |
return [word for word in text.split() if word.startswith('#')]
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
def analyze_personality(self):
|
76 |
"""Comprehensive personality analysis."""
|
77 |
all_tweets = [tweet['content'] for tweet in self.tweets]
|
|
|
|
|
|
|
|
|
|
|
78 |
analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets. Analyze:
|
79 |
Core beliefs, emotional tendencies, cognitive patterns, etc.
|
80 |
Tweets for analysis:
|
81 |
+
{json.dumps(all_tweets[:30], indent=2)}
|
82 |
"""
|
83 |
|
84 |
+
response = self.groq_client.chat.completions.create(
|
85 |
+
messages=[
|
86 |
+
{"role": "system", "content": "You are an expert psychologist."},
|
87 |
+
{"role": "user", "content": analysis_prompt},
|
88 |
+
],
|
89 |
+
model="llama-3.1-70b-versatile",
|
90 |
+
temperature=0.1,
|
91 |
+
)
|
92 |
+
self.personality_profile = response.choices[0].message.content
|
|
|
|
|
|
|
|
|
|
|
93 |
return self.personality_profile
|
94 |
|
95 |
def analyze_topics(self, n_topics=5):
|
|
|
132 |
{context}
|
133 |
Additionally, consider these contexts to increase diversity:
|
134 |
{', '.join(selected_contexts)}
|
135 |
+
|
136 |
+
**Only generate the tweet. Do not include analysis, explanation, or any other content.**
|
137 |
+
"""
|
|
|
|
|
|
|
138 |
|
139 |
try:
|
140 |
response = self.groq_client.chat.completions.create(
|
|
|
146 |
temperature=1.0, # Increased temperature for more diversity
|
147 |
max_tokens=150,
|
148 |
)
|
149 |
+
tweet = response.choices[0].message.content
|
150 |
+
# Ensure the response only contains the tweet text, and nothing else.
|
151 |
+
return tweet.strip().split("\n")[0] # Only return the first line (tweet)
|
152 |
except Exception as e:
|
153 |
print(f"Error generating tweet: {e}")
|
154 |
return "Error generating tweet"
|