Manasa1 commited on
Commit
4c87df2
·
verified ·
1 Parent(s): f85caf0

Update tweet_analyzer.py

Browse files
Files changed (1) hide show
  1. tweet_analyzer.py +49 -22
tweet_analyzer.py CHANGED
@@ -1,4 +1,4 @@
1
- import os
2
  from PyPDF2 import PdfReader
3
  import pandas as pd
4
  from dotenv import load_dotenv
@@ -8,6 +8,7 @@ from datetime import datetime
8
  from sklearn.decomposition import NMF
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
  import random
 
11
 
12
  class TweetDatasetProcessor:
13
  def __init__(self):
@@ -15,6 +16,7 @@ class TweetDatasetProcessor:
15
  self.groq_client = groq.Groq(api_key=os.getenv('Groq_api'))
16
  self.tweets = []
17
  self.personality_profile = {}
 
18
 
19
  def extract_text_from_pdf(self, pdf_path):
20
  """Extract text content from PDF file."""
@@ -72,24 +74,46 @@ class TweetDatasetProcessor:
72
  """Extract hashtags from tweet."""
73
  return [word for word in text.split() if word.startswith('#')]
74
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  def analyze_personality(self):
76
  """Comprehensive personality analysis."""
77
  all_tweets = [tweet['content'] for tweet in self.tweets]
 
 
 
 
 
78
  analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets. Analyze:
79
  Core beliefs, emotional tendencies, cognitive patterns, etc.
80
  Tweets for analysis:
81
- {json.dumps(all_tweets[:30], indent=2)}
82
  """
83
 
84
- response = self.groq_client.chat.completions.create(
85
- messages=[
86
- {"role": "system", "content": "You are an expert psychologist."},
87
- {"role": "user", "content": analysis_prompt},
88
- ],
89
- model="llama-3.1-70b-versatile",
90
- temperature=0.1,
91
- )
92
- self.personality_profile = response.choices[0].message.content
 
 
 
 
 
93
  return self.personality_profile
94
 
95
  def analyze_topics(self, n_topics=5):
@@ -139,14 +163,17 @@ class TweetDatasetProcessor:
139
  4. Includes relevant mentions or hashtags if applicable.
140
  The tweet should feel diverse and authentic, touching on a variety of topics."""
141
 
142
- response = self.groq_client.chat.completions.create(
143
- messages=[
144
- {"role": "system", "content": "You are an expert in replicating writing and thinking patterns."},
145
- {"role": "user", "content": generation_prompt},
146
- ],
147
- model="llama-3.1-70b-versatile",
148
- temperature=1.0, # Increased temperature for more diversity
149
- max_tokens=150,
150
- )
151
-
152
- return response.choices[0].message.content
 
 
 
 
1
+ import os
2
  from PyPDF2 import PdfReader
3
  import pandas as pd
4
  from dotenv import load_dotenv
 
8
  from sklearn.decomposition import NMF
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
  import random
11
+ from transformers import GPT2Tokenizer
12
 
13
  class TweetDatasetProcessor:
14
  def __init__(self):
 
16
  self.groq_client = groq.Groq(api_key=os.getenv('Groq_api'))
17
  self.tweets = []
18
  self.personality_profile = {}
19
+ self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # Initialize tokenizer
20
 
21
  def extract_text_from_pdf(self, pdf_path):
22
  """Extract text content from PDF file."""
 
74
  """Extract hashtags from tweet."""
75
  return [word for word in text.split() if word.startswith('#')]
76
 
77
+ def truncate_to_token_limit(self, tweets, max_tokens=6000):
78
+ """Truncate tweets to fit within token limit."""
79
+ total_tokens = 0
80
+ truncated_tweets = []
81
+ for tweet in tweets:
82
+ tokens = self.tokenizer.encode(tweet)
83
+ if total_tokens + len(tokens) > max_tokens:
84
+ break
85
+ total_tokens += len(tokens)
86
+ truncated_tweets.append(tweet)
87
+ return truncated_tweets
88
+
89
  def analyze_personality(self):
90
  """Comprehensive personality analysis."""
91
  all_tweets = [tweet['content'] for tweet in self.tweets]
92
+
93
+ # Truncate tweets to avoid exceeding token limit
94
+ truncated_tweets = self.truncate_to_token_limit(all_tweets, max_tokens=6000)
95
+
96
+ # Create analysis prompt with truncated tweets
97
  analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets. Analyze:
98
  Core beliefs, emotional tendencies, cognitive patterns, etc.
99
  Tweets for analysis:
100
+ {json.dumps(truncated_tweets, indent=2)}
101
  """
102
 
103
+ try:
104
+ response = self.groq_client.chat.completions.create(
105
+ messages=[
106
+ {"role": "system", "content": "You are an expert psychologist."},
107
+ {"role": "user", "content": analysis_prompt},
108
+ ],
109
+ model="llama-3.1-70b-versatile",
110
+ temperature=0.1,
111
+ )
112
+ self.personality_profile = response.choices[0].message.content
113
+ except Exception as e:
114
+ print(f"Error processing personality analysis: {e}")
115
+ self.personality_profile = {}
116
+
117
  return self.personality_profile
118
 
119
  def analyze_topics(self, n_topics=5):
 
163
  4. Includes relevant mentions or hashtags if applicable.
164
  The tweet should feel diverse and authentic, touching on a variety of topics."""
165
 
166
+ try:
167
+ response = self.groq_client.chat.completions.create(
168
+ messages=[
169
+ {"role": "system", "content": "You are an expert in replicating writing and thinking patterns."},
170
+ {"role": "user", "content": generation_prompt},
171
+ ],
172
+ model="llama-3.1-70b-versatile",
173
+ temperature=1.0, # Increased temperature for more diversity
174
+ max_tokens=150,
175
+ )
176
+ return response.choices[0].message.content
177
+ except Exception as e:
178
+ print(f"Error generating tweet: {e}")
179
+ return "Error generating tweet"