Spaces:

Manasa1
/

Jack_Clone

Sleeping

App Files Files Community

Jack_Clone / tweet_analyzer.py

Manasa1

Update tweet_analyzer.py

60b3b65 verified 5 months ago

raw

history blame

7.51 kB

	import os
	from PyPDF2 import PdfReader
	import pandas as pd
	from dotenv import load_dotenv
	import groq
	import json
	from datetime import datetime
	from sklearn.decomposition import NMF
	from sklearn.feature_extraction.text import TfidfVectorizer
	import random

	class TweetDatasetProcessor:
	def __init__(self):
	load_dotenv()
	self.groq_client = groq.Groq(api_key=os.getenv('Groq_api'))
	self.tweets = []
	self.personality_profile = {}

	def extract_text_from_pdf(self, pdf_path):
	"""Extract text content from PDF file."""
	reader = PdfReader(pdf_path)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	return text

	def process_pdf_content(self, text):
	"""Process PDF content and clean extracted tweets."""
	lines = text.split('\n')
	clean_tweets = []
	buffer = ""

	for line in lines:
	line = line.strip()
	if not line:
	if buffer: # End of a tweet
	clean_tweets.append(buffer.strip())
	buffer = ""
	elif line.startswith('http'): # Skip URLs
	continue
	else:
	buffer += " " + line # Append lines to form complete tweets

	if buffer: # Add the last tweet
	clean_tweets.append(buffer.strip())

	# Build the tweet list with metadata
	self.tweets = [
	{
	'content': tweet,
	'timestamp': datetime.now(), # Assign dummy timestamp
	'mentions': self._extract_mentions(tweet),
	'hashtags': self._extract_hashtags(tweet)
	}
	for tweet in clean_tweets
	]

	# Save the processed tweets to a CSV
	df = pd.DataFrame(self.tweets)
	df.to_csv('processed_tweets.csv', index=False)
	return df

	def _extract_timestamp(self, text):
	"""Extract timestamp if present in tweet."""
	return None # Implement timestamp extraction logic if needed

	def _extract_mentions(self, text):
	"""Extract mentioned users from tweet."""
	return [word for word in text.split() if word.startswith('@')]

	def _extract_hashtags(self, text):
	"""Extract hashtags from tweet."""
	return [word for word in text.split() if word.startswith('#')]

	def analyze_personality(self):
	"""Comprehensive personality analysis."""
	all_tweets = [tweet['content'] for tweet in self.tweets]
	analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets. Analyze:
	Core beliefs, emotional tendencies, cognitive patterns, etc.
	Tweets for analysis:
	{json.dumps(all_tweets[:5], indent=2)} # Further reduced number of tweets
	"""

	response = self.groq_client.chat.completions.create(
	messages=[
	{"role": "system", "content": "You are an expert psychologist."},
	{"role": "user", "content": analysis_prompt},
	],
	model="llama-3.1-70b-versatile",
	temperature=0.1,
	)
	self.personality_profile = response.choices[0].message.content
	return self.personality_profile

	def analyze_topics(self, n_topics=3): # Reduced the number of topics
	"""Extract and identify different topics the author has tweeted about."""
	all_tweets = [tweet['content'] for tweet in self.tweets]
	vectorizer = TfidfVectorizer(stop_words='english')
	tfidf_matrix = vectorizer.fit_transform(all_tweets)
	nmf_model = NMF(n_components=n_topics, random_state=1)
	nmf_model.fit(tfidf_matrix)

	topics = []
	for topic_idx, topic in enumerate(nmf_model.components_):
	topic_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-n_topics - 1:-1]]
	topics.append(" ".join(topic_words))

	# Remove duplicates in topics
	topics = list(set(topics))
	return topics

	def count_tokens(self, text):
	"""Estimate the number of tokens in the given text."""
	# A basic token count estimation (approximate)
	return len(text.split())

	def generate_tweet(self, context=""):
	"""Generate a new tweet based on personality profile and optional context."""
	# Extract historical topics and add them to additional contexts
	historical_topics = self.analyze_topics(n_topics=3) # Reduced number of topics
	additional_contexts = historical_topics + [
	"Comment on a recent technological advancement.",
	"Share a motivational thought.",
	"Discuss a current trending topic.",
	"Reflect on a past experience.",
	"Provide advice to followers."
	]

	# Randomly select multiple contexts to increase diversity
	selected_contexts = random.sample(additional_contexts, min(3, len(additional_contexts)))

	# Randomly sample tweets across different time periods to avoid repetition of topics
	tweet_sample = random.sample(self.tweets, min(5, len(self.tweets))) # Further reduced number of tweets
	all_tweets = [tweet['content'] for tweet in tweet_sample]

	# If personality profile is too long, truncate it (adjust length as needed)
	personality_profile_excerpt = self.personality_profile[:400] # Further truncation

	# Combine everything and check token count
	prompt = f"""Based on this personality profile:
	{personality_profile_excerpt}
	Current context or topic (if any):
	{context}
	Additionally, consider these contexts to increase diversity:
	{', '.join(selected_contexts)}

	Tweets for context:
	{', '.join(all_tweets)}

	Only generate the tweet. Do not include analysis, explanation, or any other content.
	"""

	token_count = self.count_tokens(prompt)
	if token_count > 6000: # Limit to 6000 tokens (adjust as needed)
	# Further truncate the tweet and topics if token limit is exceeded
	all_tweets = all_tweets[:3] # Reduce the number of tweets used
	prompt = f"""Based on this personality profile:
	{personality_profile_excerpt}
	Current context or topic (if any):
	{context}
	Additionally, consider these contexts to increase diversity:
	{', '.join(selected_contexts)}

	Tweets for context:
	{', '.join(all_tweets)}

	Only generate the tweet. Do not include analysis, explanation, or any other content.
	"""

	try:
	response = self.groq_client.chat.completions.create(
	messages=[
	{"role": "system", "content": "You are an expert in replicating writing and thinking patterns."},
	{"role": "user", "content": prompt},
	],
	model="llama-3.1-70b-versatile",
	temperature=1.0, # Increased temperature for more diversity
	max_tokens=150,
	)
	tweet = response.choices[0].message.content
	# Ensure the response only contains the tweet text, and nothing else.
	return tweet.strip().split("\n")[0] # Only return the first line (tweet)
	except Exception as e:
	print(f"Error generating tweet: {e}")
	return "Error generating tweet"