Spaces:

aus10powell
/

TwitterAccounts

Runtime error

App Files Files Community

TwitterAccounts / scripts /sentiment.py

aus10powell

Update scripts/sentiment.py

2a3e5e3 over 1 year ago

raw

history blame

6.1 kB

	import re
	import nltk
	from typing import List
	from transformers import pipeline
	from tqdm import tqdm
	import numpy as np
	import numpy as np
	import scipy

	def tweet_cleaner(tweet: str) -> str:
	# words = set(nltk.corpus.words.words())
	"""
	Cleans a tweet by removing @ mentions, URLs, hashtags, and non-valid words.

	Args:
	tweet (str): A single tweet as a string.

	Returns:
	str: The cleaned tweet.
	"""
	# Remove @ mentions from the tweet
	# tweet = re.sub("@[A-Za-z0-9]+", "", tweet)

	# # Remove URLs from the tweet
	# tweet = re.sub(r"(?:\@\|http?\://\|https?\://\|www)\S+", "", tweet)

	# # Remove extra whitespaces from the tweet
	# tweet = " ".join(tweet.split())

	# # Remove hashtag sign but keep the text
	# tweet = tweet.replace("#", "").replace("_", " ")

	# # Tokenize the tweet and keep only valid words
	# tweet = " ".join(
	# w
	# for w in nltk.wordpunct_tokenize(tweet)
	# if w.lower() in words or not w.isalpha()
	# )

	# # Return the cleaned tweet
	# return tweet
	bad_start = ["http:", "https:"]
	for w in bad_start:
	tweet = re.sub(f" {w}\\S+", "", tweet) # removes white space before url
	tweet = re.sub(f"{w}\\S+ ", "", tweet) # in case a tweet starts with a url
	tweet = re.sub(f"\n{w}\\S+ ", "", tweet) # in case the url is on a new line
	tweet = re.sub(
	f"\n{w}\\S+", "", tweet
	) # in case the url is alone on a new line
	tweet = re.sub(f"{w}\\S+", "", tweet) # any other case?
	tweet = re.sub(" +", " ", tweet) # replace multiple spaces with one space
	return " ".join(tweet.split()).strip()


	def is_boring_tweet(tweet):
	"""Check if tweet is boring."""
	boring_stuff = ["http", "@", "#"]
	not_boring_words = sum(
	1
	for word in tweet.split()
	if not any(bs in word.lower() for bs in boring_stuff)
	)
	return not_boring_words < 3


	def fix_text(text):
	text = text.replace("&", "&")
	text = text.replace("<", "<")
	text = text.replace(">", ">")
	return text


	def twitter_sentiment_api_score(
	tweet_list: list = None, return_argmax: bool = True, use_api=False
	):
	"""
	Sends a list of tweets to the Hugging Face Twitter Sentiment Analysis API and returns a list of sentiment scores for each tweet.

	Args:
	tweet_list (list): A list of strings, where each string represents a tweet.
	return_argmax (bool): Whether to also return the predicted sentiment label with the highest confidence score for each tweet.

	Returns:
	A list of dictionaries, where each dictionary contains the sentiment scores for a single tweet. Each sentiment score dictionary
	contains three key-value pairs: "positive", "neutral", and "negative". The value for each key is a float between 0 and 1 that
	represents the confidence score for that sentiment label, where higher values indicate higher confidence in that sentiment. If
	`return_argmax` is True, each dictionary will also contain an additional key "argmax" with the predicted sentiment label for
	that tweet.
	"""

	if use_api:
	import requests

	# URL and authentication header for the Hugging Face Twitter Sentiment Analysis API
	API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment"
	headers = {"Authorization": "Bearer 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"}

	# Function to send a POST request with a JSON payload to the API and return the response as a JSON object
	def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

	# Send a list of tweets to the API and receive a list of sentiment scores for each tweet
	output = query(
	{
	"inputs": tweet_list,
	}
	)
	else:

	from transformers import AutoModelForSequenceClassification
	from transformers import TFAutoModelForSequenceClassification
	from transformers import AutoTokenizer
	from scipy.special import softmax
	import os

	task = "sentiment"
	MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
	tokenizer = AutoTokenizer.from_pretrained(MODEL)
	model = AutoModelForSequenceClassification.from_pretrained(MODEL)
	# model.save_pretrained(MODEL)

	def get_sentimet(text):
	labels = ["negative", "neutral", "positive"]
	# text = "Good night 😊"
	text = tweet_cleaner(text)
	encoded_input = tokenizer(text, return_tensors="pt")
	output = model(**encoded_input)
	scores = output[0][0].detach().numpy()
	scores = softmax(scores)
	ranking = np.argsort(scores)[::-1]
	results = {
	labels[ranking[i]]: np.round(float(scores[ranking[i]]), 4)
	for i in range(scores.shape[0])
	}

	max_key = max(results, key=results.get)
	results["argmax"] = max_key
	return results

	return [get_sentimet(t) for t in tweet_list]

	# Loop through the list of sentiment scores and replace the sentiment labels with more intuitive labels
	result = []
	for s in output:
	sentiment_dict = {}
	for d in s:
	if isinstance(d, dict):
	if d["label"] == "LABEL_2":
	sentiment_dict["positive"] = d["score"]
	elif d["label"] == "LABEL_1":
	sentiment_dict["neutral"] = d["score"]
	elif d["label"] == "LABEL_0":
	sentiment_dict["negative"] = d["score"]
	if return_argmax and len(sentiment_dict) > 0:
	argmax_label = max(sentiment_dict, key=sentiment_dict.get)
	sentiment_dict["argmax"] = argmax_label
	result.append(sentiment_dict)

	# Return a list of dictionaries, where each dictionary contains the sentiment scores for a single tweet

	return result