import re import nltk from typing import List from transformers import pipeline from tqdm import tqdm import numpy as np import numpy as np import scipy def tweet_cleaner(tweet: str) -> str: # words = set(nltk.corpus.words.words()) """ Cleans a tweet by removing @ mentions, URLs, hashtags, and non-valid words. Args: tweet (str): A single tweet as a string. Returns: str: The cleaned tweet. """ # Remove @ mentions from the tweet # tweet = re.sub("@[A-Za-z0-9]+", "", tweet) # # Remove URLs from the tweet # tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) # # Remove extra whitespaces from the tweet # tweet = " ".join(tweet.split()) # # Remove hashtag sign but keep the text # tweet = tweet.replace("#", "").replace("_", " ") # # Tokenize the tweet and keep only valid words # tweet = " ".join( # w # for w in nltk.wordpunct_tokenize(tweet) # if w.lower() in words or not w.isalpha() # ) # # Return the cleaned tweet # return tweet bad_start = ["http:", "https:"] for w in bad_start: tweet = re.sub(f" {w}\\S+", "", tweet) # removes white space before url tweet = re.sub(f"{w}\\S+ ", "", tweet) # in case a tweet starts with a url tweet = re.sub(f"\n{w}\\S+ ", "", tweet) # in case the url is on a new line tweet = re.sub( f"\n{w}\\S+", "", tweet ) # in case the url is alone on a new line tweet = re.sub(f"{w}\\S+", "", tweet) # any other case? tweet = re.sub(" +", " ", tweet) # replace multiple spaces with one space return " ".join(tweet.split()).strip() def is_boring_tweet(tweet): """Check if tweet is boring.""" boring_stuff = ["http", "@", "#"] not_boring_words = sum( 1 for word in tweet.split() if not any(bs in word.lower() for bs in boring_stuff) ) return not_boring_words < 3 def fix_text(text): text = text.replace("&", "&") text = text.replace("<", "<") text = text.replace(">", ">") return text def twitter_sentiment_api_score( tweet_list: list = None, return_argmax: bool = True, use_api=False ): """ Sends a list of tweets to the Hugging Face Twitter Sentiment Analysis API and returns a list of sentiment scores for each tweet. Args: tweet_list (list): A list of strings, where each string represents a tweet. return_argmax (bool): Whether to also return the predicted sentiment label with the highest confidence score for each tweet. Returns: A list of dictionaries, where each dictionary contains the sentiment scores for a single tweet. Each sentiment score dictionary contains three key-value pairs: "positive", "neutral", and "negative". The value for each key is a float between 0 and 1 that represents the confidence score for that sentiment label, where higher values indicate higher confidence in that sentiment. If `return_argmax` is True, each dictionary will also contain an additional key "argmax" with the predicted sentiment label for that tweet. """ if use_api: import requests # URL and authentication header for the Hugging Face Twitter Sentiment Analysis API API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment" headers = {"Authorization": "Bearer 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"} # Function to send a POST request with a JSON payload to the API and return the response as a JSON object def query(payload): response = requests.post(API_URL, headers=headers, json=payload) return response.json() # Send a list of tweets to the API and receive a list of sentiment scores for each tweet output = query( { "inputs": tweet_list, } ) else: from transformers import AutoModelForSequenceClassification from transformers import TFAutoModelForSequenceClassification from transformers import AutoTokenizer from scipy.special import softmax import os task = "sentiment" MODEL = f"cardiffnlp/twitter-roberta-base-{task}" tokenizer = AutoTokenizer.from_pretrained(MODEL) model = AutoModelForSequenceClassification.from_pretrained(MODEL) # model.save_pretrained(MODEL) def get_sentimet(text): labels = ["negative", "neutral", "positive"] # text = "Good night 😊" text = tweet_cleaner(text) encoded_input = tokenizer(text, return_tensors="pt") output = model(**encoded_input) scores = output[0][0].detach().numpy() scores = softmax(scores) ranking = np.argsort(scores)[::-1] results = { labels[ranking[i]]: np.round(float(scores[ranking[i]]), 4) for i in range(scores.shape[0]) } max_key = max(results, key=results.get) results["argmax"] = max_key return results return [get_sentimet(t) for t in tweet_list] # Loop through the list of sentiment scores and replace the sentiment labels with more intuitive labels result = [] for s in output: sentiment_dict = {} for d in s: if isinstance(d, dict): if d["label"] == "LABEL_2": sentiment_dict["positive"] = d["score"] elif d["label"] == "LABEL_1": sentiment_dict["neutral"] = d["score"] elif d["label"] == "LABEL_0": sentiment_dict["negative"] = d["score"] if return_argmax and len(sentiment_dict) > 0: argmax_label = max(sentiment_dict, key=sentiment_dict.get) sentiment_dict["argmax"] = argmax_label result.append(sentiment_dict) # Return a list of dictionaries, where each dictionary contains the sentiment scores for a single tweet return result