Spaces:
Runtime error
Runtime error
import re | |
import nltk | |
from typing import List | |
from transformers import pipeline | |
from tqdm import tqdm | |
import numpy as np | |
import numpy as np | |
import scipy | |
def tweet_cleaner(tweet: str) -> str: | |
# words = set(nltk.corpus.words.words()) | |
""" | |
Cleans a tweet by removing @ mentions, URLs, hashtags, and non-valid words. | |
Args: | |
tweet (str): A single tweet as a string. | |
Returns: | |
str: The cleaned tweet. | |
""" | |
# Remove @ mentions from the tweet | |
# tweet = re.sub("@[A-Za-z0-9]+", "", tweet) | |
# # Remove URLs from the tweet | |
# tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) | |
# # Remove extra whitespaces from the tweet | |
# tweet = " ".join(tweet.split()) | |
# # Remove hashtag sign but keep the text | |
# tweet = tweet.replace("#", "").replace("_", " ") | |
# # Tokenize the tweet and keep only valid words | |
# tweet = " ".join( | |
# w | |
# for w in nltk.wordpunct_tokenize(tweet) | |
# if w.lower() in words or not w.isalpha() | |
# ) | |
# # Return the cleaned tweet | |
# return tweet | |
bad_start = ["http:", "https:"] | |
for w in bad_start: | |
tweet = re.sub(f" {w}\\S+", "", tweet) # removes white space before url | |
tweet = re.sub(f"{w}\\S+ ", "", tweet) # in case a tweet starts with a url | |
tweet = re.sub(f"\n{w}\\S+ ", "", tweet) # in case the url is on a new line | |
tweet = re.sub( | |
f"\n{w}\\S+", "", tweet | |
) # in case the url is alone on a new line | |
tweet = re.sub(f"{w}\\S+", "", tweet) # any other case? | |
tweet = re.sub(" +", " ", tweet) # replace multiple spaces with one space | |
return " ".join(tweet.split()).strip() | |
def is_boring_tweet(tweet): | |
"""Check if tweet is boring.""" | |
boring_stuff = ["http", "@", "#"] | |
not_boring_words = sum( | |
1 | |
for word in tweet.split() | |
if not any(bs in word.lower() for bs in boring_stuff) | |
) | |
return not_boring_words < 3 | |
def fix_text(text): | |
text = text.replace("&", "&") | |
text = text.replace("<", "<") | |
text = text.replace(">", ">") | |
return text | |
def twitter_sentiment_api_score( | |
tweet_list: list = None, return_argmax: bool = True, use_api=False | |
): | |
""" | |
Sends a list of tweets to the Hugging Face Twitter Sentiment Analysis API and returns a list of sentiment scores for each tweet. | |
Args: | |
tweet_list (list): A list of strings, where each string represents a tweet. | |
return_argmax (bool): Whether to also return the predicted sentiment label with the highest confidence score for each tweet. | |
Returns: | |
A list of dictionaries, where each dictionary contains the sentiment scores for a single tweet. Each sentiment score dictionary | |
contains three key-value pairs: "positive", "neutral", and "negative". The value for each key is a float between 0 and 1 that | |
represents the confidence score for that sentiment label, where higher values indicate higher confidence in that sentiment. If | |
`return_argmax` is True, each dictionary will also contain an additional key "argmax" with the predicted sentiment label for | |
that tweet. | |
""" | |
if use_api: | |
import requests | |
# URL and authentication header for the Hugging Face Twitter Sentiment Analysis API | |
API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment" | |
headers = {"Authorization": "Bearer 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"} | |
# Function to send a POST request with a JSON payload to the API and return the response as a JSON object | |
def query(payload): | |
response = requests.post(API_URL, headers=headers, json=payload) | |
return response.json() | |
# Send a list of tweets to the API and receive a list of sentiment scores for each tweet | |
output = query( | |
{ | |
"inputs": tweet_list, | |
} | |
) | |
else: | |
from transformers import AutoModelForSequenceClassification | |
from transformers import TFAutoModelForSequenceClassification | |
from transformers import AutoTokenizer | |
from scipy.special import softmax | |
import os | |
task = "sentiment" | |
MODEL = f"cardiffnlp/twitter-roberta-base-{task}" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL) | |
model = AutoModelForSequenceClassification.from_pretrained(MODEL) | |
# model.save_pretrained(MODEL) | |
def get_sentimet(text): | |
labels = ["negative", "neutral", "positive"] | |
# text = "Good night π" | |
text = tweet_cleaner(text) | |
encoded_input = tokenizer(text, return_tensors="pt") | |
output = model(**encoded_input) | |
scores = output[0][0].detach().numpy() | |
scores = softmax(scores) | |
ranking = np.argsort(scores)[::-1] | |
results = { | |
labels[ranking[i]]: np.round(float(scores[ranking[i]]), 4) | |
for i in range(scores.shape[0]) | |
} | |
max_key = max(results, key=results.get) | |
results["argmax"] = max_key | |
return results | |
return [get_sentimet(t) for t in tweet_list] | |
# Loop through the list of sentiment scores and replace the sentiment labels with more intuitive labels | |
result = [] | |
for s in output: | |
sentiment_dict = {} | |
for d in s: | |
if isinstance(d, dict): | |
if d["label"] == "LABEL_2": | |
sentiment_dict["positive"] = d["score"] | |
elif d["label"] == "LABEL_1": | |
sentiment_dict["neutral"] = d["score"] | |
elif d["label"] == "LABEL_0": | |
sentiment_dict["negative"] = d["score"] | |
if return_argmax and len(sentiment_dict) > 0: | |
argmax_label = max(sentiment_dict, key=sentiment_dict.get) | |
sentiment_dict["argmax"] = argmax_label | |
result.append(sentiment_dict) | |
# Return a list of dictionaries, where each dictionary contains the sentiment scores for a single tweet | |
return result | |