Spaces:

darthPanda
/

Social_media_sentiment_tracker

Runtime error

App Files Files Community

darthPanda commited on Mar 11, 2023

Commit

fccd4a8

1 Parent(s): cd80221

first

Browse files

Files changed (6) hide show

app.py +113 -0
helper_functions.py +255 -0
static/en_stopwords.txt +102 -0
static/en_stopwords_ngram.txt +134 -0
static/quartzo.ttf +0 -0
static/twitter_mask.png +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import streamlit as st
+st.set_page_config(
+    page_title="Social Media Sentiment Analyzer", page_icon="📊", layout="wide"
+)
+import pandas as pd
+import helper_functions as hf
+import plotly.express as px
+import plotly.io as pio
+import plotly
+# Whenever the search button is clicked, the search_callback function is called
+def search_callback():
+    if twitter_agree:
+        if len(st.session_state.search_term) == 0:
+            st.error("Please enter a search term")
+            return
+        st.session_state.df = hf.get_tweets(st.session_state.search_term, st.session_state.num_tweets)
+        st.session_state.df = hf.get_sentiment(st.session_state.df)
+def twitter_form():
+    with st.form(key="search_form"):
+        st.subheader("Search Parameters")
+        st.text_input("Enter a User handle (like _@elonmusk_), Hashtag (like _#Bitcoin_) or Topic (like _climate change_)", key="search_term")
+        st.slider("Number of tweets", min_value=100, max_value=500, key="num_tweets")
+        st.form_submit_button(label="Search", on_click=search_callback)
+        st.markdown(
+            "Note: it may take a while to load the results, especially with large number of tweets"
+        )
+with st.sidebar:
+    st.title("Social Media Sentiment Analyzer")
+    st.subheader("Choose your platform")
+    twitter_agree = st.checkbox('Twitter')
+    if twitter_agree:
+        twitter_form()
+    st.markdown(
+    "<div style='position: fixed; bottom: 0;'>Created by Taaha Bajwa</div>",
+    unsafe_allow_html=True,
+    )
+if "df" in st.session_state:
+    def make_dashboard(tweet_df, bar_color, wc_color):
+        # first row
+        col1, col2, col3 = st.columns([28, 34, 38])
+        with col1:
+            sentiment_plot = hf.plot_sentiment(tweet_df)
+            sentiment_plot.update_layout(height=350, title_x=0.5)
+            st.plotly_chart(sentiment_plot, theme=None, use_container_width=True)
+        with col2:
+            top_unigram = hf.get_top_n_gram(tweet_df, ngram_range=(1, 1), n=10)
+            unigram_plot = hf.plot_n_gram(
+                top_unigram, title="Top 10 Occuring Words", color=bar_color
+            )
+            unigram_plot.update_layout(height=350)
+            st.plotly_chart(unigram_plot, theme=None, use_container_width=True)
+        with col3:
+            top_bigram = hf.get_top_n_gram(tweet_df, ngram_range=(2, 2), n=10)
+            bigram_plot = hf.plot_n_gram(
+                top_bigram, title="Top 10 Occuring Bigrams", color=bar_color
+            )
+            bigram_plot.update_layout(height=350)
+            st.plotly_chart(bigram_plot, theme=None, use_container_width=True)
+        # second row
+        col1, col2 = st.columns([60, 40])
+        with col1:
+            def sentiment_color(sentiment):
+                if sentiment == "Positive":
+                    return "background-color: #54A24B; color: white"
+                elif sentiment == "Negative":
+                    return "background-color: #FF7F0E"
+                else:
+                    return "background-color: #1F77B4"
+            st.dataframe(
+                tweet_df[["Sentiment", "Tweet"]].style.applymap(
+                    sentiment_color, subset=["Sentiment"]
+                ),
+                height=350,
+            )
+        with col2:
+            wordcloud = hf.plot_wordcloud(tweet_df, colormap=wc_color)
+            st.pyplot(wordcloud)
+    adjust_tab_font = """
+    <style>
+    button[data-baseweb="tab"] > div[data-testid="stMarkdownContainer"] > p {
+        font-size: 20px;
+    }
+    </style>
+    """
+    st.write(adjust_tab_font, unsafe_allow_html=True)
+    tab1, tab2, tab3, tab4 = st.tabs(["All", "Positive 😊", "Negative ☹️", "Neutral 😐"])
+    with tab1:
+        tweet_df = st.session_state.df
+        make_dashboard(tweet_df, bar_color="#1F77B4", wc_color="Blues")
+    with tab2:
+        tweet_df = st.session_state.df.query("Sentiment == 'Positive'")
+        make_dashboard(tweet_df, bar_color="#54A24B", wc_color="Greens")
+    with tab3:
+        tweet_df = st.session_state.df.query("Sentiment == 'Negative'")
+        make_dashboard(tweet_df, bar_color="#FF7F0E", wc_color="Oranges")
+    with tab4:
+        tweet_df = st.session_state.df.query("Sentiment == 'Neutral'")
+        make_dashboard(tweet_df, bar_color="#1F77B4", wc_color="Blues")

helper_functions.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import pandas as pd
+import numpy as np
+import snscrape.modules.twitter as sntwitter
+import streamlit as st
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from transformers import pipeline
+import plotly.express as px
+import plotly.io as pio
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+from PIL import Image
+@st.cache(allow_output_mutation=True)
+def get_nltk():
+    import nltk
+    nltk.download(
+    ["punkt", "wordnet", "omw-1.4", "averaged_perceptron_tagger", "universal_tagset"]
+    )
+    return
+get_nltk()
+from nltk.stem import WordNetLemmatizer
+from nltk.tag import pos_tag
+from nltk.tokenize import word_tokenize
+import re
+from sklearn.feature_extraction.text import CountVectorizer
+# Create a custom plotly theme and set it as default
+pio.templates["custom"] = pio.templates["plotly_white"]
+pio.templates["custom"].layout.margin = {"b": 25, "l": 25, "r": 25, "t": 50}
+pio.templates["custom"].layout.width = 600
+pio.templates["custom"].layout.height = 450
+pio.templates["custom"].layout.autosize = False
+pio.templates["custom"].layout.font.update(
+    {"family": "Arial", "size": 12, "color": "#707070"}
+)
+pio.templates["custom"].layout.title.update(
+    {
+        "xref": "container",
+        "yref": "container",
+        "x": 0.5,
+        "yanchor": "top",
+        "font_size": 16,
+        "y": 0.95,
+        "font_color": "#353535",
+    }
+)
+pio.templates["custom"].layout.xaxis.update(
+    {"showline": True, "linecolor": "lightgray", "title_font_size": 14}
+)
+pio.templates["custom"].layout.yaxis.update(
+    {"showline": True, "linecolor": "lightgray", "title_font_size": 14}
+)
+pio.templates["custom"].layout.colorway = [
+    "#1F77B4",
+    "#FF7F0E",
+    "#54A24B",
+    "#D62728",
+    "#C355FA",
+    "#8C564B",
+    "#E377C2",
+    "#7F7F7F",
+    "#FFE323",
+    "#17BECF",
+]
+pio.templates.default = "custom"
+@st.cache(allow_output_mutation=True)
+def get_sentiment_model():
+    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
+    model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
+    return tokenizer,model
+tokenizer_sentiment,model_sentiment = get_sentiment_model()
+def get_tweets(query, max_tweets):
+    if query[0] == '@':
+        query = query[1:]
+        tweets_list = []
+        # Using TwitterSearchScraper to scrape data
+        for i,tweet in enumerate(sntwitter.TwitterSearchScraper('from:'+query).get_items()):
+            if i>max_tweets:
+                break
+            tweets_list.append([tweet.date, tweet.user.username, tweet.content])
+        # Creating a dataframe from the tweets list above
+        tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Username', 'Tweet'])
+    else:
+        # Creating list to append tweet data to
+        tweets_list = []
+        # Using TwitterSearchScraper to scrape data and append tweets to list
+        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query+' until:').get_items()):
+            if i>max_tweets:
+                break
+            tweets_list.append([tweet.date, tweet.user.username, tweet.content])
+        # Creating a dataframe from the tweets list above
+        tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Username', 'Tweet'])
+    tweets_df['Datetime'] = pd.to_datetime(tweets_df['Datetime'])
+    tweets_df['Date'] = tweets_df['Datetime'].dt.date
+    tweets_df['Time'] = tweets_df['Datetime'].dt.strftime('%H:%M') #tweets_df['Datetime'].dt.time
+    tweets_df.drop('Datetime', axis=1, inplace=True)
+    return tweets_df
+def text_preprocessing(text):
+    stopwords = set()
+    with open("static/en_stopwords.txt", "r") as file:
+        for word in file:
+            stopwords.add(word.rstrip("\n"))
+    lemmatizer = WordNetLemmatizer()
+    try:
+        url_pattern = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
+        user_pattern = r"@[^\s]+"
+        entity_pattern = r"&.*;"
+        neg_contraction = r"n't\W"
+        non_alpha = "[^a-z]"
+        cleaned_text = text.lower()
+        cleaned_text = re.sub(neg_contraction, " not ", cleaned_text)
+        cleaned_text = re.sub(url_pattern, " ", cleaned_text)
+        cleaned_text = re.sub(user_pattern, " ", cleaned_text)
+        cleaned_text = re.sub(entity_pattern, " ", cleaned_text)
+        cleaned_text = re.sub(non_alpha, " ", cleaned_text)
+        tokens = word_tokenize(cleaned_text)
+        #print('tokens')
+        # provide POS tag for lemmatization to yield better result
+        word_tag_tuples = pos_tag(tokens, tagset="universal")
+        tag_dict = {"NOUN": "n", "VERB": "v", "ADJ": "a", "ADV": "r"}
+        final_tokens = []
+        for word, tag in word_tag_tuples:
+            if len(word) > 1 and word not in stopwords:
+                if tag in tag_dict:
+                    final_tokens.append(lemmatizer.lemmatize(word, tag_dict[tag]))
+                else:
+                    final_tokens.append(lemmatizer.lemmatize(word))
+        return " ".join(final_tokens)
+    except:
+        return np.nan
+def get_sentiment(df):
+    useful_sentence = df['Tweet'].tolist()
+    tokenizer = tokenizer_sentiment
+    model = model_sentiment
+    pipe = pipeline(model="ProsusAI/finbert")
+    classifier = pipeline(model="ProsusAI/finbert")
+    output=[]
+    i=0
+    useful_sentence_len = len(useful_sentence)
+    for temp in useful_sentence:
+        output.extend(classifier(temp))
+        i=i+1
+    df_temp = pd.DataFrame.from_dict(output)
+    df = pd.concat([df, df_temp], axis=1)
+    df = df.rename(columns={'label': 'Sentiment'})
+    df['Sentiment'] = df['Sentiment'].replace('positive', 'Positive')
+    df['Sentiment'] = df['Sentiment'].replace('negative', 'Negative')
+    df['Sentiment'] = df['Sentiment'].replace('neutral', 'Neutral')
+    return df
+def plot_sentiment(tweet_df):
+    sentiment_count = tweet_df["Sentiment"].value_counts()
+    fig = px.pie(
+        values=sentiment_count.values,
+        names=sentiment_count.index,
+        hole=0.3,
+        title="<b>Sentiment Distribution</b>",
+        color=sentiment_count.index,
+        color_discrete_map={"Positive": "#54A24B", "Negative": "#FF7F0E", "Neutral": "#1F77B4"},
+    )
+    fig.update_traces(
+        textposition="inside",
+        texttemplate="%{label}<br>%{value} (%{percent})",
+        hovertemplate="<b>%{label}</b><br>Percentage=%{percent}<br>Count=%{value}",
+    )
+    fig.update_layout(showlegend=False)
+    return fig
+def get_top_n_gram(tweet_df, ngram_range, n=10):
+    stopwords = set()
+    with open("static/en_stopwords_ngram.txt", "r") as file:
+        for word in file:
+            stopwords.add(word.rstrip("\n"))
+    stopwords = list(stopwords)
+    corpus = tweet_df["Tweet"]
+    vectorizer = CountVectorizer(
+        analyzer="word", ngram_range=ngram_range, stop_words=stopwords
+    )
+    X = vectorizer.fit_transform(corpus.astype(str).values)
+    words = vectorizer.get_feature_names_out()
+    words_count = np.ravel(X.sum(axis=0))
+    df = pd.DataFrame(zip(words, words_count))
+    df.columns = ["words", "counts"]
+    df = df.sort_values(by="counts", ascending=False).head(n)
+    df["words"] = df["words"].str.title()
+    return df
+def plot_n_gram(n_gram_df, title, color="#54A24B"):
+    fig = px.bar(
+        # n_gram_df,
+        # x="counts",
+        # y="words",
+        x=n_gram_df.counts,
+        y=n_gram_df.words,
+        title="<b>{}</b>".format(title),
+        text_auto=True,
+    )
+    fig.update_layout(plot_bgcolor="white")
+    fig.update_xaxes(title=None)
+    fig.update_yaxes(autorange="reversed", title=None)
+    fig.update_traces(hovertemplate="<b>%{y}</b><br>Count=%{x}", marker_color=color)
+    return fig
+def plot_wordcloud(tweet_df, colormap="Greens"):
+    stopwords = set()
+    with open("static/en_stopwords_ngram.txt", "r") as file:
+        for word in file:
+            stopwords.add(word.rstrip("\n"))
+    cmap = mpl.cm.get_cmap(colormap)(np.linspace(0, 1, 20))
+    cmap = mpl.colors.ListedColormap(cmap[10:15])
+    mask = np.array(Image.open("static/twitter_mask.png"))
+    font = "static/quartzo.ttf"
+    #tweet_df["Cleaned_Tweet"] = tweet_df["Tweet"].apply(lambda x: text_preprocessing(x))
+    tweet_df["Cleaned_Tweet"] = tweet_df["Tweet"].apply(text_preprocessing)
+    #print(tweet_df["Cleaned_Tweet"])
+    text = " ".join(tweet_df["Cleaned_Tweet"])
+    #print(text)
+    wc = WordCloud(
+        background_color="white",
+        font_path=font,
+        stopwords=stopwords,
+        max_words=90,
+        colormap=cmap,
+        mask=mask,
+        random_state=42,
+        collocations=False,
+        min_word_length=2,
+        max_font_size=200,
+    )
+    wc.generate(text)
+    fig = plt.figure(figsize=(8, 8))
+    ax = fig.add_subplot(1, 1, 1)
+    plt.imshow(wc, interpolation="bilinear")
+    plt.axis("off")
+    plt.title("Wordcloud", fontdict={"fontsize": 16}, fontweight="heavy", pad=20, y=1.0)
+    return fig

static/en_stopwords.txt ADDED Viewed

	@@ -0,0 +1,102 @@

+if
+his
+our
+they
+can
+into
+an
+same
+himself
+themselves
+her
+are
+such
+through
+each
+when
+just
+yourselves
+hers
+that
+with
+those
+it
+was
+we
+its
+me
+myself
+ve
+and
+itself
+does
+doing
+or
+being
+did
+there
+while
+you
+between
+about
+on
+then
+my
+ourselves
+by
+too
+at
+ours
+here
+had
+been
+as
+the
+has
+off
+these
+other
+your
+him
+herself
+now
+is
+theirs
+whom
+any
+to
+for
+from
+of
+were
+have
+he
+ll
+be
+but
+until
+yours
+this
+again
+re
+do
+so
+some
+both
+yourself
+am
+their
+having
+she
+should
+them
+in
+during
+will
+shall
+could
+would
+ai
+ca
+sha
+wo

static/en_stopwords_ngram.txt ADDED Viewed

	@@ -0,0 +1,134 @@

+out
+ll
+during
+had
+but
+own
+re
+there
+your
+ourselves
+ours
+whom
+an
+if
+as
+against
+with
+in
+so
+his
+were
+by
+at
+theirs
+they
+yourselves
+yours
+are
+you
+could
+our
+some
+ai
+myself
+those
+these
+who
+cannot
+through
+this
+very
+their
+where
+only
+her
+above
+down
+been
+that
+will
+am
+its
+up
+each
+on
+no
+just
+itself
+once
+be
+from
+sha
+himself
+what
+for
+yourself
+me
+while
+being
+is
+more
+here
+over
+my
+would
+why
+she
+he
+ve
+to
+before
+further
+it
+how
+until
+should
+all
+when
+again
+do
+him
+both
+hers
+too
+most
+about
+same
+between
+such
+shall
+has
+which
+can
+having
+few
+the
+because
+did
+into
+than
+them
+we
+does
+below
+was
+of
+off
+now
+after
+under
+ca
+any
+nor
+not
+herself
+ought
+or
+themselves
+other
+doing
+then
+have
+and
+wo

static/quartzo.ttf ADDED Viewed

Binary file (116 kB). View file

static/twitter_mask.png ADDED Viewed