import os from git import Repo import streamlit as st import time from PIL import Image import base64 from transformers import pipeline import spacy import googleapiclient import numpy as np from sentence_transformers import SentenceTransformer from matplotlib import colormaps from matplotlib.colors import ListedColormap GITHUB_PAT = os.environ['GITHUB'] SENTIMENT = os.environ['SENTIMENT'] EMBEDDING = os.environ['EMBEDDING'] if not os.path.exists('repo_directory'): try: Repo.clone_from(f'https://marcus-t-s:{GITHUB_PAT}@github.com/marcus-t-s/yt-comment-analyser.git', 'repo_directory' ) except: st.error("Error: Oops there's an issue on our end, please wait a moment and try again.") st.stop() # from repo_directory.all_utils import * from repo_directory.utils.chart_utils import * from repo_directory.youtube_comment_class import * # Streamlit configuration st.set_page_config( page_title="ViewerVoice | YouTube Comment Analyser", layout="wide", page_icon=Image.open('images/page_icon.png') ) # Define and load cached resources @st.cache_resource def load_models(): sentiment_pipeline = pipeline("sentiment-analysis", model=r"cardiffnlp/twitter-roberta-base-sentiment") embedding_model = SentenceTransformer('flax-sentence-embeddings/all_datasets_v4_MiniLM-L6') spacy_nlp = spacy.load("en_core_web_sm") add_custom_stopwords(spacy_nlp, {"bring", "know", "come"}) return sentiment_pipeline, embedding_model, spacy_nlp @st.cache_resource def load_colors_image(): mask = np.array(Image.open('images/youtube_icon.jpg')) Reds = colormaps['Reds'] colors = ListedColormap(Reds(np.linspace(0.4, 0.8, 256))) with open("images/viewervoice_logo_crop.png", "rb") as img_file: logo_image = base64.b64encode(img_file.read()).decode("utf-8") return mask, colors, logo_image sentiment_pipeline, embedding_model, spacy_nlp = load_models() mask, colors, logo_image = load_colors_image() # Hide line at the top and "made with streamlit" text hide_decoration_bar_style = """ """ st.markdown(hide_decoration_bar_style, unsafe_allow_html=True) if 'YouTubeParser' not in st.session_state: st.session_state['YouTubeParser'] = YoutubeCommentParser() if 'comment_fig' not in st.session_state: st.session_state["comment_fig"] = None st.session_state["wordcloud_fig"] = None st.session_state["topic_fig"] = None st.session_state["sentiment_fig"] = None if 'rerun_button' not in st.session_state: st.session_state['rerun_button'] = "INIT" if 'topic_filter' not in st.session_state: st.session_state['topic_filter'] = False if 'sentiment_filter' not in st.session_state: st.session_state['sentiment_filter'] = False if 'filter_state' not in st.session_state: st.session_state['filter_state'] = "INIT" if 'video_link' not in st.session_state: st.session_state["video_link"] = None if 'num_comments' not in st.session_state: st.session_state['num_comments'] = None # Set reference to YouTubeParser object for more concise code yt_parser = st.session_state['YouTubeParser'] main_page = st.container() def query_comments_button(): # Delete larger objects from session state to later replace del st.session_state["comment_fig"] del st.session_state["wordcloud_fig"] del st.session_state["topic_fig"] del st.session_state["sentiment_fig"] del st.session_state["YouTubeParser"] # Reset session state variables back to placeholder values st.session_state.rerun_button = "QUERYING" st.session_state['filter_state'] = "INIT" st.session_state["topic_filter"] = False st.session_state["sentiment_filter"] = False st.session_state["semantic_filter"] = False st.session_state["figures_built"] = False st.session_state["comment_fig"] = None st.session_state["wordcloud_fig"] = None st.session_state["topic_fig"] = None st.session_state["sentiment_fig"] = None st.session_state["YouTubeParser"] = YoutubeCommentParser() def filter_visuals_button(): st.session_state["filter_state"] = "FILTERING" with st.sidebar: st.session_state["video_link"] = st.text_input('YouTube Video URL', value="") st.session_state["max_comments"] = st.slider(label="Maximum number of comments to query", min_value=100, max_value=3000, step=100) st.session_state["max_topics"] = st.slider(label="Maximum number of topics", min_value=5, max_value=20, step=1) st.button('Query comments :left_speech_bubble:', on_click=query_comments_button) with main_page: # Reduce space at the top reduce_header_height_style = """ """ st.markdown(reduce_header_height_style, unsafe_allow_html=True) # Title and intro section markdown_content = f"""

""" st.markdown(markdown_content, unsafe_allow_html=True) # LinkedIn links lnk = '' st.markdown(lnk + """

Made by Afiba Annor Marcus Singh

""", unsafe_allow_html=True) st.markdown("

", unsafe_allow_html=True) # Notes section st.markdown("

📝 Notes

", unsafe_allow_html=True) html_content = """

This dashboard is still under development; further updates will be implemented in due course.
Currently, the dashboard exclusively caters to comments in English and does not include comment replies.
Comments undergo cleaning and pre-processing to optimise modelling. As a result, the returned comment count may fall short of the maximum queried amount.
Please note that the sentiment analysis currently does not take emojis into account.
For optimal performance of the current topic model, we recommend retrieving thousands of comments.
Please anticipate that querying comments and running the models may require a few minutes to complete.

""" # Display the HTML content using st.markdown() st.markdown(html_content, unsafe_allow_html=True) # Query comments section if (st.session_state.rerun_button == "QUERYING") and (st.session_state["video_link"] is not None): with st.spinner('Querying comments and running models'): yt_parser = st.session_state["YouTubeParser"] try: yt_parser.scrape_comments(st.session_state['video_link']) yt_parser.scrape_video_title() except: st.error("Error: Unable to query comments, incorrect YouTube URL or maximum \ API call limit reached.") st.stop() # Run formatting and models yt_parser.format_comments() yt_parser.clean_comments() yt_parser.run_sentiment_pipeline(sentiment_pipeline) yt_parser.run_topic_modelling_pipeline(embedding_model, nlp=spacy_nlp, max_topics=st.session_state['max_topics']) # Set "QUERY COMPLETE" to bypass running this section on script re-run st.session_state.rerun_button = "QUERY COMPLETE" # Once comments are queried, build charts ready to visualise if st.session_state.rerun_button == "QUERY COMPLETE": # Check for built figures: if (not st.session_state["figures_built"]) or (st.session_state.filter_state == "FILTERING"): # Select colors for wordcloud # If filtering button pressed if st.session_state.filter_state == "FILTERING": df_filtered = yt_parser.df_comments.copy() if st.session_state["topic_filter"]: df_filtered = df_filtered.query(f"Topic == {st.session_state.topic_filter}") if st.session_state["sentiment_filter"]: df_filtered = df_filtered.query(f"Sentiment == {st.session_state.sentiment_filter}") if st.session_state["semantic_filter"]: df_filtered = semantic_search(df=df_filtered, query=st.session_state["semantic_filter"], embedding_model=embedding_model, text_col='Comment_Clean') if len(df_filtered) == 0: st.session_state['num_comments'] = 0 else: st.session_state['num_comments'] = len(df_filtered) # Build filtered table figure st.session_state["table_fig"] = comments_table(df_filtered, ['publishedAt', 'Comment_Formatted', 'Likes', 'Sentiment', 'Topic'], {'publishedAt': 'Date', 'Comment_Formatted': 'Comment'}) # Build filtered wordcloud figure st.session_state["wordcloud_fig"] = comment_wordcloud(df_filtered, mask, colors) # Build filtered topic figure st.session_state["topic_fig"] = topic_treemap(df_filtered, "Topic") # Build filtered sentiment figure st.session_state["sentiment_fig"] = sentiment_chart(df_filtered, "Sentiment") st.session_state["figures_built"] = True st.session_state.filter_state = "FILTERED" # No filtering selected else: st.session_state['num_comments'] = len(yt_parser.df_comments) # Can only build graphs if we have comments if st.session_state['num_comments'] > 0: try: # Build unfiltered table figure st.session_state["table_fig"] = comments_table(yt_parser.df_comments, ['publishedAt', 'Comment_Formatted', 'Likes', 'Sentiment', 'Topic'], {'publishedAt': 'Date', 'Comment_Formatted': 'Comment'}) # Build unfiltered wordcloud figure st.session_state["wordcloud_fig"] = comment_wordcloud(yt_parser.df_comments, mask, colors) # Build unfiltered topic figure st.session_state["topic_fig"] = topic_treemap(yt_parser.df_comments, "Topic") # Build unfiltered sentiment figure st.session_state["sentiment_fig"] = sentiment_chart(yt_parser.df_comments, "Sentiment") st.session_state["figures_built"] = True except: st.error("Error: Oops there's an issue on our end, please wait a moment and try again.") st.stop() with main_page: if st.session_state.rerun_button == "QUERY COMPLETE": st.subheader(f"{yt_parser.title}") st.markdown("

", unsafe_allow_html=True) if st.session_state['num_comments'] > 0: table_col, word_cloud_col = st.columns([0.55, 0.45]) with table_col: st.markdown(f"""

Comments

""", unsafe_allow_html=True) st.plotly_chart(st.session_state["table_fig"], use_container_width=True) with word_cloud_col: st.markdown(f"""

Word Cloud

""", unsafe_allow_html=True) st.pyplot(st.session_state["wordcloud_fig"], use_container_width=True) treemap_col, sentiment_donut_col = st.columns([0.55, 0.45]) with treemap_col: st.markdown(f"""

Topic Proportions

""", unsafe_allow_html=True) st.plotly_chart(st.session_state["topic_fig"], use_container_width=True) with sentiment_donut_col: st.markdown(f"""

Sentiment Distribution

""", unsafe_allow_html=True) st.plotly_chart(st.session_state["sentiment_fig"], use_container_width=True) # st.table(yt_parser.df_comments.head()) else: st.write("Unfortunately we couldn't find any comments for this set of filters, please try " "editing the filters and try again") with st.sidebar: # Define the HTML and CSS for the button-style container if st.session_state['num_comments'] is not None: num_comments = st.session_state['num_comments'] else: num_comments = 0 htmlstr = f"""

{num_comments}

""" # Display the button-style container with number of comments st.subheader("Number of comments") st.markdown(htmlstr, unsafe_allow_html=True) # Filters section st.subheader("Filters") if yt_parser.df_comments is not None: st.session_state["topic_filter"] = st.multiselect("Topic", options=sorted(list(yt_parser.df_comments['Topic'].unique()))) st.session_state["sentiment_filter"] = st.multiselect("Sentiment", options=list(yt_parser.df_comments['Sentiment'].unique())) st.session_state["semantic_filter"] = st.text_input("Keyword search", max_chars=30) st.button('Filter visualisations :sleuth_or_spy:', on_click=filter_visuals_button) else: st.multiselect("Topic", options=["Please query comments from a video"], disabled=True) st.multiselect("Sentiment", options=["Please query comments from a video"], disabled=True) st.text_input("Keyword search", disabled=True) st.button('Please query comments before filtering', disabled=True)