Spaces:
Runtime error
Runtime error
import gradio as gr | |
import os | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import re | |
from pythainlp.util import normalize | |
from pythainlp.tokenize import word_tokenize | |
from pythainlp import word_vector | |
import numpy as np | |
import keras | |
import plotly.express as px | |
################# | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.common.by import By | |
import time | |
import chromedriver_autoinstaller | |
import sys | |
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver') | |
# setup chrome options | |
chrome_options = webdriver.ChromeOptions() | |
chrome_options.add_argument('--headless') # ensure GUI is off | |
chrome_options.add_argument('--no-sandbox') | |
chrome_options.add_argument('--disable-dev-shm-usage') | |
# set path to chromedriver as per your configuration | |
chromedriver_autoinstaller.install() | |
wv = word_vector.WordVector() | |
word2vec = wv.get_model() | |
model= keras.models.load_model('my_model3.h5') | |
def get_comments(VIDEO_URL): | |
# Initialize the WebDriver | |
driver = webdriver.Chrome(options=chrome_options) | |
# Your scraping code here | |
#VIDEO_URL = 'https://www.youtube.com/watch?v=VIDEO_ID' | |
driver.get(VIDEO_URL) | |
# Wait for the comments to load | |
time.sleep(5) | |
# Scroll down to load more comments (optional, repeat as needed) | |
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END) | |
time.sleep(2) | |
# Find and print comments | |
comment_elements = driver.find_elements(By.XPATH, '//yt-formatted-string[@id="content-text"]') | |
data = [] | |
for comment in comment_elements: | |
if comment != '': | |
data.append(comment.text) | |
print(comment.text) | |
# Close the WebDriver | |
driver.quit() | |
return data | |
def cosine_sim(u, v): | |
return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)) | |
def sentences_to_indices(X, word2vec, max_len): | |
""" | |
Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences. | |
The output shape should be such that it can be given to `Embedding()`. | |
Arguments: | |
X -- array of sentences (strings), of shape (m, 1) | |
word2vec -- a trained Word2Vec model from gensim | |
max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. | |
Returns: | |
X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len) | |
""" | |
m = X.shape[0] # number of training examples | |
# Initialize X_indices as a numpy matrix of zeros and the correct shape | |
X_indices = np.zeros((m, max_len)) | |
for i in range(m): # loop over training examples | |
# Convert the ith training sentence in lower case and split is into words. You should get a list of words. | |
# print(X) | |
# print(len(X[i].lower().split())) | |
sentence_words = X[i].lower().split()[:max_len] | |
# Initialize j to 0 | |
j = 0 | |
try: | |
# Loop over the words of sentence_words | |
for w in sentence_words: | |
# Set the (i,j)th entry of X_indices to the index of the correct word. | |
if w in word2vec.key_to_index: | |
X_indices[i, j] = word2vec.key_to_index[w] | |
# Increment j to j + 1 | |
j += 1 | |
except: | |
print('key error: ', w) | |
return X_indices | |
def deEmojify(text): | |
regrex_pattern = re.compile(pattern = "[" | |
u"\U0001F600-\U0001F64F" # emoticons | |
u"\U0001F300-\U0001F5FF" # symbols & pictographs | |
u"\U0001F680-\U0001F6FF" # transport & map symbols | |
u"\U0001F1E0-\U0001F1FF" # flags (iOS) | |
u"\U00002500-\U00002BEF" # chinese char | |
u"\U00002702-\U000027B0" | |
u"\U00002702-\U000027B0" | |
u"\U000024C2-\U0001F251" | |
u"\U0001f926-\U0001f937" | |
u"\U00010000-\U0010ffff" | |
u"\u2640-\u2642" | |
u"\u2600-\u2B55" | |
u"\u200d" | |
u"\u23cf" | |
u"\u23e9" | |
u"\u231a" | |
u"\ufe0f" # dingbats | |
u"\u3030" | |
"]+", flags = re.UNICODE) | |
return regrex_pattern.sub(r'',text) | |
def clean_me(data): | |
data['clean_text'] = data['text'].str.replace(r'<[^<>]*>', '', regex=True) | |
data['clean2_text']= data['clean_text'].str.strip().str.lower().str.replace('\r+', ' ').str.replace('\n+',' ').str.replace('\t+',' ') | |
data['clean3_text'] = data.apply(lambda row: deEmojify(row['clean2_text']), axis=1) | |
# Normalize text | |
data['clean4_text'] = data.apply(lambda row: normalize(row['clean3_text']), axis=1) | |
# Word segmentation: it will take a while.... | |
data['wordseged_text'] = data.apply(lambda row: word_tokenize(row['clean4_text'], engine="newmm-safe"), axis=1) | |
# Join the wordsegged with space | |
data['wordseged_space_text'] = data.apply(lambda row: " ".join(row["wordseged_text"]), axis=1) | |
return(data) | |
def pretty_output(lines, sentiment): | |
label = np.array(['Neg', 'Neu', 'Pos']) | |
txt_sentiment = label[np.argmax(sentiment, axis=1)] | |
seriesText = pd.Series(txt_sentiment).value_counts() | |
df = pd.DataFrame({'Sentiment': seriesText.index, 'Count': seriesText.values}) | |
fig = px.bar(df, x='Sentiment', y='Count', color='Sentiment') | |
fig.update_xaxes(categoryorder='array', categoryarray=['Neg', 'Neu', 'Pos']) | |
txt_pos = '' | |
txt_neu = '' | |
txt_neg = '' | |
for (x, y, score) in zip(lines, txt_sentiment, sentiment,): | |
txt_score = [f"{i:.2f}" for i in score] | |
tmp = f'{y} {txt_score}:-{x} \n' | |
if y == 'Pos': | |
txt_pos += tmp | |
elif y == 'Neu': | |
txt_neu += tmp | |
else: | |
txt_neg += tmp | |
return(txt_pos, txt_neu, txt_neg, fig) | |
def combine(a, b): | |
data = pd.DataFrame() | |
lines = str.split(a, '\n') | |
if b != "": | |
lines = get_comments(b) | |
if lines == []: | |
text001 = 'CANNOT_GET DATA from Youtube' | |
print(text001) | |
data['text'] = lines | |
data = clean_me(data) | |
a = data['wordseged_space_text'][0] + ' SENTIMENT: ' | |
X_train_indices = sentences_to_indices(data['wordseged_space_text'].values, word2vec, 128) | |
result = model.predict(X_train_indices[:]) | |
txt_pos, txt_neu, txt_neg, fig = pretty_output(lines,result) | |
return txt_pos, txt_neu, txt_neg, fig | |
def mirror(x): | |
return x | |
with gr.Blocks() as demo: | |
txt = gr.Textbox(label="Input: TEXT", lines=2) | |
txt_2 = gr.Textbox(label="Input: Youtube URL") | |
btn = gr.Button(value="Submit") | |
txt_POS = gr.Textbox(value="", label="Positive comments") | |
txt_NEU = gr.Textbox(value="", label="Neutral comments") | |
txt_NEG = gr.Textbox(value="", label="Negative comments") | |
plot = gr.Plot(label="Plot") | |
btn.click(combine, inputs=[txt, txt_2], outputs=[txt_POS, txt_NEU, txt_NEG, plot]) | |
if __name__ == "__main__": | |
demo.launch() |