hug101 / app.py
ThanaphonJoe's picture
test
77f9dcf verified
raw
history blame
6.87 kB
import gradio as gr
import os
import matplotlib.pyplot as plt
import pandas as pd
import re
from pythainlp.util import normalize
from pythainlp.tokenize import word_tokenize
from pythainlp import word_vector
import numpy as np
import keras
import plotly.express as px
#################
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import chromedriver_autoinstaller
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
# setup chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless') # ensure GUI is off
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
# set path to chromedriver as per your configuration
chromedriver_autoinstaller.install()
wv = word_vector.WordVector()
word2vec = wv.get_model()
model= keras.models.load_model('my_model3.h5')
def get_comments(VIDEO_URL):
# Initialize the WebDriver
driver = webdriver.Chrome(options=chrome_options)
# Your scraping code here
#VIDEO_URL = 'https://www.youtube.com/watch?v=VIDEO_ID'
driver.get(VIDEO_URL)
# Wait for the comments to load
time.sleep(5)
# Scroll down to load more comments (optional, repeat as needed)
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
time.sleep(2)
# Find and print comments
comment_elements = driver.find_elements(By.XPATH, '//yt-formatted-string[@id="content-text"]')
data = []
for comment in comment_elements:
if comment != '':
data.append(comment.text)
print(comment.text)
# Close the WebDriver
driver.quit()
return data
def cosine_sim(u, v):
return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
def sentences_to_indices(X, word2vec, max_len):
"""
Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
The output shape should be such that it can be given to `Embedding()`.
Arguments:
X -- array of sentences (strings), of shape (m, 1)
word2vec -- a trained Word2Vec model from gensim
max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this.
Returns:
X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
"""
m = X.shape[0] # number of training examples
# Initialize X_indices as a numpy matrix of zeros and the correct shape
X_indices = np.zeros((m, max_len))
for i in range(m): # loop over training examples
# Convert the ith training sentence in lower case and split is into words. You should get a list of words.
# print(X)
# print(len(X[i].lower().split()))
sentence_words = X[i].lower().split()[:max_len]
# Initialize j to 0
j = 0
try:
# Loop over the words of sentence_words
for w in sentence_words:
# Set the (i,j)th entry of X_indices to the index of the correct word.
if w in word2vec.key_to_index:
X_indices[i, j] = word2vec.key_to_index[w]
# Increment j to j + 1
j += 1
except:
print('key error: ', w)
return X_indices
def deEmojify(text):
regrex_pattern = re.compile(pattern = "["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", flags = re.UNICODE)
return regrex_pattern.sub(r'',text)
def clean_me(data):
data['clean_text'] = data['text'].str.replace(r'<[^<>]*>', '', regex=True)
data['clean2_text']= data['clean_text'].str.strip().str.lower().str.replace('\r+', ' ').str.replace('\n+',' ').str.replace('\t+',' ')
data['clean3_text'] = data.apply(lambda row: deEmojify(row['clean2_text']), axis=1)
# Normalize text
data['clean4_text'] = data.apply(lambda row: normalize(row['clean3_text']), axis=1)
# Word segmentation: it will take a while....
data['wordseged_text'] = data.apply(lambda row: word_tokenize(row['clean4_text'], engine="newmm-safe"), axis=1)
# Join the wordsegged with space
data['wordseged_space_text'] = data.apply(lambda row: " ".join(row["wordseged_text"]), axis=1)
return(data)
def pretty_output(lines, sentiment):
label = np.array(['Neg', 'Neu', 'Pos'])
txt_sentiment = label[np.argmax(sentiment, axis=1)]
seriesText = pd.Series(txt_sentiment).value_counts()
df = pd.DataFrame({'Sentiment': seriesText.index, 'Count': seriesText.values})
fig = px.bar(df, x='Sentiment', y='Count', color='Sentiment')
fig.update_xaxes(categoryorder='array', categoryarray=['Neg', 'Neu', 'Pos'])
txt_pos = ''
txt_neu = ''
txt_neg = ''
for (x, y, score) in zip(lines, txt_sentiment, sentiment,):
txt_score = [f"{i:.2f}" for i in score]
tmp = f'{y} {txt_score}:-{x} \n'
if y == 'Pos':
txt_pos += tmp
elif y == 'Neu':
txt_neu += tmp
else:
txt_neg += tmp
return(txt_pos, txt_neu, txt_neg, fig)
def combine(a, b):
data = pd.DataFrame()
lines = str.split(a, '\n')
if b != "":
lines = get_comments(b)
if lines == []:
text001 = 'CANNOT_GET DATA from Youtube'
print(text001)
data['text'] = lines
data = clean_me(data)
a = data['wordseged_space_text'][0] + ' SENTIMENT: '
X_train_indices = sentences_to_indices(data['wordseged_space_text'].values, word2vec, 128)
result = model.predict(X_train_indices[:])
txt_pos, txt_neu, txt_neg, fig = pretty_output(lines,result)
return txt_pos, txt_neu, txt_neg, fig
def mirror(x):
return x
with gr.Blocks() as demo:
txt = gr.Textbox(label="Input: TEXT", lines=2)
txt_2 = gr.Textbox(label="Input: Youtube URL")
btn = gr.Button(value="Submit")
txt_POS = gr.Textbox(value="", label="Positive comments")
txt_NEU = gr.Textbox(value="", label="Neutral comments")
txt_NEG = gr.Textbox(value="", label="Negative comments")
plot = gr.Plot(label="Plot")
btn.click(combine, inputs=[txt, txt_2], outputs=[txt_POS, txt_NEU, txt_NEG, plot])
if __name__ == "__main__":
demo.launch()