import gradio as gr import os import matplotlib.pyplot as plt import pandas as pd import re from pythainlp.util import normalize from pythainlp.tokenize import word_tokenize from pythainlp import word_vector import numpy as np import keras import plotly.express as px ################# from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By import time import chromedriver_autoinstaller import sys sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver') # setup chrome options chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') # ensure GUI is off chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') # set path to chromedriver as per your configuration chromedriver_autoinstaller.install() wv = word_vector.WordVector() word2vec = wv.get_model() model= keras.models.load_model('my_model3.h5') def get_comments(VIDEO_URL): # Initialize the WebDriver driver = webdriver.Chrome(options=chrome_options) # Your scraping code here #VIDEO_URL = 'https://www.youtube.com/watch?v=VIDEO_ID' driver.get(VIDEO_URL) # Wait for the comments to load time.sleep(5) # Scroll down to load more comments (optional, repeat as needed) driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END) time.sleep(2) # Find and print comments comment_elements = driver.find_elements(By.XPATH, '//yt-formatted-string[@id="content-text"]') data = [] for comment in comment_elements: if comment != '': data.append(comment.text) print(comment.text) # Close the WebDriver driver.quit() return data def cosine_sim(u, v): return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)) def sentences_to_indices(X, word2vec, max_len): """ Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences. The output shape should be such that it can be given to `Embedding()`. Arguments: X -- array of sentences (strings), of shape (m, 1) word2vec -- a trained Word2Vec model from gensim max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. Returns: X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len) """ m = X.shape[0] # number of training examples # Initialize X_indices as a numpy matrix of zeros and the correct shape X_indices = np.zeros((m, max_len)) for i in range(m): # loop over training examples # Convert the ith training sentence in lower case and split is into words. You should get a list of words. # print(X) # print(len(X[i].lower().split())) sentence_words = X[i].lower().split()[:max_len] # Initialize j to 0 j = 0 try: # Loop over the words of sentence_words for w in sentence_words: # Set the (i,j)th entry of X_indices to the index of the correct word. if w in word2vec.key_to_index: X_indices[i, j] = word2vec.key_to_index[w] # Increment j to j + 1 j += 1 except: print('key error: ', w) return X_indices def deEmojify(text): regrex_pattern = re.compile(pattern = "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002500-\U00002BEF" # chinese char u"\U00002702-\U000027B0" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u"\U00010000-\U0010ffff" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u200d" u"\u23cf" u"\u23e9" u"\u231a" u"\ufe0f" # dingbats u"\u3030" "]+", flags = re.UNICODE) return regrex_pattern.sub(r'',text) def clean_me(data): data['clean_text'] = data['text'].str.replace(r'<[^<>]*>', '', regex=True) data['clean2_text']= data['clean_text'].str.strip().str.lower().str.replace('\r+', ' ').str.replace('\n+',' ').str.replace('\t+',' ') data['clean3_text'] = data.apply(lambda row: deEmojify(row['clean2_text']), axis=1) # Normalize text data['clean4_text'] = data.apply(lambda row: normalize(row['clean3_text']), axis=1) # Word segmentation: it will take a while.... data['wordseged_text'] = data.apply(lambda row: word_tokenize(row['clean4_text'], engine="newmm-safe"), axis=1) # Join the wordsegged with space data['wordseged_space_text'] = data.apply(lambda row: " ".join(row["wordseged_text"]), axis=1) return(data) def pretty_output(lines, sentiment): label = np.array(['Neg', 'Neu', 'Pos']) txt_sentiment = label[np.argmax(sentiment, axis=1)] seriesText = pd.Series(txt_sentiment).value_counts() df = pd.DataFrame({'Sentiment': seriesText.index, 'Count': seriesText.values}) fig = px.bar(df, x='Sentiment', y='Count', color='Sentiment') fig.update_xaxes(categoryorder='array', categoryarray=['Neg', 'Neu', 'Pos']) txt_pos = '' txt_neu = '' txt_neg = '' for (x, y, score) in zip(lines, txt_sentiment, sentiment,): txt_score = [f"{i:.2f}" for i in score] tmp = f'{y} {txt_score}:-{x} \n' if y == 'Pos': txt_pos += tmp elif y == 'Neu': txt_neu += tmp else: txt_neg += tmp return(txt_pos, txt_neu, txt_neg, fig) def fx(t): return 16 * np.sin(t) ** 3 def fy(t): return 13 * np.cos(t) - 5 * np.cos(2 * t) - 2 * np.cos(3 * t) - np.cos(4 * t) def combine(a, b): data = pd.DataFrame() embedded_url = '' lines = str.split(a, '\n') if b != "": lines = get_comments(b) if not lines: text001 = 'CANNOT_GET DATA from Youtube' print(text001) if not lines: t = np.linspace(-2 * np.pi, 2 * np.pi) xs = fx(t) ys = fy(t) plt.plot(xs, ys, "o") plt.title('My Heart') plt.xlabel(' CANNOT LOAD VDO ') plt.ylabel(' CANNOT LOAD VDO ') str_output = 'www.youtube.com/embed/KRhPBvrBhro?si=U8sOh4ighEG9hkTI' embed_html = f'