ThanaphonJoe commited on
Commit
77f9dcf
1 Parent(s): 43452d4
Files changed (1) hide show
  1. app.py +144 -48
app.py CHANGED
@@ -1,15 +1,107 @@
1
  import gradio as gr
2
- import pickle
3
  import os
4
- import numpy as np
5
- import pandas as pd
6
- from sklearn.model_selection import train_test_split
7
- from sklearn.metrics import confusion_matrix
8
  import matplotlib.pyplot as plt
 
9
  import re
10
  from pythainlp.util import normalize
11
- from pythainlp.corpus import thai_stopwords
12
  from pythainlp.tokenize import word_tokenize
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  def deEmojify(text):
@@ -37,13 +129,6 @@ def deEmojify(text):
37
 
38
 
39
  def clean_me(data):
40
- stopwords = list(thai_stopwords())
41
- stopwords.append("nan")
42
- stopwords.append("-")
43
- stopwords.append("_")
44
- stopwords.append("")
45
- stopwords.append(" ")
46
-
47
  data['clean_text'] = data['text'].str.replace(r'<[^<>]*>', '', regex=True)
48
  data['clean2_text']= data['clean_text'].str.strip().str.lower().str.replace('\r+', ' ').str.replace('\n+',' ').str.replace('\t+',' ')
49
  data['clean3_text'] = data.apply(lambda row: deEmojify(row['clean2_text']), axis=1)
@@ -54,16 +139,49 @@ def clean_me(data):
54
  # Join the wordsegged with space
55
  data['wordseged_space_text'] = data.apply(lambda row: " ".join(row["wordseged_text"]), axis=1)
56
 
57
-
58
  return(data)
59
 
60
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def combine(a, b):
62
  data = pd.DataFrame()
63
- data['text'] = [a]
 
 
 
 
 
 
 
64
  data = clean_me(data)
65
- a = data['wordseged_space_text'][0] + '123'
66
- return a + " " + b
 
 
 
 
 
67
 
68
 
69
  def mirror(x):
@@ -72,36 +190,14 @@ def mirror(x):
72
 
73
  with gr.Blocks() as demo:
74
 
75
- txt = gr.Textbox(label="Input", lines=2)
76
- txt_2 = gr.Textbox(label="Input 2")
77
- txt_3 = gr.Textbox(value="", label="Output")
78
- btn = gr.Button(value="Submit")
79
- btn.click(combine, inputs=[txt, txt_2], outputs=[txt_3])
80
-
81
- with gr.Row():
82
- im = gr.Image()
83
- im_2 = gr.Image()
84
-
85
- btn = gr.Button(value="Mirror Image")
86
- btn.click(mirror, inputs=[im], outputs=[im_2])
87
-
88
- gr.Markdown("## Text Examples")
89
- gr.Examples(
90
- [["hi", "Adam"], ["hello", "Eve"]],
91
- [txt, txt_2],
92
- txt_3,
93
- combine,
94
- cache_examples=True,
95
- )
96
- gr.Markdown("## Image Examples")
97
- gr.Examples(
98
- examples=[os.path.join(os.path.dirname(__file__), "lion.jpg")],
99
- inputs=im,
100
- outputs=im_2,
101
- fn=mirror,
102
- cache_examples=True,
103
- )
104
-
105
 
106
 
107
  if __name__ == "__main__":
 
1
  import gradio as gr
 
2
  import os
 
 
 
 
3
  import matplotlib.pyplot as plt
4
+ import pandas as pd
5
  import re
6
  from pythainlp.util import normalize
 
7
  from pythainlp.tokenize import word_tokenize
8
+ from pythainlp import word_vector
9
+ import numpy as np
10
+ import keras
11
+ import plotly.express as px
12
+ #################
13
+ from selenium import webdriver
14
+ from selenium.webdriver.common.keys import Keys
15
+ from selenium.webdriver.common.by import By
16
+ import time
17
+ import chromedriver_autoinstaller
18
+ import sys
19
+ sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
20
+ # setup chrome options
21
+ chrome_options = webdriver.ChromeOptions()
22
+ chrome_options.add_argument('--headless') # ensure GUI is off
23
+ chrome_options.add_argument('--no-sandbox')
24
+ chrome_options.add_argument('--disable-dev-shm-usage')
25
+
26
+ # set path to chromedriver as per your configuration
27
+ chromedriver_autoinstaller.install()
28
+
29
+ wv = word_vector.WordVector()
30
+ word2vec = wv.get_model()
31
+
32
+ model= keras.models.load_model('my_model3.h5')
33
+
34
+ def get_comments(VIDEO_URL):
35
+ # Initialize the WebDriver
36
+ driver = webdriver.Chrome(options=chrome_options)
37
+ # Your scraping code here
38
+ #VIDEO_URL = 'https://www.youtube.com/watch?v=VIDEO_ID'
39
+ driver.get(VIDEO_URL)
40
+
41
+ # Wait for the comments to load
42
+ time.sleep(5)
43
+
44
+ # Scroll down to load more comments (optional, repeat as needed)
45
+ driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
46
+ time.sleep(2)
47
+
48
+ # Find and print comments
49
+ comment_elements = driver.find_elements(By.XPATH, '//yt-formatted-string[@id="content-text"]')
50
+ data = []
51
+ for comment in comment_elements:
52
+ if comment != '':
53
+ data.append(comment.text)
54
+ print(comment.text)
55
+
56
+ # Close the WebDriver
57
+ driver.quit()
58
+
59
+ return data
60
+ def cosine_sim(u, v):
61
+ return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
62
+
63
+ def sentences_to_indices(X, word2vec, max_len):
64
+ """
65
+ Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
66
+ The output shape should be such that it can be given to `Embedding()`.
67
+
68
+ Arguments:
69
+ X -- array of sentences (strings), of shape (m, 1)
70
+ word2vec -- a trained Word2Vec model from gensim
71
+ max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this.
72
+
73
+ Returns:
74
+ X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
75
+ """
76
+
77
+ m = X.shape[0] # number of training examples
78
+
79
+ # Initialize X_indices as a numpy matrix of zeros and the correct shape
80
+ X_indices = np.zeros((m, max_len))
81
+
82
+ for i in range(m): # loop over training examples
83
+
84
+ # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
85
+ # print(X)
86
+ # print(len(X[i].lower().split()))
87
+ sentence_words = X[i].lower().split()[:max_len]
88
+
89
+ # Initialize j to 0
90
+ j = 0
91
+ try:
92
+ # Loop over the words of sentence_words
93
+ for w in sentence_words:
94
+ # Set the (i,j)th entry of X_indices to the index of the correct word.
95
+
96
+ if w in word2vec.key_to_index:
97
+ X_indices[i, j] = word2vec.key_to_index[w]
98
+ # Increment j to j + 1
99
+ j += 1
100
+ except:
101
+ print('key error: ', w)
102
+
103
+
104
+ return X_indices
105
 
106
 
107
  def deEmojify(text):
 
129
 
130
 
131
  def clean_me(data):
 
 
 
 
 
 
 
132
  data['clean_text'] = data['text'].str.replace(r'<[^<>]*>', '', regex=True)
133
  data['clean2_text']= data['clean_text'].str.strip().str.lower().str.replace('\r+', ' ').str.replace('\n+',' ').str.replace('\t+',' ')
134
  data['clean3_text'] = data.apply(lambda row: deEmojify(row['clean2_text']), axis=1)
 
139
  # Join the wordsegged with space
140
  data['wordseged_space_text'] = data.apply(lambda row: " ".join(row["wordseged_text"]), axis=1)
141
 
 
142
  return(data)
143
 
144
+ def pretty_output(lines, sentiment):
145
+
146
+ label = np.array(['Neg', 'Neu', 'Pos'])
147
+ txt_sentiment = label[np.argmax(sentiment, axis=1)]
148
+ seriesText = pd.Series(txt_sentiment).value_counts()
149
+ df = pd.DataFrame({'Sentiment': seriesText.index, 'Count': seriesText.values})
150
+ fig = px.bar(df, x='Sentiment', y='Count', color='Sentiment')
151
+ fig.update_xaxes(categoryorder='array', categoryarray=['Neg', 'Neu', 'Pos'])
152
+
153
+ txt_pos = ''
154
+ txt_neu = ''
155
+ txt_neg = ''
156
+ for (x, y, score) in zip(lines, txt_sentiment, sentiment,):
157
+ txt_score = [f"{i:.2f}" for i in score]
158
+ tmp = f'{y} {txt_score}:-{x} \n'
159
+ if y == 'Pos':
160
+ txt_pos += tmp
161
+ elif y == 'Neu':
162
+ txt_neu += tmp
163
+ else:
164
+ txt_neg += tmp
165
+
166
+ return(txt_pos, txt_neu, txt_neg, fig)
167
  def combine(a, b):
168
  data = pd.DataFrame()
169
+ lines = str.split(a, '\n')
170
+ if b != "":
171
+ lines = get_comments(b)
172
+ if lines == []:
173
+ text001 = 'CANNOT_GET DATA from Youtube'
174
+ print(text001)
175
+
176
+ data['text'] = lines
177
  data = clean_me(data)
178
+ a = data['wordseged_space_text'][0] + ' SENTIMENT: '
179
+
180
+ X_train_indices = sentences_to_indices(data['wordseged_space_text'].values, word2vec, 128)
181
+ result = model.predict(X_train_indices[:])
182
+ txt_pos, txt_neu, txt_neg, fig = pretty_output(lines,result)
183
+
184
+ return txt_pos, txt_neu, txt_neg, fig
185
 
186
 
187
  def mirror(x):
 
190
 
191
  with gr.Blocks() as demo:
192
 
193
+ txt = gr.Textbox(label="Input: TEXT", lines=2)
194
+ txt_2 = gr.Textbox(label="Input: Youtube URL")
195
+ btn = gr.Button(value="Submit")
196
+ txt_POS = gr.Textbox(value="", label="Positive comments")
197
+ txt_NEU = gr.Textbox(value="", label="Neutral comments")
198
+ txt_NEG = gr.Textbox(value="", label="Negative comments")
199
+ plot = gr.Plot(label="Plot")
200
+ btn.click(combine, inputs=[txt, txt_2], outputs=[txt_POS, txt_NEU, txt_NEG, plot])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
 
203
  if __name__ == "__main__":