pknayak commited on
Commit
c5b6285
·
1 Parent(s): 7e56340

Combining both sentiment and news collector

Browse files

1. adding the sentiment analysis code.
1. adding helping functions for sentiment analyzer.
1. combining both the gradio rows for news collector and sentiment analyzer to work on the same page

Files changed (1) hide show
  1. app.py +185 -6
app.py CHANGED
@@ -4,6 +4,15 @@ import os
4
  import json
5
  import pandas as pd
6
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  def creating_data_dir(directory_path):
@@ -147,11 +156,181 @@ def call_functions(domain):
147
  # iface.launch(debug=True)
148
 
149
  # GRADIO APP USING BLOCKS
150
- with gr.Blocks() as demo:
151
- ui_domain = gr.Dropdown(["bbc", "forbes", "businessinsider_us"], label="Select Domain")
152
- df_output = gr.Dataframe(type="pandas")
153
- retrieve_button = gr.Button("Retrieve news")
154
 
155
- retrieve_button.click(call_functions, inputs=ui_domain, outputs=df_output)
156
 
157
- demo.launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import json
5
  import pandas as pd
6
 
7
+ # -----imports for Sentiment Analyzer
8
+ from sklearn.pipeline import Pipeline
9
+ from sklearn.base import BaseEstimator, TransformerMixin
10
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
11
+
12
+
13
+ #--------------------------------------------------------------------------------------
14
+ #------------------------ NEWS DATA RETRIEVER------------------------------------------
15
+ #--------------------------------------------------------------------------------------
16
 
17
 
18
  def creating_data_dir(directory_path):
 
156
  # iface.launch(debug=True)
157
 
158
  # GRADIO APP USING BLOCKS
 
 
 
 
159
 
 
160
 
161
+
162
+
163
+ #--------------------------------------------------------------------------------------
164
+ #------------------------ SENTIMENT ANALYZER------------------------------------------
165
+ #--------------------------------------------------------------------------------------
166
+
167
+ #---------------- Data Prepocessing ----------
168
+ def re_breakline(text_list):
169
+ return [re.sub('[\n\r]', ' ', r) for r in text_list]
170
+
171
+ def re_hyperlinks(text_list):
172
+ # Applying regex
173
+ pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
174
+ return [re.sub(pattern, ' link ', r) for r in text_list]
175
+
176
+ def re_dates(text_list):
177
+ # Applying regex
178
+ pattern = '([0-2][0-9]|(3)[0-1])(\/|\.)(((0)[0-9])|((1)[0-2]))(\/|\.)\d{2,4}'
179
+ return [re.sub(pattern, ' date ', r) for r in text_list]
180
+
181
+
182
+ def re_money(text_list):
183
+ # Applying regex
184
+ pattern = '[R]{0,1}\$[ ]{0,}\d+(,|\.)\d+'
185
+ return [re.sub(pattern, ' paisa ', r) for r in text_list]
186
+
187
+ def re_numbers(text_list):
188
+ # Applying regex
189
+ return [re.sub('[0-9]+', ' num ', r) for r in text_list]
190
+
191
+ def re_negation(text_list):
192
+ # Applying regex
193
+ return [re.sub('([nN][ãÃaA][oO]|[ñÑ]| [nN] )', ' negate ', r) for r in text_list]
194
+
195
+ def re_special_chars(text_list):
196
+ # Applying regex
197
+ return [re.sub('\W', ' ', r) for r in text_list]
198
+ def re_whitespaces(text_list):
199
+ # Applying regex
200
+ white_spaces = [re.sub('\s+', ' ', r) for r in text_list]
201
+ white_spaces_end = [re.sub('[ \t]+$', '', r) for r in white_spaces]
202
+ return white_spaces_end
203
+
204
+ # Class for regular expressions application
205
+ class ApplyRegex(BaseEstimator, TransformerMixin):
206
+
207
+ def __init__(self, regex_transformers):
208
+ self.regex_transformers = regex_transformers
209
+
210
+ def fit(self, X, y=None):
211
+ return self
212
+
213
+ def transform(self, X, y=None):
214
+ # Applying all regex functions in the regex_transformers dictionary
215
+ for regex_name, regex_function in self.regex_transformers.items():
216
+ X = regex_function(X)
217
+
218
+ return X
219
+
220
+ # Class for stopwords removal from the corpus
221
+ class StopWordsRemoval(BaseEstimator, TransformerMixin):
222
+
223
+ def __init__(self, text_stopwords):
224
+ self.text_stopwords = text_stopwords
225
+ def fit(self, X, y=None):
226
+ return self
227
+
228
+ def transform(self, X, y=None):
229
+ return [' '.join(stopwords_removal(comment, self.text_stopwords)) for comment in X]
230
+
231
+ # Class for apply the stemming process
232
+ class StemmingProcess(BaseEstimator, TransformerMixin):
233
+
234
+ def __init__(self, stemmer):
235
+ self.stemmer = stemmer
236
+
237
+ def fit(self, X, y=None):
238
+ return self
239
+
240
+ def transform(self, X, y=None):
241
+ return [' '.join(stemming_process(comment, self.stemmer)) for comment in X]
242
+
243
+ # Class for extracting features from corpus
244
+ class TextFeatureExtraction(BaseEstimator, TransformerMixin):
245
+
246
+ def __init__(self, vectorizer):
247
+ self.vectorizer = vectorizer
248
+
249
+ def fit(self, X, y=None):
250
+ return self
251
+
252
+ def transform(self, X, y=None):
253
+ return self.vectorizer.fit_transform(X).toarray()
254
+
255
+
256
+ #----------------------------Creating Pipeline for Preparing the data-----
257
+ # Defining regex transformers to be applied
258
+ regex_transformers = {
259
+ 'break_line': re_breakline,
260
+ 'hiperlinks': re_hyperlinks,
261
+ 'dates': re_dates,
262
+ 'money': re_money,
263
+ 'numbers': re_numbers,
264
+ 'negation': re_negation,
265
+ 'special_chars': re_special_chars,
266
+ 'whitespaces': re_whitespaces
267
+ }
268
+
269
+ # Defining the vectorizer to extract features from text
270
+ vectorizer = TfidfVectorizer(max_features=300, min_df=7, max_df=0.8, stop_words=en_stopwords)
271
+
272
+ # Building the Pipeline
273
+ text_pipeline = Pipeline([
274
+ ('regex', ApplyRegex(regex_transformers)),
275
+ ('stopwords', StopWordsRemoval(stopwords.words('portuguese'))),
276
+ ('stemming', StemmingProcess(RSLPStemmer())),
277
+ ('text_features', TextFeatureExtraction(vectorizer))
278
+ ])
279
+
280
+
281
+
282
+ #----------------- Analyzing the Sentiments of whole dataset-------
283
+
284
+ def sentiment_analyzer(csv_file_name='combined_news_response.csv'):
285
+
286
+ df = pd.read_csv(csv_file_name)
287
+ df.drop('Unnamed: 0',axis=1,inplace=True)
288
+
289
+ # Splitting into X and y
290
+ X = list(df['content'].values)
291
+ # Applying the pipeline
292
+ X_processed = text_pipeline.fit_transform(X)
293
+
294
+ # Load a saved model
295
+ loaded_model_nb = joblib.load("Naive Bayes_model.joblib")
296
+
297
+ # Use the loaded model for inference
298
+ loaded_predictions_nb = loaded_model_nb.predict(X_processed)
299
+ sentiments = loaded_predictions_nb
300
+
301
+ # Sentiment mapping
302
+ sentiment_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
303
+
304
+ print(f"df['content'].values ==> {len(df['content'].values)} \n sentiments length ==> {len(sentiments)}")
305
+ # Create a DataFrame
306
+ sentiment_df = pd.DataFrame({
307
+ 'content': df['content'].values,
308
+ 'sentiment': [sentiment_mapping[sent] for sent in sentiments]
309
+ })
310
+
311
+ return sentiment_df
312
+
313
+
314
+
315
+
316
+
317
+ # Creating the app for both
318
+
319
+ with gr.Blocks() as demo:
320
+ with gr.Row():
321
+ with gr.Column(scale=1, min_width=600):
322
+ ui_domain = gr.Dropdown(["bbc", "forbes", "businessinsider_us"], label="Select Domain")
323
+ df_output = gr.Dataframe(type="pandas",wrap=True)
324
+ retrieve_button = gr.Button("Retrieve news")
325
+
326
+ retrieve_button.click(call_functions, inputs=ui_domain, outputs=df_output)
327
+
328
+ with gr.Row():
329
+ with gr.Column(scale=1, min_width=600):
330
+ ui_input = gr.Textbox(value='combined_news_response.csv' , visible=False)
331
+ view_sentiment_bttn = gr.Button("Analyze Sentiment")
332
+ df_output = gr.Dataframe(type="pandas",wrap=True)
333
+
334
+ view_sentiment_bttn.click(sentiment_analyzer, inputs=ui_input, outputs=df_output)
335
+
336
+ demo.launch(debug=True)