Spaces:
Runtime error
Runtime error
Combining both sentiment and news collector
Browse files1. adding the sentiment analysis code.
1. adding helping functions for sentiment analyzer.
1. combining both the gradio rows for news collector and sentiment analyzer to work on the same page
app.py
CHANGED
@@ -4,6 +4,15 @@ import os
|
|
4 |
import json
|
5 |
import pandas as pd
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def creating_data_dir(directory_path):
|
@@ -147,11 +156,181 @@ def call_functions(domain):
|
|
147 |
# iface.launch(debug=True)
|
148 |
|
149 |
# GRADIO APP USING BLOCKS
|
150 |
-
with gr.Blocks() as demo:
|
151 |
-
ui_domain = gr.Dropdown(["bbc", "forbes", "businessinsider_us"], label="Select Domain")
|
152 |
-
df_output = gr.Dataframe(type="pandas")
|
153 |
-
retrieve_button = gr.Button("Retrieve news")
|
154 |
|
155 |
-
retrieve_button.click(call_functions, inputs=ui_domain, outputs=df_output)
|
156 |
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import json
|
5 |
import pandas as pd
|
6 |
|
7 |
+
# -----imports for Sentiment Analyzer
|
8 |
+
from sklearn.pipeline import Pipeline
|
9 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
10 |
+
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
11 |
+
|
12 |
+
|
13 |
+
#--------------------------------------------------------------------------------------
|
14 |
+
#------------------------ NEWS DATA RETRIEVER------------------------------------------
|
15 |
+
#--------------------------------------------------------------------------------------
|
16 |
|
17 |
|
18 |
def creating_data_dir(directory_path):
|
|
|
156 |
# iface.launch(debug=True)
|
157 |
|
158 |
# GRADIO APP USING BLOCKS
|
|
|
|
|
|
|
|
|
159 |
|
|
|
160 |
|
161 |
+
|
162 |
+
|
163 |
+
#--------------------------------------------------------------------------------------
|
164 |
+
#------------------------ SENTIMENT ANALYZER------------------------------------------
|
165 |
+
#--------------------------------------------------------------------------------------
|
166 |
+
|
167 |
+
#---------------- Data Prepocessing ----------
|
168 |
+
def re_breakline(text_list):
|
169 |
+
return [re.sub('[\n\r]', ' ', r) for r in text_list]
|
170 |
+
|
171 |
+
def re_hyperlinks(text_list):
|
172 |
+
# Applying regex
|
173 |
+
pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
|
174 |
+
return [re.sub(pattern, ' link ', r) for r in text_list]
|
175 |
+
|
176 |
+
def re_dates(text_list):
|
177 |
+
# Applying regex
|
178 |
+
pattern = '([0-2][0-9]|(3)[0-1])(\/|\.)(((0)[0-9])|((1)[0-2]))(\/|\.)\d{2,4}'
|
179 |
+
return [re.sub(pattern, ' date ', r) for r in text_list]
|
180 |
+
|
181 |
+
|
182 |
+
def re_money(text_list):
|
183 |
+
# Applying regex
|
184 |
+
pattern = '[R]{0,1}\$[ ]{0,}\d+(,|\.)\d+'
|
185 |
+
return [re.sub(pattern, ' paisa ', r) for r in text_list]
|
186 |
+
|
187 |
+
def re_numbers(text_list):
|
188 |
+
# Applying regex
|
189 |
+
return [re.sub('[0-9]+', ' num ', r) for r in text_list]
|
190 |
+
|
191 |
+
def re_negation(text_list):
|
192 |
+
# Applying regex
|
193 |
+
return [re.sub('([nN][ãÃaA][oO]|[ñÑ]| [nN] )', ' negate ', r) for r in text_list]
|
194 |
+
|
195 |
+
def re_special_chars(text_list):
|
196 |
+
# Applying regex
|
197 |
+
return [re.sub('\W', ' ', r) for r in text_list]
|
198 |
+
def re_whitespaces(text_list):
|
199 |
+
# Applying regex
|
200 |
+
white_spaces = [re.sub('\s+', ' ', r) for r in text_list]
|
201 |
+
white_spaces_end = [re.sub('[ \t]+$', '', r) for r in white_spaces]
|
202 |
+
return white_spaces_end
|
203 |
+
|
204 |
+
# Class for regular expressions application
|
205 |
+
class ApplyRegex(BaseEstimator, TransformerMixin):
|
206 |
+
|
207 |
+
def __init__(self, regex_transformers):
|
208 |
+
self.regex_transformers = regex_transformers
|
209 |
+
|
210 |
+
def fit(self, X, y=None):
|
211 |
+
return self
|
212 |
+
|
213 |
+
def transform(self, X, y=None):
|
214 |
+
# Applying all regex functions in the regex_transformers dictionary
|
215 |
+
for regex_name, regex_function in self.regex_transformers.items():
|
216 |
+
X = regex_function(X)
|
217 |
+
|
218 |
+
return X
|
219 |
+
|
220 |
+
# Class for stopwords removal from the corpus
|
221 |
+
class StopWordsRemoval(BaseEstimator, TransformerMixin):
|
222 |
+
|
223 |
+
def __init__(self, text_stopwords):
|
224 |
+
self.text_stopwords = text_stopwords
|
225 |
+
def fit(self, X, y=None):
|
226 |
+
return self
|
227 |
+
|
228 |
+
def transform(self, X, y=None):
|
229 |
+
return [' '.join(stopwords_removal(comment, self.text_stopwords)) for comment in X]
|
230 |
+
|
231 |
+
# Class for apply the stemming process
|
232 |
+
class StemmingProcess(BaseEstimator, TransformerMixin):
|
233 |
+
|
234 |
+
def __init__(self, stemmer):
|
235 |
+
self.stemmer = stemmer
|
236 |
+
|
237 |
+
def fit(self, X, y=None):
|
238 |
+
return self
|
239 |
+
|
240 |
+
def transform(self, X, y=None):
|
241 |
+
return [' '.join(stemming_process(comment, self.stemmer)) for comment in X]
|
242 |
+
|
243 |
+
# Class for extracting features from corpus
|
244 |
+
class TextFeatureExtraction(BaseEstimator, TransformerMixin):
|
245 |
+
|
246 |
+
def __init__(self, vectorizer):
|
247 |
+
self.vectorizer = vectorizer
|
248 |
+
|
249 |
+
def fit(self, X, y=None):
|
250 |
+
return self
|
251 |
+
|
252 |
+
def transform(self, X, y=None):
|
253 |
+
return self.vectorizer.fit_transform(X).toarray()
|
254 |
+
|
255 |
+
|
256 |
+
#----------------------------Creating Pipeline for Preparing the data-----
|
257 |
+
# Defining regex transformers to be applied
|
258 |
+
regex_transformers = {
|
259 |
+
'break_line': re_breakline,
|
260 |
+
'hiperlinks': re_hyperlinks,
|
261 |
+
'dates': re_dates,
|
262 |
+
'money': re_money,
|
263 |
+
'numbers': re_numbers,
|
264 |
+
'negation': re_negation,
|
265 |
+
'special_chars': re_special_chars,
|
266 |
+
'whitespaces': re_whitespaces
|
267 |
+
}
|
268 |
+
|
269 |
+
# Defining the vectorizer to extract features from text
|
270 |
+
vectorizer = TfidfVectorizer(max_features=300, min_df=7, max_df=0.8, stop_words=en_stopwords)
|
271 |
+
|
272 |
+
# Building the Pipeline
|
273 |
+
text_pipeline = Pipeline([
|
274 |
+
('regex', ApplyRegex(regex_transformers)),
|
275 |
+
('stopwords', StopWordsRemoval(stopwords.words('portuguese'))),
|
276 |
+
('stemming', StemmingProcess(RSLPStemmer())),
|
277 |
+
('text_features', TextFeatureExtraction(vectorizer))
|
278 |
+
])
|
279 |
+
|
280 |
+
|
281 |
+
|
282 |
+
#----------------- Analyzing the Sentiments of whole dataset-------
|
283 |
+
|
284 |
+
def sentiment_analyzer(csv_file_name='combined_news_response.csv'):
|
285 |
+
|
286 |
+
df = pd.read_csv(csv_file_name)
|
287 |
+
df.drop('Unnamed: 0',axis=1,inplace=True)
|
288 |
+
|
289 |
+
# Splitting into X and y
|
290 |
+
X = list(df['content'].values)
|
291 |
+
# Applying the pipeline
|
292 |
+
X_processed = text_pipeline.fit_transform(X)
|
293 |
+
|
294 |
+
# Load a saved model
|
295 |
+
loaded_model_nb = joblib.load("Naive Bayes_model.joblib")
|
296 |
+
|
297 |
+
# Use the loaded model for inference
|
298 |
+
loaded_predictions_nb = loaded_model_nb.predict(X_processed)
|
299 |
+
sentiments = loaded_predictions_nb
|
300 |
+
|
301 |
+
# Sentiment mapping
|
302 |
+
sentiment_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
|
303 |
+
|
304 |
+
print(f"df['content'].values ==> {len(df['content'].values)} \n sentiments length ==> {len(sentiments)}")
|
305 |
+
# Create a DataFrame
|
306 |
+
sentiment_df = pd.DataFrame({
|
307 |
+
'content': df['content'].values,
|
308 |
+
'sentiment': [sentiment_mapping[sent] for sent in sentiments]
|
309 |
+
})
|
310 |
+
|
311 |
+
return sentiment_df
|
312 |
+
|
313 |
+
|
314 |
+
|
315 |
+
|
316 |
+
|
317 |
+
# Creating the app for both
|
318 |
+
|
319 |
+
with gr.Blocks() as demo:
|
320 |
+
with gr.Row():
|
321 |
+
with gr.Column(scale=1, min_width=600):
|
322 |
+
ui_domain = gr.Dropdown(["bbc", "forbes", "businessinsider_us"], label="Select Domain")
|
323 |
+
df_output = gr.Dataframe(type="pandas",wrap=True)
|
324 |
+
retrieve_button = gr.Button("Retrieve news")
|
325 |
+
|
326 |
+
retrieve_button.click(call_functions, inputs=ui_domain, outputs=df_output)
|
327 |
+
|
328 |
+
with gr.Row():
|
329 |
+
with gr.Column(scale=1, min_width=600):
|
330 |
+
ui_input = gr.Textbox(value='combined_news_response.csv' , visible=False)
|
331 |
+
view_sentiment_bttn = gr.Button("Analyze Sentiment")
|
332 |
+
df_output = gr.Dataframe(type="pandas",wrap=True)
|
333 |
+
|
334 |
+
view_sentiment_bttn.click(sentiment_analyzer, inputs=ui_input, outputs=df_output)
|
335 |
+
|
336 |
+
demo.launch(debug=True)
|