ksvmuralidhar commited on
Commit
078c1e1
1 Parent(s): 0f43b71

Upload files

Browse files
Dockerfile ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.7-slim
2
+ WORKDIR /webapp
3
+ COPY . .
4
+ RUN chmod +x /webapp/start.sh
5
+ RUN pip install --upgrade pip
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+ RUN apt update && apt install -y redis-server
8
+ EXPOSE 7860 6379
9
+ CMD ["/webapp/start.sh"]
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: News Aggregator
3
- emoji: 🏢
4
- colorFrom: purple
5
- colorTo: red
6
  sdk: docker
7
  pinned: false
8
  license: mit
 
1
  ---
2
  title: News Aggregator
3
+ emoji:
4
+ colorFrom: indigo
5
+ colorTo: blue
6
  sdk: docker
7
  pinned: false
8
  license: mit
app.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from dateutil import parser
4
+ from flask import Flask, render_template
5
+ from flask_cors import cross_origin, CORS
6
+ from db_operations.db_operations import DBOperations
7
+ import logging
8
+ import traceback
9
+ import redis
10
+ from datetime import datetime
11
+ from functools import lru_cache
12
+ from word_cloud import get_frequent_words_html
13
+ from config import NEWS_RETENTION_SECONDS
14
+
15
+
16
+ app = Flask(__name__)
17
+ CORS(app)
18
+ redis_client = redis.Redis(host='localhost', port=6379, decode_responses=True)
19
+ db = DBOperations()
20
+
21
+
22
+ REFRESH_FREQ = 300 # 300 secs = 5 mins
23
+
24
+ def is_db_fetch_reqd():
25
+ try:
26
+ env_news_time = redis_client.get('NEWSFETCHTIME')
27
+ logging.warning(f'fetch_time_env_var: {env_news_time}')
28
+ fetch_flag = 1
29
+ if env_news_time is None:
30
+ redis_client.set("NEWSFETCHTIME", str(datetime.now()))
31
+ fetch_flag = 1
32
+
33
+ if env_news_time is not None:
34
+ fetch_time_lapse_seconds = (datetime.now() - datetime.strptime(env_news_time, '%Y-%m-%d %H:%M:%S.%f')).seconds
35
+ if fetch_time_lapse_seconds <= REFRESH_FREQ:
36
+ fetch_flag = 0
37
+ else:
38
+ redis_client.set("NEWSFETCHTIME", str(datetime.now()))
39
+ fetch_flag = 1
40
+ except Exception as e:
41
+ print(e)
42
+ fetch_flag = 1
43
+ return fetch_flag
44
+
45
+
46
+ def correct_date(x):
47
+ if (not isinstance(x, str)) or (str(x).find(":") == -1):
48
+ logging.warning(f'correct_date() error: {x} is not the right date format')
49
+ return "2020-11-07 00:36:44+05:30"
50
+ return x
51
+
52
+ def date_time_parser(dt):
53
+ """
54
+ Computes the minutes elapsed since published time.
55
+ :param dt: date
56
+ :return: int, minutes elapsed.
57
+ """
58
+ try:
59
+ return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
60
+ except:
61
+ logging.warning(f'date_time_parser() error: {dt} is not the right date format')
62
+ return 100000
63
+
64
+
65
+ def elapsed_time_str(mins):
66
+ """
67
+ Return the time elapsed string from minutes passed as an argument.
68
+ :param mins: int, minutes elapsed.
69
+ :return: str, time elapsed string
70
+ """
71
+ try:
72
+ time_str = ''
73
+ hours = int(mins / 60)
74
+ days = np.round(mins / (60 * 24), 1)
75
+ remaining_mins = int(mins - (hours * 60))
76
+ if days >= 1:
77
+ time_str = f'{str(days)} days ago'
78
+ if days == 1:
79
+ time_str = 'a day ago'
80
+ elif (days < 1) & (hours < 24) & (mins >= 60):
81
+ time_str = f'{str(hours)} hours and {str(remaining_mins)} mins ago'
82
+ if (hours == 1) & (remaining_mins > 1):
83
+ time_str = f'an hour and {str(remaining_mins)} mins ago'
84
+ if (hours == 1) & (remaining_mins == 1):
85
+ time_str = f'an hour and a min ago'
86
+ if (hours > 1) & (remaining_mins == 1):
87
+ time_str = f'{str(hours)} hours and a min ago'
88
+ if (hours > 1) & (remaining_mins == 0):
89
+ time_str = f'{str(hours)} hours ago'
90
+ if ((mins / 60) == 1) & (remaining_mins == 0):
91
+ time_str = 'an hour ago'
92
+ elif (days < 1) & (hours < 24) & (mins == 0):
93
+ time_str = 'Just in'
94
+ else:
95
+ time_str = f'{str(mins)} minutes ago'
96
+ if mins == 1:
97
+ time_str = 'a minute ago'
98
+ return time_str
99
+ except:
100
+ return "-"
101
+
102
+
103
+
104
+ def fetch_from_db(fetch_flag):
105
+ try:
106
+ logging.warning(f'fetch_flag: {fetch_flag}')
107
+ if fetch_flag == 1:
108
+ final_df = db.read_news_from_db()
109
+ freq_tokens = get_frequent_words_html(final_df)
110
+ logging.warning('Fetched From DB\n\n')
111
+
112
+ final_df['_id'] = final_df['_id'].astype('str')
113
+
114
+ redis_client.set("NEWSDF", final_df.to_json())
115
+ redis_client.set("NEWSWORDCLOUD", freq_tokens)
116
+ else:
117
+ final_df = pd.read_json(redis_client.get("NEWSDF"))
118
+ freq_tokens = redis_client.get("NEWSWORDCLOUD")
119
+ logging.warning('Fetched From Cache\n\n')
120
+
121
+ except Exception as e:
122
+ print(e)
123
+ final_df = []
124
+ freq_tokens = ""
125
+ raise
126
+ return final_df, freq_tokens
127
+
128
+
129
+ @app.route("/")
130
+ @cross_origin()
131
+ def index():
132
+ """
133
+ Entry point
134
+ """
135
+ try:
136
+ src_str = ''
137
+ final_df, freq_tokens = fetch_from_db(is_db_fetch_reqd())
138
+ if len(final_df) > 1:
139
+
140
+ final_df["parsed_date"] = [correct_date(date_) for date_ in final_df['parsed_date']]
141
+ final_df["parsed_date"] = [parser.parse(date_) for date_ in final_df['parsed_date']]
142
+ final_df["elapsed_time"] =[date_time_parser(date_) for date_ in final_df['parsed_date']]
143
+ final_df = final_df.loc[final_df["elapsed_time"] <= NEWS_RETENTION_SECONDS, :].copy()
144
+ final_df["elapsed_time_str"] = final_df["elapsed_time"].apply(elapsed_time_str)
145
+ final_df.sort_values(by="elapsed_time", inplace=True)
146
+ src_str = ", ".join(sorted([*final_df['src'].unique()]))
147
+ final_df['src_time'] = final_df['src'] + ("&nbsp;" * 5) + final_df["elapsed_time_str"]
148
+ final_df.drop(columns=['_id', 'parsed_date', 'src', 'elapsed_time', 'elapsed_time_str'], inplace=True)
149
+ final_df.drop_duplicates(subset='description', inplace=True)
150
+ final_df = final_df.loc[(final_df["title"] != ""), :].copy()
151
+ else:
152
+ final_df = pd.DataFrame({'title': '', 'url': '',
153
+ 'description': '', 'src_time': ''}, index=[0])
154
+
155
+ except Exception as e:
156
+ final_df = pd.DataFrame({'title': '', 'url': '',
157
+ 'description': '', 'src_time': ''}, index=[0])
158
+ logging.warning(traceback.print_exc())
159
+
160
+ result_str = f'''
161
+ <div class="box" id="main">
162
+ <form>
163
+
164
+ <div class="banner">
165
+ <img src="../static/favicon_new.png" class="logo-img" alt="KSV Muralidhar" />
166
+ <h1 style="display:inline-block; vertical-align: middle;">Latest News</h1>
167
+ </div>
168
+ '''
169
+
170
+ if len(final_df) <= 1:
171
+ result_str += f'''<div><p class="unavailable">This app is temporarily unavailable</p></div>'''
172
+ else:
173
+ # last_update_utc = datetime.strptime(os.getenv("NEWSFETCHTIME"), '%Y-%m-%d %H:%M:%S.%f')
174
+ last_update_utc = datetime.strptime(redis_client.get('NEWSFETCHTIME'), '%Y-%m-%d %H:%M:%S.%f')
175
+ last_update_utc = last_update_utc.strftime("%Y-%m-%d %H:%M:%S")
176
+ result_str += f'<p class="srctxt">News aggregated from <b>{src_str}</b>.<br><br>Last updated: {last_update_utc} UTC</p>'
177
+
178
+ result_str += '''
179
+ <div class="input-container">
180
+ <input type="text" class="keyword-input" id="keywordInput" placeholder="Search" oninput="filterContent(true)">
181
+ <div class="clear-btn" id="clearBtn" onclick="clearFilter()">&times;</div>
182
+ </div>
183
+ '''
184
+
185
+ result_str += f"{freq_tokens} "
186
+ result_str += '<div class="show-more-word-cloud" onclick=word_cloud_display()><p class="three-dots">...</p></div>'
187
+
188
+ result_str += '''<div style="padding-bottom: 10px; font-size: 12px; font-family: Arial, Helvetica, sans-serif;">
189
+ News categories and similar news are AI-generated</div>'''
190
+
191
+
192
+ for n, i in final_df.iterrows(): # iterating through the search results
193
+ href = i["url"]
194
+ category = i["category"]
195
+ description = i["description"]
196
+ url_txt = i["title"]
197
+ src_time = i["src_time"]
198
+ sim_news = i['similar_news']
199
+ result_str += f'''<div class="news-item"><div style="padding-top: 7px;">
200
+ <a href="{href}" target="_blank" class="article-category">{category}
201
+ </a>
202
+ </div>
203
+ <div>
204
+ <a href="{href}" target="_blank" class="headline">{url_txt}
205
+ </a>
206
+ </div>
207
+ <div>
208
+ <a href="{href}" target="_blank" class="description">
209
+ {description}
210
+ </a>
211
+ </div>
212
+ <div>
213
+ <a href="{href}" target="_blank" class="time">
214
+ {src_time}
215
+ </a>
216
+ </div>
217
+
218
+
219
+ <div class="container">
220
+ <div class="content" style="display: none;">
221
+ {sim_news}
222
+ </div>
223
+ <div class="show-similar-button-container">
224
+ <button type="button" class="show-more">Show similar news</button>
225
+ <button type="button" class="show-less">Hide similar news</button>
226
+ </div>
227
+ </div>
228
+
229
+
230
+
231
+ <div>
232
+ <p></p>
233
+ </div></div>
234
+ '''
235
+
236
+ result_str += '</form></div>'
237
+ return render_template("index.html", body=result_str)
238
+
239
+
240
+ if __name__ == "__main__":
241
+ app.run(host="0.0.0.0", port=7860, workers=5, threads=5) # workers=(2*ncores) + 1, threads= (2 to 4*ncores) + 1
242
+
config.py ADDED
@@ -0,0 +1 @@
 
 
1
+ NEWS_RETENTION_SECONDS = 300
db_operations/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from db_operations.db_operations import *
db_operations/db_operations.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pymongo
2
+ import os
3
+ import pandas as pd
4
+
5
+
6
+ class DBOperations:
7
+ """
8
+ Reads news from MongoDB
9
+ """
10
+ def __init__(self):
11
+ self.url = os.getenv('DB_URL')
12
+ self.database = "rss_news_db_cat_pred_sim_news"
13
+ self.collection = "rss_news_cat_pred_sim_news"
14
+ self.__client = None
15
+ self.__error = 0
16
+
17
+ def __connect(self):
18
+ try:
19
+ self.__client = pymongo.MongoClient(self.url)
20
+ _ = self.__client.list_database_names()
21
+ except Exception as conn_exception:
22
+ self.__error = 1
23
+ self.__client = None
24
+ raise
25
+
26
+ def __read(self):
27
+ try:
28
+ db = self.__client[self.database]
29
+ coll = db[self.collection]
30
+ docs = []
31
+ for doc in coll.find():
32
+ docs.append(doc)
33
+ rss_df = pd.DataFrame(docs)
34
+ except Exception as insert_err:
35
+ self.__error = 1
36
+ rss_df = pd.DataFrame({'_id': '', 'title': '', 'url': '',
37
+ 'description': '', 'parsed_date': '',
38
+ 'src': ''}, index=[0])
39
+ return rss_df
40
+
41
+ def __close_connection(self):
42
+ if self.__client is not None:
43
+ self.__client.close()
44
+ self.__client = None
45
+
46
+ def read_news_from_db(self):
47
+ rss_df = pd.DataFrame({'_id': '', 'title': '', 'url': '',
48
+ 'description': '', 'parsed_date': '',
49
+ 'src': ''}, index=[0])
50
+ if self.url is not None:
51
+ if self.__error == 0:
52
+ self.__connect()
53
+ if self.__error == 0:
54
+ rss_df = self.__read()
55
+ if self.__error == 0:
56
+ print("Read Successful")
57
+ if self.__client is not None:
58
+ self.__close_connection()
59
+ return rss_df
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ regex==2021.8.3
2
+ lxml==4.6.3
3
+ numpy==1.21.1
4
+ python-dateutil==2.8.2
5
+ pandas==1.3.1
6
+ requests==2.26.0
7
+ bs4==0.0.1
8
+ flask==2.2.2
9
+ flask_cors==3.0.10
10
+ gunicorn==20.1.0
11
+ pymongo==4.3.3
12
+ redis
start.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ #!/bin/bash
2
+ redis-server --daemonize yes
3
+ gunicorn -b 0.0.0.0:7860 --timeout 120 --workers 5 --threads 5 app:app
static/favicon_new.png ADDED
static/loader.gif ADDED
static/refresh_reload_icon.png ADDED
static/styles.css ADDED
@@ -0,0 +1,510 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ html {
2
+ scroll-behavior: smooth;
3
+ }
4
+
5
+ @media screen and (min-width: 800px) {
6
+ a.headline {
7
+ background-color: #E5E4E2;
8
+ display: block;
9
+ width: relative;
10
+ text-decoration: none;
11
+ color: black;
12
+ line-height: 1.2;
13
+ align: justify;
14
+ border-left: 5px solid transparent;
15
+ border-top: 5px solid transparent;
16
+ border-bottom: 5px solid transparent;
17
+ border-right: 0px;
18
+ font-weight: bold;
19
+ font-size: 18px;
20
+ padding-right: 5px;
21
+ font-family: Arial, Helvetica, sans-serif;
22
+ }
23
+ }
24
+
25
+ @media screen and (max-width: 800px) {
26
+ a.headline {
27
+ background-color: #E5E4E2;
28
+ display: block;
29
+ width: relative;
30
+ text-decoration: none;
31
+ color: black;
32
+ line-height: 1.2;
33
+ align: justify;
34
+ border-left: 5px solid transparent;
35
+ border-top: 5px solid transparent;
36
+ border-bottom: 5px solid transparent;
37
+ border-right: 0px;
38
+ font-weight: bold;
39
+ font-size: 16.5px;
40
+ padding-right: 5px;
41
+ font-family: Arial, Helvetica, sans-serif;
42
+ }
43
+ }
44
+
45
+ @media screen and (min-width: 800px) {
46
+ a.description {
47
+ background-color: #E5E4E2;
48
+ align:justify;
49
+ text-align: justify;
50
+ display: block;
51
+ height:100%;
52
+ width: relative;
53
+ text-decoration: none;
54
+ border-left: 5px solid transparent;
55
+ border-top: 0px;
56
+ border-bottom: 7px solid transparent;
57
+ border-right: 0px;
58
+ font-size: 14px;
59
+ padding-right: 5px;
60
+ font-family: Arial, Helvetica, sans-serif;
61
+ color: dimgrey;
62
+ }
63
+ }
64
+
65
+ @media screen and (max-width: 800px) {
66
+ a.description {
67
+ background-color: #E5E4E2;
68
+ align:justify;
69
+ text-align: justify;
70
+ display: block;
71
+ height:100%;
72
+ width: relative;
73
+ text-decoration: none;
74
+ border-left: 5px solid transparent;
75
+ border-top: 0px;
76
+ border-bottom: 7px solid transparent;
77
+ border-right: 0px;
78
+ font-size: 12.5px;
79
+ padding-right: 5px;
80
+ font-family: Arial, Helvetica, sans-serif;
81
+ color: dimgrey;
82
+ }
83
+ }
84
+
85
+ @media screen and (min-width: 800px) {
86
+ a.time {
87
+ background-color: #E5E4E2;
88
+ align:justify;
89
+ display: block;
90
+ height:100%;
91
+ width: relative;
92
+ text-decoration: none;
93
+ border-left: 5px solid transparent;
94
+ border-top: 0px;
95
+ border-bottom: 1px solid transparent;
96
+ border-right: 0px;
97
+ padding-right: 5px;
98
+ font-size: 11px;
99
+ padding-bottom: 5px;
100
+ font-family: Arial, Helvetica, sans-serif;
101
+ color: green;
102
+ }
103
+ }
104
+
105
+ @media screen and (max-width: 800px) {
106
+ a.time {
107
+ background-color: #E5E4E2;
108
+ align:justify;
109
+ display: block;
110
+ height:100%;
111
+ width: relative;
112
+ text-decoration: none;
113
+ border-left: 5px solid transparent;
114
+ border-top: 0px;
115
+ border-bottom: 1px solid transparent;
116
+ border-right: 0px;
117
+ padding-right: 5px;
118
+ font-size: 10px;
119
+ padding-bottom: 5px;
120
+ font-family: Arial, Helvetica, sans-serif;
121
+ color: green;
122
+ }
123
+ }
124
+
125
+ .box {
126
+ display: flex;
127
+ justify-content: center;
128
+ align-items: center;
129
+ height: inherit;
130
+ padding: 20px;
131
+ }
132
+ @media screen and (min-width: 800px) {
133
+ form {
134
+ width: 50%;
135
+ overflow-x: hidden;
136
+ padding: 20px;
137
+ border-radius: 10px;
138
+ background: #fff;
139
+ box-shadow: 0 0 20px 0 #095484;
140
+ }}
141
+
142
+ @media screen and (max-width: 800px) {
143
+ form {
144
+ width: 100%;
145
+ overflow-x: hidden;
146
+ padding: 20px;
147
+ border-radius: 10px;
148
+ background: #fff;
149
+ box-shadow: 0 0 15px 0 #095484;
150
+ }}
151
+ .banner {
152
+ position: relative;
153
+ height: 30px;
154
+ /* background-size: cover; */
155
+ display: flex;
156
+ /* justify-content: center; */
157
+ /* align-items: center; */
158
+ /* text-align: center; */
159
+ }
160
+ @media screen and (min-width: 800px) {
161
+ h1 {
162
+ position: absolute;
163
+ margin: 0;
164
+ padding-left: 50px;
165
+ font-size: 25px;
166
+ color: black;
167
+ z-index: 2;
168
+ font-family: Arial, Helvetica, sans-serif;
169
+ }
170
+ }
171
+
172
+ @media screen and (max-width: 800px) {
173
+ h1 {
174
+ position: absolute;
175
+ margin: 0;
176
+ padding-left: 40px;
177
+ font-size: 24px;
178
+ color: black;
179
+ z-index: 2;
180
+ font-family: Arial, Helvetica, sans-serif;
181
+ }
182
+ }
183
+
184
+ p.unavailable {
185
+ background-color: #E5E4E2;
186
+ display: block;
187
+ width: 100%;
188
+ text-decoration: none;
189
+ color: black;
190
+ line-height: 1.2;
191
+ align: justify;
192
+ border-left: 5px solid transparent;
193
+ border-top: 5px solid transparent;
194
+ border-bottom: 5px solid transparent;
195
+ border-right: 0px;
196
+ font-weight: bold;
197
+ font-size: 18px;
198
+ padding-right: 5px;
199
+ font-family: Arial, Helvetica, sans-serif;
200
+ }
201
+ div.news-item{
202
+ background-color: #E5E4E2;
203
+ /*box-shadow: rgba(0, 0, 0, 0.4) -1px 0px 5px, rgba(0, 0, 0, 0.5) 0px 4px 5px -3px, rgba(0, 0, 0, 0.2) 0px -3px 0px inset;*/
204
+ box-shadow: rgba(0, 0, 0, 0.25) 0px 0px 5px 1px, rgba(0, 0, 0, 0.1) 0px 4px 5px -3px, rgba(0, 0, 0, 0.2) 0px -1px 0px inset;
205
+
206
+
207
+ }
208
+ div.news-item:hover{
209
+ box-shadow: none;
210
+ }
211
+
212
+ @media screen and (min-width: 800px) {
213
+ p.srctxt {
214
+ align:justify;
215
+ text-align: justify;
216
+ word-break: break-all;
217
+ font-size: 11px;
218
+ font-family: Arial, Helvetica, sans-serif;
219
+ }
220
+ .logo-img{
221
+ margin-right: 10px;
222
+ vertical-align: center;
223
+ /* position: relative; */
224
+ width: 34px;
225
+ height: 34px;
226
+
227
+ }
228
+ }
229
+
230
+ @media screen and (max-width: 800px) {
231
+ p.srctxt {
232
+ align:justify;
233
+ text-align: justify;
234
+ word-break: break-all;
235
+ font-size: 9px;
236
+ font-family: Arial, Helvetica, sans-serif;
237
+ }
238
+ .logo-img{
239
+ margin-right: 10px;
240
+ vertical-align: top;
241
+ /* position: absolute; */
242
+ width: 30px;
243
+ height: 30px;
244
+ }
245
+ }
246
+
247
+ .float{
248
+ position:fixed;
249
+ width:25px;
250
+ height:25px;
251
+ bottom:15px;
252
+ right:12px;
253
+ background-color: white;
254
+ border-radius:50%;
255
+ text-align:center;
256
+ vertical-align:center;
257
+ z-index: 99999998;
258
+ font-size:0;
259
+ cursor:pointer;
260
+ animation: beatan 0.8s infinite alternate;
261
+
262
+ }
263
+ .top-float{
264
+ position:fixed;
265
+ width:25px;
266
+ height:25px;
267
+ bottom:52px;
268
+ right:12px;
269
+ background-color: white;
270
+ border-radius:50%;
271
+ text-align:center;
272
+ vertical-align:center;
273
+ z-index: 99999998;
274
+ font-size:0;
275
+ cursor:pointer;
276
+ animation: beatan 0.8s infinite alternate;
277
+
278
+ }
279
+ .my-float{
280
+ margin-top:22px;
281
+ }
282
+
283
+ @keyframes beatan{
284
+ to { transform: scale(1.1); }
285
+ }
286
+
287
+ .loader {
288
+ position: fixed;
289
+ left: 0px;
290
+ top: 0px;
291
+ width: 100%;
292
+ height: 100%;
293
+ z-index: 99999999999;
294
+ background: url('../static/loader.gif') 50% 50% no-repeat rgb(255,255,255);
295
+ }
296
+
297
+ .highlight {
298
+ background-color: yellow;
299
+ font-weight: bold;
300
+ }
301
+
302
+ .input-container {
303
+ position: relative;
304
+ padding-bottom: 10px;
305
+ }
306
+
307
+ .keyword-input {
308
+
309
+ border-radius: 5px;
310
+ transition: border-color 0.3s ease;
311
+ border: 1px solid silver;
312
+ width: 10em;
313
+ height: 1.5em;
314
+ padding-left: 0.5em;
315
+ outline: none;
316
+ overflow: hidden;
317
+
318
+ }
319
+
320
+ .clear-btn {
321
+ position: absolute;
322
+ font-size: 20px;
323
+ left: 129px;
324
+ transform: translateY(-105%);
325
+ cursor: pointer;
326
+ opacity: 0;
327
+ transition: opacity 0.3s ease;
328
+ }
329
+
330
+ .clear-btn.show {
331
+ opacity: 1;
332
+ }
333
+
334
+ @media screen and (min-width: 800px) {
335
+ a.article-category {
336
+ background-color: #E5E4E2;
337
+ align:justify;
338
+ display: block;
339
+ height:100%;
340
+ width: relative;
341
+ text-decoration: none;
342
+ border-left: 5px solid transparent;
343
+ border-top: 0px;
344
+ font-weight: bold;
345
+ border-bottom: 1px solid transparent;
346
+ border-right: 0px;
347
+ padding-right: 5px;
348
+ font-size: 11px;
349
+ padding-bottom: 0px;
350
+ font-family: Arial, Helvetica, sans-serif;
351
+ color: green;
352
+ }
353
+ }
354
+
355
+ @media screen and (max-width: 800px) {
356
+ a.article-category {
357
+ background-color: #E5E4E2;
358
+ align:justify;
359
+ display: block;
360
+ height:100%;
361
+ font-weight: bold;
362
+ width: relative;
363
+ text-decoration: none;
364
+ border-left: 5px solid transparent;
365
+ border-top: 0px;
366
+ border-bottom: 1px solid transparent;
367
+ border-right: 0px;
368
+ padding-right: 5px;
369
+ font-size: 10px;
370
+ padding-bottom: 0px;
371
+ font-family: Arial, Helvetica, sans-serif;
372
+ color: green;
373
+ }
374
+ }
375
+
376
+ .content {
377
+ display: none;
378
+ font-family: Arial, Helvetica, sans-serif;
379
+
380
+ padding-right: 5px;
381
+
382
+ padding-top: 5px;
383
+ border-left: 5px solid transparent;
384
+ }
385
+
386
+ .container{
387
+ padding-bottom:10px;
388
+ }
389
+
390
+ .show-similar-button-container{
391
+ display: flex;
392
+ flex-direction: column;
393
+ align-items: center;
394
+ }
395
+
396
+ .similar-news-item:hover {
397
+ text-decoration: none;
398
+ }
399
+
400
+ @media screen and (min-width: 800px) {
401
+ .similar-news-item {
402
+ text-align: justify;
403
+ text-decoration: underline;
404
+ font-size: 14px;
405
+ font-family: Arial, Helvetica, sans-serif;
406
+ color: black;
407
+ display:inline-block;
408
+ padding-bottom: 10px;
409
+ width:100%;
410
+ /*white-space: nowrap;
411
+ overflow: hidden;
412
+ text-overflow: ellipsis;*/
413
+
414
+ }
415
+ }
416
+
417
+ @media screen and (max-width: 800px) {
418
+ .similar-news-item {
419
+ text-align: justify;
420
+ text-decoration: underline;
421
+ font-size: 12px;
422
+ font-family: Arial, Helvetica, sans-serif;
423
+ color: black;
424
+ display:inline-block;
425
+ padding-bottom: 8px;
426
+ width:100%;
427
+ /*white-space: nowrap;
428
+ overflow: hidden;
429
+ text-overflow: ellipsis;*/
430
+ }
431
+ }
432
+
433
+
434
+
435
+ .show-more {
436
+ background-color: #E5E4E2;
437
+ font-family: Arial, Helvetica, sans-serif;
438
+ border-radius:4px;
439
+ padding-top:3px;
440
+ padding-bottom:3px;
441
+ padding-left:3px;
442
+ padding-right:3px;
443
+ font-size: 12px;
444
+ display: box;
445
+ border: none;
446
+
447
+ }
448
+
449
+ .show-more:hover {
450
+ background-color: black;
451
+ color: white;
452
+ }
453
+
454
+ .show-less {
455
+ background-color: #E5E4E2;
456
+ font-family: Arial, Helvetica, sans-serif;
457
+ border-radius:4px;
458
+ padding-top:3px;
459
+ padding-bottom:3px;
460
+ padding-left:3px;
461
+ padding-right:3px;
462
+ font-size: 12px;
463
+ border: none;
464
+ display: none;
465
+ }
466
+
467
+ .show-less:hover {
468
+ background-color: black;
469
+ color: white;
470
+ }
471
+
472
+ .word-cloud-container{
473
+ word-wrap: break-word;
474
+ padding-bottom: 10px;
475
+
476
+ }
477
+
478
+ .wc-tokens{
479
+ font-family: Arial, Helvetica, sans-serif;
480
+ font-size: 13.2px;
481
+ cursor: pointer;
482
+ }
483
+
484
+ .wc-tokens:hover{
485
+ text-decoration: underline;
486
+ }
487
+
488
+ .word-cloud-section{
489
+ padding-bottom: 10px;
490
+ display: none;
491
+ word-wrap: break-word;
492
+ }
493
+
494
+ .show-more-word-cloud{
495
+ padding-bottom: 23px;
496
+ text-align: center;
497
+ }
498
+
499
+ .three-dots{
500
+ font-size: 30px;
501
+ margin: 0;
502
+ line-height:0;
503
+ vertical-align: top;
504
+ padding: 0;
505
+ cursor: pointer;
506
+ }
507
+
508
+ .three-dots:hover{
509
+ font-size: 25px;
510
+ }
static/top-icon.png ADDED
templates/index.html ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <link rel="preload" href="../static/loader.gif" as="image">
7
+ <link rel="preload" href="../static/favicon_new.png" as="image">
8
+ <link rel="preload" href="../static/refresh_reload_icon.png" as="image">
9
+ <link rel="preload" href="../static/top-icon.png" as="image">
10
+ <link rel="icon" href="../static/favicon_new.png" type="image/png">
11
+
12
+ <meta charset="UTF-8">
13
+ <title>Latest News</title>
14
+ <link rel="stylesheet" href="static/styles.css">
15
+ <a id="top-loc"></a>
16
+ <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
17
+ <script>
18
+ $(window).load(function(){
19
+ $('.loader').fadeOut();
20
+ });
21
+ </script>
22
+
23
+
24
+
25
+ <script>
26
+ function filterContent(match_case) {
27
+ var keyword = document.getElementById("keywordInput").value;
28
+ if (match_case == false)
29
+ {
30
+ /*var keyword = document.getElementById("keywordInput").value.toLowerCase(); */
31
+ /*var regex = new RegExp("\\b" + keyword + "\\b", "gi"); */
32
+ }
33
+ var clearbtn = document.getElementById("clearBtn");
34
+
35
+ if (keyword !== "")
36
+ {
37
+ clearbtn.style.opacity = 1;
38
+ var items = document.getElementsByClassName("news-item");
39
+ for (var i = 0; i < items.length; i++)
40
+ {
41
+ var headline = items[i].querySelector('.headline');
42
+ var description = items[i].querySelector('.description');
43
+ if (match_case == true)
44
+ {
45
+ var article_category = items[i].querySelector('.article-category');
46
+ var src_time = items[i].querySelector('.time');
47
+ var itemText = headline.textContent.concat(" ", description.textContent, " ", article_category.textContent, " ", src_time.textContent)
48
+ }
49
+ else
50
+ {
51
+ var itemText = headline.textContent.concat(" ", description.textContent, " ")
52
+ }
53
+
54
+ if (match_case == false)
55
+ { var regex = new RegExp("\\b" + keyword + "\\b", "gi");
56
+ itemText = itemText.toLowerCase();
57
+ if (regex.test(itemText) == true)
58
+ {
59
+ items[i].style.display = "block";
60
+ highlightKeyword(headline, keyword, match_case);
61
+ highlightKeyword(description, keyword, match_case);
62
+ }
63
+ else
64
+ {
65
+ items[i].style.display = "none";
66
+ }
67
+ }
68
+ else
69
+ {
70
+ if (itemText.includes(keyword))
71
+ {
72
+ items[i].style.display = "block";
73
+ highlightKeyword(headline, keyword, match_case);
74
+ highlightKeyword(description, keyword, match_case);
75
+ highlightKeyword(article_category, keyword, match_case);
76
+ highlightKeyword(src_time, keyword, match_case);
77
+
78
+ }
79
+ else
80
+ {
81
+ items[i].style.display = "none";
82
+ }
83
+ }
84
+ }
85
+ }
86
+ else
87
+ {
88
+ clearFilter();
89
+ }
90
+ }
91
+
92
+ function clearFilter() {
93
+ var items = document.getElementsByClassName("news-item");
94
+ var clearbtn = document.getElementById("clearBtn");
95
+ clearbtn.style.opacity=0;
96
+ for (var i = 0; i < items.length; i++) {
97
+ var headline = items[i].querySelector('.headline');
98
+ var description = items[i].querySelector('.description');
99
+ var article_category = items[i].querySelector('.article-category');
100
+ var src_time = items[i].querySelector('.time');
101
+ items[i].style.display = "block";
102
+ headline.innerHTML = headline.textContent; // Remove highlighting
103
+ description.innerHTML = description.textContent; // Remove highlighting
104
+ article_category.innerHTML = article_category.textContent; // Remove highlighting
105
+ src_time.innerHTML = src_time.textContent; // Remove highlighting
106
+ }
107
+
108
+ document.getElementById("keywordInput").value = ""; // Clear input field
109
+ }
110
+
111
+ function highlightKeyword(element, keyword, match_case) {
112
+ var regex = new RegExp(keyword);
113
+ if (match_case == false)
114
+ {
115
+ var regex = new RegExp("\\b" + keyword + "\\b", 'gi');
116
+ }
117
+ element.innerHTML = element.textContent.replace(regex, function(match) {
118
+ return '<span class="highlight">' + match + '</span>';
119
+ });
120
+ }
121
+
122
+ </script>
123
+
124
+ <script>
125
+ document.addEventListener('DOMContentLoaded', function() {
126
+ const containers = document.querySelectorAll('.container');
127
+
128
+ containers.forEach(container => {
129
+ const content = container.querySelector('.content');
130
+ const showMoreBtn = container.querySelector('.show-more');
131
+ const showLessBtn = container.querySelector('.show-less');
132
+
133
+ showMoreBtn.addEventListener('click', function() {
134
+ /* var similar_news_items = document.getElementsByClassName("content");
135
+ var show_less_items = document.getElementsByClassName("show-less");
136
+ var show_more_items = document.getElementsByClassName("show-more");
137
+ for (var i = 0; i < similar_news_items.length; i++) {
138
+ similar_news_items[i].style.display = 'none';
139
+ show_more_items[i].style.display = 'block';
140
+ show_less_items[i].style.display = 'none';
141
+ }
142
+ */
143
+
144
+ content.style.display = 'block';
145
+ content.style.opacity = 1;
146
+ showMoreBtn.style.display = 'none';
147
+ showLessBtn.style.display = 'block';
148
+ });
149
+
150
+ showLessBtn.addEventListener('click', function() {
151
+ document.documentElement.style.scrollBehavior = "auto";
152
+ var max_h = content.parentElement.parentElement.clientHeight;
153
+ content.style.display = 'none';
154
+ showMoreBtn.style.display = 'block';
155
+ showLessBtn.style.display = 'none';
156
+ var min_h = content.parentElement.parentElement.clientHeight;
157
+ $(window).scrollTop($(window).scrollTop() - (max_h - min_h) || 0);
158
+ document.documentElement.style.scrollBehavior = "smooth";
159
+ });
160
+ });
161
+ });
162
+ </script>
163
+
164
+ <script>
165
+ function wc_search(keyword)
166
+ {
167
+ clearFilter();
168
+ document.getElementById("keywordInput").value = keyword;
169
+ filterContent(false);
170
+ }
171
+
172
+ function word_cloud_display()
173
+ {
174
+ var word_cloud_section = document.getElementById("word-cloud-section-id");
175
+ if (word_cloud_section.style.display == 'block')
176
+ {
177
+ word_cloud_section.style.display = 'none';
178
+ }
179
+ else
180
+ {
181
+ word_cloud_section.style.display = 'block';
182
+ }
183
+ }
184
+ </script>
185
+
186
+
187
+
188
+ </head>
189
+ <body>
190
+ <div class="loader"></div>
191
+ {{body | safe}}
192
+
193
+ <a id="top_theme" class="top-float" onclick="window.scrollTo(0, 0);">
194
+ <img id="top-theme-icon" alt="_" src="../static/top-icon.png" style="border-radius:50%; border: 3px solid #b9bfc4; background-color: white;" width="25px" height="25px" border="0">
195
+ </a>
196
+ <a href="javascript:window.location.reload(true)" id="theme" class="float">
197
+ <img id="theme-icon" alt="_" src="../static/refresh_reload_icon.png" style="border-radius:50%; border: 3px solid #b9bfc4; background-color: white;" width=25px height=25px border="0" />
198
+ </a>
199
+ </body>
200
+ </html>
word_cloud.py ADDED
@@ -0,0 +1,653 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import string
4
+ from unidecode import unidecode
5
+ from collections import Counter
6
+
7
+
8
+ class TextPreprocessor:
9
+ def __init__(self, remove_punct: bool = True, remove_digits: bool = True,
10
+ remove_stop_words: bool = True,
11
+ remove_short_words: bool = False, minlen: int = 1, maxlen: int = 1, top_p: float = None,
12
+ bottom_p: float = None):
13
+ self.remove_punct = remove_punct
14
+ self.remove_digits = remove_digits
15
+ self.remove_stop_words = remove_stop_words
16
+ self.remove_short_words = remove_short_words
17
+ self.minlen = minlen
18
+ self.maxlen = maxlen
19
+ self.top_p = top_p
20
+ self.bottom_p = bottom_p
21
+ self.words_to_remove = []
22
+ self.stop_words = ["'d", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
23
+ 'about',
24
+ 'above',
25
+ 'across',
26
+ 'after',
27
+ 'afterwards',
28
+ 'again',
29
+ 'against',
30
+ 'ain',
31
+ 'all',
32
+ 'almost',
33
+ 'alone',
34
+ 'along',
35
+ 'already',
36
+ 'also',
37
+ 'although',
38
+ 'always',
39
+ 'am',
40
+ 'among',
41
+ 'amongst',
42
+ 'amount',
43
+ 'an',
44
+ 'and',
45
+ 'another',
46
+ 'any',
47
+ 'anyhow',
48
+ 'anyone',
49
+ 'anything',
50
+ 'anyway',
51
+ 'anywhere',
52
+ 'are',
53
+ 'around',
54
+ 'as',
55
+ 'at',
56
+ 'back',
57
+ 'be',
58
+ 'became',
59
+ 'because',
60
+ 'become',
61
+ 'becomes',
62
+ 'becoming',
63
+ 'been',
64
+ 'before',
65
+ 'beforehand',
66
+ 'behind',
67
+ 'being',
68
+ 'below',
69
+ 'beside',
70
+ 'besides',
71
+ 'between',
72
+ 'beyond',
73
+ 'both',
74
+ 'bottom',
75
+ 'but',
76
+ 'by',
77
+ 'ca',
78
+ 'call',
79
+ 'can',
80
+ 'cannot',
81
+ 'could',
82
+ 'couldn',
83
+ "couldn't",
84
+ 'd',
85
+ 'did',
86
+ 'do',
87
+ 'does',
88
+ 'doing',
89
+ 'done',
90
+ 'down',
91
+ 'due',
92
+ 'during',
93
+ 'each',
94
+ 'eight',
95
+ 'either',
96
+ 'eleven',
97
+ 'else',
98
+ 'elsewhere',
99
+ 'empty',
100
+ 'enough',
101
+ 'even',
102
+ 'ever',
103
+ 'every',
104
+ 'everyone',
105
+ 'everything',
106
+ 'everywhere',
107
+ 'except',
108
+ 'few',
109
+ 'fifteen',
110
+ 'fifty',
111
+ 'first',
112
+ 'five',
113
+ 'for',
114
+ 'former',
115
+ 'formerly',
116
+ 'forty',
117
+ 'four',
118
+ 'from',
119
+ 'front',
120
+ 'full',
121
+ 'further',
122
+ 'get',
123
+ 'give',
124
+ 'go',
125
+ 'had',
126
+ 'has',
127
+ 'have',
128
+ 'having',
129
+ 'he',
130
+ 'hence',
131
+ 'her',
132
+ 'here',
133
+ 'hereafter',
134
+ 'hereby',
135
+ 'herein',
136
+ 'hereupon',
137
+ 'hers',
138
+ 'herself',
139
+ 'him',
140
+ 'himself',
141
+ 'his',
142
+ 'how',
143
+ 'however',
144
+ 'hundred',
145
+ 'i',
146
+ 'if',
147
+ 'in',
148
+ 'indeed',
149
+ 'into',
150
+ 'is',
151
+ 'it',
152
+ "it's",
153
+ 'its',
154
+ 'itself',
155
+ 'just',
156
+ 'keep',
157
+ 'last',
158
+ 'latter',
159
+ 'latterly',
160
+ 'least',
161
+ 'less',
162
+ 'll',
163
+ 'm',
164
+ 'ma',
165
+ 'made',
166
+ 'make',
167
+ 'many',
168
+ 'say',
169
+ 'said',
170
+ 'says',
171
+ 'told',
172
+ 'tell',
173
+ 'may',
174
+ 'me',
175
+ 'meanwhile',
176
+ 'might',
177
+ 'mine',
178
+ 'more',
179
+ 'moreover',
180
+ 'most',
181
+ 'mostly',
182
+ 'move',
183
+ 'much',
184
+ 'must',
185
+ 'my',
186
+ 'myself',
187
+ 'name',
188
+ 'namely',
189
+ 'neither',
190
+ 'never',
191
+ 'nevertheless',
192
+ 'next',
193
+ 'nine',
194
+ 'no',
195
+ 'nobody',
196
+ 'none',
197
+ 'noone',
198
+ 'nor',
199
+ 'not',
200
+ 'nothing',
201
+ 'now',
202
+ 'nowhere',
203
+ 'o',
204
+ 'of',
205
+ 'off',
206
+ 'often',
207
+ 'on',
208
+ 'once',
209
+ 'one',
210
+ 'only',
211
+ 'onto',
212
+ 'or',
213
+ 'other',
214
+ 'others',
215
+ 'otherwise',
216
+ 'our',
217
+ 'ours',
218
+ 'ourselves',
219
+ 'out',
220
+ 'over',
221
+ 'own',
222
+ 'part',
223
+ 'per',
224
+ 'perhaps',
225
+ 'please',
226
+ 'put',
227
+ 'quite',
228
+ 'rather',
229
+ 're',
230
+ 'rs',
231
+ 'really',
232
+ 'regarding',
233
+ 's',
234
+ 'same',
235
+ 'say',
236
+ 'see',
237
+ 'seem',
238
+ 'seemed',
239
+ 'seeming',
240
+ 'seems',
241
+ 'serious',
242
+ 'several',
243
+ 'shan',
244
+ "shan't",
245
+ 'she',
246
+ "she's",
247
+ 'should',
248
+ "should've",
249
+ 'shouldn',
250
+ "shouldn't",
251
+ 'show',
252
+ 'side',
253
+ 'since',
254
+ 'six',
255
+ 'sixty',
256
+ 'so',
257
+ 'some',
258
+ 'somehow',
259
+ 'someone',
260
+ 'something',
261
+ 'sometime',
262
+ 'sometimes',
263
+ 'somewhere',
264
+ 'still',
265
+ 'such',
266
+ 't',
267
+ 'take',
268
+ 'ten',
269
+ 'than',
270
+ 'that',
271
+ "that'll",
272
+ 'the',
273
+ 'their',
274
+ 'theirs',
275
+ 'them',
276
+ 'themselves',
277
+ 'then',
278
+ 'thence',
279
+ 'there',
280
+ 'thereafter',
281
+ 'thereby',
282
+ 'therefore',
283
+ 'therein',
284
+ 'thereupon',
285
+ 'these',
286
+ 'they',
287
+ 'third',
288
+ 'this',
289
+ 'those',
290
+ 'though',
291
+ 'three',
292
+ 'through',
293
+ 'throughout',
294
+ 'thru',
295
+ 'thus',
296
+ 'to',
297
+ 'together',
298
+ 'too',
299
+ 'top',
300
+ 'toward',
301
+ 'towards',
302
+ 'twelve',
303
+ 'twenty',
304
+ 'two',
305
+ 'under',
306
+ 'unless',
307
+ 'until',
308
+ 'up',
309
+ 'upon',
310
+ 'us',
311
+ 'used',
312
+ 'using',
313
+ 'various',
314
+ 've',
315
+ 'very',
316
+ 'via',
317
+ 'was',
318
+ 'wasn',
319
+ "wasn't",
320
+ 'we',
321
+ 'well',
322
+ 'were',
323
+ 'weren',
324
+ "weren't",
325
+ 'what',
326
+ 'whatever',
327
+ 'when',
328
+ 'whence',
329
+ 'whenever',
330
+ 'where',
331
+ 'whereafter',
332
+ 'whereas',
333
+ 'whereby',
334
+ 'wherein',
335
+ 'whereupon',
336
+ 'wherever',
337
+ 'whether',
338
+ 'which',
339
+ 'while',
340
+ 'whither',
341
+ 'who',
342
+ 'whoever',
343
+ 'whole',
344
+ 'whom',
345
+ 'whose',
346
+ 'why',
347
+ 'will',
348
+ 'with',
349
+ 'within',
350
+ 'without',
351
+ 'won',
352
+ "won't",
353
+ 'would',
354
+ 'wouldn',
355
+ "wouldn't",
356
+ 'y',
357
+ 'yet',
358
+ 'you',
359
+ "you'd",
360
+ "you'll",
361
+ "you're",
362
+ "you've",
363
+ 'your',
364
+ 'yours',
365
+ 'yourself',
366
+ 'yourselves',
367
+ '‘d',
368
+ '‘ll',
369
+ '‘m',
370
+ '‘re',
371
+ '‘s',
372
+ '‘ve',
373
+ '’d',
374
+ '’ll',
375
+ '’m',
376
+ '’re',
377
+ 'new',
378
+ 'old',
379
+ '’s',
380
+ '’ve']
381
+
382
+ self.contraction_to_expansion = {"ain't": "am not",
383
+ "aren't": "are not",
384
+ "can't": "cannot",
385
+ "can't've": "cannot have",
386
+ "'cause": "because",
387
+ "could've": "could have",
388
+ "couldn't": "could not",
389
+ "couldn't've": "could not have",
390
+ "didn't": "did not",
391
+ "doesn't": "does not",
392
+ "don't": "do not",
393
+ "hadn't": "had not",
394
+ "hadn't've": "had not have",
395
+ "hasn't": "has not",
396
+ "haven't": "have not",
397
+ "he'd": "he would",
398
+ "he'd've": "he would have",
399
+ "he'll": "he will",
400
+ "he'll've": "he will have",
401
+ "he's": "he is",
402
+ "how'd": "how did",
403
+ "how'd'y": "how do you",
404
+ "how'll": "how will",
405
+ "how's": "how is",
406
+ "i'd": "i would",
407
+ "i'd've": "i would have",
408
+ "i'll": "i will",
409
+ "i'll've": "i will have",
410
+ "i'm": "i am",
411
+ "i've": "i have",
412
+ "isn't": "is not",
413
+ "it'd": "it had",
414
+ "it'd've": "it would have",
415
+ "it'll": "it will",
416
+ "it'll've": "it will have",
417
+ "it's": "it is",
418
+ "let's": "let us",
419
+ "ma'am": "madam",
420
+ "mayn't": "may not",
421
+ "might've": "might have",
422
+ "mightn't": "might not",
423
+ "mightn't've": "might not have",
424
+ "must've": "must have",
425
+ "mustn't": "must not",
426
+ "mustn't've": "must not have",
427
+ "needn't": "need not",
428
+ "needn't've": "need not have",
429
+ "o'clock": "of the clock",
430
+ "oughtn't": "ought not",
431
+ "oughtn't've": "ought not have",
432
+ "shan't": "shall not",
433
+ "sha'n't": "shall not",
434
+ "shan't've": "shall not have",
435
+ "she'd": "she would",
436
+ "she'd've": "she would have",
437
+ "she'll": "she will",
438
+ "she'll've": "she will have",
439
+ "she's": "she is",
440
+ "should've": "should have",
441
+ "shouldn't": "should not",
442
+ "shouldn't've": "should not have",
443
+ "so've": "so have",
444
+ "so's": "so is",
445
+ "that'd": "that would",
446
+ "that'd've": "that would have",
447
+ "that's": "that is",
448
+ "there'd": "there had",
449
+ "there'd've": "there would have",
450
+ "there's": "there is",
451
+ "they'd": "they would",
452
+ "they'd've": "they would have",
453
+ "they'll": "they will",
454
+ "they'll've": "they will have",
455
+ "they're": "they are",
456
+ "they've": "they have",
457
+ "to've": "to have",
458
+ "wasn't": "was not",
459
+ "we'd": "we had",
460
+ "we'd've": "we would have",
461
+ "we'll": "we will",
462
+ "we'll've": "we will have",
463
+ "we're": "we are",
464
+ "we've": "we have",
465
+ "weren't": "were not",
466
+ "what'll": "what will",
467
+ "what'll've": "what will have",
468
+ "what're": "what are",
469
+ "what's": "what is",
470
+ "what've": "what have",
471
+ "when's": "when is",
472
+ "when've": "when have",
473
+ "where'd": "where did",
474
+ "where's": "where is",
475
+ "where've": "where have",
476
+ "who'll": "who will",
477
+ "who'll've": "who will have",
478
+ "who's": "who is",
479
+ "who've": "who have",
480
+ "why's": "why is",
481
+ "why've": "why have",
482
+ "will've": "will have",
483
+ "won't": "will not",
484
+ "won't've": "will not have",
485
+ "would've": "would have",
486
+ "wouldn't": "would not",
487
+ "wouldn't've": "would not have",
488
+ "y'all": "you all",
489
+ "y'alls": "you alls",
490
+ "y'all'd": "you all would",
491
+ "y'all'd've": "you all would have",
492
+ "y'all're": "you all are",
493
+ "y'all've": "you all have",
494
+ "you'd": "you had",
495
+ "you'd've": "you would have",
496
+ "you'll": "you you will",
497
+ "you'll've": "you you will have",
498
+ "you're": "you are",
499
+ "you've": "you have"
500
+ }
501
+
502
+ @staticmethod
503
+ def __remove_double_whitespaces(string: str):
504
+ return " ".join(string.split())
505
+
506
+ def __remove_url(self, string_series: pd.Series):
507
+ """
508
+ Removes URLs m text
509
+ :param string_series: pd.Series, input string series
510
+ :return: pd.Series, cleaned string series
511
+ """
512
+ clean_string_series = string_series.str.replace(
513
+ pat=r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})",
514
+ repl=" ", regex=True).copy()
515
+ return clean_string_series.map(self.__remove_double_whitespaces)
516
+
517
+ def __expand(self, string_series: pd.Series):
518
+ """
519
+ Replaces contractions with expansions. eg. don't wit do not.
520
+ :param string_series: pd.Series, input string series
521
+ :return: pd.Series, cleaned string series
522
+ """
523
+ clean_string_series = string_series.copy()
524
+ for c, e in self.contraction_to_expansion.items():
525
+ clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
526
+ return clean_string_series.map(self.__remove_double_whitespaces)
527
+
528
+ def __remove_punct(self, string_series: pd.Series):
529
+ """
530
+ Removes punctuations from the input string.
531
+ :param string_series: pd.Series, input string series
532
+ :return: pd.Series, cleaned string series
533
+ """
534
+ clean_string_series = string_series.copy()
535
+ puncts = [r'\n', r'\r', r'\t']
536
+ puncts.extend(list(string.punctuation))
537
+ for i in puncts:
538
+ clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
539
+ return clean_string_series.map(self.__remove_double_whitespaces)
540
+
541
+ def __remove_digits(self, string_series: pd.Series):
542
+ """
543
+ Removes digits from the input string.
544
+ :param string_series: pd.Series, input string series
545
+ :return: pd.Series, cleaned string series
546
+ """
547
+ clean_string_series = string_series.str.replace(pat=r'\d', repl=" ", regex=True).copy()
548
+ return clean_string_series.map(self.__remove_double_whitespaces)
549
+
550
+ @staticmethod
551
+ def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
552
+ """
553
+ Reomves words/tokens where minlen <= len <= maxlen.
554
+ :param string_series: pd.Series, input string series
555
+ :param minlen: int, minimum length of token to be removed.
556
+ :param maxlen: int, maximum length of token to be removed.
557
+ :return: pd.Series, cleaned string series
558
+ """
559
+ clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split() if
560
+ (len(word) > maxlen) or (len(word) < minlen)]))
561
+ return clean_string_series
562
+
563
+ def __remove_stop_words(self, string_series: pd.Series):
564
+ """
565
+ Removes stop words from the input string.
566
+ :param string_series: pd.Series, input string series
567
+ :return: pd.Series, cleaned string series
568
+ """
569
+ def str_remove_stop_words(string: str):
570
+ stops = self.stop_words
571
+ return " ".join([token for token in string.split() if token not in stops])
572
+
573
+ return string_series.map(str_remove_stop_words)
574
+
575
+ def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
576
+ bottom_p: int = None, dataset: str = 'train'):
577
+ """
578
+ Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
579
+ :param string_series: pd.Series, input string series
580
+ :param top_p: float, percent of frequent words to remove.
581
+ :param bottom_p: float, percent of rare words to remove.
582
+ :param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
583
+ :return: pd.Series, cleaned string series
584
+ """
585
+ if dataset == 'train':
586
+ if top_p is None:
587
+ top_p = 0
588
+ if bottom_p is None:
589
+ bottom_p = 0
590
+
591
+ if top_p > 0 or bottom_p > 0:
592
+ word_freq = pd.Series(" ".join(string_series).split()).value_counts()
593
+ n_words = len(word_freq)
594
+
595
+ if top_p > 0:
596
+ self.words_to_remove.extend([*word_freq.index[: int(np.ceil(top_p * n_words))]])
597
+
598
+ if bottom_p > 0:
599
+ self.words_to_remove.extend([*word_freq.index[-int(np.ceil(bottom_p * n_words)):]])
600
+
601
+ if len(self.words_to_remove) == 0:
602
+ return string_series
603
+ else:
604
+ clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split()
605
+ if word not in self.words_to_remove]))
606
+ return clean_string_series
607
+
608
+ def preprocess(self, string_series: pd.Series, dataset: str = "train"):
609
+ """
610
+ Entry point.
611
+ :param string_series: pd.Series, input string series
612
+ :param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
613
+ :return: pd.Series, cleaned string series
614
+ """
615
+ string_series = string_series.str.lower().copy()
616
+ string_series = string_series.map(unidecode).copy()
617
+ string_series = self.__remove_url(string_series=string_series)
618
+ string_series = self.__expand(string_series=string_series)
619
+
620
+ if self.remove_punct:
621
+ string_series = self.__remove_punct(string_series=string_series)
622
+ if self.remove_digits:
623
+ string_series = self.__remove_digits(string_series=string_series)
624
+ if self.remove_stop_words:
625
+ string_series = self.__remove_stop_words(string_series=string_series)
626
+ if self.remove_short_words:
627
+ string_series = self.__remove_short_words(string_series=string_series,
628
+ minlen=self.minlen,
629
+ maxlen=self.maxlen)
630
+ string_series = self.__remove_top_bottom_words(string_series=string_series,
631
+ top_p=self.top_p,
632
+ bottom_p=self.bottom_p, dataset=dataset)
633
+
634
+ string_series = string_series.str.strip().copy()
635
+ string_series.replace(to_replace="", value="this is an empty message", inplace=True)
636
+
637
+ return string_series
638
+
639
+
640
+ def get_frequent_words_html(df):
641
+ text_preprocess = TextPreprocessor()
642
+ preprocessed_txt = text_preprocess.preprocess(df['title'] + ' ' + df['description'])
643
+ counter = Counter(' '.join([*preprocessed_txt]).split())
644
+
645
+ freq_tokens_html = '<div class="word-cloud-container">'
646
+ n = 1
647
+ for i, j in counter.most_common(25):
648
+ freq_tokens_html += f'<a class="wc-tokens" onclick=wc_search("{i}")>{i}</a>{"&nbsp;" * np.random.randint(3, 7, 1)[0]}'
649
+ if n == 5:
650
+ freq_tokens_html += '<div class="word-cloud-section" id="word-cloud-section-id">'
651
+ n += 1
652
+ freq_tokens_html += '</div></div>'
653
+ return freq_tokens_html