ksvmuralidhar commited on
Commit
6406b8d
·
verified ·
1 Parent(s): bb6a90e

Upload 23 files

Browse files
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -1,9 +1,9 @@
1
- FROM python:3.9-slim
2
- WORKDIR /webapp
3
- COPY . .
4
- RUN chmod +x /webapp/start.sh
5
- RUN pip install --upgrade pip
6
- RUN pip install --no-cache-dir -r requirements.txt
7
- RUN apt update && apt install -y redis-server
8
- EXPOSE 7860 6379
9
- CMD ["/webapp/start.sh"]
 
1
+ FROM python:3.9-slim
2
+ WORKDIR /webapp
3
+ COPY . .
4
+ RUN chmod +x /webapp/start.sh
5
+ RUN pip install --upgrade pip
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+ RUN apt update && apt install -y redis-server
8
+ EXPOSE 7860 6379
9
+ CMD ["newrelic-admin", "run-program", "/webapp/start.sh"]
README.md CHANGED
@@ -1,11 +1,11 @@
1
- ---
2
- title: News Aggregator
3
- emoji: ⚡
4
- colorFrom: indigo
5
- colorTo: blue
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: News Aggregator
3
+ emoji: ⚡
4
+ colorFrom: indigo
5
+ colorTo: blue
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,262 +1,258 @@
1
- import numpy as np
2
- import pandas as pd
3
- from dateutil import parser
4
- from quart_cors import cors
5
- from quart import Quart
6
- from quart import render_template
7
- from db_operations.db_operations import DBOperations
8
- import logging
9
- import traceback
10
- import redis
11
- import uuid
12
- from datetime import datetime
13
- from functools import lru_cache
14
- import gc
15
- from word_cloud import get_frequent_words_html
16
- from config import NEWS_RETENTION_SECONDS, UK_EDITION_URL
17
-
18
-
19
- app = Quart(__name__)
20
- app = cors(app, allow_origin="*")
21
- redis_client = redis.Redis(host='localhost', port=6379, decode_responses=True)
22
- logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
23
- logging.warning(f'Is Redis available?: {redis_client.ping()}')
24
- db = DBOperations()
25
- session_id = None
26
-
27
-
28
- REFRESH_FREQ = 300 # 300 secs = 5 mins
29
-
30
- def is_db_fetch_reqd():
31
- try:
32
- env_news_time = redis_client.get('NEWSFETCHTIME')
33
- logging.warning(f'[session_id: {session_id}] fetch_time_env_var: {env_news_time}')
34
- fetch_flag = 1
35
- if env_news_time is None:
36
- redis_client.set("NEWSFETCHTIME", str(datetime.now()))
37
- fetch_flag = 1
38
-
39
- if env_news_time is not None:
40
- fetch_time_lapse_seconds = (datetime.now() - datetime.strptime(env_news_time, '%Y-%m-%d %H:%M:%S.%f')).seconds
41
- if fetch_time_lapse_seconds <= REFRESH_FREQ:
42
- fetch_flag = 0
43
- else:
44
- redis_client.set("NEWSFETCHTIME", str(datetime.now()))
45
- fetch_flag = 1
46
- except Exception as e:
47
- print(e)
48
- fetch_flag = 1
49
- return fetch_flag
50
-
51
-
52
- def correct_date(x):
53
- if (not isinstance(x, str)) or (str(x).find(":") == -1):
54
- logging.error(f'[session_id: {session_id}] correct_date() error: {x} is not the right date format')
55
- return "2020-11-07 00:36:44+05:30"
56
- return x
57
-
58
- def date_time_parser(dt):
59
- """
60
- Computes the minutes elapsed since published time.
61
- :param dt: date
62
- :return: int, minutes elapsed.
63
- """
64
- try:
65
- return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
66
- except:
67
- logging.error(f'[session_id: {session_id}] date_time_parser() error: {dt} is not the right date format')
68
- return 100000
69
-
70
-
71
- def elapsed_time_str(mins):
72
- """
73
- Return the time elapsed string from minutes passed as an argument.
74
- :param mins: int, minutes elapsed.
75
- :return: str, time elapsed string
76
- """
77
- try:
78
- time_str = ''
79
- hours = int(mins / 60)
80
- days = np.round(mins / (60 * 24), 1)
81
- remaining_mins = int(mins - (hours * 60))
82
- if days >= 1:
83
- time_str = f'{str(days)} days ago'
84
- if days == 1:
85
- time_str = 'a day ago'
86
- elif (days < 1) & (hours < 24) & (mins >= 60):
87
- time_str = f'{str(hours)} hours and {str(remaining_mins)} mins ago'
88
- if (hours == 1) & (remaining_mins > 1):
89
- time_str = f'an hour and {str(remaining_mins)} mins ago'
90
- if (hours == 1) & (remaining_mins == 1):
91
- time_str = f'an hour and a min ago'
92
- if (hours > 1) & (remaining_mins == 1):
93
- time_str = f'{str(hours)} hours and a min ago'
94
- if (hours > 1) & (remaining_mins == 0):
95
- time_str = f'{str(hours)} hours ago'
96
- if ((mins / 60) == 1) & (remaining_mins == 0):
97
- time_str = 'an hour ago'
98
- elif (days < 1) & (hours < 24) & (mins == 0):
99
- time_str = 'Just in'
100
- else:
101
- time_str = f'{str(mins)} minutes ago'
102
- if mins == 1:
103
- time_str = 'a minute ago'
104
- return time_str
105
- except:
106
- return "-"
107
-
108
-
109
-
110
- async def fetch_from_db(fetch_flag):
111
- try:
112
- logging.warning(f'[session_id: {session_id}] fetch_flag: {fetch_flag}')
113
- if fetch_flag == 1:
114
- final_df = await db.read_news_from_db()
115
- freq_tokens = await get_frequent_words_html(final_df)
116
- logging.warning(f'[session_id: {session_id}] Fetched From DB')
117
-
118
- final_df['_id'] = final_df['_id'].astype('str')
119
-
120
- redis_client.set("NEWSDF", final_df.to_json())
121
- redis_client.set("NEWSWORDCLOUD", freq_tokens)
122
- else:
123
- final_df = pd.read_json(redis_client.get("NEWSDF"))
124
- freq_tokens = redis_client.get("NEWSWORDCLOUD")
125
- logging.warning(f'[session_id: {session_id}] Fetched From Cache')
126
-
127
- except Exception as e:
128
- print(e)
129
- final_df = []
130
- freq_tokens = ""
131
- # raise
132
- return final_df, freq_tokens
133
-
134
-
135
- @app.route("/")
136
- async def index():
137
- """
138
- Entry point
139
- """
140
- try:
141
- global session_id
142
- session_id = uuid.uuid4().hex
143
- src_str = ''
144
- status_code = 200
145
- logging.warning(f'[session_id: {session_id}] Entering the application')
146
- final_df, freq_tokens = await fetch_from_db(is_db_fetch_reqd())
147
- if len(final_df) == 0:
148
- final_df, freq_tokens = await fetch_from_db(1)
149
- if len(final_df) == 0:
150
- raise Exception("Unable to fetch news")
151
- if len(final_df) > 1:
152
-
153
- final_df["parsed_date"] = [correct_date(date_) for date_ in final_df['parsed_date']]
154
- final_df["parsed_date"] = [parser.parse(date_) for date_ in final_df['parsed_date']]
155
- final_df["elapsed_time"] =[date_time_parser(date_) for date_ in final_df['parsed_date']]
156
- final_df = final_df.loc[final_df["elapsed_time"] <= NEWS_RETENTION_SECONDS, :].copy()
157
- final_df["elapsed_time_str"] = final_df["elapsed_time"].apply(elapsed_time_str)
158
- final_df.sort_values(by="elapsed_time", inplace=True)
159
- src_str = ", ".join(sorted([*final_df['src'].unique()]))
160
- final_df['src_time'] = final_df['src'] + ("&nbsp;" * 5) + final_df["elapsed_time_str"]
161
- final_df.drop(columns=['_id', 'parsed_date', 'src', 'elapsed_time', 'elapsed_time_str'], inplace=True)
162
- final_df.drop_duplicates(subset='description', inplace=True)
163
- final_df = final_df.loc[(final_df["title"] != ""), :].copy()
164
- else:
165
- final_df = pd.DataFrame({'title': '', 'url': '',
166
- 'description': '', 'src_time': ''}, index=[0])
167
-
168
- except Exception as e:
169
- final_df = pd.DataFrame({'title': '', 'url': '',
170
- 'description': '', 'src_time': ''}, index=[0])
171
- logging.error(f'[session_id: {session_id}] {traceback.print_exc()}')
172
-
173
- result_str = f'''
174
- <div class="box" id="main">
175
- <form>
176
-
177
- <div class="banner">
178
- <img src="../static/favicon_new.png" class="logo-img" alt="KSV Muralidhar" />
179
- <h1 style="display:inline-block; vertical-align: middle;">Latest Indian News</h1>
180
- </div>
181
- '''
182
-
183
- if len(final_df) <= 1:
184
- result_str += f'''<div><p class="unavailable">This app is temporarily unavailable</p></div>'''
185
- status_code = 500
186
- else:
187
- last_update_utc = datetime.strptime(redis_client.get('NEWSFETCHTIME'), '%Y-%m-%d %H:%M:%S.%f')
188
- last_update_mins = int(np.ceil((datetime.now() - last_update_utc).seconds / 60))
189
- last_update_str = f'Updated {last_update_mins} {"minutes" if last_update_mins > 1 else "minute"} ago'
190
- result_str += f'<p class="srctxt">News aggregated from <b>{src_str}</b>.<br><br>{last_update_str}&nbsp;&nbsp;&nbsp;&nbsp;<a href="{UK_EDITION_URL}"><b>Switch to UK edition</b></a></p>'
191
-
192
- result_str += '''
193
- <div class="input-container">
194
- <input type="text" class="keyword-input" id="keywordInput" placeholder="Search" oninput="filterContent(true)">
195
- <div class="clear-btn" id="clearBtn" onclick="clearFilter()">&times;</div>
196
- <img src="static/info.png" alt="info" width="18" height="18" align="center" onclick="showSearchInfo()" style="cursor: pointer;">
197
- </div>
198
- '''
199
-
200
- result_str += f"{freq_tokens} "
201
- result_str += '<div class="show-more-word-cloud" onclick=word_cloud_display()><p class="three-dots">...</p></div>'
202
-
203
- result_str += f'''<div style="padding-bottom: 6px; font-size: 12px; font-family: Arial, Helvetica, sans-serif;">
204
- News categories and similar news are AI-generated</div>
205
- <div style="padding-bottom: 10px; font-size: 12px; font-family: Arial, Helvetica, sans-serif; font-weight: bold;">
206
- {len(final_df)} news articles available</div>
207
- '''
208
-
209
-
210
- for n, i in final_df.iterrows(): # iterating through the search results
211
- href = i["url"]
212
- category = i["category"]
213
- description = i["description"]
214
- url_txt = i["title"]
215
- src_time = i["src_time"]
216
- sim_news = i['similar_news']
217
- result_str += f'''<div class="news-item"><div style="padding-top: 7px;">
218
- <a href="{href}" target="_blank" class="article-category">{category}
219
- </a>
220
- </div>
221
- <div>
222
- <a href="{href}" target="_blank" class="headline">{url_txt}
223
- </a>
224
- </div>
225
- <div>
226
- <a href="{href}" target="_blank" class="description">
227
- {description}
228
- </a>
229
- </div>
230
- <div>
231
- <a href="{href}" target="_blank" class="time">
232
- {src_time}
233
- </a>
234
- </div>
235
-
236
-
237
- <div class="container">
238
- <div class="content" style="display: none;">
239
- {sim_news}
240
- </div>
241
- <div class="show-similar-button-container">
242
- <button type="button" class="show-more">Show similar news</button>
243
- <button type="button" class="show-less">Hide similar news</button>
244
- </div>
245
- </div>
246
-
247
-
248
-
249
- <div>
250
- <p></p>
251
- </div></div>
252
- '''
253
-
254
- result_str += '</form></div>'
255
- logging.warning(f'[session_id: {session_id}] Successfully rendered template')
256
- gc.collect()
257
- return await render_template("index.html", body=result_str), status_code
258
-
259
-
260
- if __name__ == "__main__":
261
- app.run(host="0.0.0.0", port=7860, workers=5, threads=5) # workers=(2*ncores) + 1, threads= (2 to 4*ncores) + 1
262
-
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from dateutil import parser
4
+ from quart_cors import cors
5
+ from quart import Quart
6
+ from quart import render_template
7
+ from db_operations.db_operations import DBOperations
8
+ import logging
9
+ import traceback
10
+ import redis
11
+ import uuid
12
+ from datetime import datetime
13
+ from functools import lru_cache
14
+ import gc
15
+ from word_cloud import get_frequent_words_html
16
+ from config import NEWS_RETENTION_SECONDS, UK_EDITION_URL
17
+
18
+
19
+ app = Quart(__name__)
20
+ app = cors(app, allow_origin="*")
21
+ redis_client = redis.Redis(host='localhost', port=6379, decode_responses=True)
22
+ logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)
23
+ logging.warning(f'Is Redis available?: {redis_client.ping()}')
24
+ db = DBOperations()
25
+ session_id = None
26
+
27
+
28
+ REFRESH_FREQ = 300 # 300 secs = 5 mins
29
+
30
+ def is_db_fetch_reqd():
31
+ try:
32
+ env_news_time = redis_client.get('NEWSFETCHTIME')
33
+ logging.warning(f'[session_id: {session_id}] fetch_time_env_var: {env_news_time}')
34
+ fetch_flag = 1
35
+ if env_news_time is None:
36
+ redis_client.set("NEWSFETCHTIME", str(datetime.now()))
37
+ fetch_flag = 1
38
+
39
+ if env_news_time is not None:
40
+ fetch_time_lapse_seconds = (datetime.now() - datetime.strptime(env_news_time, '%Y-%m-%d %H:%M:%S.%f')).seconds
41
+ if fetch_time_lapse_seconds <= REFRESH_FREQ:
42
+ fetch_flag = 0
43
+ else:
44
+ redis_client.set("NEWSFETCHTIME", str(datetime.now()))
45
+ fetch_flag = 1
46
+ except Exception as e:
47
+ print(e)
48
+ fetch_flag = 1
49
+ return fetch_flag
50
+
51
+
52
+ def correct_date(x):
53
+ if (not isinstance(x, str)) or (str(x).find(":") == -1):
54
+ logging.error(f'[session_id: {session_id}] correct_date() error: {x} is not the right date format')
55
+ return "2020-11-07 00:36:44+05:30"
56
+ return x
57
+
58
+ def date_time_parser(dt):
59
+ """
60
+ Computes the minutes elapsed since published time.
61
+ :param dt: date
62
+ :return: int, minutes elapsed.
63
+ """
64
+ try:
65
+ return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
66
+ except:
67
+ logging.error(f'[session_id: {session_id}] date_time_parser() error: {dt} is not the right date format')
68
+ return 100000
69
+
70
+
71
+ def elapsed_time_str(mins):
72
+ """
73
+ Return the time elapsed string from minutes passed as an argument.
74
+ :param mins: int, minutes elapsed.
75
+ :return: str, time elapsed string
76
+ """
77
+ try:
78
+ time_str = ''
79
+ hours = int(mins / 60)
80
+ days = np.round(mins / (60 * 24), 1)
81
+ remaining_mins = int(mins - (hours * 60))
82
+ if days >= 1:
83
+ time_str = f'{str(days)} days ago'
84
+ if days == 1:
85
+ time_str = 'a day ago'
86
+ elif (days < 1) & (hours < 24) & (mins >= 60):
87
+ time_str = f'{str(hours)} hours and {str(remaining_mins)} mins ago'
88
+ if (hours == 1) & (remaining_mins > 1):
89
+ time_str = f'an hour and {str(remaining_mins)} mins ago'
90
+ if (hours == 1) & (remaining_mins == 1):
91
+ time_str = f'an hour and a min ago'
92
+ if (hours > 1) & (remaining_mins == 1):
93
+ time_str = f'{str(hours)} hours and a min ago'
94
+ if (hours > 1) & (remaining_mins == 0):
95
+ time_str = f'{str(hours)} hours ago'
96
+ if ((mins / 60) == 1) & (remaining_mins == 0):
97
+ time_str = 'an hour ago'
98
+ elif (days < 1) & (hours < 24) & (mins == 0):
99
+ time_str = 'Just in'
100
+ else:
101
+ time_str = f'{str(mins)} minutes ago'
102
+ if mins == 1:
103
+ time_str = 'a minute ago'
104
+ return time_str
105
+ except:
106
+ return "-"
107
+
108
+
109
+
110
+ async def fetch_from_db(fetch_flag):
111
+ try:
112
+ logging.warning(f'[session_id: {session_id}] fetch_flag: {fetch_flag}')
113
+ if fetch_flag == 1:
114
+ final_df = await db.read_news_from_db()
115
+ freq_tokens = await get_frequent_words_html(final_df)
116
+ logging.warning(f'[session_id: {session_id}] Fetched From DB')
117
+
118
+ final_df['_id'] = final_df['_id'].astype('str')
119
+
120
+ redis_client.set("NEWSDF", final_df.to_json())
121
+ redis_client.set("NEWSWORDCLOUD", freq_tokens)
122
+ else:
123
+ final_df = pd.read_json(redis_client.get("NEWSDF"))
124
+ freq_tokens = redis_client.get("NEWSWORDCLOUD")
125
+ logging.warning(f'[session_id: {session_id}] Fetched From Cache')
126
+
127
+ except Exception as e:
128
+ print(e)
129
+ final_df = []
130
+ freq_tokens = ""
131
+ raise
132
+ return final_df, freq_tokens
133
+
134
+
135
+ @app.route("/")
136
+ async def index():
137
+ """
138
+ Entry point
139
+ """
140
+ try:
141
+ global session_id
142
+ session_id = uuid.uuid4().hex
143
+ src_str = ''
144
+ status_code = 200
145
+ logging.warning(f'[session_id: {session_id}] Entering the application')
146
+ final_df, freq_tokens = await fetch_from_db(is_db_fetch_reqd())
147
+ if len(final_df) > 1:
148
+
149
+ final_df["parsed_date"] = [correct_date(date_) for date_ in final_df['parsed_date']]
150
+ final_df["parsed_date"] = [parser.parse(date_) for date_ in final_df['parsed_date']]
151
+ final_df["elapsed_time"] =[date_time_parser(date_) for date_ in final_df['parsed_date']]
152
+ final_df = final_df.loc[final_df["elapsed_time"] <= NEWS_RETENTION_SECONDS, :].copy()
153
+ final_df["elapsed_time_str"] = final_df["elapsed_time"].apply(elapsed_time_str)
154
+ final_df.sort_values(by="elapsed_time", inplace=True)
155
+ src_str = ", ".join(sorted([*final_df['src'].unique()]))
156
+ final_df['src_time'] = final_df['src'] + ("&nbsp;" * 5) + final_df["elapsed_time_str"]
157
+ final_df.drop(columns=['_id', 'parsed_date', 'src', 'elapsed_time', 'elapsed_time_str'], inplace=True)
158
+ final_df.drop_duplicates(subset='description', inplace=True)
159
+ final_df = final_df.loc[(final_df["title"] != ""), :].copy()
160
+ else:
161
+ final_df = pd.DataFrame({'title': '', 'url': '',
162
+ 'description': '', 'src_time': ''}, index=[0])
163
+
164
+ except Exception as e:
165
+ final_df = pd.DataFrame({'title': '', 'url': '',
166
+ 'description': '', 'src_time': ''}, index=[0])
167
+ logging.error(f'[session_id: {session_id}] {traceback.print_exc()}')
168
+
169
+ result_str = f'''
170
+ <div class="box" id="main">
171
+ <form>
172
+
173
+ <div class="banner">
174
+ <img src="../static/favicon_new.png" class="logo-img" alt="KSV Muralidhar" />
175
+ <h1 style="display:inline-block; vertical-align: middle;">Latest Indian News</h1>
176
+ </div>
177
+ '''
178
+
179
+ if len(final_df) <= 1:
180
+ result_str += f'''<div><p class="unavailable">This app is temporarily unavailable</p></div>'''
181
+ status_code = 500
182
+ else:
183
+ last_update_utc = datetime.strptime(redis_client.get('NEWSFETCHTIME'), '%Y-%m-%d %H:%M:%S.%f')
184
+ last_update_mins = int(np.ceil((datetime.now() - last_update_utc).seconds / 60))
185
+ last_update_str = f'Updated {last_update_mins} {"minutes" if last_update_mins > 1 else "minute"} ago'
186
+ result_str += f'<p class="srctxt">News aggregated from <b>{src_str}</b>.<br><br>{last_update_str}&nbsp;&nbsp;&nbsp;&nbsp;<a href="{UK_EDITION_URL}"><b>Switch to UK edition</b></a></p>'
187
+
188
+ result_str += '''
189
+ <div class="input-container">
190
+ <input type="text" class="keyword-input" id="keywordInput" placeholder="Search" oninput="filterContent(true)">
191
+ <div class="clear-btn" id="clearBtn" onclick="clearFilter()">&times;</div>
192
+ <img src="static/info.png" alt="info" width="18" height="18" align="center" onclick="showSearchInfo()" style="cursor: pointer;">
193
+ </div>
194
+ '''
195
+
196
+ result_str += f"{freq_tokens} "
197
+ result_str += '<div class="show-more-word-cloud" onclick=word_cloud_display()><p class="three-dots">...</p></div>'
198
+
199
+ result_str += f'''<div style="padding-bottom: 6px; font-size: 12px; font-family: Arial, Helvetica, sans-serif;">
200
+ News categories and similar news are AI-generated</div>
201
+ <div style="padding-bottom: 10px; font-size: 12px; font-family: Arial, Helvetica, sans-serif; font-weight: bold;">
202
+ {len(final_df)} news articles available</div>
203
+ '''
204
+
205
+
206
+ for n, i in final_df.iterrows(): # iterating through the search results
207
+ href = i["url"]
208
+ category = i["category"]
209
+ description = i["description"]
210
+ url_txt = i["title"]
211
+ src_time = i["src_time"]
212
+ sim_news = i['similar_news']
213
+ result_str += f'''<div class="news-item"><div style="padding-top: 7px;">
214
+ <a href="{href}" target="_blank" class="article-category">{category}
215
+ </a>
216
+ </div>
217
+ <div>
218
+ <a href="{href}" target="_blank" class="headline">{url_txt}
219
+ </a>
220
+ </div>
221
+ <div>
222
+ <a href="{href}" target="_blank" class="description">
223
+ {description}
224
+ </a>
225
+ </div>
226
+ <div>
227
+ <a href="{href}" target="_blank" class="time">
228
+ {src_time}
229
+ </a>
230
+ </div>
231
+
232
+
233
+ <div class="container">
234
+ <div class="content" style="display: none;">
235
+ {sim_news}
236
+ </div>
237
+ <div class="show-similar-button-container">
238
+ <button type="button" class="show-more">Show similar news</button>
239
+ <button type="button" class="show-less">Hide similar news</button>
240
+ </div>
241
+ </div>
242
+
243
+
244
+
245
+ <div>
246
+ <p></p>
247
+ </div></div>
248
+ '''
249
+
250
+ result_str += '</form></div>'
251
+ logging.warning(f'[session_id: {session_id}] Successfully rendered template')
252
+ gc.collect()
253
+ return await render_template("index.html", body=result_str), status_code
254
+
255
+
256
+ if __name__ == "__main__":
257
+ app.run(host="0.0.0.0", port=7860, workers=5, threads=5) # workers=(2*ncores) + 1, threads= (2 to 4*ncores) + 1
258
+
 
 
 
 
config.py CHANGED
@@ -1,2 +1,2 @@
1
- NEWS_RETENTION_SECONDS = 300
2
- UK_EDITION_URL = "https://ksvmuralidhar-uk-news-aggregator.hf.space"
 
1
+ NEWS_RETENTION_SECONDS = 300
2
+ UK_EDITION_URL = "https://ksvmuralidhar-uk-news-aggregator.hf.space"
db_operations/db_operations.py CHANGED
@@ -1,69 +1,69 @@
1
- import pymongo
2
- import os
3
- import pandas as pd
4
- import logging
5
-
6
-
7
- class DBOperations:
8
- """
9
- Reads news from MongoDB
10
- """
11
- def __init__(self):
12
- self.url = os.getenv('DB_URL')
13
- self.database = "rss_news_db_cat_pred_sim_news"
14
- self.collection = "rss_news_cat_pred_sim_news"
15
- self.__client = None
16
- self.__error = 0
17
-
18
- async def __connect(self):
19
- try:
20
- self.__client = pymongo.MongoClient(self.url)
21
- _ = self.__client.list_database_names()
22
- except Exception as conn_exception:
23
- self.__error = 1
24
- logging.critical(f"Error in DBOperations.connect(): {conn_exception}")
25
- self.__client = None
26
- raise
27
-
28
- async def __read(self):
29
- try:
30
- db = self.__client[self.database]
31
- coll = db[self.collection]
32
- docs = []
33
- maxtries = 5
34
- ntry = 0
35
-
36
- while (len(docs) == 0) and (ntry < maxtries):
37
- for doc in coll.find():
38
- docs.append(doc)
39
- ntry += 1
40
- logging.info(f"DB Read try: {ntry}")
41
-
42
- rss_df = pd.DataFrame(docs)
43
- except Exception as insert_err:
44
- self.__error = 1
45
- logging.critical(f"Error in DBOperations.read(): {insert_err}")
46
- rss_df = pd.DataFrame({'_id': '', 'title': '', 'url': '',
47
- 'description': '', 'parsed_date': '',
48
- 'src': ''}, index=[0])
49
- return rss_df
50
-
51
- def __close_connection(self):
52
- if self.__client is not None:
53
- self.__client.close()
54
- self.__client = None
55
-
56
- async def read_news_from_db(self):
57
- rss_df = pd.DataFrame({'_id': '', 'title': '', 'url': '',
58
- 'description': '', 'parsed_date': '',
59
- 'src': ''}, index=[0])
60
- if self.url is not None:
61
- if self.__error == 0:
62
- await self.__connect()
63
- if self.__error == 0:
64
- rss_df = await self.__read()
65
- if self.__error == 0:
66
- logging.info("Read Successful")
67
- if self.__client is not None:
68
- self.__close_connection()
69
- return rss_df
 
1
+ import pymongo
2
+ import os
3
+ import pandas as pd
4
+ import logging
5
+
6
+
7
+ class DBOperations:
8
+ """
9
+ Reads news from MongoDB
10
+ """
11
+ def __init__(self):
12
+ self.url = os.getenv('DB_URL')
13
+ self.database = "rss_news_db_cat_pred_sim_news"
14
+ self.collection = "rss_news_cat_pred_sim_news"
15
+ self.__client = None
16
+ self.__error = 0
17
+
18
+ async def __connect(self):
19
+ try:
20
+ self.__client = pymongo.MongoClient(self.url)
21
+ _ = self.__client.list_database_names()
22
+ except Exception as conn_exception:
23
+ self.__error = 1
24
+ logging.critical(f"Error in DBOperations.connect(): {conn_exception}")
25
+ self.__client = None
26
+ raise
27
+
28
+ async def __read(self):
29
+ try:
30
+ db = self.__client[self.database]
31
+ coll = db[self.collection]
32
+ docs = []
33
+ maxtries = 5
34
+ ntry = 0
35
+
36
+ while (len(docs) == 0) and (ntry < maxtries):
37
+ for doc in coll.find():
38
+ docs.append(doc)
39
+ ntry += 1
40
+ logging.info(f"DB Read try: {ntry}")
41
+
42
+ rss_df = pd.DataFrame(docs)
43
+ except Exception as insert_err:
44
+ self.__error = 1
45
+ logging.critical(f"Error in DBOperations.read(): {insert_err}")
46
+ rss_df = pd.DataFrame({'_id': '', 'title': '', 'url': '',
47
+ 'description': '', 'parsed_date': '',
48
+ 'src': ''}, index=[0])
49
+ return rss_df
50
+
51
+ def __close_connection(self):
52
+ if self.__client is not None:
53
+ self.__client.close()
54
+ self.__client = None
55
+
56
+ async def read_news_from_db(self):
57
+ rss_df = pd.DataFrame({'_id': '', 'title': '', 'url': '',
58
+ 'description': '', 'parsed_date': '',
59
+ 'src': ''}, index=[0])
60
+ if self.url is not None:
61
+ if self.__error == 0:
62
+ await self.__connect()
63
+ if self.__error == 0:
64
+ rss_df = await self.__read()
65
+ if self.__error == 0:
66
+ logging.info("Read Successful")
67
+ if self.__client is not None:
68
+ self.__close_connection()
69
+ return rss_df
indian_news_app_load_tests/indian_news_app_100_conc_users.html CHANGED
The diff for this file is too large to render. See raw diff
 
indian_news_app_load_tests/indian_news_app_1_user.html CHANGED
The diff for this file is too large to render. See raw diff
 
indian_news_app_load_tests/indian_news_app_25_conc_users.html CHANGED
The diff for this file is too large to render. See raw diff
 
indian_news_app_load_tests/indian_news_app_50_conc_users.html CHANGED
The diff for this file is too large to render. See raw diff
 
indian_news_app_load_tests/indian_news_app_5_conc_users.html CHANGED
The diff for this file is too large to render. See raw diff
 
newrelic.ini ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ---------------------------------------------------------------------------
2
+
3
+ #
4
+ # This file configures the New Relic Python Agent.
5
+ #
6
+ # The path to the configuration file should be supplied to the function
7
+ # newrelic.agent.initialize() when the agent is being initialized.
8
+ #
9
+ # The configuration file follows a structure similar to what you would
10
+ # find for Microsoft Windows INI files. For further information on the
11
+ # configuration file format see the Python ConfigParser documentation at:
12
+ #
13
+ # http://docs.python.org/library/configparser.html
14
+ #
15
+ # For further discussion on the behaviour of the Python agent that can
16
+ # be configured via this configuration file see:
17
+ #
18
+ # https://docs.newrelic.com/docs/apm/agents/python-agent/configuration/python-agent-configuration/
19
+ #
20
+
21
+ # ---------------------------------------------------------------------------
22
+
23
+ # Here are the settings that are common to all environments.
24
+
25
+ [newrelic]
26
+
27
+ # You must specify the license key associated with your New
28
+ # Relic account. This may also be set using the NEW_RELIC_LICENSE_KEY
29
+ # environment variable. This key binds the Python Agent's data to
30
+ # your account in the New Relic service. For more information on
31
+ # storing and generating license keys, see
32
+ # https://docs.newrelic.com/docs/apis/intro-apis/new-relic-api-keys/#ingest-license-key
33
+
34
+
35
+ # The application name. Set this to be the name of your
36
+ # application as you would like it to show up in New Relic UI.
37
+ # You may also set this using the NEW_RELIC_APP_NAME environment variable.
38
+ # The UI will then auto-map instances of your application into a
39
+ # entry on your home dashboard page. You can also specify multiple
40
+ # app names to group your aggregated data. For further details,
41
+ # please see:
42
+ # https://docs.newrelic.com/docs/apm/agents/manage-apm-agents/app-naming/use-multiple-names-app/
43
+ app_name = news_aggregator
44
+
45
+ # When "true", the agent collects performance data about your
46
+ # application and reports this data to the New Relic UI at
47
+ # newrelic.com. This global switch is normally overridden for
48
+ # each environment below. It may also be set using the
49
+ # NEW_RELIC_MONITOR_MODE environment variable.
50
+ monitor_mode = true
51
+
52
+ # Sets the name of a file to log agent messages to. Whatever you
53
+ # set this to, you must ensure that the permissions for the
54
+ # containing directory and the file itself are correct, and
55
+ # that the user that your web application runs as can write out
56
+ # to the file. If not able to out a log file, it is also
57
+ # possible to say "stderr" and output to standard error output.
58
+ # This would normally result in output appearing in your web
59
+ # server log. It can also be set using the NEW_RELIC_LOG
60
+ # environment variable.
61
+ log_file = stdout
62
+
63
+ # Sets the level of detail of messages sent to the log file, if
64
+ # a log file location has been provided. Possible values, in
65
+ # increasing order of detail, are: "critical", "error", "warning",
66
+ # "info" and "debug". When reporting any agent issues to New
67
+ # Relic technical support, the most useful setting for the
68
+ # support engineers is "debug". However, this can generate a lot
69
+ # of information very quickly, so it is best not to keep the
70
+ # agent at this level for longer than it takes to reproduce the
71
+ # problem you are experiencing. This may also be set using the
72
+ # NEW_RELIC_LOG_LEVEL environment variable.
73
+ log_level = info
74
+
75
+ # High Security Mode enforces certain security settings, and prevents
76
+ # them from being overridden, so that no sensitive data is sent to New
77
+ # Relic. Enabling High Security Mode means that request parameters are
78
+ # not collected and SQL can not be sent to New Relic in its raw form.
79
+ # To activate High Security Mode, it must be set to 'true' in this
80
+ # local .ini configuration file AND be set to 'true' in the
81
+ # server-side configuration in the New Relic user interface. It can
82
+ # also be set using the NEW_RELIC_HIGH_SECURITY environment variable.
83
+ # For details, see
84
+ # https://docs.newrelic.com/docs/subscriptions/high-security
85
+ high_security = false
86
+
87
+ # The Python Agent will attempt to connect directly to the New
88
+ # Relic service. If there is an intermediate firewall between
89
+ # your host and the New Relic service that requires you to use a
90
+ # HTTP proxy, then you should set both the "proxy_host" and
91
+ # "proxy_port" settings to the required values for the HTTP
92
+ # proxy. The "proxy_user" and "proxy_pass" settings should
93
+ # additionally be set if proxy authentication is implemented by
94
+ # the HTTP proxy. The "proxy_scheme" setting dictates what
95
+ # protocol scheme is used in talking to the HTTP proxy. This
96
+ # would normally always be set as "http" which will result in the
97
+ # agent then using a SSL tunnel through the HTTP proxy for end to
98
+ # end encryption.
99
+ # See https://docs.newrelic.com/docs/apm/agents/python-agent/configuration/python-agent-configuration/#proxy
100
+ # for information on proxy configuration via environment variables.
101
+ # proxy_scheme = http
102
+ # proxy_host = hostname
103
+ # proxy_port = 8080
104
+ # proxy_user =
105
+ # proxy_pass =
106
+
107
+ # Capturing request parameters is off by default. To enable the
108
+ # capturing of request parameters, first ensure that the setting
109
+ # "attributes.enabled" is set to "true" (the default value), and
110
+ # then add "request.parameters.*" to the "attributes.include"
111
+ # setting. For details about attributes configuration, please
112
+ # consult the documentation.
113
+ # attributes.include = request.parameters.*
114
+
115
+ # The transaction tracer captures deep information about slow
116
+ # transactions and sends this to the UI on a periodic basis. The
117
+ # transaction tracer is enabled by default. Set this to "false"
118
+ # to turn it off.
119
+ transaction_tracer.enabled = true
120
+
121
+ # Threshold in seconds for when to collect a transaction trace.
122
+ # When the response time of a controller action exceeds this
123
+ # threshold, a transaction trace will be recorded and sent to
124
+ # the UI. Valid values are any positive float value, or (default)
125
+ # "apdex_f", which will use the threshold for a dissatisfying
126
+ # Apdex controller action - four times the Apdex T value.
127
+ transaction_tracer.transaction_threshold = apdex_f
128
+
129
+ # When the transaction tracer is on, SQL statements can
130
+ # optionally be recorded. The recorder has three modes, "off"
131
+ # which sends no SQL, "raw" which sends the SQL statement in its
132
+ # original form, and "obfuscated", which strips out numeric and
133
+ # string literals.
134
+ transaction_tracer.record_sql = obfuscated
135
+
136
+ # Threshold in seconds for when to collect stack trace for a SQL
137
+ # call. In other words, when SQL statements exceed this
138
+ # threshold, then capture and send to the UI the current stack
139
+ # trace. This is helpful for pinpointing where long SQL calls
140
+ # originate from in an application.
141
+ transaction_tracer.stack_trace_threshold = 0.5
142
+
143
+ # Determines whether the agent will capture query plans for slow
144
+ # SQL queries. Only supported in MySQL and PostgreSQL. Set this
145
+ # to "false" to turn it off.
146
+ transaction_tracer.explain_enabled = true
147
+
148
+ # Threshold for query execution time below which query plans
149
+ # will not not be captured. Relevant only when "explain_enabled"
150
+ # is true.
151
+ transaction_tracer.explain_threshold = 0.5
152
+
153
+ # Space separated list of function or method names in form
154
+ # 'module:function' or 'module:class.function' for which
155
+ # additional function timing instrumentation will be added.
156
+ transaction_tracer.function_trace =
157
+
158
+ # The error collector captures information about uncaught
159
+ # exceptions or logged exceptions and sends them to UI for
160
+ # viewing. The error collector is enabled by default. Set this
161
+ # to "false" to turn it off. For more details on errors, see
162
+ # https://docs.newrelic.com/docs/apm/agents/manage-apm-agents/agent-data/manage-errors-apm-collect-ignore-or-mark-expected/
163
+ error_collector.enabled = true
164
+
165
+ # To stop specific errors from reporting to the UI, set this to
166
+ # a space separated list of the Python exception type names to
167
+ # ignore. The exception name should be of the form 'module:class'.
168
+ error_collector.ignore_classes =
169
+
170
+ # Expected errors are reported to the UI but will not affect the
171
+ # Apdex or error rate. To mark specific errors as expected, set this
172
+ # to a space separated list of the Python exception type names to
173
+ # expected. The exception name should be of the form 'module:class'.
174
+ error_collector.expected_classes =
175
+
176
+ # Browser monitoring is the Real User Monitoring feature of the UI.
177
+ # For those Python web frameworks that are supported, this
178
+ # setting enables the auto-insertion of the browser monitoring
179
+ # JavaScript fragments.
180
+ browser_monitoring.auto_instrument = true
181
+
182
+ # A thread profiling session can be scheduled via the UI when
183
+ # this option is enabled. The thread profiler will periodically
184
+ # capture a snapshot of the call stack for each active thread in
185
+ # the application to construct a statistically representative
186
+ # call tree. For more details on the thread profiler tool, see
187
+ # https://docs.newrelic.com/docs/apm/apm-ui-pages/events/thread-profiler-tool/
188
+ thread_profiler.enabled = true
189
+
190
+ # Your application deployments can be recorded through the
191
+ # New Relic REST API. To use this feature provide your API key
192
+ # below then use the `newrelic-admin record-deploy` command.
193
+ # This can also be set using the NEW_RELIC_API_KEY
194
+ # environment variable.
195
+ # api_key =
196
+
197
+ # Distributed tracing lets you see the path that a request takes
198
+ # through your distributed system. For more information, please
199
+ # consult our distributed tracing planning guide.
200
+ # https://docs.newrelic.com/docs/transition-guide-distributed-tracing
201
+ distributed_tracing.enabled = true
202
+
203
+ # This setting enables log decoration, the forwarding of log events,
204
+ # and the collection of logging metrics if these sub-feature
205
+ # configurations are also enabled. If this setting is false, no
206
+ # logging instrumentation features are enabled. This can also be
207
+ # set using the NEW_RELIC_APPLICATION_LOGGING_ENABLED environment
208
+ # variable.
209
+ # application_logging.enabled = true
210
+
211
+ # If true, the agent captures log records emitted by your application
212
+ # and forwards them to New Relic. `application_logging.enabled` must
213
+ # also be true for this setting to take effect. You can also set
214
+ # this using the NEW_RELIC_APPLICATION_LOGGING_FORWARDING_ENABLED
215
+ # environment variable.
216
+ # application_logging.forwarding.enabled = true
217
+
218
+ # If true, the agent decorates logs with metadata to link to entities,
219
+ # hosts, traces, and spans. `application_logging.enabled` must also
220
+ # be true for this setting to take effect. This can also be set
221
+ # using the NEW_RELIC_APPLICATION_LOGGING_LOCAL_DECORATING_ENABLED
222
+ # environment variable.
223
+ # application_logging.local_decorating.enabled = true
224
+
225
+ # If true, the agent captures metrics related to the log lines
226
+ # being sent up by your application. This can also be set
227
+ # using the NEW_RELIC_APPLICATION_LOGGING_METRICS_ENABLED
228
+ # environment variable.
229
+ # application_logging.metrics.enabled = true
230
+
231
+
232
+ # ---------------------------------------------------------------------------
233
+
234
+ #
235
+ # The application environments. These are specific settings which
236
+ # override the common environment settings. The settings related to a
237
+ # specific environment will be used when the environment argument to the
238
+ # newrelic.agent.initialize() function has been defined to be either
239
+ # "development", "test", "staging" or "production".
240
+ #
241
+
242
+ [newrelic:development]
243
+ monitor_mode = false
244
+
245
+ [newrelic:test]
246
+ monitor_mode = false
247
+
248
+ [newrelic:staging]
249
+ app_name = (Staging)
250
+ monitor_mode = true
251
+
252
+ [newrelic:production]
253
+ monitor_mode = true
254
+
255
+ # ---------------------------------------------------------------------------
requirements.txt CHANGED
@@ -1,15 +1,16 @@
1
- regex==2021.8.3
2
- lxml==4.6.3
3
- numpy==1.21.1
4
- python-dateutil==2.8.2
5
- pandas==1.3.1
6
- requests==2.26.0
7
- bs4==0.0.1
8
- gunicorn
9
- pymongo==4.3.3
10
- unidecode
11
- redis
12
- asyncio
13
- uvicorn
14
- Quart
15
- quart-cors
 
 
1
+ regex==2021.8.3
2
+ lxml==4.6.3
3
+ numpy==1.21.1
4
+ python-dateutil==2.8.2
5
+ pandas==1.3.1
6
+ requests==2.26.0
7
+ bs4==0.0.1
8
+ gunicorn
9
+ pymongo==4.3.3
10
+ unidecode
11
+ redis
12
+ newrelic
13
+ asyncio
14
+ uvicorn
15
+ Quart
16
+ quart-cors
start.sh CHANGED
@@ -1,6 +1,6 @@
1
- #!/bin/bash
2
- redis-server --daemonize yes
3
- redis-cli config set save ""
4
- redis-cli config set appendonly no
5
- redis-cli config set stop-writes-on-bgsave-error no
6
  gunicorn -b 0.0.0.0:7860 --timeout 120 --worker-class uvicorn.workers.UvicornWorker --workers 5 --threads 5 app:app
 
1
+ #!/bin/bash
2
+ redis-server --daemonize yes
3
+ redis-cli config set save ""
4
+ redis-cli config set appendonly no
5
+ redis-cli config set stop-writes-on-bgsave-error no
6
  gunicorn -b 0.0.0.0:7860 --timeout 120 --worker-class uvicorn.workers.UvicornWorker --workers 5 --threads 5 app:app
static/styles.css CHANGED
@@ -1,510 +1,510 @@
1
- html {
2
- scroll-behavior: smooth;
3
- }
4
-
5
- @media screen and (min-width: 800px) {
6
- a.headline {
7
- background-color: #E5E4E2;
8
- display: block;
9
- width: relative;
10
- text-decoration: none;
11
- color: black;
12
- line-height: 1.2;
13
- align: justify;
14
- border-left: 5px solid transparent;
15
- border-top: 5px solid transparent;
16
- border-bottom: 5px solid transparent;
17
- border-right: 0px;
18
- font-weight: bold;
19
- font-size: 18px;
20
- padding-right: 5px;
21
- font-family: Arial, Helvetica, sans-serif;
22
- }
23
- }
24
-
25
- @media screen and (max-width: 800px) {
26
- a.headline {
27
- background-color: #E5E4E2;
28
- display: block;
29
- width: relative;
30
- text-decoration: none;
31
- color: black;
32
- line-height: 1.2;
33
- align: justify;
34
- border-left: 5px solid transparent;
35
- border-top: 5px solid transparent;
36
- border-bottom: 5px solid transparent;
37
- border-right: 0px;
38
- font-weight: bold;
39
- font-size: 16.5px;
40
- padding-right: 5px;
41
- font-family: Arial, Helvetica, sans-serif;
42
- }
43
- }
44
-
45
- @media screen and (min-width: 800px) {
46
- a.description {
47
- background-color: #E5E4E2;
48
- align:justify;
49
- text-align: justify;
50
- display: block;
51
- height:100%;
52
- width: relative;
53
- text-decoration: none;
54
- border-left: 5px solid transparent;
55
- border-top: 0px;
56
- border-bottom: 7px solid transparent;
57
- border-right: 0px;
58
- font-size: 14px;
59
- padding-right: 5px;
60
- font-family: Arial, Helvetica, sans-serif;
61
- color: dimgrey;
62
- }
63
- }
64
-
65
- @media screen and (max-width: 800px) {
66
- a.description {
67
- background-color: #E5E4E2;
68
- align:justify;
69
- text-align: justify;
70
- display: block;
71
- height:100%;
72
- width: relative;
73
- text-decoration: none;
74
- border-left: 5px solid transparent;
75
- border-top: 0px;
76
- border-bottom: 7px solid transparent;
77
- border-right: 0px;
78
- font-size: 12.5px;
79
- padding-right: 5px;
80
- font-family: Arial, Helvetica, sans-serif;
81
- color: dimgrey;
82
- }
83
- }
84
-
85
- @media screen and (min-width: 800px) {
86
- a.time {
87
- background-color: #E5E4E2;
88
- align:justify;
89
- display: block;
90
- height:100%;
91
- width: relative;
92
- text-decoration: none;
93
- border-left: 5px solid transparent;
94
- border-top: 0px;
95
- border-bottom: 1px solid transparent;
96
- border-right: 0px;
97
- padding-right: 5px;
98
- font-size: 11px;
99
- padding-bottom: 5px;
100
- font-family: Arial, Helvetica, sans-serif;
101
- color: green;
102
- }
103
- }
104
-
105
- @media screen and (max-width: 800px) {
106
- a.time {
107
- background-color: #E5E4E2;
108
- align:justify;
109
- display: block;
110
- height:100%;
111
- width: relative;
112
- text-decoration: none;
113
- border-left: 5px solid transparent;
114
- border-top: 0px;
115
- border-bottom: 1px solid transparent;
116
- border-right: 0px;
117
- padding-right: 5px;
118
- font-size: 10px;
119
- padding-bottom: 5px;
120
- font-family: Arial, Helvetica, sans-serif;
121
- color: green;
122
- }
123
- }
124
-
125
- .box {
126
- display: flex;
127
- justify-content: center;
128
- align-items: center;
129
- height: inherit;
130
- padding: 20px;
131
- }
132
- @media screen and (min-width: 800px) {
133
- form {
134
- width: 50%;
135
- overflow-x: hidden;
136
- padding: 20px;
137
- border-radius: 10px;
138
- background: #fff;
139
- box-shadow: 0 0 20px 0 #095484;
140
- }}
141
-
142
- @media screen and (max-width: 800px) {
143
- form {
144
- width: 100%;
145
- overflow-x: hidden;
146
- padding: 20px;
147
- border-radius: 10px;
148
- background: #fff;
149
- box-shadow: 0 0 15px 0 #095484;
150
- }}
151
- .banner {
152
- position: relative;
153
- height: 30px;
154
- /* background-size: cover; */
155
- display: flex;
156
- /* justify-content: center; */
157
- /* align-items: center; */
158
- /* text-align: center; */
159
- }
160
- @media screen and (min-width: 800px) {
161
- h1 {
162
- position: absolute;
163
- margin: 0;
164
- padding-left: 50px;
165
- font-size: 25px;
166
- color: black;
167
- z-index: 2;
168
- font-family: Arial, Helvetica, sans-serif;
169
- }
170
- }
171
-
172
- @media screen and (max-width: 800px) {
173
- h1 {
174
- position: absolute;
175
- margin: 0;
176
- padding-left: 40px;
177
- font-size: 24px;
178
- color: black;
179
- z-index: 2;
180
- font-family: Arial, Helvetica, sans-serif;
181
- }
182
- }
183
-
184
- p.unavailable {
185
- background-color: #E5E4E2;
186
- display: block;
187
- width: 100%;
188
- text-decoration: none;
189
- color: black;
190
- line-height: 1.2;
191
- align: justify;
192
- border-left: 5px solid transparent;
193
- border-top: 5px solid transparent;
194
- border-bottom: 5px solid transparent;
195
- border-right: 0px;
196
- font-weight: bold;
197
- font-size: 18px;
198
- padding-right: 5px;
199
- font-family: Arial, Helvetica, sans-serif;
200
- }
201
- div.news-item{
202
- background-color: #E5E4E2;
203
- /*box-shadow: rgba(0, 0, 0, 0.4) -1px 0px 5px, rgba(0, 0, 0, 0.5) 0px 4px 5px -3px, rgba(0, 0, 0, 0.2) 0px -3px 0px inset;*/
204
- box-shadow: rgba(0, 0, 0, 0.25) 0px 0px 5px 1px, rgba(0, 0, 0, 0.1) 0px 4px 5px -3px, rgba(0, 0, 0, 0.2) 0px -1px 0px inset;
205
-
206
-
207
- }
208
- div.news-item:hover{
209
- box-shadow: none;
210
- }
211
-
212
- @media screen and (min-width: 800px) {
213
- p.srctxt {
214
- align:justify;
215
- text-align: justify;
216
- word-break: break-all;
217
- font-size: 11px;
218
- font-family: Arial, Helvetica, sans-serif;
219
- }
220
- .logo-img{
221
- margin-right: 10px;
222
- vertical-align: center;
223
- /* position: relative; */
224
- width: 34px;
225
- height: 34px;
226
-
227
- }
228
- }
229
-
230
- @media screen and (max-width: 800px) {
231
- p.srctxt {
232
- align:justify;
233
- text-align: justify;
234
- word-break: break-all;
235
- font-size: 9px;
236
- font-family: Arial, Helvetica, sans-serif;
237
- }
238
- .logo-img{
239
- margin-right: 10px;
240
- vertical-align: top;
241
- /* position: absolute; */
242
- width: 30px;
243
- height: 30px;
244
- }
245
- }
246
-
247
- .float{
248
- position:fixed;
249
- width:25px;
250
- height:25px;
251
- bottom:15px;
252
- right:12px;
253
- background-color: white;
254
- border-radius:50%;
255
- text-align:center;
256
- vertical-align:center;
257
- z-index: 99999998;
258
- font-size:0;
259
- cursor:pointer;
260
- animation: beatan 0.8s infinite alternate;
261
-
262
- }
263
- .top-float{
264
- position:fixed;
265
- width:25px;
266
- height:25px;
267
- bottom:52px;
268
- right:12px;
269
- background-color: white;
270
- border-radius:50%;
271
- text-align:center;
272
- vertical-align:center;
273
- z-index: 99999998;
274
- font-size:0;
275
- cursor:pointer;
276
- animation: beatan 0.8s infinite alternate;
277
-
278
- }
279
- .my-float{
280
- margin-top:22px;
281
- }
282
-
283
- @keyframes beatan{
284
- to { transform: scale(1.1); }
285
- }
286
-
287
- .loader {
288
- position: fixed;
289
- left: 0px;
290
- top: 0px;
291
- width: 100%;
292
- height: 100%;
293
- z-index: 99999999999;
294
- background: url('../static/loader.gif') 50% 50% no-repeat rgb(255,255,255);
295
- }
296
-
297
- .highlight {
298
- background-color: yellow;
299
- font-weight: bold;
300
- }
301
-
302
- .input-container {
303
- position: relative;
304
- padding-bottom: 10px;
305
- }
306
-
307
- .keyword-input {
308
-
309
- border-radius: 5px;
310
- transition: border-color 0.3s ease;
311
- border: 1px solid silver;
312
- width: 10em;
313
- height: 1.5em;
314
- padding-left: 0.5em;
315
- outline: none;
316
- overflow: hidden;
317
-
318
- }
319
-
320
- .clear-btn {
321
- position: absolute;
322
- font-size: 20px;
323
- left: 129px;
324
- transform: translateY(-105%);
325
- cursor: pointer;
326
- opacity: 0;
327
- transition: opacity 0.3s ease;
328
- }
329
-
330
- .clear-btn.show {
331
- opacity: 1;
332
- }
333
-
334
- @media screen and (min-width: 800px) {
335
- a.article-category {
336
- background-color: #E5E4E2;
337
- align:justify;
338
- display: block;
339
- height:100%;
340
- width: relative;
341
- text-decoration: none;
342
- border-left: 5px solid transparent;
343
- border-top: 0px;
344
- font-weight: bold;
345
- border-bottom: 1px solid transparent;
346
- border-right: 0px;
347
- padding-right: 5px;
348
- font-size: 11px;
349
- padding-bottom: 0px;
350
- font-family: Arial, Helvetica, sans-serif;
351
- color: green;
352
- }
353
- }
354
-
355
- @media screen and (max-width: 800px) {
356
- a.article-category {
357
- background-color: #E5E4E2;
358
- align:justify;
359
- display: block;
360
- height:100%;
361
- font-weight: bold;
362
- width: relative;
363
- text-decoration: none;
364
- border-left: 5px solid transparent;
365
- border-top: 0px;
366
- border-bottom: 1px solid transparent;
367
- border-right: 0px;
368
- padding-right: 5px;
369
- font-size: 10px;
370
- padding-bottom: 0px;
371
- font-family: Arial, Helvetica, sans-serif;
372
- color: green;
373
- }
374
- }
375
-
376
- .content {
377
- display: none;
378
- font-family: Arial, Helvetica, sans-serif;
379
-
380
- padding-right: 5px;
381
-
382
- padding-top: 5px;
383
- border-left: 5px solid transparent;
384
- }
385
-
386
- .container{
387
- padding-bottom:10px;
388
- }
389
-
390
- .show-similar-button-container{
391
- display: flex;
392
- flex-direction: column;
393
- align-items: center;
394
- }
395
-
396
- .similar-news-item:hover {
397
- text-decoration: none;
398
- }
399
-
400
- @media screen and (min-width: 800px) {
401
- .similar-news-item {
402
- text-align: justify;
403
- text-decoration: underline;
404
- font-size: 14px;
405
- font-family: Arial, Helvetica, sans-serif;
406
- color: black;
407
- display:inline-block;
408
- padding-bottom: 10px;
409
- width:100%;
410
- /*white-space: nowrap;
411
- overflow: hidden;
412
- text-overflow: ellipsis;*/
413
-
414
- }
415
- }
416
-
417
- @media screen and (max-width: 800px) {
418
- .similar-news-item {
419
- text-align: justify;
420
- text-decoration: underline;
421
- font-size: 12px;
422
- font-family: Arial, Helvetica, sans-serif;
423
- color: black;
424
- display:inline-block;
425
- padding-bottom: 8px;
426
- width:100%;
427
- /*white-space: nowrap;
428
- overflow: hidden;
429
- text-overflow: ellipsis;*/
430
- }
431
- }
432
-
433
-
434
-
435
- .show-more {
436
- background-color: #E5E4E2;
437
- font-family: Arial, Helvetica, sans-serif;
438
- border-radius:4px;
439
- padding-top:3px;
440
- padding-bottom:3px;
441
- padding-left:3px;
442
- padding-right:3px;
443
- font-size: 12px;
444
- display: box;
445
- border: none;
446
-
447
- }
448
-
449
- .show-more:hover {
450
- background-color: black;
451
- color: white;
452
- }
453
-
454
- .show-less {
455
- background-color: #E5E4E2;
456
- font-family: Arial, Helvetica, sans-serif;
457
- border-radius:4px;
458
- padding-top:3px;
459
- padding-bottom:3px;
460
- padding-left:3px;
461
- padding-right:3px;
462
- font-size: 12px;
463
- border: none;
464
- display: none;
465
- }
466
-
467
- .show-less:hover {
468
- background-color: black;
469
- color: white;
470
- }
471
-
472
- .word-cloud-container{
473
- word-wrap: break-word;
474
- padding-bottom: 10px;
475
-
476
- }
477
-
478
- .wc-tokens{
479
- font-family: Arial, Helvetica, sans-serif;
480
- font-size: 13.2px;
481
- cursor: pointer;
482
- }
483
-
484
- .wc-tokens:hover{
485
- text-decoration: underline;
486
- }
487
-
488
- .word-cloud-section{
489
- padding-bottom: 10px;
490
- display: none;
491
- word-wrap: break-word;
492
- }
493
-
494
- .show-more-word-cloud{
495
- padding-bottom: 23px;
496
- text-align: center;
497
- }
498
-
499
- .three-dots{
500
- font-size: 30px;
501
- margin: 0;
502
- line-height:0;
503
- vertical-align: top;
504
- padding: 0;
505
- cursor: pointer;
506
- }
507
-
508
- .three-dots:hover{
509
- font-size: 25px;
510
  }
 
1
+ html {
2
+ scroll-behavior: smooth;
3
+ }
4
+
5
+ @media screen and (min-width: 800px) {
6
+ a.headline {
7
+ background-color: #E5E4E2;
8
+ display: block;
9
+ width: relative;
10
+ text-decoration: none;
11
+ color: black;
12
+ line-height: 1.2;
13
+ align: justify;
14
+ border-left: 5px solid transparent;
15
+ border-top: 5px solid transparent;
16
+ border-bottom: 5px solid transparent;
17
+ border-right: 0px;
18
+ font-weight: bold;
19
+ font-size: 18px;
20
+ padding-right: 5px;
21
+ font-family: Arial, Helvetica, sans-serif;
22
+ }
23
+ }
24
+
25
+ @media screen and (max-width: 800px) {
26
+ a.headline {
27
+ background-color: #E5E4E2;
28
+ display: block;
29
+ width: relative;
30
+ text-decoration: none;
31
+ color: black;
32
+ line-height: 1.2;
33
+ align: justify;
34
+ border-left: 5px solid transparent;
35
+ border-top: 5px solid transparent;
36
+ border-bottom: 5px solid transparent;
37
+ border-right: 0px;
38
+ font-weight: bold;
39
+ font-size: 16.5px;
40
+ padding-right: 5px;
41
+ font-family: Arial, Helvetica, sans-serif;
42
+ }
43
+ }
44
+
45
+ @media screen and (min-width: 800px) {
46
+ a.description {
47
+ background-color: #E5E4E2;
48
+ align:justify;
49
+ text-align: justify;
50
+ display: block;
51
+ height:100%;
52
+ width: relative;
53
+ text-decoration: none;
54
+ border-left: 5px solid transparent;
55
+ border-top: 0px;
56
+ border-bottom: 7px solid transparent;
57
+ border-right: 0px;
58
+ font-size: 14px;
59
+ padding-right: 5px;
60
+ font-family: Arial, Helvetica, sans-serif;
61
+ color: dimgrey;
62
+ }
63
+ }
64
+
65
+ @media screen and (max-width: 800px) {
66
+ a.description {
67
+ background-color: #E5E4E2;
68
+ align:justify;
69
+ text-align: justify;
70
+ display: block;
71
+ height:100%;
72
+ width: relative;
73
+ text-decoration: none;
74
+ border-left: 5px solid transparent;
75
+ border-top: 0px;
76
+ border-bottom: 7px solid transparent;
77
+ border-right: 0px;
78
+ font-size: 12.5px;
79
+ padding-right: 5px;
80
+ font-family: Arial, Helvetica, sans-serif;
81
+ color: dimgrey;
82
+ }
83
+ }
84
+
85
+ @media screen and (min-width: 800px) {
86
+ a.time {
87
+ background-color: #E5E4E2;
88
+ align:justify;
89
+ display: block;
90
+ height:100%;
91
+ width: relative;
92
+ text-decoration: none;
93
+ border-left: 5px solid transparent;
94
+ border-top: 0px;
95
+ border-bottom: 1px solid transparent;
96
+ border-right: 0px;
97
+ padding-right: 5px;
98
+ font-size: 11px;
99
+ padding-bottom: 5px;
100
+ font-family: Arial, Helvetica, sans-serif;
101
+ color: green;
102
+ }
103
+ }
104
+
105
+ @media screen and (max-width: 800px) {
106
+ a.time {
107
+ background-color: #E5E4E2;
108
+ align:justify;
109
+ display: block;
110
+ height:100%;
111
+ width: relative;
112
+ text-decoration: none;
113
+ border-left: 5px solid transparent;
114
+ border-top: 0px;
115
+ border-bottom: 1px solid transparent;
116
+ border-right: 0px;
117
+ padding-right: 5px;
118
+ font-size: 10px;
119
+ padding-bottom: 5px;
120
+ font-family: Arial, Helvetica, sans-serif;
121
+ color: green;
122
+ }
123
+ }
124
+
125
+ .box {
126
+ display: flex;
127
+ justify-content: center;
128
+ align-items: center;
129
+ height: inherit;
130
+ padding: 20px;
131
+ }
132
+ @media screen and (min-width: 800px) {
133
+ form {
134
+ width: 50%;
135
+ overflow-x: hidden;
136
+ padding: 20px;
137
+ border-radius: 10px;
138
+ background: #fff;
139
+ box-shadow: 0 0 20px 0 #095484;
140
+ }}
141
+
142
+ @media screen and (max-width: 800px) {
143
+ form {
144
+ width: 100%;
145
+ overflow-x: hidden;
146
+ padding: 20px;
147
+ border-radius: 10px;
148
+ background: #fff;
149
+ box-shadow: 0 0 15px 0 #095484;
150
+ }}
151
+ .banner {
152
+ position: relative;
153
+ height: 30px;
154
+ /* background-size: cover; */
155
+ display: flex;
156
+ /* justify-content: center; */
157
+ /* align-items: center; */
158
+ /* text-align: center; */
159
+ }
160
+ @media screen and (min-width: 800px) {
161
+ h1 {
162
+ position: absolute;
163
+ margin: 0;
164
+ padding-left: 50px;
165
+ font-size: 25px;
166
+ color: black;
167
+ z-index: 2;
168
+ font-family: Arial, Helvetica, sans-serif;
169
+ }
170
+ }
171
+
172
+ @media screen and (max-width: 800px) {
173
+ h1 {
174
+ position: absolute;
175
+ margin: 0;
176
+ padding-left: 40px;
177
+ font-size: 24px;
178
+ color: black;
179
+ z-index: 2;
180
+ font-family: Arial, Helvetica, sans-serif;
181
+ }
182
+ }
183
+
184
+ p.unavailable {
185
+ background-color: #E5E4E2;
186
+ display: block;
187
+ width: 100%;
188
+ text-decoration: none;
189
+ color: black;
190
+ line-height: 1.2;
191
+ align: justify;
192
+ border-left: 5px solid transparent;
193
+ border-top: 5px solid transparent;
194
+ border-bottom: 5px solid transparent;
195
+ border-right: 0px;
196
+ font-weight: bold;
197
+ font-size: 18px;
198
+ padding-right: 5px;
199
+ font-family: Arial, Helvetica, sans-serif;
200
+ }
201
+ div.news-item{
202
+ background-color: #E5E4E2;
203
+ /*box-shadow: rgba(0, 0, 0, 0.4) -1px 0px 5px, rgba(0, 0, 0, 0.5) 0px 4px 5px -3px, rgba(0, 0, 0, 0.2) 0px -3px 0px inset;*/
204
+ box-shadow: rgba(0, 0, 0, 0.25) 0px 0px 5px 1px, rgba(0, 0, 0, 0.1) 0px 4px 5px -3px, rgba(0, 0, 0, 0.2) 0px -1px 0px inset;
205
+
206
+
207
+ }
208
+ div.news-item:hover{
209
+ box-shadow: none;
210
+ }
211
+
212
+ @media screen and (min-width: 800px) {
213
+ p.srctxt {
214
+ align:justify;
215
+ text-align: justify;
216
+ word-break: break-all;
217
+ font-size: 11px;
218
+ font-family: Arial, Helvetica, sans-serif;
219
+ }
220
+ .logo-img{
221
+ margin-right: 10px;
222
+ vertical-align: center;
223
+ /* position: relative; */
224
+ width: 34px;
225
+ height: 34px;
226
+
227
+ }
228
+ }
229
+
230
+ @media screen and (max-width: 800px) {
231
+ p.srctxt {
232
+ align:justify;
233
+ text-align: justify;
234
+ word-break: break-all;
235
+ font-size: 9px;
236
+ font-family: Arial, Helvetica, sans-serif;
237
+ }
238
+ .logo-img{
239
+ margin-right: 10px;
240
+ vertical-align: top;
241
+ /* position: absolute; */
242
+ width: 30px;
243
+ height: 30px;
244
+ }
245
+ }
246
+
247
+ .float{
248
+ position:fixed;
249
+ width:25px;
250
+ height:25px;
251
+ bottom:15px;
252
+ right:12px;
253
+ background-color: white;
254
+ border-radius:50%;
255
+ text-align:center;
256
+ vertical-align:center;
257
+ z-index: 99999998;
258
+ font-size:0;
259
+ cursor:pointer;
260
+ animation: beatan 0.8s infinite alternate;
261
+
262
+ }
263
+ .top-float{
264
+ position:fixed;
265
+ width:25px;
266
+ height:25px;
267
+ bottom:52px;
268
+ right:12px;
269
+ background-color: white;
270
+ border-radius:50%;
271
+ text-align:center;
272
+ vertical-align:center;
273
+ z-index: 99999998;
274
+ font-size:0;
275
+ cursor:pointer;
276
+ animation: beatan 0.8s infinite alternate;
277
+
278
+ }
279
+ .my-float{
280
+ margin-top:22px;
281
+ }
282
+
283
+ @keyframes beatan{
284
+ to { transform: scale(1.1); }
285
+ }
286
+
287
+ .loader {
288
+ position: fixed;
289
+ left: 0px;
290
+ top: 0px;
291
+ width: 100%;
292
+ height: 100%;
293
+ z-index: 99999999999;
294
+ background: url('../static/loader.gif') 50% 50% no-repeat rgb(255,255,255);
295
+ }
296
+
297
+ .highlight {
298
+ background-color: yellow;
299
+ font-weight: bold;
300
+ }
301
+
302
+ .input-container {
303
+ position: relative;
304
+ padding-bottom: 10px;
305
+ }
306
+
307
+ .keyword-input {
308
+
309
+ border-radius: 5px;
310
+ transition: border-color 0.3s ease;
311
+ border: 1px solid silver;
312
+ width: 10em;
313
+ height: 1.5em;
314
+ padding-left: 0.5em;
315
+ outline: none;
316
+ overflow: hidden;
317
+
318
+ }
319
+
320
+ .clear-btn {
321
+ position: absolute;
322
+ font-size: 20px;
323
+ left: 129px;
324
+ transform: translateY(-105%);
325
+ cursor: pointer;
326
+ opacity: 0;
327
+ transition: opacity 0.3s ease;
328
+ }
329
+
330
+ .clear-btn.show {
331
+ opacity: 1;
332
+ }
333
+
334
+ @media screen and (min-width: 800px) {
335
+ a.article-category {
336
+ background-color: #E5E4E2;
337
+ align:justify;
338
+ display: block;
339
+ height:100%;
340
+ width: relative;
341
+ text-decoration: none;
342
+ border-left: 5px solid transparent;
343
+ border-top: 0px;
344
+ font-weight: bold;
345
+ border-bottom: 1px solid transparent;
346
+ border-right: 0px;
347
+ padding-right: 5px;
348
+ font-size: 11px;
349
+ padding-bottom: 0px;
350
+ font-family: Arial, Helvetica, sans-serif;
351
+ color: green;
352
+ }
353
+ }
354
+
355
+ @media screen and (max-width: 800px) {
356
+ a.article-category {
357
+ background-color: #E5E4E2;
358
+ align:justify;
359
+ display: block;
360
+ height:100%;
361
+ font-weight: bold;
362
+ width: relative;
363
+ text-decoration: none;
364
+ border-left: 5px solid transparent;
365
+ border-top: 0px;
366
+ border-bottom: 1px solid transparent;
367
+ border-right: 0px;
368
+ padding-right: 5px;
369
+ font-size: 10px;
370
+ padding-bottom: 0px;
371
+ font-family: Arial, Helvetica, sans-serif;
372
+ color: green;
373
+ }
374
+ }
375
+
376
+ .content {
377
+ display: none;
378
+ font-family: Arial, Helvetica, sans-serif;
379
+
380
+ padding-right: 5px;
381
+
382
+ padding-top: 5px;
383
+ border-left: 5px solid transparent;
384
+ }
385
+
386
+ .container{
387
+ padding-bottom:10px;
388
+ }
389
+
390
+ .show-similar-button-container{
391
+ display: flex;
392
+ flex-direction: column;
393
+ align-items: center;
394
+ }
395
+
396
+ .similar-news-item:hover {
397
+ text-decoration: none;
398
+ }
399
+
400
+ @media screen and (min-width: 800px) {
401
+ .similar-news-item {
402
+ text-align: justify;
403
+ text-decoration: underline;
404
+ font-size: 14px;
405
+ font-family: Arial, Helvetica, sans-serif;
406
+ color: black;
407
+ display:inline-block;
408
+ padding-bottom: 10px;
409
+ width:100%;
410
+ /*white-space: nowrap;
411
+ overflow: hidden;
412
+ text-overflow: ellipsis;*/
413
+
414
+ }
415
+ }
416
+
417
+ @media screen and (max-width: 800px) {
418
+ .similar-news-item {
419
+ text-align: justify;
420
+ text-decoration: underline;
421
+ font-size: 12px;
422
+ font-family: Arial, Helvetica, sans-serif;
423
+ color: black;
424
+ display:inline-block;
425
+ padding-bottom: 8px;
426
+ width:100%;
427
+ /*white-space: nowrap;
428
+ overflow: hidden;
429
+ text-overflow: ellipsis;*/
430
+ }
431
+ }
432
+
433
+
434
+
435
+ .show-more {
436
+ background-color: #E5E4E2;
437
+ font-family: Arial, Helvetica, sans-serif;
438
+ border-radius:4px;
439
+ padding-top:3px;
440
+ padding-bottom:3px;
441
+ padding-left:3px;
442
+ padding-right:3px;
443
+ font-size: 12px;
444
+ display: box;
445
+ border: none;
446
+
447
+ }
448
+
449
+ .show-more:hover {
450
+ background-color: black;
451
+ color: white;
452
+ }
453
+
454
+ .show-less {
455
+ background-color: #E5E4E2;
456
+ font-family: Arial, Helvetica, sans-serif;
457
+ border-radius:4px;
458
+ padding-top:3px;
459
+ padding-bottom:3px;
460
+ padding-left:3px;
461
+ padding-right:3px;
462
+ font-size: 12px;
463
+ border: none;
464
+ display: none;
465
+ }
466
+
467
+ .show-less:hover {
468
+ background-color: black;
469
+ color: white;
470
+ }
471
+
472
+ .word-cloud-container{
473
+ word-wrap: break-word;
474
+ padding-bottom: 10px;
475
+
476
+ }
477
+
478
+ .wc-tokens{
479
+ font-family: Arial, Helvetica, sans-serif;
480
+ font-size: 13.2px;
481
+ cursor: pointer;
482
+ }
483
+
484
+ .wc-tokens:hover{
485
+ text-decoration: underline;
486
+ }
487
+
488
+ .word-cloud-section{
489
+ padding-bottom: 10px;
490
+ display: none;
491
+ word-wrap: break-word;
492
+ }
493
+
494
+ .show-more-word-cloud{
495
+ padding-bottom: 23px;
496
+ text-align: center;
497
+ }
498
+
499
+ .three-dots{
500
+ font-size: 30px;
501
+ margin: 0;
502
+ line-height:0;
503
+ vertical-align: top;
504
+ padding: 0;
505
+ cursor: pointer;
506
+ }
507
+
508
+ .three-dots:hover{
509
+ font-size: 25px;
510
  }
templates/index.html CHANGED
@@ -1,205 +1,205 @@
1
- <!DOCTYPE html>
2
- <html lang="en">
3
- <head>
4
-
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
- <link rel="preload" href="../static/loader.gif" as="image">
7
- <link rel="preload" href="../static/favicon_new.png" as="image">
8
- <link rel="preload" href="../static/refresh_reload_icon.png" as="image">
9
- <link rel="preload" href="../static/top-icon.png" as="image">
10
- <link rel="icon" href="../static/favicon_new.png" type="image/png">
11
-
12
- <meta charset="UTF-8">
13
- <title>Latest Indian News</title>
14
- <link rel="stylesheet" href="static/styles.css">
15
- <a id="top-loc"></a>
16
- <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
17
- <!--
18
- <script>
19
- $(window).load(function(){
20
- $('.loader').fadeOut();
21
- });
22
- </script>
23
- -->
24
-
25
- <script>
26
- function filterContent(match_case) {
27
- var keyword = document.getElementById("keywordInput").value;
28
- if (match_case == false)
29
- {
30
- /*var keyword = document.getElementById("keywordInput").value.toLowerCase(); */
31
- /*var regex = new RegExp("\\b" + keyword + "\\b", "gi"); */
32
- }
33
- var clearbtn = document.getElementById("clearBtn");
34
-
35
- if (keyword !== "")
36
- {
37
- clearbtn.style.opacity = 1;
38
- var items = document.getElementsByClassName("news-item");
39
- for (var i = 0; i < items.length; i++)
40
- {
41
- var headline = items[i].querySelector('.headline');
42
- var description = items[i].querySelector('.description');
43
- if (match_case == true)
44
- {
45
- var article_category = items[i].querySelector('.article-category');
46
- var src_time = items[i].querySelector('.time');
47
- var itemText = headline.textContent.concat(" ", description.textContent, " ", article_category.textContent, " ", src_time.textContent)
48
- }
49
- else
50
- {
51
- var itemText = headline.textContent.concat(" ", description.textContent, " ")
52
- }
53
-
54
- if (match_case == false)
55
- { var regex = new RegExp("\\b" + keyword + "\\b", "gi");
56
- itemText = itemText.toLowerCase();
57
- if (regex.test(itemText) == true)
58
- {
59
- items[i].style.display = "block";
60
- highlightKeyword(headline, keyword, match_case);
61
- highlightKeyword(description, keyword, match_case);
62
- }
63
- else
64
- {
65
- items[i].style.display = "none";
66
- }
67
- }
68
- else
69
- {
70
- if (itemText.includes(keyword))
71
- {
72
- items[i].style.display = "block";
73
- highlightKeyword(headline, keyword, match_case);
74
- highlightKeyword(description, keyword, match_case);
75
- highlightKeyword(article_category, keyword, match_case);
76
- highlightKeyword(src_time, keyword, match_case);
77
-
78
- }
79
- else
80
- {
81
- items[i].style.display = "none";
82
- }
83
- }
84
- }
85
- }
86
- else
87
- {
88
- clearFilter();
89
- }
90
- }
91
-
92
- function clearFilter() {
93
- var items = document.getElementsByClassName("news-item");
94
- var clearbtn = document.getElementById("clearBtn");
95
- clearbtn.style.opacity=0;
96
- for (var i = 0; i < items.length; i++) {
97
- var headline = items[i].querySelector('.headline');
98
- var description = items[i].querySelector('.description');
99
- var article_category = items[i].querySelector('.article-category');
100
- var src_time = items[i].querySelector('.time');
101
- items[i].style.display = "block";
102
- headline.innerHTML = headline.textContent; // Remove highlighting
103
- description.innerHTML = description.textContent; // Remove highlighting
104
- article_category.innerHTML = article_category.textContent; // Remove highlighting
105
- src_time.innerHTML = src_time.textContent; // Remove highlighting
106
- }
107
-
108
- document.getElementById("keywordInput").value = ""; // Clear input field
109
- }
110
-
111
- function highlightKeyword(element, keyword, match_case) {
112
- var regex = new RegExp(keyword);
113
- if (match_case == false)
114
- {
115
- var regex = new RegExp("\\b" + keyword + "\\b", 'gi');
116
- }
117
- element.innerHTML = element.textContent.replace(regex, function(match) {
118
- return '<span class="highlight">' + match + '</span>';
119
- });
120
- }
121
-
122
- </script>
123
-
124
- <script>
125
- document.addEventListener('DOMContentLoaded', function() {
126
- const containers = document.querySelectorAll('.container');
127
-
128
- containers.forEach(container => {
129
- const content = container.querySelector('.content');
130
- const showMoreBtn = container.querySelector('.show-more');
131
- const showLessBtn = container.querySelector('.show-less');
132
-
133
- showMoreBtn.addEventListener('click', function() {
134
- /* var similar_news_items = document.getElementsByClassName("content");
135
- var show_less_items = document.getElementsByClassName("show-less");
136
- var show_more_items = document.getElementsByClassName("show-more");
137
- for (var i = 0; i < similar_news_items.length; i++) {
138
- similar_news_items[i].style.display = 'none';
139
- show_more_items[i].style.display = 'block';
140
- show_less_items[i].style.display = 'none';
141
- }
142
- */
143
-
144
- content.style.display = 'block';
145
- content.style.opacity = 1;
146
- showMoreBtn.style.display = 'none';
147
- showLessBtn.style.display = 'block';
148
- });
149
-
150
- showLessBtn.addEventListener('click', function() {
151
- document.documentElement.style.scrollBehavior = "auto";
152
- var max_h = content.parentElement.parentElement.clientHeight;
153
- content.style.display = 'none';
154
- showMoreBtn.style.display = 'block';
155
- showLessBtn.style.display = 'none';
156
- var min_h = content.parentElement.parentElement.clientHeight;
157
- $(window).scrollTop($(window).scrollTop() - (max_h - min_h) || 0);
158
- document.documentElement.style.scrollBehavior = "smooth";
159
- });
160
- });
161
- });
162
- </script>
163
-
164
- <script>
165
- function wc_search(keyword)
166
- {
167
- clearFilter();
168
- document.getElementById("keywordInput").value = keyword;
169
- filterContent(false);
170
- }
171
-
172
- function word_cloud_display()
173
- {
174
- var word_cloud_section = document.getElementById("word-cloud-section-id");
175
- if (word_cloud_section.style.display == 'block')
176
- {
177
- word_cloud_section.style.display = 'none';
178
- }
179
- else
180
- {
181
- word_cloud_section.style.display = 'block';
182
- }
183
- }
184
- </script>
185
-
186
- <script>
187
- function showSearchInfo() {
188
- alert("- Search is case-sensitive.\n- Search for news category (NATION, WORLD, SPORTS, ENTERTAINMENT, BUSINESS, TECHNOLOGY, HEALTH and SCIENCE) to filter news by category.\n- Search for news source (like zeebiz.com, ndtv.com, etc.) to filter news by source.")
189
- }
190
- </script>
191
-
192
-
193
- </head>
194
- <body>
195
- <!--<div class="loader"></div>-->
196
- {{body | safe}}
197
-
198
- <a id="top_theme" class="top-float" onclick="window.scrollTo(0, 0);">
199
- <img id="top-theme-icon" alt="_" src="../static/top-icon.png" style="border-radius:50%; border: 3px solid #b9bfc4; background-color: white;" width="25px" height="25px" border="0">
200
- </a>
201
- <a href="javascript:window.location.reload(true)" id="theme" class="float">
202
- <img id="theme-icon" alt="_" src="../static/refresh_reload_icon.png" style="border-radius:50%; border: 3px solid #b9bfc4; background-color: white;" width=25px height=25px border="0" />
203
- </a>
204
- </body>
205
- </html>
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <link rel="preload" href="../static/loader.gif" as="image">
7
+ <link rel="preload" href="../static/favicon_new.png" as="image">
8
+ <link rel="preload" href="../static/refresh_reload_icon.png" as="image">
9
+ <link rel="preload" href="../static/top-icon.png" as="image">
10
+ <link rel="icon" href="../static/favicon_new.png" type="image/png">
11
+
12
+ <meta charset="UTF-8">
13
+ <title>Latest Indian News</title>
14
+ <link rel="stylesheet" href="static/styles.css">
15
+ <a id="top-loc"></a>
16
+ <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
17
+ <!--
18
+ <script>
19
+ $(window).load(function(){
20
+ $('.loader').fadeOut();
21
+ });
22
+ </script>
23
+ -->
24
+
25
+ <script>
26
+ function filterContent(match_case) {
27
+ var keyword = document.getElementById("keywordInput").value;
28
+ if (match_case == false)
29
+ {
30
+ /*var keyword = document.getElementById("keywordInput").value.toLowerCase(); */
31
+ /*var regex = new RegExp("\\b" + keyword + "\\b", "gi"); */
32
+ }
33
+ var clearbtn = document.getElementById("clearBtn");
34
+
35
+ if (keyword !== "")
36
+ {
37
+ clearbtn.style.opacity = 1;
38
+ var items = document.getElementsByClassName("news-item");
39
+ for (var i = 0; i < items.length; i++)
40
+ {
41
+ var headline = items[i].querySelector('.headline');
42
+ var description = items[i].querySelector('.description');
43
+ if (match_case == true)
44
+ {
45
+ var article_category = items[i].querySelector('.article-category');
46
+ var src_time = items[i].querySelector('.time');
47
+ var itemText = headline.textContent.concat(" ", description.textContent, " ", article_category.textContent, " ", src_time.textContent)
48
+ }
49
+ else
50
+ {
51
+ var itemText = headline.textContent.concat(" ", description.textContent, " ")
52
+ }
53
+
54
+ if (match_case == false)
55
+ { var regex = new RegExp("\\b" + keyword + "\\b", "gi");
56
+ itemText = itemText.toLowerCase();
57
+ if (regex.test(itemText) == true)
58
+ {
59
+ items[i].style.display = "block";
60
+ highlightKeyword(headline, keyword, match_case);
61
+ highlightKeyword(description, keyword, match_case);
62
+ }
63
+ else
64
+ {
65
+ items[i].style.display = "none";
66
+ }
67
+ }
68
+ else
69
+ {
70
+ if (itemText.includes(keyword))
71
+ {
72
+ items[i].style.display = "block";
73
+ highlightKeyword(headline, keyword, match_case);
74
+ highlightKeyword(description, keyword, match_case);
75
+ highlightKeyword(article_category, keyword, match_case);
76
+ highlightKeyword(src_time, keyword, match_case);
77
+
78
+ }
79
+ else
80
+ {
81
+ items[i].style.display = "none";
82
+ }
83
+ }
84
+ }
85
+ }
86
+ else
87
+ {
88
+ clearFilter();
89
+ }
90
+ }
91
+
92
+ function clearFilter() {
93
+ var items = document.getElementsByClassName("news-item");
94
+ var clearbtn = document.getElementById("clearBtn");
95
+ clearbtn.style.opacity=0;
96
+ for (var i = 0; i < items.length; i++) {
97
+ var headline = items[i].querySelector('.headline');
98
+ var description = items[i].querySelector('.description');
99
+ var article_category = items[i].querySelector('.article-category');
100
+ var src_time = items[i].querySelector('.time');
101
+ items[i].style.display = "block";
102
+ headline.innerHTML = headline.textContent; // Remove highlighting
103
+ description.innerHTML = description.textContent; // Remove highlighting
104
+ article_category.innerHTML = article_category.textContent; // Remove highlighting
105
+ src_time.innerHTML = src_time.textContent; // Remove highlighting
106
+ }
107
+
108
+ document.getElementById("keywordInput").value = ""; // Clear input field
109
+ }
110
+
111
+ function highlightKeyword(element, keyword, match_case) {
112
+ var regex = new RegExp(keyword);
113
+ if (match_case == false)
114
+ {
115
+ var regex = new RegExp("\\b" + keyword + "\\b", 'gi');
116
+ }
117
+ element.innerHTML = element.textContent.replace(regex, function(match) {
118
+ return '<span class="highlight">' + match + '</span>';
119
+ });
120
+ }
121
+
122
+ </script>
123
+
124
+ <script>
125
+ document.addEventListener('DOMContentLoaded', function() {
126
+ const containers = document.querySelectorAll('.container');
127
+
128
+ containers.forEach(container => {
129
+ const content = container.querySelector('.content');
130
+ const showMoreBtn = container.querySelector('.show-more');
131
+ const showLessBtn = container.querySelector('.show-less');
132
+
133
+ showMoreBtn.addEventListener('click', function() {
134
+ /* var similar_news_items = document.getElementsByClassName("content");
135
+ var show_less_items = document.getElementsByClassName("show-less");
136
+ var show_more_items = document.getElementsByClassName("show-more");
137
+ for (var i = 0; i < similar_news_items.length; i++) {
138
+ similar_news_items[i].style.display = 'none';
139
+ show_more_items[i].style.display = 'block';
140
+ show_less_items[i].style.display = 'none';
141
+ }
142
+ */
143
+
144
+ content.style.display = 'block';
145
+ content.style.opacity = 1;
146
+ showMoreBtn.style.display = 'none';
147
+ showLessBtn.style.display = 'block';
148
+ });
149
+
150
+ showLessBtn.addEventListener('click', function() {
151
+ document.documentElement.style.scrollBehavior = "auto";
152
+ var max_h = content.parentElement.parentElement.clientHeight;
153
+ content.style.display = 'none';
154
+ showMoreBtn.style.display = 'block';
155
+ showLessBtn.style.display = 'none';
156
+ var min_h = content.parentElement.parentElement.clientHeight;
157
+ $(window).scrollTop($(window).scrollTop() - (max_h - min_h) || 0);
158
+ document.documentElement.style.scrollBehavior = "smooth";
159
+ });
160
+ });
161
+ });
162
+ </script>
163
+
164
+ <script>
165
+ function wc_search(keyword)
166
+ {
167
+ clearFilter();
168
+ document.getElementById("keywordInput").value = keyword;
169
+ filterContent(false);
170
+ }
171
+
172
+ function word_cloud_display()
173
+ {
174
+ var word_cloud_section = document.getElementById("word-cloud-section-id");
175
+ if (word_cloud_section.style.display == 'block')
176
+ {
177
+ word_cloud_section.style.display = 'none';
178
+ }
179
+ else
180
+ {
181
+ word_cloud_section.style.display = 'block';
182
+ }
183
+ }
184
+ </script>
185
+
186
+ <script>
187
+ function showSearchInfo() {
188
+ alert("- Search is case-sensitive.\n- Search for news category (NATION, WORLD, SPORTS, ENTERTAINMENT, BUSINESS, TECHNOLOGY, HEALTH and SCIENCE) to filter news by category.\n- Search for news source (like zeebiz.com, ndtv.com, etc.) to filter news by source.")
189
+ }
190
+ </script>
191
+
192
+
193
+ </head>
194
+ <body>
195
+ <!--<div class="loader"></div>-->
196
+ {{body | safe}}
197
+
198
+ <a id="top_theme" class="top-float" onclick="window.scrollTo(0, 0);">
199
+ <img id="top-theme-icon" alt="_" src="../static/top-icon.png" style="border-radius:50%; border: 3px solid #b9bfc4; background-color: white;" width="25px" height="25px" border="0">
200
+ </a>
201
+ <a href="javascript:window.location.reload(true)" id="theme" class="float">
202
+ <img id="theme-icon" alt="_" src="../static/refresh_reload_icon.png" style="border-radius:50%; border: 3px solid #b9bfc4; background-color: white;" width=25px height=25px border="0" />
203
+ </a>
204
+ </body>
205
+ </html>
word_cloud.py CHANGED
@@ -1,653 +1,653 @@
1
- import numpy as np
2
- import pandas as pd
3
- import string
4
- from unidecode import unidecode
5
- from collections import Counter
6
-
7
-
8
- class TextPreprocessor:
9
- def __init__(self, remove_punct: bool = True, remove_digits: bool = True,
10
- remove_stop_words: bool = True,
11
- remove_short_words: bool = False, minlen: int = 1, maxlen: int = 1, top_p: float = None,
12
- bottom_p: float = None):
13
- self.remove_punct = remove_punct
14
- self.remove_digits = remove_digits
15
- self.remove_stop_words = remove_stop_words
16
- self.remove_short_words = remove_short_words
17
- self.minlen = minlen
18
- self.maxlen = maxlen
19
- self.top_p = top_p
20
- self.bottom_p = bottom_p
21
- self.words_to_remove = []
22
- self.stop_words = ["'d", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
23
- 'about',
24
- 'above',
25
- 'across',
26
- 'after',
27
- 'afterwards',
28
- 'again',
29
- 'against',
30
- 'ain',
31
- 'all',
32
- 'almost',
33
- 'alone',
34
- 'along',
35
- 'already',
36
- 'also',
37
- 'although',
38
- 'always',
39
- 'am',
40
- 'among',
41
- 'amongst',
42
- 'amount',
43
- 'an',
44
- 'and',
45
- 'another',
46
- 'any',
47
- 'anyhow',
48
- 'anyone',
49
- 'anything',
50
- 'anyway',
51
- 'anywhere',
52
- 'are',
53
- 'around',
54
- 'as',
55
- 'at',
56
- 'back',
57
- 'be',
58
- 'became',
59
- 'because',
60
- 'become',
61
- 'becomes',
62
- 'becoming',
63
- 'been',
64
- 'before',
65
- 'beforehand',
66
- 'behind',
67
- 'being',
68
- 'below',
69
- 'beside',
70
- 'besides',
71
- 'between',
72
- 'beyond',
73
- 'both',
74
- 'bottom',
75
- 'but',
76
- 'by',
77
- 'ca',
78
- 'call',
79
- 'can',
80
- 'cannot',
81
- 'could',
82
- 'couldn',
83
- "couldn't",
84
- 'd',
85
- 'did',
86
- 'do',
87
- 'does',
88
- 'doing',
89
- 'done',
90
- 'down',
91
- 'due',
92
- 'during',
93
- 'each',
94
- 'eight',
95
- 'either',
96
- 'eleven',
97
- 'else',
98
- 'elsewhere',
99
- 'empty',
100
- 'enough',
101
- 'even',
102
- 'ever',
103
- 'every',
104
- 'everyone',
105
- 'everything',
106
- 'everywhere',
107
- 'except',
108
- 'few',
109
- 'fifteen',
110
- 'fifty',
111
- 'first',
112
- 'five',
113
- 'for',
114
- 'former',
115
- 'formerly',
116
- 'forty',
117
- 'four',
118
- 'from',
119
- 'front',
120
- 'full',
121
- 'further',
122
- 'get',
123
- 'give',
124
- 'go',
125
- 'had',
126
- 'has',
127
- 'have',
128
- 'having',
129
- 'he',
130
- 'hence',
131
- 'her',
132
- 'here',
133
- 'hereafter',
134
- 'hereby',
135
- 'herein',
136
- 'hereupon',
137
- 'hers',
138
- 'herself',
139
- 'him',
140
- 'himself',
141
- 'his',
142
- 'how',
143
- 'however',
144
- 'hundred',
145
- 'i',
146
- 'if',
147
- 'in',
148
- 'indeed',
149
- 'into',
150
- 'is',
151
- 'it',
152
- "it's",
153
- 'its',
154
- 'itself',
155
- 'just',
156
- 'keep',
157
- 'last',
158
- 'latter',
159
- 'latterly',
160
- 'least',
161
- 'less',
162
- 'll',
163
- 'm',
164
- 'ma',
165
- 'made',
166
- 'make',
167
- 'many',
168
- 'say',
169
- 'said',
170
- 'says',
171
- 'told',
172
- 'tell',
173
- 'may',
174
- 'me',
175
- 'meanwhile',
176
- 'might',
177
- 'mine',
178
- 'more',
179
- 'moreover',
180
- 'most',
181
- 'mostly',
182
- 'move',
183
- 'much',
184
- 'must',
185
- 'my',
186
- 'myself',
187
- 'name',
188
- 'namely',
189
- 'neither',
190
- 'never',
191
- 'nevertheless',
192
- 'next',
193
- 'nine',
194
- 'no',
195
- 'nobody',
196
- 'none',
197
- 'noone',
198
- 'nor',
199
- 'not',
200
- 'nothing',
201
- 'now',
202
- 'nowhere',
203
- 'o',
204
- 'of',
205
- 'off',
206
- 'often',
207
- 'on',
208
- 'once',
209
- 'one',
210
- 'only',
211
- 'onto',
212
- 'or',
213
- 'other',
214
- 'others',
215
- 'otherwise',
216
- 'our',
217
- 'ours',
218
- 'ourselves',
219
- 'out',
220
- 'over',
221
- 'own',
222
- 'part',
223
- 'per',
224
- 'perhaps',
225
- 'please',
226
- 'put',
227
- 'quite',
228
- 'rather',
229
- 're',
230
- 'rs',
231
- 'really',
232
- 'regarding',
233
- 's',
234
- 'same',
235
- 'say',
236
- 'see',
237
- 'seem',
238
- 'seemed',
239
- 'seeming',
240
- 'seems',
241
- 'serious',
242
- 'several',
243
- 'shan',
244
- "shan't",
245
- 'she',
246
- "she's",
247
- 'should',
248
- "should've",
249
- 'shouldn',
250
- "shouldn't",
251
- 'show',
252
- 'side',
253
- 'since',
254
- 'six',
255
- 'sixty',
256
- 'so',
257
- 'some',
258
- 'somehow',
259
- 'someone',
260
- 'something',
261
- 'sometime',
262
- 'sometimes',
263
- 'somewhere',
264
- 'still',
265
- 'such',
266
- 't',
267
- 'take',
268
- 'ten',
269
- 'than',
270
- 'that',
271
- "that'll",
272
- 'the',
273
- 'their',
274
- 'theirs',
275
- 'them',
276
- 'themselves',
277
- 'then',
278
- 'thence',
279
- 'there',
280
- 'thereafter',
281
- 'thereby',
282
- 'therefore',
283
- 'therein',
284
- 'thereupon',
285
- 'these',
286
- 'they',
287
- 'third',
288
- 'this',
289
- 'those',
290
- 'though',
291
- 'three',
292
- 'through',
293
- 'throughout',
294
- 'thru',
295
- 'thus',
296
- 'to',
297
- 'together',
298
- 'too',
299
- 'top',
300
- 'toward',
301
- 'towards',
302
- 'twelve',
303
- 'twenty',
304
- 'two',
305
- 'under',
306
- 'unless',
307
- 'until',
308
- 'up',
309
- 'upon',
310
- 'us',
311
- 'used',
312
- 'using',
313
- 'various',
314
- 've',
315
- 'very',
316
- 'via',
317
- 'was',
318
- 'wasn',
319
- "wasn't",
320
- 'we',
321
- 'well',
322
- 'were',
323
- 'weren',
324
- "weren't",
325
- 'what',
326
- 'whatever',
327
- 'when',
328
- 'whence',
329
- 'whenever',
330
- 'where',
331
- 'whereafter',
332
- 'whereas',
333
- 'whereby',
334
- 'wherein',
335
- 'whereupon',
336
- 'wherever',
337
- 'whether',
338
- 'which',
339
- 'while',
340
- 'whither',
341
- 'who',
342
- 'whoever',
343
- 'whole',
344
- 'whom',
345
- 'whose',
346
- 'why',
347
- 'will',
348
- 'with',
349
- 'within',
350
- 'without',
351
- 'won',
352
- "won't",
353
- 'would',
354
- 'wouldn',
355
- "wouldn't",
356
- 'y',
357
- 'yet',
358
- 'you',
359
- "you'd",
360
- "you'll",
361
- "you're",
362
- "you've",
363
- 'your',
364
- 'yours',
365
- 'yourself',
366
- 'yourselves',
367
- '‘d',
368
- '‘ll',
369
- '‘m',
370
- '‘re',
371
- '‘s',
372
- '‘ve',
373
- '’d',
374
- '’ll',
375
- '’m',
376
- '’re',
377
- 'new',
378
- 'old',
379
- '’s',
380
- '’ve']
381
-
382
- self.contraction_to_expansion = {"ain't": "am not",
383
- "aren't": "are not",
384
- "can't": "cannot",
385
- "can't've": "cannot have",
386
- "'cause": "because",
387
- "could've": "could have",
388
- "couldn't": "could not",
389
- "couldn't've": "could not have",
390
- "didn't": "did not",
391
- "doesn't": "does not",
392
- "don't": "do not",
393
- "hadn't": "had not",
394
- "hadn't've": "had not have",
395
- "hasn't": "has not",
396
- "haven't": "have not",
397
- "he'd": "he would",
398
- "he'd've": "he would have",
399
- "he'll": "he will",
400
- "he'll've": "he will have",
401
- "he's": "he is",
402
- "how'd": "how did",
403
- "how'd'y": "how do you",
404
- "how'll": "how will",
405
- "how's": "how is",
406
- "i'd": "i would",
407
- "i'd've": "i would have",
408
- "i'll": "i will",
409
- "i'll've": "i will have",
410
- "i'm": "i am",
411
- "i've": "i have",
412
- "isn't": "is not",
413
- "it'd": "it had",
414
- "it'd've": "it would have",
415
- "it'll": "it will",
416
- "it'll've": "it will have",
417
- "it's": "it is",
418
- "let's": "let us",
419
- "ma'am": "madam",
420
- "mayn't": "may not",
421
- "might've": "might have",
422
- "mightn't": "might not",
423
- "mightn't've": "might not have",
424
- "must've": "must have",
425
- "mustn't": "must not",
426
- "mustn't've": "must not have",
427
- "needn't": "need not",
428
- "needn't've": "need not have",
429
- "o'clock": "of the clock",
430
- "oughtn't": "ought not",
431
- "oughtn't've": "ought not have",
432
- "shan't": "shall not",
433
- "sha'n't": "shall not",
434
- "shan't've": "shall not have",
435
- "she'd": "she would",
436
- "she'd've": "she would have",
437
- "she'll": "she will",
438
- "she'll've": "she will have",
439
- "she's": "she is",
440
- "should've": "should have",
441
- "shouldn't": "should not",
442
- "shouldn't've": "should not have",
443
- "so've": "so have",
444
- "so's": "so is",
445
- "that'd": "that would",
446
- "that'd've": "that would have",
447
- "that's": "that is",
448
- "there'd": "there had",
449
- "there'd've": "there would have",
450
- "there's": "there is",
451
- "they'd": "they would",
452
- "they'd've": "they would have",
453
- "they'll": "they will",
454
- "they'll've": "they will have",
455
- "they're": "they are",
456
- "they've": "they have",
457
- "to've": "to have",
458
- "wasn't": "was not",
459
- "we'd": "we had",
460
- "we'd've": "we would have",
461
- "we'll": "we will",
462
- "we'll've": "we will have",
463
- "we're": "we are",
464
- "we've": "we have",
465
- "weren't": "were not",
466
- "what'll": "what will",
467
- "what'll've": "what will have",
468
- "what're": "what are",
469
- "what's": "what is",
470
- "what've": "what have",
471
- "when's": "when is",
472
- "when've": "when have",
473
- "where'd": "where did",
474
- "where's": "where is",
475
- "where've": "where have",
476
- "who'll": "who will",
477
- "who'll've": "who will have",
478
- "who's": "who is",
479
- "who've": "who have",
480
- "why's": "why is",
481
- "why've": "why have",
482
- "will've": "will have",
483
- "won't": "will not",
484
- "won't've": "will not have",
485
- "would've": "would have",
486
- "wouldn't": "would not",
487
- "wouldn't've": "would not have",
488
- "y'all": "you all",
489
- "y'alls": "you alls",
490
- "y'all'd": "you all would",
491
- "y'all'd've": "you all would have",
492
- "y'all're": "you all are",
493
- "y'all've": "you all have",
494
- "you'd": "you had",
495
- "you'd've": "you would have",
496
- "you'll": "you you will",
497
- "you'll've": "you you will have",
498
- "you're": "you are",
499
- "you've": "you have"
500
- }
501
-
502
- @staticmethod
503
- def __remove_double_whitespaces(string: str):
504
- return " ".join(string.split())
505
-
506
- async def __remove_url(self, string_series: pd.Series):
507
- """
508
- Removes URLs m text
509
- :param string_series: pd.Series, input string series
510
- :return: pd.Series, cleaned string series
511
- """
512
- clean_string_series = string_series.str.replace(
513
- pat=r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})",
514
- repl=" ", regex=True).copy()
515
- return clean_string_series.map(self.__remove_double_whitespaces)
516
-
517
- async def __expand(self, string_series: pd.Series):
518
- """
519
- Replaces contractions with expansions. eg. don't wit do not.
520
- :param string_series: pd.Series, input string series
521
- :return: pd.Series, cleaned string series
522
- """
523
- clean_string_series = string_series.copy()
524
- for c, e in self.contraction_to_expansion.items():
525
- clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
526
- return clean_string_series.map(self.__remove_double_whitespaces)
527
-
528
- async def __remove_punct(self, string_series: pd.Series):
529
- """
530
- Removes punctuations from the input string.
531
- :param string_series: pd.Series, input string series
532
- :return: pd.Series, cleaned string series
533
- """
534
- clean_string_series = string_series.copy()
535
- puncts = [r'\n', r'\r', r'\t']
536
- puncts.extend(list(string.punctuation))
537
- for i in puncts:
538
- clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
539
- return clean_string_series.map(self.__remove_double_whitespaces)
540
-
541
- async def __remove_digits(self, string_series: pd.Series):
542
- """
543
- Removes digits from the input string.
544
- :param string_series: pd.Series, input string series
545
- :return: pd.Series, cleaned string series
546
- """
547
- clean_string_series = string_series.str.replace(pat=r'\d', repl=" ", regex=True).copy()
548
- return clean_string_series.map(self.__remove_double_whitespaces)
549
-
550
- @staticmethod
551
- async def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
552
- """
553
- Reomves words/tokens where minlen <= len <= maxlen.
554
- :param string_series: pd.Series, input string series
555
- :param minlen: int, minimum length of token to be removed.
556
- :param maxlen: int, maximum length of token to be removed.
557
- :return: pd.Series, cleaned string series
558
- """
559
- clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split() if
560
- (len(word) > maxlen) or (len(word) < minlen)]))
561
- return clean_string_series
562
-
563
- async def __remove_stop_words(self, string_series: pd.Series):
564
- """
565
- Removes stop words from the input string.
566
- :param string_series: pd.Series, input string series
567
- :return: pd.Series, cleaned string series
568
- """
569
- def str_remove_stop_words(string: str):
570
- stops = self.stop_words
571
- return " ".join([token for token in string.split() if token not in stops])
572
-
573
- return string_series.map(str_remove_stop_words)
574
-
575
- async def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
576
- bottom_p: int = None, dataset: str = 'train'):
577
- """
578
- Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
579
- :param string_series: pd.Series, input string series
580
- :param top_p: float, percent of frequent words to remove.
581
- :param bottom_p: float, percent of rare words to remove.
582
- :param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
583
- :return: pd.Series, cleaned string series
584
- """
585
- if dataset == 'train':
586
- if top_p is None:
587
- top_p = 0
588
- if bottom_p is None:
589
- bottom_p = 0
590
-
591
- if top_p > 0 or bottom_p > 0:
592
- word_freq = pd.Series(" ".join(string_series).split()).value_counts()
593
- n_words = len(word_freq)
594
-
595
- if top_p > 0:
596
- self.words_to_remove.extend([*word_freq.index[: int(np.ceil(top_p * n_words))]])
597
-
598
- if bottom_p > 0:
599
- self.words_to_remove.extend([*word_freq.index[-int(np.ceil(bottom_p * n_words)):]])
600
-
601
- if len(self.words_to_remove) == 0:
602
- return string_series
603
- else:
604
- clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split()
605
- if word not in self.words_to_remove]))
606
- return clean_string_series
607
-
608
- async def preprocess(self, string_series: pd.Series, dataset: str = "train"):
609
- """
610
- Entry point.
611
- :param string_series: pd.Series, input string series
612
- :param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
613
- :return: pd.Series, cleaned string series
614
- """
615
- string_series = string_series.str.lower().copy()
616
- string_series = string_series.map(unidecode).copy()
617
- string_series = await self.__remove_url(string_series=string_series)
618
- string_series = await self.__expand(string_series=string_series)
619
-
620
- if self.remove_punct:
621
- string_series = await self.__remove_punct(string_series=string_series)
622
- if self.remove_digits:
623
- string_series = await self.__remove_digits(string_series=string_series)
624
- if self.remove_stop_words:
625
- string_series = await self.__remove_stop_words(string_series=string_series)
626
- if self.remove_short_words:
627
- string_series = await self.__remove_short_words(string_series=string_series,
628
- minlen=self.minlen,
629
- maxlen=self.maxlen)
630
- string_series = await self.__remove_top_bottom_words(string_series=string_series,
631
- top_p=self.top_p,
632
- bottom_p=self.bottom_p, dataset=dataset)
633
-
634
- string_series = string_series.str.strip().copy()
635
- string_series.replace(to_replace="", value="this is an empty message", inplace=True)
636
-
637
- return string_series
638
-
639
-
640
- async def get_frequent_words_html(df):
641
- text_preprocess = TextPreprocessor()
642
- preprocessed_txt = await text_preprocess.preprocess(df['title'] + ' ' + df['description'])
643
- counter = Counter(' '.join([*preprocessed_txt]).split())
644
-
645
- freq_tokens_html = '<div class="word-cloud-container">'
646
- n = 1
647
- for i, j in counter.most_common(25):
648
- freq_tokens_html += f'<a class="wc-tokens" onclick=wc_search("{i}")>{i}</a>{"&nbsp;" * np.random.randint(3, 7, 1)[0]}'
649
- if n == 5:
650
- freq_tokens_html += '<div class="word-cloud-section" id="word-cloud-section-id">'
651
- n += 1
652
- freq_tokens_html += '</div></div>'
653
  return freq_tokens_html
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import string
4
+ from unidecode import unidecode
5
+ from collections import Counter
6
+
7
+
8
+ class TextPreprocessor:
9
+ def __init__(self, remove_punct: bool = True, remove_digits: bool = True,
10
+ remove_stop_words: bool = True,
11
+ remove_short_words: bool = False, minlen: int = 1, maxlen: int = 1, top_p: float = None,
12
+ bottom_p: float = None):
13
+ self.remove_punct = remove_punct
14
+ self.remove_digits = remove_digits
15
+ self.remove_stop_words = remove_stop_words
16
+ self.remove_short_words = remove_short_words
17
+ self.minlen = minlen
18
+ self.maxlen = maxlen
19
+ self.top_p = top_p
20
+ self.bottom_p = bottom_p
21
+ self.words_to_remove = []
22
+ self.stop_words = ["'d", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
23
+ 'about',
24
+ 'above',
25
+ 'across',
26
+ 'after',
27
+ 'afterwards',
28
+ 'again',
29
+ 'against',
30
+ 'ain',
31
+ 'all',
32
+ 'almost',
33
+ 'alone',
34
+ 'along',
35
+ 'already',
36
+ 'also',
37
+ 'although',
38
+ 'always',
39
+ 'am',
40
+ 'among',
41
+ 'amongst',
42
+ 'amount',
43
+ 'an',
44
+ 'and',
45
+ 'another',
46
+ 'any',
47
+ 'anyhow',
48
+ 'anyone',
49
+ 'anything',
50
+ 'anyway',
51
+ 'anywhere',
52
+ 'are',
53
+ 'around',
54
+ 'as',
55
+ 'at',
56
+ 'back',
57
+ 'be',
58
+ 'became',
59
+ 'because',
60
+ 'become',
61
+ 'becomes',
62
+ 'becoming',
63
+ 'been',
64
+ 'before',
65
+ 'beforehand',
66
+ 'behind',
67
+ 'being',
68
+ 'below',
69
+ 'beside',
70
+ 'besides',
71
+ 'between',
72
+ 'beyond',
73
+ 'both',
74
+ 'bottom',
75
+ 'but',
76
+ 'by',
77
+ 'ca',
78
+ 'call',
79
+ 'can',
80
+ 'cannot',
81
+ 'could',
82
+ 'couldn',
83
+ "couldn't",
84
+ 'd',
85
+ 'did',
86
+ 'do',
87
+ 'does',
88
+ 'doing',
89
+ 'done',
90
+ 'down',
91
+ 'due',
92
+ 'during',
93
+ 'each',
94
+ 'eight',
95
+ 'either',
96
+ 'eleven',
97
+ 'else',
98
+ 'elsewhere',
99
+ 'empty',
100
+ 'enough',
101
+ 'even',
102
+ 'ever',
103
+ 'every',
104
+ 'everyone',
105
+ 'everything',
106
+ 'everywhere',
107
+ 'except',
108
+ 'few',
109
+ 'fifteen',
110
+ 'fifty',
111
+ 'first',
112
+ 'five',
113
+ 'for',
114
+ 'former',
115
+ 'formerly',
116
+ 'forty',
117
+ 'four',
118
+ 'from',
119
+ 'front',
120
+ 'full',
121
+ 'further',
122
+ 'get',
123
+ 'give',
124
+ 'go',
125
+ 'had',
126
+ 'has',
127
+ 'have',
128
+ 'having',
129
+ 'he',
130
+ 'hence',
131
+ 'her',
132
+ 'here',
133
+ 'hereafter',
134
+ 'hereby',
135
+ 'herein',
136
+ 'hereupon',
137
+ 'hers',
138
+ 'herself',
139
+ 'him',
140
+ 'himself',
141
+ 'his',
142
+ 'how',
143
+ 'however',
144
+ 'hundred',
145
+ 'i',
146
+ 'if',
147
+ 'in',
148
+ 'indeed',
149
+ 'into',
150
+ 'is',
151
+ 'it',
152
+ "it's",
153
+ 'its',
154
+ 'itself',
155
+ 'just',
156
+ 'keep',
157
+ 'last',
158
+ 'latter',
159
+ 'latterly',
160
+ 'least',
161
+ 'less',
162
+ 'll',
163
+ 'm',
164
+ 'ma',
165
+ 'made',
166
+ 'make',
167
+ 'many',
168
+ 'say',
169
+ 'said',
170
+ 'says',
171
+ 'told',
172
+ 'tell',
173
+ 'may',
174
+ 'me',
175
+ 'meanwhile',
176
+ 'might',
177
+ 'mine',
178
+ 'more',
179
+ 'moreover',
180
+ 'most',
181
+ 'mostly',
182
+ 'move',
183
+ 'much',
184
+ 'must',
185
+ 'my',
186
+ 'myself',
187
+ 'name',
188
+ 'namely',
189
+ 'neither',
190
+ 'never',
191
+ 'nevertheless',
192
+ 'next',
193
+ 'nine',
194
+ 'no',
195
+ 'nobody',
196
+ 'none',
197
+ 'noone',
198
+ 'nor',
199
+ 'not',
200
+ 'nothing',
201
+ 'now',
202
+ 'nowhere',
203
+ 'o',
204
+ 'of',
205
+ 'off',
206
+ 'often',
207
+ 'on',
208
+ 'once',
209
+ 'one',
210
+ 'only',
211
+ 'onto',
212
+ 'or',
213
+ 'other',
214
+ 'others',
215
+ 'otherwise',
216
+ 'our',
217
+ 'ours',
218
+ 'ourselves',
219
+ 'out',
220
+ 'over',
221
+ 'own',
222
+ 'part',
223
+ 'per',
224
+ 'perhaps',
225
+ 'please',
226
+ 'put',
227
+ 'quite',
228
+ 'rather',
229
+ 're',
230
+ 'rs',
231
+ 'really',
232
+ 'regarding',
233
+ 's',
234
+ 'same',
235
+ 'say',
236
+ 'see',
237
+ 'seem',
238
+ 'seemed',
239
+ 'seeming',
240
+ 'seems',
241
+ 'serious',
242
+ 'several',
243
+ 'shan',
244
+ "shan't",
245
+ 'she',
246
+ "she's",
247
+ 'should',
248
+ "should've",
249
+ 'shouldn',
250
+ "shouldn't",
251
+ 'show',
252
+ 'side',
253
+ 'since',
254
+ 'six',
255
+ 'sixty',
256
+ 'so',
257
+ 'some',
258
+ 'somehow',
259
+ 'someone',
260
+ 'something',
261
+ 'sometime',
262
+ 'sometimes',
263
+ 'somewhere',
264
+ 'still',
265
+ 'such',
266
+ 't',
267
+ 'take',
268
+ 'ten',
269
+ 'than',
270
+ 'that',
271
+ "that'll",
272
+ 'the',
273
+ 'their',
274
+ 'theirs',
275
+ 'them',
276
+ 'themselves',
277
+ 'then',
278
+ 'thence',
279
+ 'there',
280
+ 'thereafter',
281
+ 'thereby',
282
+ 'therefore',
283
+ 'therein',
284
+ 'thereupon',
285
+ 'these',
286
+ 'they',
287
+ 'third',
288
+ 'this',
289
+ 'those',
290
+ 'though',
291
+ 'three',
292
+ 'through',
293
+ 'throughout',
294
+ 'thru',
295
+ 'thus',
296
+ 'to',
297
+ 'together',
298
+ 'too',
299
+ 'top',
300
+ 'toward',
301
+ 'towards',
302
+ 'twelve',
303
+ 'twenty',
304
+ 'two',
305
+ 'under',
306
+ 'unless',
307
+ 'until',
308
+ 'up',
309
+ 'upon',
310
+ 'us',
311
+ 'used',
312
+ 'using',
313
+ 'various',
314
+ 've',
315
+ 'very',
316
+ 'via',
317
+ 'was',
318
+ 'wasn',
319
+ "wasn't",
320
+ 'we',
321
+ 'well',
322
+ 'were',
323
+ 'weren',
324
+ "weren't",
325
+ 'what',
326
+ 'whatever',
327
+ 'when',
328
+ 'whence',
329
+ 'whenever',
330
+ 'where',
331
+ 'whereafter',
332
+ 'whereas',
333
+ 'whereby',
334
+ 'wherein',
335
+ 'whereupon',
336
+ 'wherever',
337
+ 'whether',
338
+ 'which',
339
+ 'while',
340
+ 'whither',
341
+ 'who',
342
+ 'whoever',
343
+ 'whole',
344
+ 'whom',
345
+ 'whose',
346
+ 'why',
347
+ 'will',
348
+ 'with',
349
+ 'within',
350
+ 'without',
351
+ 'won',
352
+ "won't",
353
+ 'would',
354
+ 'wouldn',
355
+ "wouldn't",
356
+ 'y',
357
+ 'yet',
358
+ 'you',
359
+ "you'd",
360
+ "you'll",
361
+ "you're",
362
+ "you've",
363
+ 'your',
364
+ 'yours',
365
+ 'yourself',
366
+ 'yourselves',
367
+ '‘d',
368
+ '‘ll',
369
+ '‘m',
370
+ '‘re',
371
+ '‘s',
372
+ '‘ve',
373
+ '’d',
374
+ '’ll',
375
+ '’m',
376
+ '’re',
377
+ 'new',
378
+ 'old',
379
+ '’s',
380
+ '’ve']
381
+
382
+ self.contraction_to_expansion = {"ain't": "am not",
383
+ "aren't": "are not",
384
+ "can't": "cannot",
385
+ "can't've": "cannot have",
386
+ "'cause": "because",
387
+ "could've": "could have",
388
+ "couldn't": "could not",
389
+ "couldn't've": "could not have",
390
+ "didn't": "did not",
391
+ "doesn't": "does not",
392
+ "don't": "do not",
393
+ "hadn't": "had not",
394
+ "hadn't've": "had not have",
395
+ "hasn't": "has not",
396
+ "haven't": "have not",
397
+ "he'd": "he would",
398
+ "he'd've": "he would have",
399
+ "he'll": "he will",
400
+ "he'll've": "he will have",
401
+ "he's": "he is",
402
+ "how'd": "how did",
403
+ "how'd'y": "how do you",
404
+ "how'll": "how will",
405
+ "how's": "how is",
406
+ "i'd": "i would",
407
+ "i'd've": "i would have",
408
+ "i'll": "i will",
409
+ "i'll've": "i will have",
410
+ "i'm": "i am",
411
+ "i've": "i have",
412
+ "isn't": "is not",
413
+ "it'd": "it had",
414
+ "it'd've": "it would have",
415
+ "it'll": "it will",
416
+ "it'll've": "it will have",
417
+ "it's": "it is",
418
+ "let's": "let us",
419
+ "ma'am": "madam",
420
+ "mayn't": "may not",
421
+ "might've": "might have",
422
+ "mightn't": "might not",
423
+ "mightn't've": "might not have",
424
+ "must've": "must have",
425
+ "mustn't": "must not",
426
+ "mustn't've": "must not have",
427
+ "needn't": "need not",
428
+ "needn't've": "need not have",
429
+ "o'clock": "of the clock",
430
+ "oughtn't": "ought not",
431
+ "oughtn't've": "ought not have",
432
+ "shan't": "shall not",
433
+ "sha'n't": "shall not",
434
+ "shan't've": "shall not have",
435
+ "she'd": "she would",
436
+ "she'd've": "she would have",
437
+ "she'll": "she will",
438
+ "she'll've": "she will have",
439
+ "she's": "she is",
440
+ "should've": "should have",
441
+ "shouldn't": "should not",
442
+ "shouldn't've": "should not have",
443
+ "so've": "so have",
444
+ "so's": "so is",
445
+ "that'd": "that would",
446
+ "that'd've": "that would have",
447
+ "that's": "that is",
448
+ "there'd": "there had",
449
+ "there'd've": "there would have",
450
+ "there's": "there is",
451
+ "they'd": "they would",
452
+ "they'd've": "they would have",
453
+ "they'll": "they will",
454
+ "they'll've": "they will have",
455
+ "they're": "they are",
456
+ "they've": "they have",
457
+ "to've": "to have",
458
+ "wasn't": "was not",
459
+ "we'd": "we had",
460
+ "we'd've": "we would have",
461
+ "we'll": "we will",
462
+ "we'll've": "we will have",
463
+ "we're": "we are",
464
+ "we've": "we have",
465
+ "weren't": "were not",
466
+ "what'll": "what will",
467
+ "what'll've": "what will have",
468
+ "what're": "what are",
469
+ "what's": "what is",
470
+ "what've": "what have",
471
+ "when's": "when is",
472
+ "when've": "when have",
473
+ "where'd": "where did",
474
+ "where's": "where is",
475
+ "where've": "where have",
476
+ "who'll": "who will",
477
+ "who'll've": "who will have",
478
+ "who's": "who is",
479
+ "who've": "who have",
480
+ "why's": "why is",
481
+ "why've": "why have",
482
+ "will've": "will have",
483
+ "won't": "will not",
484
+ "won't've": "will not have",
485
+ "would've": "would have",
486
+ "wouldn't": "would not",
487
+ "wouldn't've": "would not have",
488
+ "y'all": "you all",
489
+ "y'alls": "you alls",
490
+ "y'all'd": "you all would",
491
+ "y'all'd've": "you all would have",
492
+ "y'all're": "you all are",
493
+ "y'all've": "you all have",
494
+ "you'd": "you had",
495
+ "you'd've": "you would have",
496
+ "you'll": "you you will",
497
+ "you'll've": "you you will have",
498
+ "you're": "you are",
499
+ "you've": "you have"
500
+ }
501
+
502
+ @staticmethod
503
+ def __remove_double_whitespaces(string: str):
504
+ return " ".join(string.split())
505
+
506
+ async def __remove_url(self, string_series: pd.Series):
507
+ """
508
+ Removes URLs m text
509
+ :param string_series: pd.Series, input string series
510
+ :return: pd.Series, cleaned string series
511
+ """
512
+ clean_string_series = string_series.str.replace(
513
+ pat=r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})",
514
+ repl=" ", regex=True).copy()
515
+ return clean_string_series.map(self.__remove_double_whitespaces)
516
+
517
+ async def __expand(self, string_series: pd.Series):
518
+ """
519
+ Replaces contractions with expansions. eg. don't wit do not.
520
+ :param string_series: pd.Series, input string series
521
+ :return: pd.Series, cleaned string series
522
+ """
523
+ clean_string_series = string_series.copy()
524
+ for c, e in self.contraction_to_expansion.items():
525
+ clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
526
+ return clean_string_series.map(self.__remove_double_whitespaces)
527
+
528
+ async def __remove_punct(self, string_series: pd.Series):
529
+ """
530
+ Removes punctuations from the input string.
531
+ :param string_series: pd.Series, input string series
532
+ :return: pd.Series, cleaned string series
533
+ """
534
+ clean_string_series = string_series.copy()
535
+ puncts = [r'\n', r'\r', r'\t']
536
+ puncts.extend(list(string.punctuation))
537
+ for i in puncts:
538
+ clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
539
+ return clean_string_series.map(self.__remove_double_whitespaces)
540
+
541
+ async def __remove_digits(self, string_series: pd.Series):
542
+ """
543
+ Removes digits from the input string.
544
+ :param string_series: pd.Series, input string series
545
+ :return: pd.Series, cleaned string series
546
+ """
547
+ clean_string_series = string_series.str.replace(pat=r'\d', repl=" ", regex=True).copy()
548
+ return clean_string_series.map(self.__remove_double_whitespaces)
549
+
550
+ @staticmethod
551
+ async def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
552
+ """
553
+ Reomves words/tokens where minlen <= len <= maxlen.
554
+ :param string_series: pd.Series, input string series
555
+ :param minlen: int, minimum length of token to be removed.
556
+ :param maxlen: int, maximum length of token to be removed.
557
+ :return: pd.Series, cleaned string series
558
+ """
559
+ clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split() if
560
+ (len(word) > maxlen) or (len(word) < minlen)]))
561
+ return clean_string_series
562
+
563
+ async def __remove_stop_words(self, string_series: pd.Series):
564
+ """
565
+ Removes stop words from the input string.
566
+ :param string_series: pd.Series, input string series
567
+ :return: pd.Series, cleaned string series
568
+ """
569
+ def str_remove_stop_words(string: str):
570
+ stops = self.stop_words
571
+ return " ".join([token for token in string.split() if token not in stops])
572
+
573
+ return string_series.map(str_remove_stop_words)
574
+
575
+ async def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
576
+ bottom_p: int = None, dataset: str = 'train'):
577
+ """
578
+ Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
579
+ :param string_series: pd.Series, input string series
580
+ :param top_p: float, percent of frequent words to remove.
581
+ :param bottom_p: float, percent of rare words to remove.
582
+ :param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
583
+ :return: pd.Series, cleaned string series
584
+ """
585
+ if dataset == 'train':
586
+ if top_p is None:
587
+ top_p = 0
588
+ if bottom_p is None:
589
+ bottom_p = 0
590
+
591
+ if top_p > 0 or bottom_p > 0:
592
+ word_freq = pd.Series(" ".join(string_series).split()).value_counts()
593
+ n_words = len(word_freq)
594
+
595
+ if top_p > 0:
596
+ self.words_to_remove.extend([*word_freq.index[: int(np.ceil(top_p * n_words))]])
597
+
598
+ if bottom_p > 0:
599
+ self.words_to_remove.extend([*word_freq.index[-int(np.ceil(bottom_p * n_words)):]])
600
+
601
+ if len(self.words_to_remove) == 0:
602
+ return string_series
603
+ else:
604
+ clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split()
605
+ if word not in self.words_to_remove]))
606
+ return clean_string_series
607
+
608
+ async def preprocess(self, string_series: pd.Series, dataset: str = "train"):
609
+ """
610
+ Entry point.
611
+ :param string_series: pd.Series, input string series
612
+ :param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
613
+ :return: pd.Series, cleaned string series
614
+ """
615
+ string_series = string_series.str.lower().copy()
616
+ string_series = string_series.map(unidecode).copy()
617
+ string_series = await self.__remove_url(string_series=string_series)
618
+ string_series = await self.__expand(string_series=string_series)
619
+
620
+ if self.remove_punct:
621
+ string_series = await self.__remove_punct(string_series=string_series)
622
+ if self.remove_digits:
623
+ string_series = await self.__remove_digits(string_series=string_series)
624
+ if self.remove_stop_words:
625
+ string_series = await self.__remove_stop_words(string_series=string_series)
626
+ if self.remove_short_words:
627
+ string_series = await self.__remove_short_words(string_series=string_series,
628
+ minlen=self.minlen,
629
+ maxlen=self.maxlen)
630
+ string_series = await self.__remove_top_bottom_words(string_series=string_series,
631
+ top_p=self.top_p,
632
+ bottom_p=self.bottom_p, dataset=dataset)
633
+
634
+ string_series = string_series.str.strip().copy()
635
+ string_series.replace(to_replace="", value="this is an empty message", inplace=True)
636
+
637
+ return string_series
638
+
639
+
640
+ async def get_frequent_words_html(df):
641
+ text_preprocess = TextPreprocessor()
642
+ preprocessed_txt = await text_preprocess.preprocess(df['title'] + ' ' + df['description'])
643
+ counter = Counter(' '.join([*preprocessed_txt]).split())
644
+
645
+ freq_tokens_html = '<div class="word-cloud-container">'
646
+ n = 1
647
+ for i, j in counter.most_common(25):
648
+ freq_tokens_html += f'<a class="wc-tokens" onclick=wc_search("{i}")>{i}</a>{"&nbsp;" * np.random.randint(3, 7, 1)[0]}'
649
+ if n == 5:
650
+ freq_tokens_html += '<div class="word-cloud-section" id="word-cloud-section-id">'
651
+ n += 1
652
+ freq_tokens_html += '</div></div>'
653
  return freq_tokens_html