Spaces:
Running
Running
Upload files
Browse files- Dockerfile +9 -0
- README.md +3 -3
- app.py +242 -0
- config.py +1 -0
- db_operations/__init__.py +1 -0
- db_operations/db_operations.py +59 -0
- requirements.txt +12 -0
- start.sh +3 -0
- static/favicon_new.png +0 -0
- static/loader.gif +0 -0
- static/refresh_reload_icon.png +0 -0
- static/styles.css +510 -0
- static/top-icon.png +0 -0
- templates/index.html +200 -0
- word_cloud.py +653 -0
Dockerfile
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.7-slim
|
2 |
+
WORKDIR /webapp
|
3 |
+
COPY . .
|
4 |
+
RUN chmod +x /webapp/start.sh
|
5 |
+
RUN pip install --upgrade pip
|
6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
7 |
+
RUN apt update && apt install -y redis-server
|
8 |
+
EXPOSE 7860 6379
|
9 |
+
CMD ["/webapp/start.sh"]
|
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
title: News Aggregator
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
license: mit
|
|
|
1 |
---
|
2 |
title: News Aggregator
|
3 |
+
emoji: ⚡
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: blue
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
license: mit
|
app.py
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
from dateutil import parser
|
4 |
+
from flask import Flask, render_template
|
5 |
+
from flask_cors import cross_origin, CORS
|
6 |
+
from db_operations.db_operations import DBOperations
|
7 |
+
import logging
|
8 |
+
import traceback
|
9 |
+
import redis
|
10 |
+
from datetime import datetime
|
11 |
+
from functools import lru_cache
|
12 |
+
from word_cloud import get_frequent_words_html
|
13 |
+
from config import NEWS_RETENTION_SECONDS
|
14 |
+
|
15 |
+
|
16 |
+
app = Flask(__name__)
|
17 |
+
CORS(app)
|
18 |
+
redis_client = redis.Redis(host='localhost', port=6379, decode_responses=True)
|
19 |
+
db = DBOperations()
|
20 |
+
|
21 |
+
|
22 |
+
REFRESH_FREQ = 300 # 300 secs = 5 mins
|
23 |
+
|
24 |
+
def is_db_fetch_reqd():
|
25 |
+
try:
|
26 |
+
env_news_time = redis_client.get('NEWSFETCHTIME')
|
27 |
+
logging.warning(f'fetch_time_env_var: {env_news_time}')
|
28 |
+
fetch_flag = 1
|
29 |
+
if env_news_time is None:
|
30 |
+
redis_client.set("NEWSFETCHTIME", str(datetime.now()))
|
31 |
+
fetch_flag = 1
|
32 |
+
|
33 |
+
if env_news_time is not None:
|
34 |
+
fetch_time_lapse_seconds = (datetime.now() - datetime.strptime(env_news_time, '%Y-%m-%d %H:%M:%S.%f')).seconds
|
35 |
+
if fetch_time_lapse_seconds <= REFRESH_FREQ:
|
36 |
+
fetch_flag = 0
|
37 |
+
else:
|
38 |
+
redis_client.set("NEWSFETCHTIME", str(datetime.now()))
|
39 |
+
fetch_flag = 1
|
40 |
+
except Exception as e:
|
41 |
+
print(e)
|
42 |
+
fetch_flag = 1
|
43 |
+
return fetch_flag
|
44 |
+
|
45 |
+
|
46 |
+
def correct_date(x):
|
47 |
+
if (not isinstance(x, str)) or (str(x).find(":") == -1):
|
48 |
+
logging.warning(f'correct_date() error: {x} is not the right date format')
|
49 |
+
return "2020-11-07 00:36:44+05:30"
|
50 |
+
return x
|
51 |
+
|
52 |
+
def date_time_parser(dt):
|
53 |
+
"""
|
54 |
+
Computes the minutes elapsed since published time.
|
55 |
+
:param dt: date
|
56 |
+
:return: int, minutes elapsed.
|
57 |
+
"""
|
58 |
+
try:
|
59 |
+
return int(np.round((dt.now(dt.tz) - dt).total_seconds() / 60, 0))
|
60 |
+
except:
|
61 |
+
logging.warning(f'date_time_parser() error: {dt} is not the right date format')
|
62 |
+
return 100000
|
63 |
+
|
64 |
+
|
65 |
+
def elapsed_time_str(mins):
|
66 |
+
"""
|
67 |
+
Return the time elapsed string from minutes passed as an argument.
|
68 |
+
:param mins: int, minutes elapsed.
|
69 |
+
:return: str, time elapsed string
|
70 |
+
"""
|
71 |
+
try:
|
72 |
+
time_str = ''
|
73 |
+
hours = int(mins / 60)
|
74 |
+
days = np.round(mins / (60 * 24), 1)
|
75 |
+
remaining_mins = int(mins - (hours * 60))
|
76 |
+
if days >= 1:
|
77 |
+
time_str = f'{str(days)} days ago'
|
78 |
+
if days == 1:
|
79 |
+
time_str = 'a day ago'
|
80 |
+
elif (days < 1) & (hours < 24) & (mins >= 60):
|
81 |
+
time_str = f'{str(hours)} hours and {str(remaining_mins)} mins ago'
|
82 |
+
if (hours == 1) & (remaining_mins > 1):
|
83 |
+
time_str = f'an hour and {str(remaining_mins)} mins ago'
|
84 |
+
if (hours == 1) & (remaining_mins == 1):
|
85 |
+
time_str = f'an hour and a min ago'
|
86 |
+
if (hours > 1) & (remaining_mins == 1):
|
87 |
+
time_str = f'{str(hours)} hours and a min ago'
|
88 |
+
if (hours > 1) & (remaining_mins == 0):
|
89 |
+
time_str = f'{str(hours)} hours ago'
|
90 |
+
if ((mins / 60) == 1) & (remaining_mins == 0):
|
91 |
+
time_str = 'an hour ago'
|
92 |
+
elif (days < 1) & (hours < 24) & (mins == 0):
|
93 |
+
time_str = 'Just in'
|
94 |
+
else:
|
95 |
+
time_str = f'{str(mins)} minutes ago'
|
96 |
+
if mins == 1:
|
97 |
+
time_str = 'a minute ago'
|
98 |
+
return time_str
|
99 |
+
except:
|
100 |
+
return "-"
|
101 |
+
|
102 |
+
|
103 |
+
|
104 |
+
def fetch_from_db(fetch_flag):
|
105 |
+
try:
|
106 |
+
logging.warning(f'fetch_flag: {fetch_flag}')
|
107 |
+
if fetch_flag == 1:
|
108 |
+
final_df = db.read_news_from_db()
|
109 |
+
freq_tokens = get_frequent_words_html(final_df)
|
110 |
+
logging.warning('Fetched From DB\n\n')
|
111 |
+
|
112 |
+
final_df['_id'] = final_df['_id'].astype('str')
|
113 |
+
|
114 |
+
redis_client.set("NEWSDF", final_df.to_json())
|
115 |
+
redis_client.set("NEWSWORDCLOUD", freq_tokens)
|
116 |
+
else:
|
117 |
+
final_df = pd.read_json(redis_client.get("NEWSDF"))
|
118 |
+
freq_tokens = redis_client.get("NEWSWORDCLOUD")
|
119 |
+
logging.warning('Fetched From Cache\n\n')
|
120 |
+
|
121 |
+
except Exception as e:
|
122 |
+
print(e)
|
123 |
+
final_df = []
|
124 |
+
freq_tokens = ""
|
125 |
+
raise
|
126 |
+
return final_df, freq_tokens
|
127 |
+
|
128 |
+
|
129 |
+
@app.route("/")
|
130 |
+
@cross_origin()
|
131 |
+
def index():
|
132 |
+
"""
|
133 |
+
Entry point
|
134 |
+
"""
|
135 |
+
try:
|
136 |
+
src_str = ''
|
137 |
+
final_df, freq_tokens = fetch_from_db(is_db_fetch_reqd())
|
138 |
+
if len(final_df) > 1:
|
139 |
+
|
140 |
+
final_df["parsed_date"] = [correct_date(date_) for date_ in final_df['parsed_date']]
|
141 |
+
final_df["parsed_date"] = [parser.parse(date_) for date_ in final_df['parsed_date']]
|
142 |
+
final_df["elapsed_time"] =[date_time_parser(date_) for date_ in final_df['parsed_date']]
|
143 |
+
final_df = final_df.loc[final_df["elapsed_time"] <= NEWS_RETENTION_SECONDS, :].copy()
|
144 |
+
final_df["elapsed_time_str"] = final_df["elapsed_time"].apply(elapsed_time_str)
|
145 |
+
final_df.sort_values(by="elapsed_time", inplace=True)
|
146 |
+
src_str = ", ".join(sorted([*final_df['src'].unique()]))
|
147 |
+
final_df['src_time'] = final_df['src'] + (" " * 5) + final_df["elapsed_time_str"]
|
148 |
+
final_df.drop(columns=['_id', 'parsed_date', 'src', 'elapsed_time', 'elapsed_time_str'], inplace=True)
|
149 |
+
final_df.drop_duplicates(subset='description', inplace=True)
|
150 |
+
final_df = final_df.loc[(final_df["title"] != ""), :].copy()
|
151 |
+
else:
|
152 |
+
final_df = pd.DataFrame({'title': '', 'url': '',
|
153 |
+
'description': '', 'src_time': ''}, index=[0])
|
154 |
+
|
155 |
+
except Exception as e:
|
156 |
+
final_df = pd.DataFrame({'title': '', 'url': '',
|
157 |
+
'description': '', 'src_time': ''}, index=[0])
|
158 |
+
logging.warning(traceback.print_exc())
|
159 |
+
|
160 |
+
result_str = f'''
|
161 |
+
<div class="box" id="main">
|
162 |
+
<form>
|
163 |
+
|
164 |
+
<div class="banner">
|
165 |
+
<img src="../static/favicon_new.png" class="logo-img" alt="KSV Muralidhar" />
|
166 |
+
<h1 style="display:inline-block; vertical-align: middle;">Latest News</h1>
|
167 |
+
</div>
|
168 |
+
'''
|
169 |
+
|
170 |
+
if len(final_df) <= 1:
|
171 |
+
result_str += f'''<div><p class="unavailable">This app is temporarily unavailable</p></div>'''
|
172 |
+
else:
|
173 |
+
# last_update_utc = datetime.strptime(os.getenv("NEWSFETCHTIME"), '%Y-%m-%d %H:%M:%S.%f')
|
174 |
+
last_update_utc = datetime.strptime(redis_client.get('NEWSFETCHTIME'), '%Y-%m-%d %H:%M:%S.%f')
|
175 |
+
last_update_utc = last_update_utc.strftime("%Y-%m-%d %H:%M:%S")
|
176 |
+
result_str += f'<p class="srctxt">News aggregated from <b>{src_str}</b>.<br><br>Last updated: {last_update_utc} UTC</p>'
|
177 |
+
|
178 |
+
result_str += '''
|
179 |
+
<div class="input-container">
|
180 |
+
<input type="text" class="keyword-input" id="keywordInput" placeholder="Search" oninput="filterContent(true)">
|
181 |
+
<div class="clear-btn" id="clearBtn" onclick="clearFilter()">×</div>
|
182 |
+
</div>
|
183 |
+
'''
|
184 |
+
|
185 |
+
result_str += f"{freq_tokens} "
|
186 |
+
result_str += '<div class="show-more-word-cloud" onclick=word_cloud_display()><p class="three-dots">...</p></div>'
|
187 |
+
|
188 |
+
result_str += '''<div style="padding-bottom: 10px; font-size: 12px; font-family: Arial, Helvetica, sans-serif;">
|
189 |
+
News categories and similar news are AI-generated</div>'''
|
190 |
+
|
191 |
+
|
192 |
+
for n, i in final_df.iterrows(): # iterating through the search results
|
193 |
+
href = i["url"]
|
194 |
+
category = i["category"]
|
195 |
+
description = i["description"]
|
196 |
+
url_txt = i["title"]
|
197 |
+
src_time = i["src_time"]
|
198 |
+
sim_news = i['similar_news']
|
199 |
+
result_str += f'''<div class="news-item"><div style="padding-top: 7px;">
|
200 |
+
<a href="{href}" target="_blank" class="article-category">{category}
|
201 |
+
</a>
|
202 |
+
</div>
|
203 |
+
<div>
|
204 |
+
<a href="{href}" target="_blank" class="headline">{url_txt}
|
205 |
+
</a>
|
206 |
+
</div>
|
207 |
+
<div>
|
208 |
+
<a href="{href}" target="_blank" class="description">
|
209 |
+
{description}
|
210 |
+
</a>
|
211 |
+
</div>
|
212 |
+
<div>
|
213 |
+
<a href="{href}" target="_blank" class="time">
|
214 |
+
{src_time}
|
215 |
+
</a>
|
216 |
+
</div>
|
217 |
+
|
218 |
+
|
219 |
+
<div class="container">
|
220 |
+
<div class="content" style="display: none;">
|
221 |
+
{sim_news}
|
222 |
+
</div>
|
223 |
+
<div class="show-similar-button-container">
|
224 |
+
<button type="button" class="show-more">Show similar news</button>
|
225 |
+
<button type="button" class="show-less">Hide similar news</button>
|
226 |
+
</div>
|
227 |
+
</div>
|
228 |
+
|
229 |
+
|
230 |
+
|
231 |
+
<div>
|
232 |
+
<p></p>
|
233 |
+
</div></div>
|
234 |
+
'''
|
235 |
+
|
236 |
+
result_str += '</form></div>'
|
237 |
+
return render_template("index.html", body=result_str)
|
238 |
+
|
239 |
+
|
240 |
+
if __name__ == "__main__":
|
241 |
+
app.run(host="0.0.0.0", port=7860, workers=5, threads=5) # workers=(2*ncores) + 1, threads= (2 to 4*ncores) + 1
|
242 |
+
|
config.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
NEWS_RETENTION_SECONDS = 300
|
db_operations/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from db_operations.db_operations import *
|
db_operations/db_operations.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pymongo
|
2 |
+
import os
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
|
6 |
+
class DBOperations:
|
7 |
+
"""
|
8 |
+
Reads news from MongoDB
|
9 |
+
"""
|
10 |
+
def __init__(self):
|
11 |
+
self.url = os.getenv('DB_URL')
|
12 |
+
self.database = "rss_news_db_cat_pred_sim_news"
|
13 |
+
self.collection = "rss_news_cat_pred_sim_news"
|
14 |
+
self.__client = None
|
15 |
+
self.__error = 0
|
16 |
+
|
17 |
+
def __connect(self):
|
18 |
+
try:
|
19 |
+
self.__client = pymongo.MongoClient(self.url)
|
20 |
+
_ = self.__client.list_database_names()
|
21 |
+
except Exception as conn_exception:
|
22 |
+
self.__error = 1
|
23 |
+
self.__client = None
|
24 |
+
raise
|
25 |
+
|
26 |
+
def __read(self):
|
27 |
+
try:
|
28 |
+
db = self.__client[self.database]
|
29 |
+
coll = db[self.collection]
|
30 |
+
docs = []
|
31 |
+
for doc in coll.find():
|
32 |
+
docs.append(doc)
|
33 |
+
rss_df = pd.DataFrame(docs)
|
34 |
+
except Exception as insert_err:
|
35 |
+
self.__error = 1
|
36 |
+
rss_df = pd.DataFrame({'_id': '', 'title': '', 'url': '',
|
37 |
+
'description': '', 'parsed_date': '',
|
38 |
+
'src': ''}, index=[0])
|
39 |
+
return rss_df
|
40 |
+
|
41 |
+
def __close_connection(self):
|
42 |
+
if self.__client is not None:
|
43 |
+
self.__client.close()
|
44 |
+
self.__client = None
|
45 |
+
|
46 |
+
def read_news_from_db(self):
|
47 |
+
rss_df = pd.DataFrame({'_id': '', 'title': '', 'url': '',
|
48 |
+
'description': '', 'parsed_date': '',
|
49 |
+
'src': ''}, index=[0])
|
50 |
+
if self.url is not None:
|
51 |
+
if self.__error == 0:
|
52 |
+
self.__connect()
|
53 |
+
if self.__error == 0:
|
54 |
+
rss_df = self.__read()
|
55 |
+
if self.__error == 0:
|
56 |
+
print("Read Successful")
|
57 |
+
if self.__client is not None:
|
58 |
+
self.__close_connection()
|
59 |
+
return rss_df
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
regex==2021.8.3
|
2 |
+
lxml==4.6.3
|
3 |
+
numpy==1.21.1
|
4 |
+
python-dateutil==2.8.2
|
5 |
+
pandas==1.3.1
|
6 |
+
requests==2.26.0
|
7 |
+
bs4==0.0.1
|
8 |
+
flask==2.2.2
|
9 |
+
flask_cors==3.0.10
|
10 |
+
gunicorn==20.1.0
|
11 |
+
pymongo==4.3.3
|
12 |
+
redis
|
start.sh
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
redis-server --daemonize yes
|
3 |
+
gunicorn -b 0.0.0.0:7860 --timeout 120 --workers 5 --threads 5 app:app
|
static/favicon_new.png
ADDED
![]() |
static/loader.gif
ADDED
![]() |
static/refresh_reload_icon.png
ADDED
![]() |
static/styles.css
ADDED
@@ -0,0 +1,510 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
html {
|
2 |
+
scroll-behavior: smooth;
|
3 |
+
}
|
4 |
+
|
5 |
+
@media screen and (min-width: 800px) {
|
6 |
+
a.headline {
|
7 |
+
background-color: #E5E4E2;
|
8 |
+
display: block;
|
9 |
+
width: relative;
|
10 |
+
text-decoration: none;
|
11 |
+
color: black;
|
12 |
+
line-height: 1.2;
|
13 |
+
align: justify;
|
14 |
+
border-left: 5px solid transparent;
|
15 |
+
border-top: 5px solid transparent;
|
16 |
+
border-bottom: 5px solid transparent;
|
17 |
+
border-right: 0px;
|
18 |
+
font-weight: bold;
|
19 |
+
font-size: 18px;
|
20 |
+
padding-right: 5px;
|
21 |
+
font-family: Arial, Helvetica, sans-serif;
|
22 |
+
}
|
23 |
+
}
|
24 |
+
|
25 |
+
@media screen and (max-width: 800px) {
|
26 |
+
a.headline {
|
27 |
+
background-color: #E5E4E2;
|
28 |
+
display: block;
|
29 |
+
width: relative;
|
30 |
+
text-decoration: none;
|
31 |
+
color: black;
|
32 |
+
line-height: 1.2;
|
33 |
+
align: justify;
|
34 |
+
border-left: 5px solid transparent;
|
35 |
+
border-top: 5px solid transparent;
|
36 |
+
border-bottom: 5px solid transparent;
|
37 |
+
border-right: 0px;
|
38 |
+
font-weight: bold;
|
39 |
+
font-size: 16.5px;
|
40 |
+
padding-right: 5px;
|
41 |
+
font-family: Arial, Helvetica, sans-serif;
|
42 |
+
}
|
43 |
+
}
|
44 |
+
|
45 |
+
@media screen and (min-width: 800px) {
|
46 |
+
a.description {
|
47 |
+
background-color: #E5E4E2;
|
48 |
+
align:justify;
|
49 |
+
text-align: justify;
|
50 |
+
display: block;
|
51 |
+
height:100%;
|
52 |
+
width: relative;
|
53 |
+
text-decoration: none;
|
54 |
+
border-left: 5px solid transparent;
|
55 |
+
border-top: 0px;
|
56 |
+
border-bottom: 7px solid transparent;
|
57 |
+
border-right: 0px;
|
58 |
+
font-size: 14px;
|
59 |
+
padding-right: 5px;
|
60 |
+
font-family: Arial, Helvetica, sans-serif;
|
61 |
+
color: dimgrey;
|
62 |
+
}
|
63 |
+
}
|
64 |
+
|
65 |
+
@media screen and (max-width: 800px) {
|
66 |
+
a.description {
|
67 |
+
background-color: #E5E4E2;
|
68 |
+
align:justify;
|
69 |
+
text-align: justify;
|
70 |
+
display: block;
|
71 |
+
height:100%;
|
72 |
+
width: relative;
|
73 |
+
text-decoration: none;
|
74 |
+
border-left: 5px solid transparent;
|
75 |
+
border-top: 0px;
|
76 |
+
border-bottom: 7px solid transparent;
|
77 |
+
border-right: 0px;
|
78 |
+
font-size: 12.5px;
|
79 |
+
padding-right: 5px;
|
80 |
+
font-family: Arial, Helvetica, sans-serif;
|
81 |
+
color: dimgrey;
|
82 |
+
}
|
83 |
+
}
|
84 |
+
|
85 |
+
@media screen and (min-width: 800px) {
|
86 |
+
a.time {
|
87 |
+
background-color: #E5E4E2;
|
88 |
+
align:justify;
|
89 |
+
display: block;
|
90 |
+
height:100%;
|
91 |
+
width: relative;
|
92 |
+
text-decoration: none;
|
93 |
+
border-left: 5px solid transparent;
|
94 |
+
border-top: 0px;
|
95 |
+
border-bottom: 1px solid transparent;
|
96 |
+
border-right: 0px;
|
97 |
+
padding-right: 5px;
|
98 |
+
font-size: 11px;
|
99 |
+
padding-bottom: 5px;
|
100 |
+
font-family: Arial, Helvetica, sans-serif;
|
101 |
+
color: green;
|
102 |
+
}
|
103 |
+
}
|
104 |
+
|
105 |
+
@media screen and (max-width: 800px) {
|
106 |
+
a.time {
|
107 |
+
background-color: #E5E4E2;
|
108 |
+
align:justify;
|
109 |
+
display: block;
|
110 |
+
height:100%;
|
111 |
+
width: relative;
|
112 |
+
text-decoration: none;
|
113 |
+
border-left: 5px solid transparent;
|
114 |
+
border-top: 0px;
|
115 |
+
border-bottom: 1px solid transparent;
|
116 |
+
border-right: 0px;
|
117 |
+
padding-right: 5px;
|
118 |
+
font-size: 10px;
|
119 |
+
padding-bottom: 5px;
|
120 |
+
font-family: Arial, Helvetica, sans-serif;
|
121 |
+
color: green;
|
122 |
+
}
|
123 |
+
}
|
124 |
+
|
125 |
+
.box {
|
126 |
+
display: flex;
|
127 |
+
justify-content: center;
|
128 |
+
align-items: center;
|
129 |
+
height: inherit;
|
130 |
+
padding: 20px;
|
131 |
+
}
|
132 |
+
@media screen and (min-width: 800px) {
|
133 |
+
form {
|
134 |
+
width: 50%;
|
135 |
+
overflow-x: hidden;
|
136 |
+
padding: 20px;
|
137 |
+
border-radius: 10px;
|
138 |
+
background: #fff;
|
139 |
+
box-shadow: 0 0 20px 0 #095484;
|
140 |
+
}}
|
141 |
+
|
142 |
+
@media screen and (max-width: 800px) {
|
143 |
+
form {
|
144 |
+
width: 100%;
|
145 |
+
overflow-x: hidden;
|
146 |
+
padding: 20px;
|
147 |
+
border-radius: 10px;
|
148 |
+
background: #fff;
|
149 |
+
box-shadow: 0 0 15px 0 #095484;
|
150 |
+
}}
|
151 |
+
.banner {
|
152 |
+
position: relative;
|
153 |
+
height: 30px;
|
154 |
+
/* background-size: cover; */
|
155 |
+
display: flex;
|
156 |
+
/* justify-content: center; */
|
157 |
+
/* align-items: center; */
|
158 |
+
/* text-align: center; */
|
159 |
+
}
|
160 |
+
@media screen and (min-width: 800px) {
|
161 |
+
h1 {
|
162 |
+
position: absolute;
|
163 |
+
margin: 0;
|
164 |
+
padding-left: 50px;
|
165 |
+
font-size: 25px;
|
166 |
+
color: black;
|
167 |
+
z-index: 2;
|
168 |
+
font-family: Arial, Helvetica, sans-serif;
|
169 |
+
}
|
170 |
+
}
|
171 |
+
|
172 |
+
@media screen and (max-width: 800px) {
|
173 |
+
h1 {
|
174 |
+
position: absolute;
|
175 |
+
margin: 0;
|
176 |
+
padding-left: 40px;
|
177 |
+
font-size: 24px;
|
178 |
+
color: black;
|
179 |
+
z-index: 2;
|
180 |
+
font-family: Arial, Helvetica, sans-serif;
|
181 |
+
}
|
182 |
+
}
|
183 |
+
|
184 |
+
p.unavailable {
|
185 |
+
background-color: #E5E4E2;
|
186 |
+
display: block;
|
187 |
+
width: 100%;
|
188 |
+
text-decoration: none;
|
189 |
+
color: black;
|
190 |
+
line-height: 1.2;
|
191 |
+
align: justify;
|
192 |
+
border-left: 5px solid transparent;
|
193 |
+
border-top: 5px solid transparent;
|
194 |
+
border-bottom: 5px solid transparent;
|
195 |
+
border-right: 0px;
|
196 |
+
font-weight: bold;
|
197 |
+
font-size: 18px;
|
198 |
+
padding-right: 5px;
|
199 |
+
font-family: Arial, Helvetica, sans-serif;
|
200 |
+
}
|
201 |
+
div.news-item{
|
202 |
+
background-color: #E5E4E2;
|
203 |
+
/*box-shadow: rgba(0, 0, 0, 0.4) -1px 0px 5px, rgba(0, 0, 0, 0.5) 0px 4px 5px -3px, rgba(0, 0, 0, 0.2) 0px -3px 0px inset;*/
|
204 |
+
box-shadow: rgba(0, 0, 0, 0.25) 0px 0px 5px 1px, rgba(0, 0, 0, 0.1) 0px 4px 5px -3px, rgba(0, 0, 0, 0.2) 0px -1px 0px inset;
|
205 |
+
|
206 |
+
|
207 |
+
}
|
208 |
+
div.news-item:hover{
|
209 |
+
box-shadow: none;
|
210 |
+
}
|
211 |
+
|
212 |
+
@media screen and (min-width: 800px) {
|
213 |
+
p.srctxt {
|
214 |
+
align:justify;
|
215 |
+
text-align: justify;
|
216 |
+
word-break: break-all;
|
217 |
+
font-size: 11px;
|
218 |
+
font-family: Arial, Helvetica, sans-serif;
|
219 |
+
}
|
220 |
+
.logo-img{
|
221 |
+
margin-right: 10px;
|
222 |
+
vertical-align: center;
|
223 |
+
/* position: relative; */
|
224 |
+
width: 34px;
|
225 |
+
height: 34px;
|
226 |
+
|
227 |
+
}
|
228 |
+
}
|
229 |
+
|
230 |
+
@media screen and (max-width: 800px) {
|
231 |
+
p.srctxt {
|
232 |
+
align:justify;
|
233 |
+
text-align: justify;
|
234 |
+
word-break: break-all;
|
235 |
+
font-size: 9px;
|
236 |
+
font-family: Arial, Helvetica, sans-serif;
|
237 |
+
}
|
238 |
+
.logo-img{
|
239 |
+
margin-right: 10px;
|
240 |
+
vertical-align: top;
|
241 |
+
/* position: absolute; */
|
242 |
+
width: 30px;
|
243 |
+
height: 30px;
|
244 |
+
}
|
245 |
+
}
|
246 |
+
|
247 |
+
.float{
|
248 |
+
position:fixed;
|
249 |
+
width:25px;
|
250 |
+
height:25px;
|
251 |
+
bottom:15px;
|
252 |
+
right:12px;
|
253 |
+
background-color: white;
|
254 |
+
border-radius:50%;
|
255 |
+
text-align:center;
|
256 |
+
vertical-align:center;
|
257 |
+
z-index: 99999998;
|
258 |
+
font-size:0;
|
259 |
+
cursor:pointer;
|
260 |
+
animation: beatan 0.8s infinite alternate;
|
261 |
+
|
262 |
+
}
|
263 |
+
.top-float{
|
264 |
+
position:fixed;
|
265 |
+
width:25px;
|
266 |
+
height:25px;
|
267 |
+
bottom:52px;
|
268 |
+
right:12px;
|
269 |
+
background-color: white;
|
270 |
+
border-radius:50%;
|
271 |
+
text-align:center;
|
272 |
+
vertical-align:center;
|
273 |
+
z-index: 99999998;
|
274 |
+
font-size:0;
|
275 |
+
cursor:pointer;
|
276 |
+
animation: beatan 0.8s infinite alternate;
|
277 |
+
|
278 |
+
}
|
279 |
+
.my-float{
|
280 |
+
margin-top:22px;
|
281 |
+
}
|
282 |
+
|
283 |
+
@keyframes beatan{
|
284 |
+
to { transform: scale(1.1); }
|
285 |
+
}
|
286 |
+
|
287 |
+
.loader {
|
288 |
+
position: fixed;
|
289 |
+
left: 0px;
|
290 |
+
top: 0px;
|
291 |
+
width: 100%;
|
292 |
+
height: 100%;
|
293 |
+
z-index: 99999999999;
|
294 |
+
background: url('../static/loader.gif') 50% 50% no-repeat rgb(255,255,255);
|
295 |
+
}
|
296 |
+
|
297 |
+
.highlight {
|
298 |
+
background-color: yellow;
|
299 |
+
font-weight: bold;
|
300 |
+
}
|
301 |
+
|
302 |
+
.input-container {
|
303 |
+
position: relative;
|
304 |
+
padding-bottom: 10px;
|
305 |
+
}
|
306 |
+
|
307 |
+
.keyword-input {
|
308 |
+
|
309 |
+
border-radius: 5px;
|
310 |
+
transition: border-color 0.3s ease;
|
311 |
+
border: 1px solid silver;
|
312 |
+
width: 10em;
|
313 |
+
height: 1.5em;
|
314 |
+
padding-left: 0.5em;
|
315 |
+
outline: none;
|
316 |
+
overflow: hidden;
|
317 |
+
|
318 |
+
}
|
319 |
+
|
320 |
+
.clear-btn {
|
321 |
+
position: absolute;
|
322 |
+
font-size: 20px;
|
323 |
+
left: 129px;
|
324 |
+
transform: translateY(-105%);
|
325 |
+
cursor: pointer;
|
326 |
+
opacity: 0;
|
327 |
+
transition: opacity 0.3s ease;
|
328 |
+
}
|
329 |
+
|
330 |
+
.clear-btn.show {
|
331 |
+
opacity: 1;
|
332 |
+
}
|
333 |
+
|
334 |
+
@media screen and (min-width: 800px) {
|
335 |
+
a.article-category {
|
336 |
+
background-color: #E5E4E2;
|
337 |
+
align:justify;
|
338 |
+
display: block;
|
339 |
+
height:100%;
|
340 |
+
width: relative;
|
341 |
+
text-decoration: none;
|
342 |
+
border-left: 5px solid transparent;
|
343 |
+
border-top: 0px;
|
344 |
+
font-weight: bold;
|
345 |
+
border-bottom: 1px solid transparent;
|
346 |
+
border-right: 0px;
|
347 |
+
padding-right: 5px;
|
348 |
+
font-size: 11px;
|
349 |
+
padding-bottom: 0px;
|
350 |
+
font-family: Arial, Helvetica, sans-serif;
|
351 |
+
color: green;
|
352 |
+
}
|
353 |
+
}
|
354 |
+
|
355 |
+
@media screen and (max-width: 800px) {
|
356 |
+
a.article-category {
|
357 |
+
background-color: #E5E4E2;
|
358 |
+
align:justify;
|
359 |
+
display: block;
|
360 |
+
height:100%;
|
361 |
+
font-weight: bold;
|
362 |
+
width: relative;
|
363 |
+
text-decoration: none;
|
364 |
+
border-left: 5px solid transparent;
|
365 |
+
border-top: 0px;
|
366 |
+
border-bottom: 1px solid transparent;
|
367 |
+
border-right: 0px;
|
368 |
+
padding-right: 5px;
|
369 |
+
font-size: 10px;
|
370 |
+
padding-bottom: 0px;
|
371 |
+
font-family: Arial, Helvetica, sans-serif;
|
372 |
+
color: green;
|
373 |
+
}
|
374 |
+
}
|
375 |
+
|
376 |
+
.content {
|
377 |
+
display: none;
|
378 |
+
font-family: Arial, Helvetica, sans-serif;
|
379 |
+
|
380 |
+
padding-right: 5px;
|
381 |
+
|
382 |
+
padding-top: 5px;
|
383 |
+
border-left: 5px solid transparent;
|
384 |
+
}
|
385 |
+
|
386 |
+
.container{
|
387 |
+
padding-bottom:10px;
|
388 |
+
}
|
389 |
+
|
390 |
+
.show-similar-button-container{
|
391 |
+
display: flex;
|
392 |
+
flex-direction: column;
|
393 |
+
align-items: center;
|
394 |
+
}
|
395 |
+
|
396 |
+
.similar-news-item:hover {
|
397 |
+
text-decoration: none;
|
398 |
+
}
|
399 |
+
|
400 |
+
@media screen and (min-width: 800px) {
|
401 |
+
.similar-news-item {
|
402 |
+
text-align: justify;
|
403 |
+
text-decoration: underline;
|
404 |
+
font-size: 14px;
|
405 |
+
font-family: Arial, Helvetica, sans-serif;
|
406 |
+
color: black;
|
407 |
+
display:inline-block;
|
408 |
+
padding-bottom: 10px;
|
409 |
+
width:100%;
|
410 |
+
/*white-space: nowrap;
|
411 |
+
overflow: hidden;
|
412 |
+
text-overflow: ellipsis;*/
|
413 |
+
|
414 |
+
}
|
415 |
+
}
|
416 |
+
|
417 |
+
@media screen and (max-width: 800px) {
|
418 |
+
.similar-news-item {
|
419 |
+
text-align: justify;
|
420 |
+
text-decoration: underline;
|
421 |
+
font-size: 12px;
|
422 |
+
font-family: Arial, Helvetica, sans-serif;
|
423 |
+
color: black;
|
424 |
+
display:inline-block;
|
425 |
+
padding-bottom: 8px;
|
426 |
+
width:100%;
|
427 |
+
/*white-space: nowrap;
|
428 |
+
overflow: hidden;
|
429 |
+
text-overflow: ellipsis;*/
|
430 |
+
}
|
431 |
+
}
|
432 |
+
|
433 |
+
|
434 |
+
|
435 |
+
.show-more {
|
436 |
+
background-color: #E5E4E2;
|
437 |
+
font-family: Arial, Helvetica, sans-serif;
|
438 |
+
border-radius:4px;
|
439 |
+
padding-top:3px;
|
440 |
+
padding-bottom:3px;
|
441 |
+
padding-left:3px;
|
442 |
+
padding-right:3px;
|
443 |
+
font-size: 12px;
|
444 |
+
display: box;
|
445 |
+
border: none;
|
446 |
+
|
447 |
+
}
|
448 |
+
|
449 |
+
.show-more:hover {
|
450 |
+
background-color: black;
|
451 |
+
color: white;
|
452 |
+
}
|
453 |
+
|
454 |
+
.show-less {
|
455 |
+
background-color: #E5E4E2;
|
456 |
+
font-family: Arial, Helvetica, sans-serif;
|
457 |
+
border-radius:4px;
|
458 |
+
padding-top:3px;
|
459 |
+
padding-bottom:3px;
|
460 |
+
padding-left:3px;
|
461 |
+
padding-right:3px;
|
462 |
+
font-size: 12px;
|
463 |
+
border: none;
|
464 |
+
display: none;
|
465 |
+
}
|
466 |
+
|
467 |
+
.show-less:hover {
|
468 |
+
background-color: black;
|
469 |
+
color: white;
|
470 |
+
}
|
471 |
+
|
472 |
+
.word-cloud-container{
|
473 |
+
word-wrap: break-word;
|
474 |
+
padding-bottom: 10px;
|
475 |
+
|
476 |
+
}
|
477 |
+
|
478 |
+
.wc-tokens{
|
479 |
+
font-family: Arial, Helvetica, sans-serif;
|
480 |
+
font-size: 13.2px;
|
481 |
+
cursor: pointer;
|
482 |
+
}
|
483 |
+
|
484 |
+
.wc-tokens:hover{
|
485 |
+
text-decoration: underline;
|
486 |
+
}
|
487 |
+
|
488 |
+
.word-cloud-section{
|
489 |
+
padding-bottom: 10px;
|
490 |
+
display: none;
|
491 |
+
word-wrap: break-word;
|
492 |
+
}
|
493 |
+
|
494 |
+
.show-more-word-cloud{
|
495 |
+
padding-bottom: 23px;
|
496 |
+
text-align: center;
|
497 |
+
}
|
498 |
+
|
499 |
+
.three-dots{
|
500 |
+
font-size: 30px;
|
501 |
+
margin: 0;
|
502 |
+
line-height:0;
|
503 |
+
vertical-align: top;
|
504 |
+
padding: 0;
|
505 |
+
cursor: pointer;
|
506 |
+
}
|
507 |
+
|
508 |
+
.three-dots:hover{
|
509 |
+
font-size: 25px;
|
510 |
+
}
|
static/top-icon.png
ADDED
![]() |
templates/index.html
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<link rel="preload" href="../static/loader.gif" as="image">
|
7 |
+
<link rel="preload" href="../static/favicon_new.png" as="image">
|
8 |
+
<link rel="preload" href="../static/refresh_reload_icon.png" as="image">
|
9 |
+
<link rel="preload" href="../static/top-icon.png" as="image">
|
10 |
+
<link rel="icon" href="../static/favicon_new.png" type="image/png">
|
11 |
+
|
12 |
+
<meta charset="UTF-8">
|
13 |
+
<title>Latest News</title>
|
14 |
+
<link rel="stylesheet" href="static/styles.css">
|
15 |
+
<a id="top-loc"></a>
|
16 |
+
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
|
17 |
+
<script>
|
18 |
+
$(window).load(function(){
|
19 |
+
$('.loader').fadeOut();
|
20 |
+
});
|
21 |
+
</script>
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
<script>
|
26 |
+
function filterContent(match_case) {
|
27 |
+
var keyword = document.getElementById("keywordInput").value;
|
28 |
+
if (match_case == false)
|
29 |
+
{
|
30 |
+
/*var keyword = document.getElementById("keywordInput").value.toLowerCase(); */
|
31 |
+
/*var regex = new RegExp("\\b" + keyword + "\\b", "gi"); */
|
32 |
+
}
|
33 |
+
var clearbtn = document.getElementById("clearBtn");
|
34 |
+
|
35 |
+
if (keyword !== "")
|
36 |
+
{
|
37 |
+
clearbtn.style.opacity = 1;
|
38 |
+
var items = document.getElementsByClassName("news-item");
|
39 |
+
for (var i = 0; i < items.length; i++)
|
40 |
+
{
|
41 |
+
var headline = items[i].querySelector('.headline');
|
42 |
+
var description = items[i].querySelector('.description');
|
43 |
+
if (match_case == true)
|
44 |
+
{
|
45 |
+
var article_category = items[i].querySelector('.article-category');
|
46 |
+
var src_time = items[i].querySelector('.time');
|
47 |
+
var itemText = headline.textContent.concat(" ", description.textContent, " ", article_category.textContent, " ", src_time.textContent)
|
48 |
+
}
|
49 |
+
else
|
50 |
+
{
|
51 |
+
var itemText = headline.textContent.concat(" ", description.textContent, " ")
|
52 |
+
}
|
53 |
+
|
54 |
+
if (match_case == false)
|
55 |
+
{ var regex = new RegExp("\\b" + keyword + "\\b", "gi");
|
56 |
+
itemText = itemText.toLowerCase();
|
57 |
+
if (regex.test(itemText) == true)
|
58 |
+
{
|
59 |
+
items[i].style.display = "block";
|
60 |
+
highlightKeyword(headline, keyword, match_case);
|
61 |
+
highlightKeyword(description, keyword, match_case);
|
62 |
+
}
|
63 |
+
else
|
64 |
+
{
|
65 |
+
items[i].style.display = "none";
|
66 |
+
}
|
67 |
+
}
|
68 |
+
else
|
69 |
+
{
|
70 |
+
if (itemText.includes(keyword))
|
71 |
+
{
|
72 |
+
items[i].style.display = "block";
|
73 |
+
highlightKeyword(headline, keyword, match_case);
|
74 |
+
highlightKeyword(description, keyword, match_case);
|
75 |
+
highlightKeyword(article_category, keyword, match_case);
|
76 |
+
highlightKeyword(src_time, keyword, match_case);
|
77 |
+
|
78 |
+
}
|
79 |
+
else
|
80 |
+
{
|
81 |
+
items[i].style.display = "none";
|
82 |
+
}
|
83 |
+
}
|
84 |
+
}
|
85 |
+
}
|
86 |
+
else
|
87 |
+
{
|
88 |
+
clearFilter();
|
89 |
+
}
|
90 |
+
}
|
91 |
+
|
92 |
+
function clearFilter() {
|
93 |
+
var items = document.getElementsByClassName("news-item");
|
94 |
+
var clearbtn = document.getElementById("clearBtn");
|
95 |
+
clearbtn.style.opacity=0;
|
96 |
+
for (var i = 0; i < items.length; i++) {
|
97 |
+
var headline = items[i].querySelector('.headline');
|
98 |
+
var description = items[i].querySelector('.description');
|
99 |
+
var article_category = items[i].querySelector('.article-category');
|
100 |
+
var src_time = items[i].querySelector('.time');
|
101 |
+
items[i].style.display = "block";
|
102 |
+
headline.innerHTML = headline.textContent; // Remove highlighting
|
103 |
+
description.innerHTML = description.textContent; // Remove highlighting
|
104 |
+
article_category.innerHTML = article_category.textContent; // Remove highlighting
|
105 |
+
src_time.innerHTML = src_time.textContent; // Remove highlighting
|
106 |
+
}
|
107 |
+
|
108 |
+
document.getElementById("keywordInput").value = ""; // Clear input field
|
109 |
+
}
|
110 |
+
|
111 |
+
function highlightKeyword(element, keyword, match_case) {
|
112 |
+
var regex = new RegExp(keyword);
|
113 |
+
if (match_case == false)
|
114 |
+
{
|
115 |
+
var regex = new RegExp("\\b" + keyword + "\\b", 'gi');
|
116 |
+
}
|
117 |
+
element.innerHTML = element.textContent.replace(regex, function(match) {
|
118 |
+
return '<span class="highlight">' + match + '</span>';
|
119 |
+
});
|
120 |
+
}
|
121 |
+
|
122 |
+
</script>
|
123 |
+
|
124 |
+
<script>
|
125 |
+
document.addEventListener('DOMContentLoaded', function() {
|
126 |
+
const containers = document.querySelectorAll('.container');
|
127 |
+
|
128 |
+
containers.forEach(container => {
|
129 |
+
const content = container.querySelector('.content');
|
130 |
+
const showMoreBtn = container.querySelector('.show-more');
|
131 |
+
const showLessBtn = container.querySelector('.show-less');
|
132 |
+
|
133 |
+
showMoreBtn.addEventListener('click', function() {
|
134 |
+
/* var similar_news_items = document.getElementsByClassName("content");
|
135 |
+
var show_less_items = document.getElementsByClassName("show-less");
|
136 |
+
var show_more_items = document.getElementsByClassName("show-more");
|
137 |
+
for (var i = 0; i < similar_news_items.length; i++) {
|
138 |
+
similar_news_items[i].style.display = 'none';
|
139 |
+
show_more_items[i].style.display = 'block';
|
140 |
+
show_less_items[i].style.display = 'none';
|
141 |
+
}
|
142 |
+
*/
|
143 |
+
|
144 |
+
content.style.display = 'block';
|
145 |
+
content.style.opacity = 1;
|
146 |
+
showMoreBtn.style.display = 'none';
|
147 |
+
showLessBtn.style.display = 'block';
|
148 |
+
});
|
149 |
+
|
150 |
+
showLessBtn.addEventListener('click', function() {
|
151 |
+
document.documentElement.style.scrollBehavior = "auto";
|
152 |
+
var max_h = content.parentElement.parentElement.clientHeight;
|
153 |
+
content.style.display = 'none';
|
154 |
+
showMoreBtn.style.display = 'block';
|
155 |
+
showLessBtn.style.display = 'none';
|
156 |
+
var min_h = content.parentElement.parentElement.clientHeight;
|
157 |
+
$(window).scrollTop($(window).scrollTop() - (max_h - min_h) || 0);
|
158 |
+
document.documentElement.style.scrollBehavior = "smooth";
|
159 |
+
});
|
160 |
+
});
|
161 |
+
});
|
162 |
+
</script>
|
163 |
+
|
164 |
+
<script>
|
165 |
+
function wc_search(keyword)
|
166 |
+
{
|
167 |
+
clearFilter();
|
168 |
+
document.getElementById("keywordInput").value = keyword;
|
169 |
+
filterContent(false);
|
170 |
+
}
|
171 |
+
|
172 |
+
function word_cloud_display()
|
173 |
+
{
|
174 |
+
var word_cloud_section = document.getElementById("word-cloud-section-id");
|
175 |
+
if (word_cloud_section.style.display == 'block')
|
176 |
+
{
|
177 |
+
word_cloud_section.style.display = 'none';
|
178 |
+
}
|
179 |
+
else
|
180 |
+
{
|
181 |
+
word_cloud_section.style.display = 'block';
|
182 |
+
}
|
183 |
+
}
|
184 |
+
</script>
|
185 |
+
|
186 |
+
|
187 |
+
|
188 |
+
</head>
|
189 |
+
<body>
|
190 |
+
<div class="loader"></div>
|
191 |
+
{{body | safe}}
|
192 |
+
|
193 |
+
<a id="top_theme" class="top-float" onclick="window.scrollTo(0, 0);">
|
194 |
+
<img id="top-theme-icon" alt="_" src="../static/top-icon.png" style="border-radius:50%; border: 3px solid #b9bfc4; background-color: white;" width="25px" height="25px" border="0">
|
195 |
+
</a>
|
196 |
+
<a href="javascript:window.location.reload(true)" id="theme" class="float">
|
197 |
+
<img id="theme-icon" alt="_" src="../static/refresh_reload_icon.png" style="border-radius:50%; border: 3px solid #b9bfc4; background-color: white;" width=25px height=25px border="0" />
|
198 |
+
</a>
|
199 |
+
</body>
|
200 |
+
</html>
|
word_cloud.py
ADDED
@@ -0,0 +1,653 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import string
|
4 |
+
from unidecode import unidecode
|
5 |
+
from collections import Counter
|
6 |
+
|
7 |
+
|
8 |
+
class TextPreprocessor:
|
9 |
+
def __init__(self, remove_punct: bool = True, remove_digits: bool = True,
|
10 |
+
remove_stop_words: bool = True,
|
11 |
+
remove_short_words: bool = False, minlen: int = 1, maxlen: int = 1, top_p: float = None,
|
12 |
+
bottom_p: float = None):
|
13 |
+
self.remove_punct = remove_punct
|
14 |
+
self.remove_digits = remove_digits
|
15 |
+
self.remove_stop_words = remove_stop_words
|
16 |
+
self.remove_short_words = remove_short_words
|
17 |
+
self.minlen = minlen
|
18 |
+
self.maxlen = maxlen
|
19 |
+
self.top_p = top_p
|
20 |
+
self.bottom_p = bottom_p
|
21 |
+
self.words_to_remove = []
|
22 |
+
self.stop_words = ["'d", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
|
23 |
+
'about',
|
24 |
+
'above',
|
25 |
+
'across',
|
26 |
+
'after',
|
27 |
+
'afterwards',
|
28 |
+
'again',
|
29 |
+
'against',
|
30 |
+
'ain',
|
31 |
+
'all',
|
32 |
+
'almost',
|
33 |
+
'alone',
|
34 |
+
'along',
|
35 |
+
'already',
|
36 |
+
'also',
|
37 |
+
'although',
|
38 |
+
'always',
|
39 |
+
'am',
|
40 |
+
'among',
|
41 |
+
'amongst',
|
42 |
+
'amount',
|
43 |
+
'an',
|
44 |
+
'and',
|
45 |
+
'another',
|
46 |
+
'any',
|
47 |
+
'anyhow',
|
48 |
+
'anyone',
|
49 |
+
'anything',
|
50 |
+
'anyway',
|
51 |
+
'anywhere',
|
52 |
+
'are',
|
53 |
+
'around',
|
54 |
+
'as',
|
55 |
+
'at',
|
56 |
+
'back',
|
57 |
+
'be',
|
58 |
+
'became',
|
59 |
+
'because',
|
60 |
+
'become',
|
61 |
+
'becomes',
|
62 |
+
'becoming',
|
63 |
+
'been',
|
64 |
+
'before',
|
65 |
+
'beforehand',
|
66 |
+
'behind',
|
67 |
+
'being',
|
68 |
+
'below',
|
69 |
+
'beside',
|
70 |
+
'besides',
|
71 |
+
'between',
|
72 |
+
'beyond',
|
73 |
+
'both',
|
74 |
+
'bottom',
|
75 |
+
'but',
|
76 |
+
'by',
|
77 |
+
'ca',
|
78 |
+
'call',
|
79 |
+
'can',
|
80 |
+
'cannot',
|
81 |
+
'could',
|
82 |
+
'couldn',
|
83 |
+
"couldn't",
|
84 |
+
'd',
|
85 |
+
'did',
|
86 |
+
'do',
|
87 |
+
'does',
|
88 |
+
'doing',
|
89 |
+
'done',
|
90 |
+
'down',
|
91 |
+
'due',
|
92 |
+
'during',
|
93 |
+
'each',
|
94 |
+
'eight',
|
95 |
+
'either',
|
96 |
+
'eleven',
|
97 |
+
'else',
|
98 |
+
'elsewhere',
|
99 |
+
'empty',
|
100 |
+
'enough',
|
101 |
+
'even',
|
102 |
+
'ever',
|
103 |
+
'every',
|
104 |
+
'everyone',
|
105 |
+
'everything',
|
106 |
+
'everywhere',
|
107 |
+
'except',
|
108 |
+
'few',
|
109 |
+
'fifteen',
|
110 |
+
'fifty',
|
111 |
+
'first',
|
112 |
+
'five',
|
113 |
+
'for',
|
114 |
+
'former',
|
115 |
+
'formerly',
|
116 |
+
'forty',
|
117 |
+
'four',
|
118 |
+
'from',
|
119 |
+
'front',
|
120 |
+
'full',
|
121 |
+
'further',
|
122 |
+
'get',
|
123 |
+
'give',
|
124 |
+
'go',
|
125 |
+
'had',
|
126 |
+
'has',
|
127 |
+
'have',
|
128 |
+
'having',
|
129 |
+
'he',
|
130 |
+
'hence',
|
131 |
+
'her',
|
132 |
+
'here',
|
133 |
+
'hereafter',
|
134 |
+
'hereby',
|
135 |
+
'herein',
|
136 |
+
'hereupon',
|
137 |
+
'hers',
|
138 |
+
'herself',
|
139 |
+
'him',
|
140 |
+
'himself',
|
141 |
+
'his',
|
142 |
+
'how',
|
143 |
+
'however',
|
144 |
+
'hundred',
|
145 |
+
'i',
|
146 |
+
'if',
|
147 |
+
'in',
|
148 |
+
'indeed',
|
149 |
+
'into',
|
150 |
+
'is',
|
151 |
+
'it',
|
152 |
+
"it's",
|
153 |
+
'its',
|
154 |
+
'itself',
|
155 |
+
'just',
|
156 |
+
'keep',
|
157 |
+
'last',
|
158 |
+
'latter',
|
159 |
+
'latterly',
|
160 |
+
'least',
|
161 |
+
'less',
|
162 |
+
'll',
|
163 |
+
'm',
|
164 |
+
'ma',
|
165 |
+
'made',
|
166 |
+
'make',
|
167 |
+
'many',
|
168 |
+
'say',
|
169 |
+
'said',
|
170 |
+
'says',
|
171 |
+
'told',
|
172 |
+
'tell',
|
173 |
+
'may',
|
174 |
+
'me',
|
175 |
+
'meanwhile',
|
176 |
+
'might',
|
177 |
+
'mine',
|
178 |
+
'more',
|
179 |
+
'moreover',
|
180 |
+
'most',
|
181 |
+
'mostly',
|
182 |
+
'move',
|
183 |
+
'much',
|
184 |
+
'must',
|
185 |
+
'my',
|
186 |
+
'myself',
|
187 |
+
'name',
|
188 |
+
'namely',
|
189 |
+
'neither',
|
190 |
+
'never',
|
191 |
+
'nevertheless',
|
192 |
+
'next',
|
193 |
+
'nine',
|
194 |
+
'no',
|
195 |
+
'nobody',
|
196 |
+
'none',
|
197 |
+
'noone',
|
198 |
+
'nor',
|
199 |
+
'not',
|
200 |
+
'nothing',
|
201 |
+
'now',
|
202 |
+
'nowhere',
|
203 |
+
'o',
|
204 |
+
'of',
|
205 |
+
'off',
|
206 |
+
'often',
|
207 |
+
'on',
|
208 |
+
'once',
|
209 |
+
'one',
|
210 |
+
'only',
|
211 |
+
'onto',
|
212 |
+
'or',
|
213 |
+
'other',
|
214 |
+
'others',
|
215 |
+
'otherwise',
|
216 |
+
'our',
|
217 |
+
'ours',
|
218 |
+
'ourselves',
|
219 |
+
'out',
|
220 |
+
'over',
|
221 |
+
'own',
|
222 |
+
'part',
|
223 |
+
'per',
|
224 |
+
'perhaps',
|
225 |
+
'please',
|
226 |
+
'put',
|
227 |
+
'quite',
|
228 |
+
'rather',
|
229 |
+
're',
|
230 |
+
'rs',
|
231 |
+
'really',
|
232 |
+
'regarding',
|
233 |
+
's',
|
234 |
+
'same',
|
235 |
+
'say',
|
236 |
+
'see',
|
237 |
+
'seem',
|
238 |
+
'seemed',
|
239 |
+
'seeming',
|
240 |
+
'seems',
|
241 |
+
'serious',
|
242 |
+
'several',
|
243 |
+
'shan',
|
244 |
+
"shan't",
|
245 |
+
'she',
|
246 |
+
"she's",
|
247 |
+
'should',
|
248 |
+
"should've",
|
249 |
+
'shouldn',
|
250 |
+
"shouldn't",
|
251 |
+
'show',
|
252 |
+
'side',
|
253 |
+
'since',
|
254 |
+
'six',
|
255 |
+
'sixty',
|
256 |
+
'so',
|
257 |
+
'some',
|
258 |
+
'somehow',
|
259 |
+
'someone',
|
260 |
+
'something',
|
261 |
+
'sometime',
|
262 |
+
'sometimes',
|
263 |
+
'somewhere',
|
264 |
+
'still',
|
265 |
+
'such',
|
266 |
+
't',
|
267 |
+
'take',
|
268 |
+
'ten',
|
269 |
+
'than',
|
270 |
+
'that',
|
271 |
+
"that'll",
|
272 |
+
'the',
|
273 |
+
'their',
|
274 |
+
'theirs',
|
275 |
+
'them',
|
276 |
+
'themselves',
|
277 |
+
'then',
|
278 |
+
'thence',
|
279 |
+
'there',
|
280 |
+
'thereafter',
|
281 |
+
'thereby',
|
282 |
+
'therefore',
|
283 |
+
'therein',
|
284 |
+
'thereupon',
|
285 |
+
'these',
|
286 |
+
'they',
|
287 |
+
'third',
|
288 |
+
'this',
|
289 |
+
'those',
|
290 |
+
'though',
|
291 |
+
'three',
|
292 |
+
'through',
|
293 |
+
'throughout',
|
294 |
+
'thru',
|
295 |
+
'thus',
|
296 |
+
'to',
|
297 |
+
'together',
|
298 |
+
'too',
|
299 |
+
'top',
|
300 |
+
'toward',
|
301 |
+
'towards',
|
302 |
+
'twelve',
|
303 |
+
'twenty',
|
304 |
+
'two',
|
305 |
+
'under',
|
306 |
+
'unless',
|
307 |
+
'until',
|
308 |
+
'up',
|
309 |
+
'upon',
|
310 |
+
'us',
|
311 |
+
'used',
|
312 |
+
'using',
|
313 |
+
'various',
|
314 |
+
've',
|
315 |
+
'very',
|
316 |
+
'via',
|
317 |
+
'was',
|
318 |
+
'wasn',
|
319 |
+
"wasn't",
|
320 |
+
'we',
|
321 |
+
'well',
|
322 |
+
'were',
|
323 |
+
'weren',
|
324 |
+
"weren't",
|
325 |
+
'what',
|
326 |
+
'whatever',
|
327 |
+
'when',
|
328 |
+
'whence',
|
329 |
+
'whenever',
|
330 |
+
'where',
|
331 |
+
'whereafter',
|
332 |
+
'whereas',
|
333 |
+
'whereby',
|
334 |
+
'wherein',
|
335 |
+
'whereupon',
|
336 |
+
'wherever',
|
337 |
+
'whether',
|
338 |
+
'which',
|
339 |
+
'while',
|
340 |
+
'whither',
|
341 |
+
'who',
|
342 |
+
'whoever',
|
343 |
+
'whole',
|
344 |
+
'whom',
|
345 |
+
'whose',
|
346 |
+
'why',
|
347 |
+
'will',
|
348 |
+
'with',
|
349 |
+
'within',
|
350 |
+
'without',
|
351 |
+
'won',
|
352 |
+
"won't",
|
353 |
+
'would',
|
354 |
+
'wouldn',
|
355 |
+
"wouldn't",
|
356 |
+
'y',
|
357 |
+
'yet',
|
358 |
+
'you',
|
359 |
+
"you'd",
|
360 |
+
"you'll",
|
361 |
+
"you're",
|
362 |
+
"you've",
|
363 |
+
'your',
|
364 |
+
'yours',
|
365 |
+
'yourself',
|
366 |
+
'yourselves',
|
367 |
+
'‘d',
|
368 |
+
'‘ll',
|
369 |
+
'‘m',
|
370 |
+
'‘re',
|
371 |
+
'‘s',
|
372 |
+
'‘ve',
|
373 |
+
'’d',
|
374 |
+
'’ll',
|
375 |
+
'’m',
|
376 |
+
'’re',
|
377 |
+
'new',
|
378 |
+
'old',
|
379 |
+
'’s',
|
380 |
+
'’ve']
|
381 |
+
|
382 |
+
self.contraction_to_expansion = {"ain't": "am not",
|
383 |
+
"aren't": "are not",
|
384 |
+
"can't": "cannot",
|
385 |
+
"can't've": "cannot have",
|
386 |
+
"'cause": "because",
|
387 |
+
"could've": "could have",
|
388 |
+
"couldn't": "could not",
|
389 |
+
"couldn't've": "could not have",
|
390 |
+
"didn't": "did not",
|
391 |
+
"doesn't": "does not",
|
392 |
+
"don't": "do not",
|
393 |
+
"hadn't": "had not",
|
394 |
+
"hadn't've": "had not have",
|
395 |
+
"hasn't": "has not",
|
396 |
+
"haven't": "have not",
|
397 |
+
"he'd": "he would",
|
398 |
+
"he'd've": "he would have",
|
399 |
+
"he'll": "he will",
|
400 |
+
"he'll've": "he will have",
|
401 |
+
"he's": "he is",
|
402 |
+
"how'd": "how did",
|
403 |
+
"how'd'y": "how do you",
|
404 |
+
"how'll": "how will",
|
405 |
+
"how's": "how is",
|
406 |
+
"i'd": "i would",
|
407 |
+
"i'd've": "i would have",
|
408 |
+
"i'll": "i will",
|
409 |
+
"i'll've": "i will have",
|
410 |
+
"i'm": "i am",
|
411 |
+
"i've": "i have",
|
412 |
+
"isn't": "is not",
|
413 |
+
"it'd": "it had",
|
414 |
+
"it'd've": "it would have",
|
415 |
+
"it'll": "it will",
|
416 |
+
"it'll've": "it will have",
|
417 |
+
"it's": "it is",
|
418 |
+
"let's": "let us",
|
419 |
+
"ma'am": "madam",
|
420 |
+
"mayn't": "may not",
|
421 |
+
"might've": "might have",
|
422 |
+
"mightn't": "might not",
|
423 |
+
"mightn't've": "might not have",
|
424 |
+
"must've": "must have",
|
425 |
+
"mustn't": "must not",
|
426 |
+
"mustn't've": "must not have",
|
427 |
+
"needn't": "need not",
|
428 |
+
"needn't've": "need not have",
|
429 |
+
"o'clock": "of the clock",
|
430 |
+
"oughtn't": "ought not",
|
431 |
+
"oughtn't've": "ought not have",
|
432 |
+
"shan't": "shall not",
|
433 |
+
"sha'n't": "shall not",
|
434 |
+
"shan't've": "shall not have",
|
435 |
+
"she'd": "she would",
|
436 |
+
"she'd've": "she would have",
|
437 |
+
"she'll": "she will",
|
438 |
+
"she'll've": "she will have",
|
439 |
+
"she's": "she is",
|
440 |
+
"should've": "should have",
|
441 |
+
"shouldn't": "should not",
|
442 |
+
"shouldn't've": "should not have",
|
443 |
+
"so've": "so have",
|
444 |
+
"so's": "so is",
|
445 |
+
"that'd": "that would",
|
446 |
+
"that'd've": "that would have",
|
447 |
+
"that's": "that is",
|
448 |
+
"there'd": "there had",
|
449 |
+
"there'd've": "there would have",
|
450 |
+
"there's": "there is",
|
451 |
+
"they'd": "they would",
|
452 |
+
"they'd've": "they would have",
|
453 |
+
"they'll": "they will",
|
454 |
+
"they'll've": "they will have",
|
455 |
+
"they're": "they are",
|
456 |
+
"they've": "they have",
|
457 |
+
"to've": "to have",
|
458 |
+
"wasn't": "was not",
|
459 |
+
"we'd": "we had",
|
460 |
+
"we'd've": "we would have",
|
461 |
+
"we'll": "we will",
|
462 |
+
"we'll've": "we will have",
|
463 |
+
"we're": "we are",
|
464 |
+
"we've": "we have",
|
465 |
+
"weren't": "were not",
|
466 |
+
"what'll": "what will",
|
467 |
+
"what'll've": "what will have",
|
468 |
+
"what're": "what are",
|
469 |
+
"what's": "what is",
|
470 |
+
"what've": "what have",
|
471 |
+
"when's": "when is",
|
472 |
+
"when've": "when have",
|
473 |
+
"where'd": "where did",
|
474 |
+
"where's": "where is",
|
475 |
+
"where've": "where have",
|
476 |
+
"who'll": "who will",
|
477 |
+
"who'll've": "who will have",
|
478 |
+
"who's": "who is",
|
479 |
+
"who've": "who have",
|
480 |
+
"why's": "why is",
|
481 |
+
"why've": "why have",
|
482 |
+
"will've": "will have",
|
483 |
+
"won't": "will not",
|
484 |
+
"won't've": "will not have",
|
485 |
+
"would've": "would have",
|
486 |
+
"wouldn't": "would not",
|
487 |
+
"wouldn't've": "would not have",
|
488 |
+
"y'all": "you all",
|
489 |
+
"y'alls": "you alls",
|
490 |
+
"y'all'd": "you all would",
|
491 |
+
"y'all'd've": "you all would have",
|
492 |
+
"y'all're": "you all are",
|
493 |
+
"y'all've": "you all have",
|
494 |
+
"you'd": "you had",
|
495 |
+
"you'd've": "you would have",
|
496 |
+
"you'll": "you you will",
|
497 |
+
"you'll've": "you you will have",
|
498 |
+
"you're": "you are",
|
499 |
+
"you've": "you have"
|
500 |
+
}
|
501 |
+
|
502 |
+
@staticmethod
|
503 |
+
def __remove_double_whitespaces(string: str):
|
504 |
+
return " ".join(string.split())
|
505 |
+
|
506 |
+
def __remove_url(self, string_series: pd.Series):
|
507 |
+
"""
|
508 |
+
Removes URLs m text
|
509 |
+
:param string_series: pd.Series, input string series
|
510 |
+
:return: pd.Series, cleaned string series
|
511 |
+
"""
|
512 |
+
clean_string_series = string_series.str.replace(
|
513 |
+
pat=r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})",
|
514 |
+
repl=" ", regex=True).copy()
|
515 |
+
return clean_string_series.map(self.__remove_double_whitespaces)
|
516 |
+
|
517 |
+
def __expand(self, string_series: pd.Series):
|
518 |
+
"""
|
519 |
+
Replaces contractions with expansions. eg. don't wit do not.
|
520 |
+
:param string_series: pd.Series, input string series
|
521 |
+
:return: pd.Series, cleaned string series
|
522 |
+
"""
|
523 |
+
clean_string_series = string_series.copy()
|
524 |
+
for c, e in self.contraction_to_expansion.items():
|
525 |
+
clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
|
526 |
+
return clean_string_series.map(self.__remove_double_whitespaces)
|
527 |
+
|
528 |
+
def __remove_punct(self, string_series: pd.Series):
|
529 |
+
"""
|
530 |
+
Removes punctuations from the input string.
|
531 |
+
:param string_series: pd.Series, input string series
|
532 |
+
:return: pd.Series, cleaned string series
|
533 |
+
"""
|
534 |
+
clean_string_series = string_series.copy()
|
535 |
+
puncts = [r'\n', r'\r', r'\t']
|
536 |
+
puncts.extend(list(string.punctuation))
|
537 |
+
for i in puncts:
|
538 |
+
clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
|
539 |
+
return clean_string_series.map(self.__remove_double_whitespaces)
|
540 |
+
|
541 |
+
def __remove_digits(self, string_series: pd.Series):
|
542 |
+
"""
|
543 |
+
Removes digits from the input string.
|
544 |
+
:param string_series: pd.Series, input string series
|
545 |
+
:return: pd.Series, cleaned string series
|
546 |
+
"""
|
547 |
+
clean_string_series = string_series.str.replace(pat=r'\d', repl=" ", regex=True).copy()
|
548 |
+
return clean_string_series.map(self.__remove_double_whitespaces)
|
549 |
+
|
550 |
+
@staticmethod
|
551 |
+
def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
|
552 |
+
"""
|
553 |
+
Reomves words/tokens where minlen <= len <= maxlen.
|
554 |
+
:param string_series: pd.Series, input string series
|
555 |
+
:param minlen: int, minimum length of token to be removed.
|
556 |
+
:param maxlen: int, maximum length of token to be removed.
|
557 |
+
:return: pd.Series, cleaned string series
|
558 |
+
"""
|
559 |
+
clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split() if
|
560 |
+
(len(word) > maxlen) or (len(word) < minlen)]))
|
561 |
+
return clean_string_series
|
562 |
+
|
563 |
+
def __remove_stop_words(self, string_series: pd.Series):
|
564 |
+
"""
|
565 |
+
Removes stop words from the input string.
|
566 |
+
:param string_series: pd.Series, input string series
|
567 |
+
:return: pd.Series, cleaned string series
|
568 |
+
"""
|
569 |
+
def str_remove_stop_words(string: str):
|
570 |
+
stops = self.stop_words
|
571 |
+
return " ".join([token for token in string.split() if token not in stops])
|
572 |
+
|
573 |
+
return string_series.map(str_remove_stop_words)
|
574 |
+
|
575 |
+
def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
|
576 |
+
bottom_p: int = None, dataset: str = 'train'):
|
577 |
+
"""
|
578 |
+
Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
|
579 |
+
:param string_series: pd.Series, input string series
|
580 |
+
:param top_p: float, percent of frequent words to remove.
|
581 |
+
:param bottom_p: float, percent of rare words to remove.
|
582 |
+
:param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
|
583 |
+
:return: pd.Series, cleaned string series
|
584 |
+
"""
|
585 |
+
if dataset == 'train':
|
586 |
+
if top_p is None:
|
587 |
+
top_p = 0
|
588 |
+
if bottom_p is None:
|
589 |
+
bottom_p = 0
|
590 |
+
|
591 |
+
if top_p > 0 or bottom_p > 0:
|
592 |
+
word_freq = pd.Series(" ".join(string_series).split()).value_counts()
|
593 |
+
n_words = len(word_freq)
|
594 |
+
|
595 |
+
if top_p > 0:
|
596 |
+
self.words_to_remove.extend([*word_freq.index[: int(np.ceil(top_p * n_words))]])
|
597 |
+
|
598 |
+
if bottom_p > 0:
|
599 |
+
self.words_to_remove.extend([*word_freq.index[-int(np.ceil(bottom_p * n_words)):]])
|
600 |
+
|
601 |
+
if len(self.words_to_remove) == 0:
|
602 |
+
return string_series
|
603 |
+
else:
|
604 |
+
clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split()
|
605 |
+
if word not in self.words_to_remove]))
|
606 |
+
return clean_string_series
|
607 |
+
|
608 |
+
def preprocess(self, string_series: pd.Series, dataset: str = "train"):
|
609 |
+
"""
|
610 |
+
Entry point.
|
611 |
+
:param string_series: pd.Series, input string series
|
612 |
+
:param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
|
613 |
+
:return: pd.Series, cleaned string series
|
614 |
+
"""
|
615 |
+
string_series = string_series.str.lower().copy()
|
616 |
+
string_series = string_series.map(unidecode).copy()
|
617 |
+
string_series = self.__remove_url(string_series=string_series)
|
618 |
+
string_series = self.__expand(string_series=string_series)
|
619 |
+
|
620 |
+
if self.remove_punct:
|
621 |
+
string_series = self.__remove_punct(string_series=string_series)
|
622 |
+
if self.remove_digits:
|
623 |
+
string_series = self.__remove_digits(string_series=string_series)
|
624 |
+
if self.remove_stop_words:
|
625 |
+
string_series = self.__remove_stop_words(string_series=string_series)
|
626 |
+
if self.remove_short_words:
|
627 |
+
string_series = self.__remove_short_words(string_series=string_series,
|
628 |
+
minlen=self.minlen,
|
629 |
+
maxlen=self.maxlen)
|
630 |
+
string_series = self.__remove_top_bottom_words(string_series=string_series,
|
631 |
+
top_p=self.top_p,
|
632 |
+
bottom_p=self.bottom_p, dataset=dataset)
|
633 |
+
|
634 |
+
string_series = string_series.str.strip().copy()
|
635 |
+
string_series.replace(to_replace="", value="this is an empty message", inplace=True)
|
636 |
+
|
637 |
+
return string_series
|
638 |
+
|
639 |
+
|
640 |
+
def get_frequent_words_html(df):
|
641 |
+
text_preprocess = TextPreprocessor()
|
642 |
+
preprocessed_txt = text_preprocess.preprocess(df['title'] + ' ' + df['description'])
|
643 |
+
counter = Counter(' '.join([*preprocessed_txt]).split())
|
644 |
+
|
645 |
+
freq_tokens_html = '<div class="word-cloud-container">'
|
646 |
+
n = 1
|
647 |
+
for i, j in counter.most_common(25):
|
648 |
+
freq_tokens_html += f'<a class="wc-tokens" onclick=wc_search("{i}")>{i}</a>{" " * np.random.randint(3, 7, 1)[0]}'
|
649 |
+
if n == 5:
|
650 |
+
freq_tokens_html += '<div class="word-cloud-section" id="word-cloud-section-id">'
|
651 |
+
n += 1
|
652 |
+
freq_tokens_html += '</div></div>'
|
653 |
+
return freq_tokens_html
|