Update news_extractor/news_extractor.py
Browse files
news_extractor/news_extractor.py
CHANGED
@@ -6,6 +6,7 @@ import regex as re
|
|
6 |
from dateutil import parser
|
7 |
import logging
|
8 |
import multiprocessing
|
|
|
9 |
|
10 |
|
11 |
def date_time_parser(dt):
|
@@ -91,9 +92,8 @@ def news_agg(rss):
|
|
91 |
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
|
92 |
}
|
93 |
|
94 |
-
timeout = 5
|
95 |
|
96 |
-
resp = r.get(rss, timeout=
|
97 |
logging.warning(f'{rss}: {resp.status_code}')
|
98 |
b = BeautifulSoup(resp.content, "xml")
|
99 |
items = b.find_all("item")
|
@@ -121,19 +121,7 @@ def news_agg(rss):
|
|
121 |
|
122 |
|
123 |
# List of RSS feeds
|
124 |
-
rss =
|
125 |
-
'https://www.thehindu.com/news/feeder/default.rss',
|
126 |
-
# 'https://telanganatoday.com/feed',
|
127 |
-
'https://www.businesstoday.in/rssfeeds/?id=225346',
|
128 |
-
'https://feeds.feedburner.com/ndtvnews-latest',
|
129 |
-
'https://www.hindustantimes.com/feeds/rss/world-news/rssfeed.xml',
|
130 |
-
'https://www.indiatoday.in/rss/1206578',
|
131 |
-
|
132 |
-
'https://www.moneycontrol.com/rss/latestnews.xml',
|
133 |
-
'https://www.livemint.com/rss/news',
|
134 |
-
|
135 |
-
'https://www.zeebiz.com/latest.xml/feed',
|
136 |
-
'https://www.timesofindia.indiatimes.com/rssfeedmostrecent.cms']
|
137 |
|
138 |
|
139 |
def get_news_rss(url):
|
|
|
6 |
from dateutil import parser
|
7 |
import logging
|
8 |
import multiprocessing
|
9 |
+
from config import NEWS_EXTRACTOR_URL_TIMEOUT, RSS_FEEDS_TO_EXTRACT
|
10 |
|
11 |
|
12 |
def date_time_parser(dt):
|
|
|
92 |
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
|
93 |
}
|
94 |
|
|
|
95 |
|
96 |
+
resp = r.get(rss, timeout=NEWS_EXTRACTOR_URL_TIMEOUT, headers=headers)
|
97 |
logging.warning(f'{rss}: {resp.status_code}')
|
98 |
b = BeautifulSoup(resp.content, "xml")
|
99 |
items = b.find_all("item")
|
|
|
121 |
|
122 |
|
123 |
# List of RSS feeds
|
124 |
+
rss = RSS_FEEDS_TO_EXTRACT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
|
127 |
def get_news_rss(url):
|