Dee-lys123 commited on
Commit
8f43ed9
·
verified ·
1 Parent(s): 4f0cba8

Upload 5 files

Browse files
app.cpython-38.pyc ADDED
Binary file (798 Bytes). View file
 
main.cpython-310.pyc ADDED
Binary file (1.84 kB). View file
 
main.cpython-38.pyc ADDED
Binary file (3.23 kB). View file
 
main.cpython-39.pyc ADDED
Binary file (6.24 kB). View file
 
main.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ import smtplib
3
+ from email.mime.multipart import MIMEMultipart
4
+ from email.mime.text import MIMEText
5
+ import feedparser
6
+ import re
7
+ import html
8
+ import requests
9
+ from datetime import datetime, timedelta
10
+ from bs4 import BeautifulSoup
11
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
12
+ from urllib.parse import urlparse, parse_qs
13
+ import streamlit as st
14
+ import matplotlib.pyplot as plt
15
+ import pandas as pd
16
+
17
+ warnings.filterwarnings("ignore")
18
+
19
+ # Summarization using BART
20
+ tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
21
+ model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
22
+
23
+ def summarize_text_with_bart(text):
24
+ inputs = tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
25
+ summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
26
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
27
+ return summary if summary else 'Summary unavailable'
28
+
29
+ # Initialize the sentiment analysis model
30
+ sentiment_model = pipeline("sentiment-analysis", model="finiteautomata/bertweet-base-sentiment-analysis")
31
+
32
+ def analyze_sentiment(summary):
33
+ result = sentiment_model(summary)[0]
34
+ if result['label'] == 'POS':
35
+ score = 1 * result['score']
36
+ elif result['label'] == 'NEG':
37
+ score = -1 * result['score']
38
+ else:
39
+ score = 0
40
+
41
+ if score <= -0.6:
42
+ sentiment = "Very Negative"
43
+ elif -0.6 < score <= -0.2:
44
+ sentiment = "Negative"
45
+ elif -0.2 < score <= 0.2:
46
+ sentiment = "Neutral"
47
+ elif 0.2 < score <= 0.6:
48
+ sentiment = "Positive"
49
+ else:
50
+ sentiment = "Very Positive"
51
+
52
+ return sentiment, score
53
+
54
+ def fetch_news_from_rss(feed_url, keywords, start_date, end_date):
55
+ try:
56
+ start_date = datetime.combine(start_date, datetime.min.time())
57
+ end_date = datetime.combine(end_date, datetime.max.time())
58
+
59
+ feed = feedparser.parse(feed_url)
60
+ if 'entries' not in feed:
61
+ st.write("Failed to retrieve feed or feed is empty")
62
+ return []
63
+
64
+ news_items = []
65
+
66
+ for entry in feed.entries:
67
+ published_date = datetime(
68
+ entry.published_parsed.tm_year,
69
+ entry.published_parsed.tm_mon,
70
+ entry.published_parsed.tm_mday,
71
+ entry.published_parsed.tm_hour,
72
+ entry.published_parsed.tm_min,
73
+ entry.published_parsed.tm_sec
74
+ )
75
+
76
+ if start_date <= published_date <= end_date:
77
+ title = entry.title.lower()
78
+ summary = entry.summary.lower() if entry.summary != entry.title else 'No summary available.'
79
+
80
+ if any(keyword.lower() in title or keyword.lower() in summary for keyword in keywords):
81
+ source = extract_source(entry.link)
82
+ news_items.append({
83
+ 'title': entry.title,
84
+ 'summary': clean_html(entry.summary) if entry.summary != entry.title else 'No summary available.',
85
+ 'link': entry.link,
86
+ 'date': published_date,
87
+ 'source': source
88
+ })
89
+ news_items.sort(key=lambda x: x['date'], reverse=True)
90
+ return news_items
91
+ except Exception as e:
92
+ st.write(f"An error occurred while fetching feed: {e}")
93
+ return []
94
+
95
+ def extract_source(entry_link):
96
+ try:
97
+ parsed_url = urlparse(entry_link)
98
+ domain = parsed_url.netloc
99
+
100
+ known_sources = {
101
+ "www.themalaysianinsight.com": "The Malaysian Insight",
102
+ "www.themalaysianreserve.com": "The Malaysian Reserve",
103
+ }
104
+ return known_sources.get(domain, domain)
105
+ except Exception as e:
106
+ st.write(f"Failed to extract source from {entry_link}: {e}")
107
+ return "Unknown Source"
108
+
109
+ def clean_html(raw_html):
110
+ return re.sub(r'<[^>]+>', '', html.unescape(raw_html))
111
+
112
+ def remove_duplicates(news_items):
113
+ seen_links = set()
114
+ unique_items = []
115
+ for item in news_items:
116
+ if item['link'] not in seen_links:
117
+ unique_items.append(item)
118
+ seen_links.add(item['link'])
119
+ return unique_items
120
+
121
+ def fetch_article_content(bing_url):
122
+ try:
123
+ parsed_url = urlparse(bing_url)
124
+ query_params = parse_qs(parsed_url.query)
125
+ original_url = query_params.get('url', [None])[0]
126
+
127
+ if original_url:
128
+ original_content_response = requests.get(original_url, headers={'User-Agent': 'Mozilla/5.0'}, allow_redirects=True)
129
+ original_content_response.raise_for_status()
130
+ original_soup = BeautifulSoup(original_content_response.text, 'html.parser')
131
+ paragraphs = original_soup.find_all('p')
132
+ return ' '.join(paragraph.text for paragraph in paragraphs), extract_source(original_url)
133
+ else:
134
+ st.write(f"Original article link not found in Bing URL: {bing_url}")
135
+ return None, "Unknown Source"
136
+ except requests.RequestException as e:
137
+ st.write(f"An error occurred while fetching article content from {bing_url}: {e}")
138
+ return None, "Unknown Source"
139
+
140
+ def display_news(news_items):
141
+ for idx, item in enumerate(news_items, 1):
142
+ sentiment_category, sentiment_score = analyze_sentiment(item['summary'])
143
+ article_info = (
144
+ f"### Article {idx}:\n\n"
145
+ f"**Title:** {item['title']}\n\n"
146
+ f"**Summary:** {item['summary']}\n\n"
147
+ f"**Source:** {item['source']}\n\n"
148
+ f"[Link to article]({item['link']})\n\n"
149
+ f"**Date:** {item['date'].strftime('%Y-%m-%d %H:%M:%S')}\n\n"
150
+ f"**Sentiment:** {sentiment_category} (Score: {sentiment_score:.2f})"
151
+ )
152
+ st.markdown(article_info, unsafe_allow_html=True)
153
+ st.markdown("---")
154
+
155
+ def display_top_5_rankings(news_items):
156
+ sorted_by_sentiment = sorted(news_items, key=lambda x: analyze_sentiment(x['summary'])[1], reverse=True)
157
+ top_5_positive = sorted_by_sentiment[:5]
158
+ top_5_negative = sorted_by_sentiment[-5:]
159
+
160
+ st.subheader("Top 5 Most Positive Articles")
161
+ for idx, item in enumerate(top_5_positive, 1):
162
+ st.markdown(f"**{idx}. {item['title']}** - Sentiment: {analyze_sentiment(item['summary'])[0]} (Score: {analyze_sentiment(item['summary'])[1]:.2f})")
163
+ st.markdown(f"**Source**: {item['source']}")
164
+ st.markdown(f"[Read more]({item['link']})")
165
+
166
+ st.subheader("Top 5 Most Negative Articles")
167
+ for idx, item in enumerate(top_5_negative, 1):
168
+ st.markdown(f"**{idx}. {item['title']}** - Sentiment: {analyze_sentiment(item['summary'])[0]} (Score: {analyze_sentiment(item['summary'])[1]:.2f})")
169
+ st.markdown(f"**Source**: {item['source']}")
170
+ st.markdown(f"[Read more]({item['link']})")
171
+
172
+ def plot_sentiment_over_time(news_items):
173
+ df = pd.DataFrame(news_items)
174
+ df['sentiment_score'] = df['summary'].apply(lambda x: analyze_sentiment(x)[1])
175
+ df['date'] = pd.to_datetime(df['date'])
176
+
177
+ plt.figure(figsize=(10, 6))
178
+ plt.plot(df['date'], df['sentiment_score'], marker='o', linestyle='-', color='b')
179
+ plt.title('Sentiment Scores Over Time')
180
+ plt.xlabel('Date')
181
+ plt.ylabel('Sentiment Score')
182
+ plt.xticks(rotation=45)
183
+ plt.grid(True)
184
+ st.pyplot(plt)
185
+
186
+ def calculate_average_sentiment(news_items):
187
+ total_sentiment = sum(analyze_sentiment(item['summary'])[1] for item in news_items)
188
+ avg_sentiment = total_sentiment / len(news_items) if news_items else 0
189
+ st.write(f"**Average Sentiment for Selected Period**: {avg_sentiment:.2f}")
190
+
191
+ def send_email(subject, body, to_email):
192
+ from_email = "hankaier123@gmail.com"
193
+ password = "unyl ldnt cssx kfkx"
194
+
195
+ msg = MIMEMultipart()
196
+ msg['From'] = from_email
197
+ msg['To'] = to_email
198
+ msg['Subject'] = subject
199
+
200
+ msg.attach(MIMEText(body, 'plain'))
201
+
202
+ try:
203
+ with smtplib.SMTP_SSL("smtp.gmail.com", 465) as server:
204
+ server.login(from_email, password)
205
+ server.sendmail(from_email, to_email, msg.as_string())
206
+ st.success("Email sent successfully!")
207
+ except Exception as e:
208
+ st.error(f"Failed to send email: {e}")
209
+
210
+ def main():
211
+ st.title("News Summarizer")
212
+
213
+ st.sidebar.header("Filter Settings")
214
+ start_date = st.sidebar.date_input("Start Date", datetime.now() - timedelta(days=30))
215
+ end_date = st.sidebar.date_input("End Date", datetime.now())
216
+
217
+ if start_date > end_date:
218
+ st.sidebar.error("Error: End date must fall after start date.")
219
+
220
+ keywords_input = st.sidebar.text_input("Enter keywords (comma-separated)", "smes, malaysia")
221
+ keywords = [keyword.strip() for keyword in keywords_input.split(",")]
222
+
223
+ if st.sidebar.button("Fetch News"):
224
+ bing_news_query = "+".join(keywords)
225
+ bing_rss_url = f"https://www.bing.com/news/search?q={bing_news_query}&format=rss"
226
+
227
+ st.write(f"Fetching news articles related to: {', '.join(keywords)}")
228
+ bing_news_items = fetch_news_from_rss(bing_rss_url, keywords, start_date, end_date)
229
+
230
+ unique_news_items = remove_duplicates(bing_news_items)
231
+
232
+ for item in unique_news_items:
233
+ content, source = fetch_article_content(item['link'])
234
+ if content:
235
+ summarized_text = summarize_text_with_bart(content)
236
+ if summarized_text != 'Summary unavailable':
237
+ item['summary'] = summarized_text
238
+ item['source'] = source
239
+
240
+ if unique_news_items:
241
+ st.subheader("News Articles")
242
+ display_news(unique_news_items)
243
+
244
+ display_top_5_rankings(unique_news_items)
245
+ plot_sentiment_over_time(unique_news_items)
246
+ calculate_average_sentiment(unique_news_items)
247
+
248
+ st.subheader("Send News via Email")
249
+ email_subject = st.text_input("Email Subject", "News Summaries")
250
+ to_email = st.text_input("Recipient Email")
251
+ if st.button("Send Email"):
252
+ if to_email:
253
+ email_body = "\n\n".join([f"Title: {item['title']}\nSummary: {item['summary']}\nSource: {item['source']}\nLink: {item['link']}\n" for item in unique_news_items])
254
+ send_email(email_subject, email_body, to_email)
255
+ else:
256
+ st.error("Please provide a recipient email.")
257
+ else:
258
+ st.write("No news articles found for the selected date range and keywords.")
259
+
260
+ if __name__ == '__main__':
261
+ main()