Spaces:
No application file
No application file
Upload 5 files
Browse files- app.cpython-38.pyc +0 -0
- main.cpython-310.pyc +0 -0
- main.cpython-38.pyc +0 -0
- main.cpython-39.pyc +0 -0
- main.py +261 -0
app.cpython-38.pyc
ADDED
Binary file (798 Bytes). View file
|
|
main.cpython-310.pyc
ADDED
Binary file (1.84 kB). View file
|
|
main.cpython-38.pyc
ADDED
Binary file (3.23 kB). View file
|
|
main.cpython-39.pyc
ADDED
Binary file (6.24 kB). View file
|
|
main.py
ADDED
@@ -0,0 +1,261 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
import smtplib
|
3 |
+
from email.mime.multipart import MIMEMultipart
|
4 |
+
from email.mime.text import MIMEText
|
5 |
+
import feedparser
|
6 |
+
import re
|
7 |
+
import html
|
8 |
+
import requests
|
9 |
+
from datetime import datetime, timedelta
|
10 |
+
from bs4 import BeautifulSoup
|
11 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
12 |
+
from urllib.parse import urlparse, parse_qs
|
13 |
+
import streamlit as st
|
14 |
+
import matplotlib.pyplot as plt
|
15 |
+
import pandas as pd
|
16 |
+
|
17 |
+
warnings.filterwarnings("ignore")
|
18 |
+
|
19 |
+
# Summarization using BART
|
20 |
+
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
|
21 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
|
22 |
+
|
23 |
+
def summarize_text_with_bart(text):
|
24 |
+
inputs = tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
|
25 |
+
summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
|
26 |
+
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
27 |
+
return summary if summary else 'Summary unavailable'
|
28 |
+
|
29 |
+
# Initialize the sentiment analysis model
|
30 |
+
sentiment_model = pipeline("sentiment-analysis", model="finiteautomata/bertweet-base-sentiment-analysis")
|
31 |
+
|
32 |
+
def analyze_sentiment(summary):
|
33 |
+
result = sentiment_model(summary)[0]
|
34 |
+
if result['label'] == 'POS':
|
35 |
+
score = 1 * result['score']
|
36 |
+
elif result['label'] == 'NEG':
|
37 |
+
score = -1 * result['score']
|
38 |
+
else:
|
39 |
+
score = 0
|
40 |
+
|
41 |
+
if score <= -0.6:
|
42 |
+
sentiment = "Very Negative"
|
43 |
+
elif -0.6 < score <= -0.2:
|
44 |
+
sentiment = "Negative"
|
45 |
+
elif -0.2 < score <= 0.2:
|
46 |
+
sentiment = "Neutral"
|
47 |
+
elif 0.2 < score <= 0.6:
|
48 |
+
sentiment = "Positive"
|
49 |
+
else:
|
50 |
+
sentiment = "Very Positive"
|
51 |
+
|
52 |
+
return sentiment, score
|
53 |
+
|
54 |
+
def fetch_news_from_rss(feed_url, keywords, start_date, end_date):
|
55 |
+
try:
|
56 |
+
start_date = datetime.combine(start_date, datetime.min.time())
|
57 |
+
end_date = datetime.combine(end_date, datetime.max.time())
|
58 |
+
|
59 |
+
feed = feedparser.parse(feed_url)
|
60 |
+
if 'entries' not in feed:
|
61 |
+
st.write("Failed to retrieve feed or feed is empty")
|
62 |
+
return []
|
63 |
+
|
64 |
+
news_items = []
|
65 |
+
|
66 |
+
for entry in feed.entries:
|
67 |
+
published_date = datetime(
|
68 |
+
entry.published_parsed.tm_year,
|
69 |
+
entry.published_parsed.tm_mon,
|
70 |
+
entry.published_parsed.tm_mday,
|
71 |
+
entry.published_parsed.tm_hour,
|
72 |
+
entry.published_parsed.tm_min,
|
73 |
+
entry.published_parsed.tm_sec
|
74 |
+
)
|
75 |
+
|
76 |
+
if start_date <= published_date <= end_date:
|
77 |
+
title = entry.title.lower()
|
78 |
+
summary = entry.summary.lower() if entry.summary != entry.title else 'No summary available.'
|
79 |
+
|
80 |
+
if any(keyword.lower() in title or keyword.lower() in summary for keyword in keywords):
|
81 |
+
source = extract_source(entry.link)
|
82 |
+
news_items.append({
|
83 |
+
'title': entry.title,
|
84 |
+
'summary': clean_html(entry.summary) if entry.summary != entry.title else 'No summary available.',
|
85 |
+
'link': entry.link,
|
86 |
+
'date': published_date,
|
87 |
+
'source': source
|
88 |
+
})
|
89 |
+
news_items.sort(key=lambda x: x['date'], reverse=True)
|
90 |
+
return news_items
|
91 |
+
except Exception as e:
|
92 |
+
st.write(f"An error occurred while fetching feed: {e}")
|
93 |
+
return []
|
94 |
+
|
95 |
+
def extract_source(entry_link):
|
96 |
+
try:
|
97 |
+
parsed_url = urlparse(entry_link)
|
98 |
+
domain = parsed_url.netloc
|
99 |
+
|
100 |
+
known_sources = {
|
101 |
+
"www.themalaysianinsight.com": "The Malaysian Insight",
|
102 |
+
"www.themalaysianreserve.com": "The Malaysian Reserve",
|
103 |
+
}
|
104 |
+
return known_sources.get(domain, domain)
|
105 |
+
except Exception as e:
|
106 |
+
st.write(f"Failed to extract source from {entry_link}: {e}")
|
107 |
+
return "Unknown Source"
|
108 |
+
|
109 |
+
def clean_html(raw_html):
|
110 |
+
return re.sub(r'<[^>]+>', '', html.unescape(raw_html))
|
111 |
+
|
112 |
+
def remove_duplicates(news_items):
|
113 |
+
seen_links = set()
|
114 |
+
unique_items = []
|
115 |
+
for item in news_items:
|
116 |
+
if item['link'] not in seen_links:
|
117 |
+
unique_items.append(item)
|
118 |
+
seen_links.add(item['link'])
|
119 |
+
return unique_items
|
120 |
+
|
121 |
+
def fetch_article_content(bing_url):
|
122 |
+
try:
|
123 |
+
parsed_url = urlparse(bing_url)
|
124 |
+
query_params = parse_qs(parsed_url.query)
|
125 |
+
original_url = query_params.get('url', [None])[0]
|
126 |
+
|
127 |
+
if original_url:
|
128 |
+
original_content_response = requests.get(original_url, headers={'User-Agent': 'Mozilla/5.0'}, allow_redirects=True)
|
129 |
+
original_content_response.raise_for_status()
|
130 |
+
original_soup = BeautifulSoup(original_content_response.text, 'html.parser')
|
131 |
+
paragraphs = original_soup.find_all('p')
|
132 |
+
return ' '.join(paragraph.text for paragraph in paragraphs), extract_source(original_url)
|
133 |
+
else:
|
134 |
+
st.write(f"Original article link not found in Bing URL: {bing_url}")
|
135 |
+
return None, "Unknown Source"
|
136 |
+
except requests.RequestException as e:
|
137 |
+
st.write(f"An error occurred while fetching article content from {bing_url}: {e}")
|
138 |
+
return None, "Unknown Source"
|
139 |
+
|
140 |
+
def display_news(news_items):
|
141 |
+
for idx, item in enumerate(news_items, 1):
|
142 |
+
sentiment_category, sentiment_score = analyze_sentiment(item['summary'])
|
143 |
+
article_info = (
|
144 |
+
f"### Article {idx}:\n\n"
|
145 |
+
f"**Title:** {item['title']}\n\n"
|
146 |
+
f"**Summary:** {item['summary']}\n\n"
|
147 |
+
f"**Source:** {item['source']}\n\n"
|
148 |
+
f"[Link to article]({item['link']})\n\n"
|
149 |
+
f"**Date:** {item['date'].strftime('%Y-%m-%d %H:%M:%S')}\n\n"
|
150 |
+
f"**Sentiment:** {sentiment_category} (Score: {sentiment_score:.2f})"
|
151 |
+
)
|
152 |
+
st.markdown(article_info, unsafe_allow_html=True)
|
153 |
+
st.markdown("---")
|
154 |
+
|
155 |
+
def display_top_5_rankings(news_items):
|
156 |
+
sorted_by_sentiment = sorted(news_items, key=lambda x: analyze_sentiment(x['summary'])[1], reverse=True)
|
157 |
+
top_5_positive = sorted_by_sentiment[:5]
|
158 |
+
top_5_negative = sorted_by_sentiment[-5:]
|
159 |
+
|
160 |
+
st.subheader("Top 5 Most Positive Articles")
|
161 |
+
for idx, item in enumerate(top_5_positive, 1):
|
162 |
+
st.markdown(f"**{idx}. {item['title']}** - Sentiment: {analyze_sentiment(item['summary'])[0]} (Score: {analyze_sentiment(item['summary'])[1]:.2f})")
|
163 |
+
st.markdown(f"**Source**: {item['source']}")
|
164 |
+
st.markdown(f"[Read more]({item['link']})")
|
165 |
+
|
166 |
+
st.subheader("Top 5 Most Negative Articles")
|
167 |
+
for idx, item in enumerate(top_5_negative, 1):
|
168 |
+
st.markdown(f"**{idx}. {item['title']}** - Sentiment: {analyze_sentiment(item['summary'])[0]} (Score: {analyze_sentiment(item['summary'])[1]:.2f})")
|
169 |
+
st.markdown(f"**Source**: {item['source']}")
|
170 |
+
st.markdown(f"[Read more]({item['link']})")
|
171 |
+
|
172 |
+
def plot_sentiment_over_time(news_items):
|
173 |
+
df = pd.DataFrame(news_items)
|
174 |
+
df['sentiment_score'] = df['summary'].apply(lambda x: analyze_sentiment(x)[1])
|
175 |
+
df['date'] = pd.to_datetime(df['date'])
|
176 |
+
|
177 |
+
plt.figure(figsize=(10, 6))
|
178 |
+
plt.plot(df['date'], df['sentiment_score'], marker='o', linestyle='-', color='b')
|
179 |
+
plt.title('Sentiment Scores Over Time')
|
180 |
+
plt.xlabel('Date')
|
181 |
+
plt.ylabel('Sentiment Score')
|
182 |
+
plt.xticks(rotation=45)
|
183 |
+
plt.grid(True)
|
184 |
+
st.pyplot(plt)
|
185 |
+
|
186 |
+
def calculate_average_sentiment(news_items):
|
187 |
+
total_sentiment = sum(analyze_sentiment(item['summary'])[1] for item in news_items)
|
188 |
+
avg_sentiment = total_sentiment / len(news_items) if news_items else 0
|
189 |
+
st.write(f"**Average Sentiment for Selected Period**: {avg_sentiment:.2f}")
|
190 |
+
|
191 |
+
def send_email(subject, body, to_email):
|
192 |
+
from_email = "hankaier123@gmail.com"
|
193 |
+
password = "unyl ldnt cssx kfkx"
|
194 |
+
|
195 |
+
msg = MIMEMultipart()
|
196 |
+
msg['From'] = from_email
|
197 |
+
msg['To'] = to_email
|
198 |
+
msg['Subject'] = subject
|
199 |
+
|
200 |
+
msg.attach(MIMEText(body, 'plain'))
|
201 |
+
|
202 |
+
try:
|
203 |
+
with smtplib.SMTP_SSL("smtp.gmail.com", 465) as server:
|
204 |
+
server.login(from_email, password)
|
205 |
+
server.sendmail(from_email, to_email, msg.as_string())
|
206 |
+
st.success("Email sent successfully!")
|
207 |
+
except Exception as e:
|
208 |
+
st.error(f"Failed to send email: {e}")
|
209 |
+
|
210 |
+
def main():
|
211 |
+
st.title("News Summarizer")
|
212 |
+
|
213 |
+
st.sidebar.header("Filter Settings")
|
214 |
+
start_date = st.sidebar.date_input("Start Date", datetime.now() - timedelta(days=30))
|
215 |
+
end_date = st.sidebar.date_input("End Date", datetime.now())
|
216 |
+
|
217 |
+
if start_date > end_date:
|
218 |
+
st.sidebar.error("Error: End date must fall after start date.")
|
219 |
+
|
220 |
+
keywords_input = st.sidebar.text_input("Enter keywords (comma-separated)", "smes, malaysia")
|
221 |
+
keywords = [keyword.strip() for keyword in keywords_input.split(",")]
|
222 |
+
|
223 |
+
if st.sidebar.button("Fetch News"):
|
224 |
+
bing_news_query = "+".join(keywords)
|
225 |
+
bing_rss_url = f"https://www.bing.com/news/search?q={bing_news_query}&format=rss"
|
226 |
+
|
227 |
+
st.write(f"Fetching news articles related to: {', '.join(keywords)}")
|
228 |
+
bing_news_items = fetch_news_from_rss(bing_rss_url, keywords, start_date, end_date)
|
229 |
+
|
230 |
+
unique_news_items = remove_duplicates(bing_news_items)
|
231 |
+
|
232 |
+
for item in unique_news_items:
|
233 |
+
content, source = fetch_article_content(item['link'])
|
234 |
+
if content:
|
235 |
+
summarized_text = summarize_text_with_bart(content)
|
236 |
+
if summarized_text != 'Summary unavailable':
|
237 |
+
item['summary'] = summarized_text
|
238 |
+
item['source'] = source
|
239 |
+
|
240 |
+
if unique_news_items:
|
241 |
+
st.subheader("News Articles")
|
242 |
+
display_news(unique_news_items)
|
243 |
+
|
244 |
+
display_top_5_rankings(unique_news_items)
|
245 |
+
plot_sentiment_over_time(unique_news_items)
|
246 |
+
calculate_average_sentiment(unique_news_items)
|
247 |
+
|
248 |
+
st.subheader("Send News via Email")
|
249 |
+
email_subject = st.text_input("Email Subject", "News Summaries")
|
250 |
+
to_email = st.text_input("Recipient Email")
|
251 |
+
if st.button("Send Email"):
|
252 |
+
if to_email:
|
253 |
+
email_body = "\n\n".join([f"Title: {item['title']}\nSummary: {item['summary']}\nSource: {item['source']}\nLink: {item['link']}\n" for item in unique_news_items])
|
254 |
+
send_email(email_subject, email_body, to_email)
|
255 |
+
else:
|
256 |
+
st.error("Please provide a recipient email.")
|
257 |
+
else:
|
258 |
+
st.write("No news articles found for the selected date range and keywords.")
|
259 |
+
|
260 |
+
if __name__ == '__main__':
|
261 |
+
main()
|