import warnings import smtplib from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText import feedparser import re import html import requests from datetime import datetime, timedelta from bs4 import BeautifulSoup from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline from urllib.parse import urlparse, parse_qs import streamlit as st import matplotlib.pyplot as plt import pandas as pd warnings.filterwarnings("ignore") # Summarization using BART tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn") def summarize_text_with_bart(text): inputs = tokenizer([text], max_length=1024, return_tensors="pt", truncation=True) summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) return summary if summary else 'Summary unavailable' # Initialize the sentiment analysis model sentiment_model = pipeline("sentiment-analysis", model="finiteautomata/bertweet-base-sentiment-analysis") def analyze_sentiment(summary): result = sentiment_model(summary)[0] if result['label'] == 'POS': score = 1 * result['score'] elif result['label'] == 'NEG': score = -1 * result['score'] else: score = 0 if score <= -0.6: sentiment = "Very Negative" elif -0.6 < score <= -0.2: sentiment = "Negative" elif -0.2 < score <= 0.2: sentiment = "Neutral" elif 0.2 < score <= 0.6: sentiment = "Positive" else: sentiment = "Very Positive" return sentiment, score def fetch_news_from_rss(feed_url, keywords, start_date, end_date): try: start_date = datetime.combine(start_date, datetime.min.time()) end_date = datetime.combine(end_date, datetime.max.time()) feed = feedparser.parse(feed_url) if 'entries' not in feed: st.write("Failed to retrieve feed or feed is empty") return [] news_items = [] for entry in feed.entries: published_date = datetime( entry.published_parsed.tm_year, entry.published_parsed.tm_mon, entry.published_parsed.tm_mday, entry.published_parsed.tm_hour, entry.published_parsed.tm_min, entry.published_parsed.tm_sec ) if start_date <= published_date <= end_date: title = entry.title.lower() summary = entry.summary.lower() if entry.summary != entry.title else 'No summary available.' if any(keyword.lower() in title or keyword.lower() in summary for keyword in keywords): source = extract_source(entry.link) news_items.append({ 'title': entry.title, 'summary': clean_html(entry.summary) if entry.summary != entry.title else 'No summary available.', 'link': entry.link, 'date': published_date, 'source': source }) news_items.sort(key=lambda x: x['date'], reverse=True) return news_items except Exception as e: st.write(f"An error occurred while fetching feed: {e}") return [] def extract_source(entry_link): try: parsed_url = urlparse(entry_link) domain = parsed_url.netloc known_sources = { "www.themalaysianinsight.com": "The Malaysian Insight", "www.themalaysianreserve.com": "The Malaysian Reserve", } return known_sources.get(domain, domain) except Exception as e: st.write(f"Failed to extract source from {entry_link}: {e}") return "Unknown Source" def clean_html(raw_html): return re.sub(r'<[^>]+>', '', html.unescape(raw_html)) def remove_duplicates(news_items): seen_links = set() unique_items = [] for item in news_items: if item['link'] not in seen_links: unique_items.append(item) seen_links.add(item['link']) return unique_items def fetch_article_content(bing_url): try: parsed_url = urlparse(bing_url) query_params = parse_qs(parsed_url.query) original_url = query_params.get('url', [None])[0] if original_url: original_content_response = requests.get(original_url, headers={'User-Agent': 'Mozilla/5.0'}, allow_redirects=True) original_content_response.raise_for_status() original_soup = BeautifulSoup(original_content_response.text, 'html.parser') paragraphs = original_soup.find_all('p') return ' '.join(paragraph.text for paragraph in paragraphs), extract_source(original_url) else: st.write(f"Original article link not found in Bing URL: {bing_url}") return None, "Unknown Source" except requests.RequestException as e: st.write(f"An error occurred while fetching article content from {bing_url}: {e}") return None, "Unknown Source" def display_news(news_items): for idx, item in enumerate(news_items, 1): sentiment_category, sentiment_score = analyze_sentiment(item['summary']) article_info = ( f"### Article {idx}:\n\n" f"**Title:** {item['title']}\n\n" f"**Summary:** {item['summary']}\n\n" f"**Source:** {item['source']}\n\n" f"[Link to article]({item['link']})\n\n" f"**Date:** {item['date'].strftime('%Y-%m-%d %H:%M:%S')}\n\n" f"**Sentiment:** {sentiment_category} (Score: {sentiment_score:.2f})" ) st.markdown(article_info, unsafe_allow_html=True) st.markdown("---") def display_top_5_rankings(news_items): sorted_by_sentiment = sorted(news_items, key=lambda x: analyze_sentiment(x['summary'])[1], reverse=True) top_5_positive = sorted_by_sentiment[:5] top_5_negative = sorted_by_sentiment[-5:] st.subheader("Top 5 Most Positive Articles") for idx, item in enumerate(top_5_positive, 1): st.markdown(f"**{idx}. {item['title']}** - Sentiment: {analyze_sentiment(item['summary'])[0]} (Score: {analyze_sentiment(item['summary'])[1]:.2f})") st.markdown(f"**Source**: {item['source']}") st.markdown(f"[Read more]({item['link']})") st.subheader("Top 5 Most Negative Articles") for idx, item in enumerate(top_5_negative, 1): st.markdown(f"**{idx}. {item['title']}** - Sentiment: {analyze_sentiment(item['summary'])[0]} (Score: {analyze_sentiment(item['summary'])[1]:.2f})") st.markdown(f"**Source**: {item['source']}") st.markdown(f"[Read more]({item['link']})") def plot_sentiment_over_time(news_items): df = pd.DataFrame(news_items) df['sentiment_score'] = df['summary'].apply(lambda x: analyze_sentiment(x)[1]) df['date'] = pd.to_datetime(df['date']) plt.figure(figsize=(10, 6)) plt.plot(df['date'], df['sentiment_score'], marker='o', linestyle='-', color='b') plt.title('Sentiment Scores Over Time') plt.xlabel('Date') plt.ylabel('Sentiment Score') plt.xticks(rotation=45) plt.grid(True) st.pyplot(plt) def calculate_average_sentiment(news_items): total_sentiment = sum(analyze_sentiment(item['summary'])[1] for item in news_items) avg_sentiment = total_sentiment / len(news_items) if news_items else 0 st.write(f"**Average Sentiment for Selected Period**: {avg_sentiment:.2f}") def send_email(subject, body, to_email): from_email = "hankaier123@gmail.com" password = "unyl ldnt cssx kfkx" msg = MIMEMultipart() msg['From'] = from_email msg['To'] = to_email msg['Subject'] = subject msg.attach(MIMEText(body, 'plain')) try: with smtplib.SMTP_SSL("smtp.gmail.com", 465) as server: server.login(from_email, password) server.sendmail(from_email, to_email, msg.as_string()) st.success("Email sent successfully!") except Exception as e: st.error(f"Failed to send email: {e}") def main(): st.title("News Summarizer") st.sidebar.header("Filter Settings") start_date = st.sidebar.date_input("Start Date", datetime.now() - timedelta(days=30)) end_date = st.sidebar.date_input("End Date", datetime.now()) if start_date > end_date: st.sidebar.error("Error: End date must fall after start date.") keywords_input = st.sidebar.text_input("Enter keywords (comma-separated)", "smes, malaysia") keywords = [keyword.strip() for keyword in keywords_input.split(",")] if st.sidebar.button("Fetch News"): bing_news_query = "+".join(keywords) bing_rss_url = f"https://www.bing.com/news/search?q={bing_news_query}&format=rss" st.write(f"Fetching news articles related to: {', '.join(keywords)}") bing_news_items = fetch_news_from_rss(bing_rss_url, keywords, start_date, end_date) unique_news_items = remove_duplicates(bing_news_items) for item in unique_news_items: content, source = fetch_article_content(item['link']) if content: summarized_text = summarize_text_with_bart(content) if summarized_text != 'Summary unavailable': item['summary'] = summarized_text item['source'] = source if unique_news_items: st.subheader("News Articles") display_news(unique_news_items) display_top_5_rankings(unique_news_items) plot_sentiment_over_time(unique_news_items) calculate_average_sentiment(unique_news_items) st.subheader("Send News via Email") email_subject = st.text_input("Email Subject", "News Summaries") to_email = st.text_input("Recipient Email") if st.button("Send Email"): if to_email: email_body = "\n\n".join([f"Title: {item['title']}\nSummary: {item['summary']}\nSource: {item['source']}\nLink: {item['link']}\n" for item in unique_news_items]) send_email(email_subject, email_body, to_email) else: st.error("Please provide a recipient email.") else: st.write("No news articles found for the selected date range and keywords.") if __name__ == '__main__': main()