ANNA / app.py
Dee-lys123's picture
Update app.py
4bc5379 verified
import streamlit as st
import warnings
import requests
import feedparser
import re
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline # type: ignore
from urllib.parse import urlparse, parse_qs
import pandas as pd
import plotly.express as px
from io import BytesIO
# Streamlit App Interface
st.set_page_config(layout="wide", page_title="News Analysis", page_icon="testtest.png")
# Custom Header with Logo and Title
col1, col2 = st.columns([1, 8]) # Adjust column proportions as needed
with col1:
st.image("testtest.png", width=130) # Increase size of the logo here
with col2:
st.markdown(
"""
<style>
.custom-title {
font-size: 45px; /* Adjust font size */
line-height: 2.0; /* Adjust line height (height of the text block) */
margin: 0;
color: #333; /* Optional: Adjust the color */
}
</style>
<h1 class="custom-title">ANNA: Aggregated News Network Analytics</h1>
""",
unsafe_allow_html=True,
)
# Suppress warnings
warnings.filterwarnings("ignore")
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initialize BART for Summarization
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to(device)
def summarize_text_with_bart(text):
inputs = tokenizer([text], max_length=1024, return_tensors="pt", truncation=True).to(device)
summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
return tokenizer.decode(summary_ids[0], skip_special_tokens=True) or 'Summary unavailable'
# Initialize sentiment analysis model
sentiment_model = pipeline("sentiment-analysis", model="finiteautomata/bertweet-base-sentiment-analysis", device=0 if torch.cuda.is_available() else -1)
def analyze_sentiment(title, summary):
combined_text = f"{title}. {summary}"
result = sentiment_model(combined_text)[0]
score = result['score'] if result['label'] == 'POS' else -result['score'] if result['label'] == 'NEG' else 0
label = get_sentiment_label(score) # Add this line
return score, label # Return both score and label
def fetch_yahoo_news(keywords, start_date, end_date, num_pages=1):
news_items = []
base_url = "https://news.search.yahoo.com/search"
for page in range(1, num_pages + 1):
params = {"p": " OR ".join(keywords), "b": page * 10}
response = requests.get(base_url, params=params)
if response.status_code != 200:
continue
soup = BeautifulSoup(response.content, "html.parser")
for item in soup.find_all("div", class_="NewsArticle"):
title = item.find("h4", class_="s-title").get_text(strip=True) if item.find("h4", class_="s-title") else "No Title"
link = item.find("a", href=True)["href"] if item.find("a", href=True) else "No Link"
date_text = item.find("span", class_="s-time").get_text(strip=True) if item.find("span", class_="s-time") else "No Date"
date = parse_date(date_text)
if date and start_date <= date <= end_date:
summary = item.find("p", class_="s-desc").get_text(strip=True) if item.find("p", class_="s-desc") else "No Summary"
news_items.append({'title': title, 'summary': summary, 'link': link, 'date': date, 'source': "Yahoo News"})
return news_items
def parse_date(date_text):
if "ago" in date_text:
days_ago = int(re.search(r"(\d+)", date_text).group(1))
return datetime.now() - timedelta(days=days_ago)
else:
try:
return datetime.strptime(date_text, "%b %d, %Y")
except ValueError:
return None
def fetch_bing_news(keywords, start_date, end_date, num_pages=1):
news_items = []
for page in range(1, num_pages + 1):
feed_url = f"https://www.bing.com/news/search?q={'+'.join(keywords)}&format=rss&first={(page - 1) * 10}"
feed = feedparser.parse(feed_url)
for entry in feed.entries:
published_date = datetime(*entry.published_parsed[:6])
if start_date <= published_date <= end_date:
content, source = fetch_article_content(entry.link)
summary = summarize_text_with_bart(content) if content else 'No Summary'
news_items.append({
'title': entry.title,
'summary': summary,
'link': entry.link,
'date': published_date,
'source': source
})
return news_items
def fetch_article_content(bing_url):
try:
parsed_url = urlparse(bing_url)
query_params = parse_qs(parsed_url.query)
original_url = query_params.get('url', [None])[0]
if original_url:
response = requests.get(original_url, headers={'User-Agent': 'Mozilla/5.0'}, allow_redirects=True)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
paragraphs = soup.find_all('p')
content = ' '.join(paragraph.text for paragraph in paragraphs)
source = urlparse(original_url).netloc.replace("www.", "")
return content, source
else:
return None, "Unknown Source"
except requests.RequestException:
return None, "Unknown Source"
def remove_duplicates(news_items):
seen_titles = set()
unique_items = []
for item in news_items:
if item['title'] not in seen_titles:
unique_items.append(item)
seen_titles.add(item['title'])
return unique_items
def filter_relevant_articles(news_items):
irrelevant_keywords = ["award", "pipes"] # Add more keywords here
return [
item for item in news_items
if not any(keyword in item['title'].lower() or keyword in item['summary'].lower() for keyword in irrelevant_keywords)
]
def get_sentiment_label(score):
if score <= -0.6:
return "Very Negative"
elif -0.6 < score <= -0.01:
return "Negative"
elif -0.01 < score < 0.01:
return "Neutral"
elif 0.01 <= score < 0.6:
return "Positive"
elif score >= 0.6:
return "Very Positive"
def adjust_sentiment_by_keywords(news_items):
negative_keywords = ["burden"]
for item in news_items:
if any(keyword in item['title'].lower() or keyword in item['summary'].lower() for keyword in negative_keywords):
item['sentiment_score'] = -0.6
return news_items
def display_top_news(news_items, num_top=5):
news_items = sorted(news_items, key=lambda x: x['date'], reverse=True)
for idx, item in enumerate(news_items, start=1):
item['article_number'] = f"Article {idx}"
for item in news_items:
item['sentiment_score'], item['sentiment_label'] = analyze_sentiment(item['title'], item['summary'])
positive_news = [item for item in news_items if item['sentiment_score'] > 0.01]
negative_news = [item for item in news_items if item['sentiment_score'] < -0.01]
top_positive_news = sorted(positive_news, key=lambda x: x['sentiment_score'], reverse=True)[:num_top]
top_negative_news = sorted(negative_news, key=lambda x: x['sentiment_score'])[:num_top]
positive_df = pd.DataFrame(top_positive_news)
negative_df = pd.DataFrame(top_negative_news)
return positive_df, negative_df
def export_to_excel(news_items):
df = pd.DataFrame(news_items)
output = BytesIO()
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
df.to_excel(writer, index=False, sheet_name="News Summary")
output.seek(0)
return output
def plot_sentiment_over_time(news_items):
df = pd.DataFrame(news_items)
df['date'] = pd.to_datetime(df['date']).dt.date
daily_sentiment = df.groupby('date', as_index=False)['sentiment_score'].mean()
fig = px.line(
daily_sentiment,
x='date',
y='sentiment_score',
title="Average Sentiment Over Time",
labels={'date': 'Date', 'sentiment_score': 'Average Sentiment Score'},
hover_data={'date': "|%Y-%m-%d", 'sentiment_score': ":.2f"},
)
fig.update_traces(mode='lines+markers')
fig.update_layout(xaxis_title="Date", yaxis_title="Average Sentiment Score")
st.plotly_chart(fig, use_container_width=True)
# Manually add negative news in the backend
def manually_add_negative_news():
manually_added_news = [
{
'title': "SMEs worry minimum wage hike to RM1,700 could strain businesses despite potential benefits",
'summary': "The RM1,700 minimum wage increase starting February 2025 has sparked concerns among SMEs. While it aims to help workers cope with rising costs, experts warn it could strain businesses already facing challenges. Some worry about job losses, especially in less developed states, and wage compression among the M40. Automation is seen as unfeasible for many SMEs. Critics also oppose the two percent dividend tax, calling it double taxation.",
'link': "https://www.malaymail.com/news/malaysia/2024/10/19/smes-worry-minimum-wage-hike-to-rm1700-could-strain-businesses-despite-potential-benefits/154151",
'date': datetime(2024, 10, 19),
'source': "malaymail.com",
'sentiment_score': -0.7
},
{
'title': "SMEs anticipate price hikes next year",
'summary': "SMEs in Malaysia plan to raise prices next year due to rising costs, risking business closures without government support. Key concerns include new taxes, higher fuel costs, and a minimum wage increase. Economists stress the need for policies that support business innovation and focus on growth sectors like tech and renewable energy. However, they warn that relying on subsidies is unsustainable for the government.",
'link': "https://themalaysianreserve.com/2024/10/25/smes-anticipate-price-hikes-next-year/",
'date': datetime(2024, 10, 25),
'source': "themalaysianreserve.com",
'sentiment_score': -0.8
},
{
'title': "Many desperate for help: Samenta says economic indicators mask severe challenges as rising costs threaten Malaysian SMEs, urges govt support",
'summary': "Samenta warned that SMEs are facing rising compliance costs despite positive economic indicators. New mandates like e-invoicing and ESG standards, along with increased government fees, are straining businesses. The association urged the government to focus on helping SMEs manage costs and access opportunities, rather than introducing new taxes.",
'link': "https://www.malaymail.com/news/malaysia/2024/10/14/many-desperate-for-help-samenta-says-positive-economic-indicators-mask-severe-challenges-as-rising-costs-threaten-malaysian-smes-urges-govt-support/153561",
'date': datetime(2024, 10, 14),
'source': "malaymail.com",
'sentiment_score': -0.7
},
{
'title': "Sabah, Sarawak SMEs may struggle to implement revised minimum wage policy, says Samenta president",
'summary': "Samenta president Datuk William Ng warned that the RM1,700 minimum wage hike and dividend tax could harm SMEs, especially in Sabah and Sarawak, causing job losses and financial strain.",
'link': "https://www.theborneopost.com/2024/10/19/sabah-swak-smes-may-struggle-to-implement-revised-minimum-wage-policy-says-samenta-president/",
'date': datetime(2024, 10, 19),
'source': "theborneopost.com",
'sentiment_score': -0.7
}
]
return manually_added_news
col1, col2 = st.columns([1, 3])
with col1:
st.header("🔍 Inputs")
keywords_input = st.text_input("Enter keywords (separate by '+')", "SMEs+Malaysia")
start_date = st.date_input("Start date:", datetime(2024, 10, 14))
end_date = st.date_input("End date:", datetime(2024, 11, 26))
if st.button("Fetch and Analyze News"):
# Manually add negative news directly to the backend
manually_added_news = manually_add_negative_news()
keywords = [kw.strip() for kw in keywords_input.split(",")]
start_datetime = datetime.combine(start_date, datetime.min.time())
end_datetime = datetime.combine(end_date, datetime.max.time())
yahoo_news = fetch_yahoo_news(keywords, start_datetime, end_datetime, num_pages=10)
bing_news = fetch_bing_news(keywords, start_datetime, end_datetime, num_pages=10)
all_news = remove_duplicates(yahoo_news + bing_news)
# Combine the manually added negative news with the fetched news
all_news += manually_added_news
filtered_news = filter_relevant_articles(all_news)
adjusted_news = adjust_sentiment_by_keywords(filtered_news)
st.session_state['news_items'] = adjusted_news
with col2:
st.header("📈 Outputs")
if 'news_items' in st.session_state:
news_items = st.session_state['news_items']
positive_df, negative_df = display_top_news(news_items)
st.subheader("Top 5 Positive News Articles")
st.dataframe(positive_df)
st.subheader("Top 5 Negative News Articles")
st.dataframe(negative_df)
st.subheader("Sentiment Over Time")
plot_sentiment_over_time(news_items)
st.subheader("All News Articles")
sorted_news_items = sorted(news_items, key=lambda x: x['date'], reverse=True)
for idx, item in enumerate(sorted_news_items, 1):
item['article_number'] = f"Article {idx}"
st.markdown(f"""
*{item['article_number']}*
- *Title*: {item['title']}
- *Date*: {item['date']}
- *Summary*: {item['summary']}
- *Sentiment Score*: {item['sentiment_score']:.2f} ({item['sentiment_label']})
- *Source*: {item['source']}
- *Link*: [Read more]({item['link']})
---
""")
excel_data = export_to_excel(news_items)
st.download_button(
label="Download News Summary as Excel",
data=excel_data,
file_name="news_summary.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
)