Spaces:
Sleeping
Sleeping
import streamlit as st | |
import warnings | |
import requests | |
import feedparser | |
import re | |
from datetime import datetime, timedelta | |
from bs4 import BeautifulSoup | |
import torch | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline # type: ignore | |
from urllib.parse import urlparse, parse_qs | |
import pandas as pd | |
import plotly.express as px | |
from io import BytesIO | |
# Streamlit App Interface | |
st.set_page_config(layout="wide", page_title="News Analysis", page_icon="testtest.png") | |
# Custom Header with Logo and Title | |
col1, col2 = st.columns([1, 8]) # Adjust column proportions as needed | |
with col1: | |
st.image("testtest.png", width=130) # Increase size of the logo here | |
with col2: | |
st.markdown( | |
""" | |
<style> | |
.custom-title { | |
font-size: 45px; /* Adjust font size */ | |
line-height: 2.0; /* Adjust line height (height of the text block) */ | |
margin: 0; | |
color: #333; /* Optional: Adjust the color */ | |
} | |
</style> | |
<h1 class="custom-title">ANNA: Aggregated News Network Analytics</h1> | |
""", | |
unsafe_allow_html=True, | |
) | |
# Suppress warnings | |
warnings.filterwarnings("ignore") | |
# Check for GPU availability | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# Initialize BART for Summarization | |
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") | |
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to(device) | |
def summarize_text_with_bart(text): | |
inputs = tokenizer([text], max_length=1024, return_tensors="pt", truncation=True).to(device) | |
summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True) | |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True) or 'Summary unavailable' | |
# Initialize sentiment analysis model | |
sentiment_model = pipeline("sentiment-analysis", model="finiteautomata/bertweet-base-sentiment-analysis", device=0 if torch.cuda.is_available() else -1) | |
def analyze_sentiment(title, summary): | |
combined_text = f"{title}. {summary}" | |
result = sentiment_model(combined_text)[0] | |
score = result['score'] if result['label'] == 'POS' else -result['score'] if result['label'] == 'NEG' else 0 | |
label = get_sentiment_label(score) # Add this line | |
return score, label # Return both score and label | |
def fetch_yahoo_news(keywords, start_date, end_date, num_pages=1): | |
news_items = [] | |
base_url = "https://news.search.yahoo.com/search" | |
for page in range(1, num_pages + 1): | |
params = {"p": " OR ".join(keywords), "b": page * 10} | |
response = requests.get(base_url, params=params) | |
if response.status_code != 200: | |
continue | |
soup = BeautifulSoup(response.content, "html.parser") | |
for item in soup.find_all("div", class_="NewsArticle"): | |
title = item.find("h4", class_="s-title").get_text(strip=True) if item.find("h4", class_="s-title") else "No Title" | |
link = item.find("a", href=True)["href"] if item.find("a", href=True) else "No Link" | |
date_text = item.find("span", class_="s-time").get_text(strip=True) if item.find("span", class_="s-time") else "No Date" | |
date = parse_date(date_text) | |
if date and start_date <= date <= end_date: | |
summary = item.find("p", class_="s-desc").get_text(strip=True) if item.find("p", class_="s-desc") else "No Summary" | |
news_items.append({'title': title, 'summary': summary, 'link': link, 'date': date, 'source': "Yahoo News"}) | |
return news_items | |
def parse_date(date_text): | |
if "ago" in date_text: | |
days_ago = int(re.search(r"(\d+)", date_text).group(1)) | |
return datetime.now() - timedelta(days=days_ago) | |
else: | |
try: | |
return datetime.strptime(date_text, "%b %d, %Y") | |
except ValueError: | |
return None | |
def fetch_bing_news(keywords, start_date, end_date, num_pages=1): | |
news_items = [] | |
for page in range(1, num_pages + 1): | |
feed_url = f"https://www.bing.com/news/search?q={'+'.join(keywords)}&format=rss&first={(page - 1) * 10}" | |
feed = feedparser.parse(feed_url) | |
for entry in feed.entries: | |
published_date = datetime(*entry.published_parsed[:6]) | |
if start_date <= published_date <= end_date: | |
content, source = fetch_article_content(entry.link) | |
summary = summarize_text_with_bart(content) if content else 'No Summary' | |
news_items.append({ | |
'title': entry.title, | |
'summary': summary, | |
'link': entry.link, | |
'date': published_date, | |
'source': source | |
}) | |
return news_items | |
def fetch_article_content(bing_url): | |
try: | |
parsed_url = urlparse(bing_url) | |
query_params = parse_qs(parsed_url.query) | |
original_url = query_params.get('url', [None])[0] | |
if original_url: | |
response = requests.get(original_url, headers={'User-Agent': 'Mozilla/5.0'}, allow_redirects=True) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'html.parser') | |
paragraphs = soup.find_all('p') | |
content = ' '.join(paragraph.text for paragraph in paragraphs) | |
source = urlparse(original_url).netloc.replace("www.", "") | |
return content, source | |
else: | |
return None, "Unknown Source" | |
except requests.RequestException: | |
return None, "Unknown Source" | |
def remove_duplicates(news_items): | |
seen_titles = set() | |
unique_items = [] | |
for item in news_items: | |
if item['title'] not in seen_titles: | |
unique_items.append(item) | |
seen_titles.add(item['title']) | |
return unique_items | |
def filter_relevant_articles(news_items): | |
irrelevant_keywords = ["award", "pipes"] # Add more keywords here | |
return [ | |
item for item in news_items | |
if not any(keyword in item['title'].lower() or keyword in item['summary'].lower() for keyword in irrelevant_keywords) | |
] | |
def get_sentiment_label(score): | |
if score <= -0.6: | |
return "Very Negative" | |
elif -0.6 < score <= -0.01: | |
return "Negative" | |
elif -0.01 < score < 0.01: | |
return "Neutral" | |
elif 0.01 <= score < 0.6: | |
return "Positive" | |
elif score >= 0.6: | |
return "Very Positive" | |
def adjust_sentiment_by_keywords(news_items): | |
negative_keywords = ["burden"] | |
for item in news_items: | |
if any(keyword in item['title'].lower() or keyword in item['summary'].lower() for keyword in negative_keywords): | |
item['sentiment_score'] = -0.6 | |
return news_items | |
def display_top_news(news_items, num_top=5): | |
news_items = sorted(news_items, key=lambda x: x['date'], reverse=True) | |
for idx, item in enumerate(news_items, start=1): | |
item['article_number'] = f"Article {idx}" | |
for item in news_items: | |
item['sentiment_score'], item['sentiment_label'] = analyze_sentiment(item['title'], item['summary']) | |
positive_news = [item for item in news_items if item['sentiment_score'] > 0.01] | |
negative_news = [item for item in news_items if item['sentiment_score'] < -0.01] | |
top_positive_news = sorted(positive_news, key=lambda x: x['sentiment_score'], reverse=True)[:num_top] | |
top_negative_news = sorted(negative_news, key=lambda x: x['sentiment_score'])[:num_top] | |
positive_df = pd.DataFrame(top_positive_news) | |
negative_df = pd.DataFrame(top_negative_news) | |
return positive_df, negative_df | |
def export_to_excel(news_items): | |
df = pd.DataFrame(news_items) | |
output = BytesIO() | |
with pd.ExcelWriter(output, engine='xlsxwriter') as writer: | |
df.to_excel(writer, index=False, sheet_name="News Summary") | |
output.seek(0) | |
return output | |
def plot_sentiment_over_time(news_items): | |
df = pd.DataFrame(news_items) | |
df['date'] = pd.to_datetime(df['date']).dt.date | |
daily_sentiment = df.groupby('date', as_index=False)['sentiment_score'].mean() | |
fig = px.line( | |
daily_sentiment, | |
x='date', | |
y='sentiment_score', | |
title="Average Sentiment Over Time", | |
labels={'date': 'Date', 'sentiment_score': 'Average Sentiment Score'}, | |
hover_data={'date': "|%Y-%m-%d", 'sentiment_score': ":.2f"}, | |
) | |
fig.update_traces(mode='lines+markers') | |
fig.update_layout(xaxis_title="Date", yaxis_title="Average Sentiment Score") | |
st.plotly_chart(fig, use_container_width=True) | |
# Manually add negative news in the backend | |
def manually_add_negative_news(): | |
manually_added_news = [ | |
{ | |
'title': "SMEs worry minimum wage hike to RM1,700 could strain businesses despite potential benefits", | |
'summary': "The RM1,700 minimum wage increase starting February 2025 has sparked concerns among SMEs. While it aims to help workers cope with rising costs, experts warn it could strain businesses already facing challenges. Some worry about job losses, especially in less developed states, and wage compression among the M40. Automation is seen as unfeasible for many SMEs. Critics also oppose the two percent dividend tax, calling it double taxation.", | |
'link': "https://www.malaymail.com/news/malaysia/2024/10/19/smes-worry-minimum-wage-hike-to-rm1700-could-strain-businesses-despite-potential-benefits/154151", | |
'date': datetime(2024, 10, 19), | |
'source': "malaymail.com", | |
'sentiment_score': -0.7 | |
}, | |
{ | |
'title': "SMEs anticipate price hikes next year", | |
'summary': "SMEs in Malaysia plan to raise prices next year due to rising costs, risking business closures without government support. Key concerns include new taxes, higher fuel costs, and a minimum wage increase. Economists stress the need for policies that support business innovation and focus on growth sectors like tech and renewable energy. However, they warn that relying on subsidies is unsustainable for the government.", | |
'link': "https://themalaysianreserve.com/2024/10/25/smes-anticipate-price-hikes-next-year/", | |
'date': datetime(2024, 10, 25), | |
'source': "themalaysianreserve.com", | |
'sentiment_score': -0.8 | |
}, | |
{ | |
'title': "Many desperate for help: Samenta says economic indicators mask severe challenges as rising costs threaten Malaysian SMEs, urges govt support", | |
'summary': "Samenta warned that SMEs are facing rising compliance costs despite positive economic indicators. New mandates like e-invoicing and ESG standards, along with increased government fees, are straining businesses. The association urged the government to focus on helping SMEs manage costs and access opportunities, rather than introducing new taxes.", | |
'link': "https://www.malaymail.com/news/malaysia/2024/10/14/many-desperate-for-help-samenta-says-positive-economic-indicators-mask-severe-challenges-as-rising-costs-threaten-malaysian-smes-urges-govt-support/153561", | |
'date': datetime(2024, 10, 14), | |
'source': "malaymail.com", | |
'sentiment_score': -0.7 | |
}, | |
{ | |
'title': "Sabah, Sarawak SMEs may struggle to implement revised minimum wage policy, says Samenta president", | |
'summary': "Samenta president Datuk William Ng warned that the RM1,700 minimum wage hike and dividend tax could harm SMEs, especially in Sabah and Sarawak, causing job losses and financial strain.", | |
'link': "https://www.theborneopost.com/2024/10/19/sabah-swak-smes-may-struggle-to-implement-revised-minimum-wage-policy-says-samenta-president/", | |
'date': datetime(2024, 10, 19), | |
'source': "theborneopost.com", | |
'sentiment_score': -0.7 | |
} | |
] | |
return manually_added_news | |
col1, col2 = st.columns([1, 3]) | |
with col1: | |
st.header("🔍 Inputs") | |
keywords_input = st.text_input("Enter keywords (separate by '+')", "SMEs+Malaysia") | |
start_date = st.date_input("Start date:", datetime(2024, 10, 14)) | |
end_date = st.date_input("End date:", datetime(2024, 11, 26)) | |
if st.button("Fetch and Analyze News"): | |
# Manually add negative news directly to the backend | |
manually_added_news = manually_add_negative_news() | |
keywords = [kw.strip() for kw in keywords_input.split(",")] | |
start_datetime = datetime.combine(start_date, datetime.min.time()) | |
end_datetime = datetime.combine(end_date, datetime.max.time()) | |
yahoo_news = fetch_yahoo_news(keywords, start_datetime, end_datetime, num_pages=10) | |
bing_news = fetch_bing_news(keywords, start_datetime, end_datetime, num_pages=10) | |
all_news = remove_duplicates(yahoo_news + bing_news) | |
# Combine the manually added negative news with the fetched news | |
all_news += manually_added_news | |
filtered_news = filter_relevant_articles(all_news) | |
adjusted_news = adjust_sentiment_by_keywords(filtered_news) | |
st.session_state['news_items'] = adjusted_news | |
with col2: | |
st.header("📈 Outputs") | |
if 'news_items' in st.session_state: | |
news_items = st.session_state['news_items'] | |
positive_df, negative_df = display_top_news(news_items) | |
st.subheader("Top 5 Positive News Articles") | |
st.dataframe(positive_df) | |
st.subheader("Top 5 Negative News Articles") | |
st.dataframe(negative_df) | |
st.subheader("Sentiment Over Time") | |
plot_sentiment_over_time(news_items) | |
st.subheader("All News Articles") | |
sorted_news_items = sorted(news_items, key=lambda x: x['date'], reverse=True) | |
for idx, item in enumerate(sorted_news_items, 1): | |
item['article_number'] = f"Article {idx}" | |
st.markdown(f""" | |
*{item['article_number']}* | |
- *Title*: {item['title']} | |
- *Date*: {item['date']} | |
- *Summary*: {item['summary']} | |
- *Sentiment Score*: {item['sentiment_score']:.2f} ({item['sentiment_label']}) | |
- *Source*: {item['source']} | |
- *Link*: [Read more]({item['link']}) | |
--- | |
""") | |
excel_data = export_to_excel(news_items) | |
st.download_button( | |
label="Download News Summary as Excel", | |
data=excel_data, | |
file_name="news_summary.xlsx", | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
) |