Spaces:

Dee-lys123
/

ANNA

Sleeping

App Files Files Community

ANNA / app.py

Dee-lys123

Update app.py

4bc5379 verified 3 months ago

raw

history blame contribute delete

14.5 kB

	import streamlit as st
	import warnings
	import requests
	import feedparser
	import re
	from datetime import datetime, timedelta
	from bs4 import BeautifulSoup
	import torch
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline # type: ignore
	from urllib.parse import urlparse, parse_qs
	import pandas as pd
	import plotly.express as px
	from io import BytesIO

	# Streamlit App Interface
	st.set_page_config(layout="wide", page_title="News Analysis", page_icon="testtest.png")

	# Custom Header with Logo and Title
	col1, col2 = st.columns([1, 8]) # Adjust column proportions as needed
	with col1:
	st.image("testtest.png", width=130) # Increase size of the logo here
	with col2:
	st.markdown(
	"""
	<style>
	.custom-title {
	font-size: 45px; /* Adjust font size */
	line-height: 2.0; /* Adjust line height (height of the text block) */
	margin: 0;
	color: #333; /* Optional: Adjust the color */
	}
	</style>
	<h1 class="custom-title">ANNA: Aggregated News Network Analytics</h1>
	""",
	unsafe_allow_html=True,
	)

	# Suppress warnings
	warnings.filterwarnings("ignore")

	# Check for GPU availability
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Initialize BART for Summarization
	tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
	model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to(device)

	def summarize_text_with_bart(text):
	inputs = tokenizer([text], max_length=1024, return_tensors="pt", truncation=True).to(device)
	summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
	return tokenizer.decode(summary_ids[0], skip_special_tokens=True) or 'Summary unavailable'

	# Initialize sentiment analysis model
	sentiment_model = pipeline("sentiment-analysis", model="finiteautomata/bertweet-base-sentiment-analysis", device=0 if torch.cuda.is_available() else -1)

	def analyze_sentiment(title, summary):
	combined_text = f"{title}. {summary}"
	result = sentiment_model(combined_text)[0]
	score = result['score'] if result['label'] == 'POS' else -result['score'] if result['label'] == 'NEG' else 0
	label = get_sentiment_label(score) # Add this line
	return score, label # Return both score and label

	def fetch_yahoo_news(keywords, start_date, end_date, num_pages=1):
	news_items = []
	base_url = "https://news.search.yahoo.com/search"
	for page in range(1, num_pages + 1):
	params = {"p": " OR ".join(keywords), "b": page * 10}
	response = requests.get(base_url, params=params)
	if response.status_code != 200:
	continue
	soup = BeautifulSoup(response.content, "html.parser")
	for item in soup.find_all("div", class_="NewsArticle"):
	title = item.find("h4", class_="s-title").get_text(strip=True) if item.find("h4", class_="s-title") else "No Title"
	link = item.find("a", href=True)["href"] if item.find("a", href=True) else "No Link"
	date_text = item.find("span", class_="s-time").get_text(strip=True) if item.find("span", class_="s-time") else "No Date"
	date = parse_date(date_text)
	if date and start_date <= date <= end_date:
	summary = item.find("p", class_="s-desc").get_text(strip=True) if item.find("p", class_="s-desc") else "No Summary"
	news_items.append({'title': title, 'summary': summary, 'link': link, 'date': date, 'source': "Yahoo News"})
	return news_items

	def parse_date(date_text):
	if "ago" in date_text:
	days_ago = int(re.search(r"(\d+)", date_text).group(1))
	return datetime.now() - timedelta(days=days_ago)
	else:
	try:
	return datetime.strptime(date_text, "%b %d, %Y")
	except ValueError:
	return None

	def fetch_bing_news(keywords, start_date, end_date, num_pages=1):
	news_items = []
	for page in range(1, num_pages + 1):
	feed_url = f"https://www.bing.com/news/search?q={'+'.join(keywords)}&format=rss&first={(page - 1) * 10}"
	feed = feedparser.parse(feed_url)
	for entry in feed.entries:
	published_date = datetime(*entry.published_parsed[:6])
	if start_date <= published_date <= end_date:
	content, source = fetch_article_content(entry.link)
	summary = summarize_text_with_bart(content) if content else 'No Summary'
	news_items.append({
	'title': entry.title,
	'summary': summary,
	'link': entry.link,
	'date': published_date,
	'source': source
	})
	return news_items

	def fetch_article_content(bing_url):
	try:
	parsed_url = urlparse(bing_url)
	query_params = parse_qs(parsed_url.query)
	original_url = query_params.get('url', [None])[0]

	if original_url:
	response = requests.get(original_url, headers={'User-Agent': 'Mozilla/5.0'}, allow_redirects=True)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')
	paragraphs = soup.find_all('p')
	content = ' '.join(paragraph.text for paragraph in paragraphs)
	source = urlparse(original_url).netloc.replace("www.", "")
	return content, source
	else:
	return None, "Unknown Source"
	except requests.RequestException:
	return None, "Unknown Source"

	def remove_duplicates(news_items):
	seen_titles = set()
	unique_items = []
	for item in news_items:
	if item['title'] not in seen_titles:
	unique_items.append(item)
	seen_titles.add(item['title'])
	return unique_items

	def filter_relevant_articles(news_items):
	irrelevant_keywords = ["award", "pipes"] # Add more keywords here
	return [
	item for item in news_items
	if not any(keyword in item['title'].lower() or keyword in item['summary'].lower() for keyword in irrelevant_keywords)
	]

	def get_sentiment_label(score):
	if score <= -0.6:
	return "Very Negative"
	elif -0.6 < score <= -0.01:
	return "Negative"
	elif -0.01 < score < 0.01:
	return "Neutral"
	elif 0.01 <= score < 0.6:
	return "Positive"
	elif score >= 0.6:
	return "Very Positive"

	def adjust_sentiment_by_keywords(news_items):
	negative_keywords = ["burden"]
	for item in news_items:
	if any(keyword in item['title'].lower() or keyword in item['summary'].lower() for keyword in negative_keywords):
	item['sentiment_score'] = -0.6
	return news_items

	def display_top_news(news_items, num_top=5):
	news_items = sorted(news_items, key=lambda x: x['date'], reverse=True)
	for idx, item in enumerate(news_items, start=1):
	item['article_number'] = f"Article {idx}"
	for item in news_items:
	item['sentiment_score'], item['sentiment_label'] = analyze_sentiment(item['title'], item['summary'])

	positive_news = [item for item in news_items if item['sentiment_score'] > 0.01]
	negative_news = [item for item in news_items if item['sentiment_score'] < -0.01]

	top_positive_news = sorted(positive_news, key=lambda x: x['sentiment_score'], reverse=True)[:num_top]
	top_negative_news = sorted(negative_news, key=lambda x: x['sentiment_score'])[:num_top]

	positive_df = pd.DataFrame(top_positive_news)
	negative_df = pd.DataFrame(top_negative_news)
	return positive_df, negative_df

	def export_to_excel(news_items):
	df = pd.DataFrame(news_items)
	output = BytesIO()
	with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
	df.to_excel(writer, index=False, sheet_name="News Summary")
	output.seek(0)
	return output

	def plot_sentiment_over_time(news_items):
	df = pd.DataFrame(news_items)
	df['date'] = pd.to_datetime(df['date']).dt.date

	daily_sentiment = df.groupby('date', as_index=False)['sentiment_score'].mean()

	fig = px.line(
	daily_sentiment,
	x='date',
	y='sentiment_score',
	title="Average Sentiment Over Time",
	labels={'date': 'Date', 'sentiment_score': 'Average Sentiment Score'},
	hover_data={'date': "\|%Y-%m-%d", 'sentiment_score': ":.2f"},
	)
	fig.update_traces(mode='lines+markers')
	fig.update_layout(xaxis_title="Date", yaxis_title="Average Sentiment Score")
	st.plotly_chart(fig, use_container_width=True)

	# Manually add negative news in the backend
	def manually_add_negative_news():
	manually_added_news = [
	{
	'title': "SMEs worry minimum wage hike to RM1,700 could strain businesses despite potential benefits",
	'summary': "The RM1,700 minimum wage increase starting February 2025 has sparked concerns among SMEs. While it aims to help workers cope with rising costs, experts warn it could strain businesses already facing challenges. Some worry about job losses, especially in less developed states, and wage compression among the M40. Automation is seen as unfeasible for many SMEs. Critics also oppose the two percent dividend tax, calling it double taxation.",
	'link': "https://www.malaymail.com/news/malaysia/2024/10/19/smes-worry-minimum-wage-hike-to-rm1700-could-strain-businesses-despite-potential-benefits/154151",
	'date': datetime(2024, 10, 19),
	'source': "malaymail.com",
	'sentiment_score': -0.7
	},
	{
	'title': "SMEs anticipate price hikes next year",
	'summary': "SMEs in Malaysia plan to raise prices next year due to rising costs, risking business closures without government support. Key concerns include new taxes, higher fuel costs, and a minimum wage increase. Economists stress the need for policies that support business innovation and focus on growth sectors like tech and renewable energy. However, they warn that relying on subsidies is unsustainable for the government.",
	'link': "https://themalaysianreserve.com/2024/10/25/smes-anticipate-price-hikes-next-year/",
	'date': datetime(2024, 10, 25),
	'source': "themalaysianreserve.com",
	'sentiment_score': -0.8
	},
	{
	'title': "Many desperate for help: Samenta says economic indicators mask severe challenges as rising costs threaten Malaysian SMEs, urges govt support",
	'summary': "Samenta warned that SMEs are facing rising compliance costs despite positive economic indicators. New mandates like e-invoicing and ESG standards, along with increased government fees, are straining businesses. The association urged the government to focus on helping SMEs manage costs and access opportunities, rather than introducing new taxes.",
	'link': "https://www.malaymail.com/news/malaysia/2024/10/14/many-desperate-for-help-samenta-says-positive-economic-indicators-mask-severe-challenges-as-rising-costs-threaten-malaysian-smes-urges-govt-support/153561",
	'date': datetime(2024, 10, 14),
	'source': "malaymail.com",
	'sentiment_score': -0.7
	},
	{
	'title': "Sabah, Sarawak SMEs may struggle to implement revised minimum wage policy, says Samenta president",
	'summary': "Samenta president Datuk William Ng warned that the RM1,700 minimum wage hike and dividend tax could harm SMEs, especially in Sabah and Sarawak, causing job losses and financial strain.",
	'link': "https://www.theborneopost.com/2024/10/19/sabah-swak-smes-may-struggle-to-implement-revised-minimum-wage-policy-says-samenta-president/",
	'date': datetime(2024, 10, 19),
	'source': "theborneopost.com",
	'sentiment_score': -0.7
	}
	]
	return manually_added_news


	col1, col2 = st.columns([1, 3])

	with col1:
	st.header("🔍 Inputs")
	keywords_input = st.text_input("Enter keywords (separate by '+')", "SMEs+Malaysia")
	start_date = st.date_input("Start date:", datetime(2024, 10, 14))
	end_date = st.date_input("End date:", datetime(2024, 11, 26))

	if st.button("Fetch and Analyze News"):
	# Manually add negative news directly to the backend
	manually_added_news = manually_add_negative_news()

	keywords = [kw.strip() for kw in keywords_input.split(",")]
	start_datetime = datetime.combine(start_date, datetime.min.time())
	end_datetime = datetime.combine(end_date, datetime.max.time())
	yahoo_news = fetch_yahoo_news(keywords, start_datetime, end_datetime, num_pages=10)
	bing_news = fetch_bing_news(keywords, start_datetime, end_datetime, num_pages=10)
	all_news = remove_duplicates(yahoo_news + bing_news)

	# Combine the manually added negative news with the fetched news
	all_news += manually_added_news
	filtered_news = filter_relevant_articles(all_news)
	adjusted_news = adjust_sentiment_by_keywords(filtered_news)
	st.session_state['news_items'] = adjusted_news

	with col2:
	st.header("📈 Outputs")
	if 'news_items' in st.session_state:
	news_items = st.session_state['news_items']
	positive_df, negative_df = display_top_news(news_items)

	st.subheader("Top 5 Positive News Articles")
	st.dataframe(positive_df)

	st.subheader("Top 5 Negative News Articles")
	st.dataframe(negative_df)

	st.subheader("Sentiment Over Time")
	plot_sentiment_over_time(news_items)

	st.subheader("All News Articles")
	sorted_news_items = sorted(news_items, key=lambda x: x['date'], reverse=True)

	for idx, item in enumerate(sorted_news_items, 1):
	item['article_number'] = f"Article {idx}"
	st.markdown(f"""
	{item['article_number']}
	- Title: {item['title']}
	- Date: {item['date']}
	- Summary: {item['summary']}
	- Sentiment Score: {item['sentiment_score']:.2f} ({item['sentiment_label']})
	- Source: {item['source']}
	- Link: [Read more]({item['link']})
	---
	""")

	excel_data = export_to_excel(news_items)
	st.download_button(
	label="Download News Summary as Excel",
	data=excel_data,
	file_name="news_summary.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	)