Spaces:
Running
Running
import gradio as gr | |
from transformers import pipeline | |
import feedparser | |
from datetime import datetime, timedelta | |
import pytz | |
from bs4 import BeautifulSoup | |
import hashlib | |
import threading | |
import logging | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Global settings | |
SUMMARIZER_MODELS = { | |
"Default (facebook/bart-large-cnn)": "facebook/bart-large-cnn", | |
"Free Model (distilbart-cnn-6-6)": "sshleifer/distilbart-cnn-6-6" | |
} | |
CACHE_SIZE = 500 | |
RSS_FETCH_INTERVAL = timedelta(hours=8) | |
ARTICLE_LIMIT = 5 | |
# Restructured news sources with fixed categories | |
CATEGORIES = ["Technology", "Business", "World News", "Science", "Sports", "Health"] | |
NEWS_SOURCES = { | |
"Technology": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml", | |
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best" | |
}, | |
"Business": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml", | |
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best" | |
}, | |
"World News": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml", | |
"BBC": "http://feeds.bbci.co.uk/news/world/rss.xml", | |
"CNN": "http://rss.cnn.com/rss/edition_world.rss", | |
"reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best" | |
}, | |
"Science": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml" | |
}, | |
"Sports": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml", | |
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best" | |
}, | |
"Health": { | |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml", | |
"politico": "http://rss.politico.com/healthcare.xml", | |
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best" | |
}, | |
} | |
class NewsCache: | |
def __init__(self, size): | |
self.cache = {} | |
self.size = size | |
self.lock = threading.Lock() | |
def get(self, key): | |
with self.lock: | |
return self.cache.get(key) | |
def set(self, key, value): | |
with self.lock: | |
if len(self.cache) >= self.size: | |
oldest_key = next(iter(self.cache)) | |
del self.cache[oldest_key] | |
self.cache[key] = value | |
cache = NewsCache(CACHE_SIZE) | |
def fetch_rss_news(tech_sources, business_sources, world_sources): | |
articles = [] | |
cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL | |
# Create a mapping of selected sources | |
selected_sources = { | |
"Technology": tech_sources if tech_sources else [], | |
"Business": business_sources if business_sources else [], | |
"World News": world_sources if world_sources else [], | |
"Science": science_sources if science_sources else [], | |
"Sports": sports_sources if sports_sources else [], | |
"Health": health_sources if health_sources else [], | |
} | |
logger.info(f"Selected sources: {selected_sources}") | |
for category, sources in selected_sources.items(): | |
if not sources: # Skip if no sources selected for this category | |
continue | |
logger.info(f"Processing category: {category} with sources: {sources}") | |
for source in sources: | |
if source in NEWS_SOURCES[category]: | |
url = NEWS_SOURCES[category][source] | |
try: | |
logger.info(f"Fetching from URL: {url}") | |
feed = feedparser.parse(url) | |
if hasattr(feed, 'status') and feed.status != 200: | |
logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}") | |
continue | |
for entry in feed.entries: | |
try: | |
published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC) | |
if published > cutoff_time: | |
articles.append({ | |
"title": entry.title, | |
"description": BeautifulSoup(entry.description, "html.parser").get_text(), | |
"link": entry.link, | |
"category": category, | |
"source": source, | |
"published": published | |
}) | |
except (AttributeError, TypeError) as e: | |
logger.error(f"Error processing entry: {str(e)}") | |
continue | |
except Exception as e: | |
logger.error(f"Error fetching feed from {url}: {str(e)}") | |
continue | |
logger.info(f"Total articles fetched: {len(articles)}") | |
articles = sorted(articles, key=lambda x: x["published"], reverse=True)[:ARTICLE_LIMIT] | |
return articles | |
def summarize_text(text, model_name): | |
try: | |
summarizer = pipeline("summarization", model=model_name, device=-1) | |
content_hash = hashlib.md5(text.encode()).hexdigest() | |
cached_summary = cache.get(content_hash) | |
if cached_summary: | |
logger.info("Using cached summary") | |
return cached_summary | |
logger.info(f"Generating new summary using model: {model_name}") | |
result = summarizer(text, max_length=120, min_length=40, truncation=True) | |
summary = result[0]['summary_text'] | |
cache.set(content_hash, summary) | |
return summary | |
except Exception as e: | |
logger.error(f"Error in summarization: {str(e)}") | |
return "Summary unavailable." | |
def summarize_articles(articles, model_name): | |
summaries = [] | |
for article in articles: | |
content = article["description"] | |
summary = summarize_text(content, model_name) | |
summaries.append(f""" | |
π° {article['title']} | |
- π Category: {article['category']} | |
- π‘ Source: {article['source']} | |
- π Read More: {article['link']} | |
π Summary: {summary} | |
""") | |
return "\n".join(summaries) | |
def generate_summary(tech_sources, business_sources, world_sources, model_name): | |
logger.info(f""" | |
Generating summary with: | |
- Tech sources: {tech_sources} | |
- Business sources: {business_sources} | |
- World sources: {world_sources} | |
- Model: {model_name} | |
""") | |
# Check if any sources are selected | |
if not any([ | |
tech_sources is not None and len(tech_sources) > 0, | |
business_sources is not None and len(business_sources) > 0, | |
world_sources is not None and len(world_sources) > 0 | |
]): | |
return "Please select at least one news source." | |
try: | |
articles = fetch_rss_news(tech_sources, business_sources, world_sources) | |
if not articles: | |
return "No recent news found from the selected sources." | |
return summarize_articles(articles, model_name) | |
except Exception as e: | |
logger.error(f"Error in generate_summary: {str(e)}") | |
return f"An error occurred while generating the summary. Please try again." | |
# Gradio Interface | |
demo = gr.Blocks() | |
with demo: | |
gr.Markdown("# π° AI News Summarizer") | |
with gr.Row(): | |
with gr.Column(): | |
# Create checkbox groups for each category | |
tech_sources = gr.CheckboxGroup( | |
choices=list(NEWS_SOURCES["Technology"].keys()), | |
label="Technology Sources", | |
value=[] | |
) | |
business_sources = gr.CheckboxGroup( | |
choices=list(NEWS_SOURCES["Business"].keys()), | |
label="Business Sources", | |
value=[] | |
) | |
world_sources = gr.CheckboxGroup( | |
choices=list(NEWS_SOURCES["World News"].keys()), | |
label="World News Sources", | |
value=[] | |
) | |
science_sources = gr.CheckboxGroup( | |
choices=list(NEWS_SOURCES["Science"].keys()), | |
label="Science Sources", | |
value=[] | |
) | |
sports_sources = gr.CheckboxGroup( | |
choices=list(NEWS_SOURCES["Sports"].keys()), | |
label="Sports Sources", | |
value=[] | |
) | |
health_sources = gr.CheckboxGroup( | |
choices=list(NEWS_SOURCES["Health"].keys()), | |
label="Health Sources", | |
value=[] | |
) | |
with gr.Column(): | |
model_selector = gr.Radio( | |
choices=list(SUMMARIZER_MODELS.keys()), | |
label="Choose Summarization Model", | |
value="Default (facebook/bart-large-cnn)" | |
) | |
summarize_button = gr.Button("Get News Summary") | |
summary_output = gr.Textbox(label="News Summary", lines=20) | |
def get_summary(tech_sources, business_sources, world_sources, selected_model): | |
try: | |
model_name = SUMMARIZER_MODELS[selected_model] | |
return generate_summary(tech_sources, business_sources, world_sources, model_name) | |
except Exception as e: | |
logger.error(f"Error in get_summary: {str(e)}") | |
return "An error occurred while processing your request. Please try again." | |
# Connect the components to the summary function | |
summarize_button.click( | |
get_summary, | |
inputs=[tech_sources, business_sources, world_sources, model_selector], | |
outputs=summary_output | |
) | |
if __name__ == "__main__": | |
demo.launch() | |