import gradio as gr from transformers import pipeline import feedparser from datetime import datetime, timedelta import pytz from bs4 import BeautifulSoup import hashlib import threading import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Global settings SUMMARIZER_MODELS = { "Default (facebook/bart-large-cnn)": "facebook/bart-large-cnn", "Free Model (distilbart-cnn-6-6)": "sshleifer/distilbart-cnn-6-6" } CACHE_SIZE = 500 RSS_FETCH_INTERVAL = timedelta(hours=8) ARTICLE_LIMIT = 5 # Restructured news sources with fixed categories CATEGORIES = ["Technology", "Business", "World News", "Science", "Sports", "Health"] NEWS_SOURCES = { "Technology": { "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml", "reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best" }, "Business": { "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml", "reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best" }, "World News": { "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml", "BBC": "http://feeds.bbci.co.uk/news/world/rss.xml", "CNN": "http://rss.cnn.com/rss/edition_world.rss", "reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best" }, "Science": { "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml" }, "Sports": { "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml", "reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best" }, "Health": { "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml", "politico": "http://rss.politico.com/healthcare.xml", "reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best" }, } class NewsCache: def __init__(self, size): self.cache = {} self.size = size self.lock = threading.Lock() def get(self, key): with self.lock: return self.cache.get(key) def set(self, key, value): with self.lock: if len(self.cache) >= self.size: oldest_key = next(iter(self.cache)) del self.cache[oldest_key] self.cache[key] = value cache = NewsCache(CACHE_SIZE) def fetch_rss_news(tech_sources, business_sources, world_sources): articles = [] cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL # Create a mapping of selected sources selected_sources = { "Technology": tech_sources if tech_sources else [], "Business": business_sources if business_sources else [], "World News": world_sources if world_sources else [], "Science": science_sources if science_sources else [], "Sports": sports_sources if sports_sources else [], "Health": health_sources if health_sources else [], } logger.info(f"Selected sources: {selected_sources}") for category, sources in selected_sources.items(): if not sources: # Skip if no sources selected for this category continue logger.info(f"Processing category: {category} with sources: {sources}") for source in sources: if source in NEWS_SOURCES[category]: url = NEWS_SOURCES[category][source] try: logger.info(f"Fetching from URL: {url}") feed = feedparser.parse(url) if hasattr(feed, 'status') and feed.status != 200: logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}") continue for entry in feed.entries: try: published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC) if published > cutoff_time: articles.append({ "title": entry.title, "description": BeautifulSoup(entry.description, "html.parser").get_text(), "link": entry.link, "category": category, "source": source, "published": published }) except (AttributeError, TypeError) as e: logger.error(f"Error processing entry: {str(e)}") continue except Exception as e: logger.error(f"Error fetching feed from {url}: {str(e)}") continue logger.info(f"Total articles fetched: {len(articles)}") articles = sorted(articles, key=lambda x: x["published"], reverse=True)[:ARTICLE_LIMIT] return articles def summarize_text(text, model_name): try: summarizer = pipeline("summarization", model=model_name, device=-1) content_hash = hashlib.md5(text.encode()).hexdigest() cached_summary = cache.get(content_hash) if cached_summary: logger.info("Using cached summary") return cached_summary logger.info(f"Generating new summary using model: {model_name}") result = summarizer(text, max_length=120, min_length=40, truncation=True) summary = result[0]['summary_text'] cache.set(content_hash, summary) return summary except Exception as e: logger.error(f"Error in summarization: {str(e)}") return "Summary unavailable." def summarize_articles(articles, model_name): summaries = [] for article in articles: content = article["description"] summary = summarize_text(content, model_name) summaries.append(f""" 📰 {article['title']} - 📁 Category: {article['category']} - 💡 Source: {article['source']} - 🔗 Read More: {article['link']} 📃 Summary: {summary} """) return "\n".join(summaries) def generate_summary(tech_sources, business_sources, world_sources, model_name): logger.info(f""" Generating summary with: - Tech sources: {tech_sources} - Business sources: {business_sources} - World sources: {world_sources} - Model: {model_name} """) # Check if any sources are selected if not any([ tech_sources is not None and len(tech_sources) > 0, business_sources is not None and len(business_sources) > 0, world_sources is not None and len(world_sources) > 0 ]): return "Please select at least one news source." try: articles = fetch_rss_news(tech_sources, business_sources, world_sources) if not articles: return "No recent news found from the selected sources." return summarize_articles(articles, model_name) except Exception as e: logger.error(f"Error in generate_summary: {str(e)}") return f"An error occurred while generating the summary. Please try again." # Gradio Interface demo = gr.Blocks() with demo: gr.Markdown("# 📰 AI News Summarizer") with gr.Row(): with gr.Column(): # Create checkbox groups for each category tech_sources = gr.CheckboxGroup( choices=list(NEWS_SOURCES["Technology"].keys()), label="Technology Sources", value=[] ) business_sources = gr.CheckboxGroup( choices=list(NEWS_SOURCES["Business"].keys()), label="Business Sources", value=[] ) world_sources = gr.CheckboxGroup( choices=list(NEWS_SOURCES["World News"].keys()), label="World News Sources", value=[] ) science_sources = gr.CheckboxGroup( choices=list(NEWS_SOURCES["Science"].keys()), label="Science Sources", value=[] ) sports_sources = gr.CheckboxGroup( choices=list(NEWS_SOURCES["Sports"].keys()), label="Sports Sources", value=[] ) health_sources = gr.CheckboxGroup( choices=list(NEWS_SOURCES["Health"].keys()), label="Health Sources", value=[] ) with gr.Column(): model_selector = gr.Radio( choices=list(SUMMARIZER_MODELS.keys()), label="Choose Summarization Model", value="Default (facebook/bart-large-cnn)" ) summarize_button = gr.Button("Get News Summary") summary_output = gr.Textbox(label="News Summary", lines=20) def get_summary(tech_sources, business_sources, world_sources, selected_model): try: model_name = SUMMARIZER_MODELS[selected_model] return generate_summary(tech_sources, business_sources, world_sources, model_name) except Exception as e: logger.error(f"Error in get_summary: {str(e)}") return "An error occurred while processing your request. Please try again." # Connect the components to the summary function summarize_button.click( get_summary, inputs=[tech_sources, business_sources, world_sources, model_selector], outputs=summary_output ) if __name__ == "__main__": demo.launch()