Spaces:

loayshabet
/

news-sumarry

Running

App Files Files Community

loayshabet commited on Dec 22, 2024

Commit

ff70769

verified ·

1 Parent(s): 7620715

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -68

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import pipeline
 import feedparser
 from datetime import datetime, timedelta
 import pytz
@@ -7,9 +7,11 @@ from bs4 import BeautifulSoup
 import hashlib
 import threading
 import logging
 # Set up logging
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Global settings
@@ -22,36 +24,42 @@ RSS_FETCH_INTERVAL = timedelta(hours=8)
 ARTICLE_LIMIT = 5
 # Restructured news sources with fixed categories
-CATEGORIES = ["Technology", "Business", "World News", "Science", "Sports", "Health"]
 NEWS_SOURCES = {
     "Technology": {
         "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
-        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best"
     },
     "Business": {
         "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
-        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best"
     },
     "World News": {
-        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
         "BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
-        "CNN": "http://rss.cnn.com/rss/edition_world.rss",
-        "reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best"
-    },
-    "Science": {
-        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml"
-    },
-    "Sports": {
-        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
-        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best"
-    },
-    "Health": {
-        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
-        "politico": "http://rss.politico.com/healthcare.xml",
-        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best"
-    },
 }
 class NewsCache:
     def __init__(self, size):
         self.cache = {}
@@ -75,20 +83,16 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
     articles = []
     cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
-    # Create a mapping of selected sources
     selected_sources = {
         "Technology": tech_sources if tech_sources else [],
         "Business": business_sources if business_sources else [],
-        "World News": world_sources if world_sources else [],
-        "Science": science_sources if science_sources else [],
-        "Sports": sports_sources if sports_sources else [],
-        "Health": health_sources if health_sources else [],
     }
     logger.info(f"Selected sources: {selected_sources}")
     for category, sources in selected_sources.items():
-        if not sources:  # Skip if no sources selected for this category
             continue
         logger.info(f"Processing category: {category} with sources: {sources}")
@@ -100,23 +104,33 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
                     logger.info(f"Fetching from URL: {url}")
                     feed = feedparser.parse(url)
-                    if hasattr(feed, 'status') and feed.status != 200:
-                        logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}")
-                        continue
                     for entry in feed.entries:
                         try:
-                            published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
                             if published > cutoff_time:
                                 articles.append({
                                     "title": entry.title,
-                                    "description": BeautifulSoup(entry.description, "html.parser").get_text(),
                                     "link": entry.link,
                                     "category": category,
                                     "source": source,
                                     "published": published
                                 })
-                        except (AttributeError, TypeError) as e:
                             logger.error(f"Error processing entry: {str(e)}")
                             continue
@@ -130,7 +144,14 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
 def summarize_text(text, model_name):
     try:
-        summarizer = pipeline("summarization", model=model_name, device=-1)
         content_hash = hashlib.md5(text.encode()).hexdigest()
         cached_summary = cache.get(content_hash)
@@ -138,27 +159,44 @@ def summarize_text(text, model_name):
             logger.info("Using cached summary")
             return cached_summary
-        logger.info(f"Generating new summary using model: {model_name}")
-        result = summarizer(text, max_length=120, min_length=40, truncation=True)
         summary = result[0]['summary_text']
         cache.set(content_hash, summary)
         return summary
     except Exception as e:
         logger.error(f"Error in summarization: {str(e)}")
-        return "Summary unavailable."
 def summarize_articles(articles, model_name):
     summaries = []
-    for article in articles:
-        content = article["description"]
-        summary = summarize_text(content, model_name)
-        summaries.append(f"""
-        📰 {article['title']}
-        - 📁 Category: {article['category']}
-        - 💡 Source: {article['source']}
-        - 🔗 Read More: {article['link']}
-        📃 Summary: {summary}
-        """)
     return "\n".join(summaries)
 def generate_summary(tech_sources, business_sources, world_sources, model_name):
@@ -170,7 +208,6 @@ def generate_summary(tech_sources, business_sources, world_sources, model_name):
     - Model: {model_name}
     """)
-    # Check if any sources are selected
     if not any([
         tech_sources is not None and len(tech_sources) > 0,
         business_sources is not None and len(business_sources) > 0,
@@ -185,7 +222,8 @@ def generate_summary(tech_sources, business_sources, world_sources, model_name):
         return summarize_articles(articles, model_name)
     except Exception as e:
         logger.error(f"Error in generate_summary: {str(e)}")
-        return f"An error occurred while generating the summary. Please try again."
 # Gradio Interface
 demo = gr.Blocks()
@@ -195,7 +233,6 @@ with demo:
     with gr.Row():
         with gr.Column():
-            # Create checkbox groups for each category
             tech_sources = gr.CheckboxGroup(
                 choices=list(NEWS_SOURCES["Technology"].keys()),
                 label="Technology Sources",
@@ -211,21 +248,6 @@ with demo:
                 label="World News Sources",
                 value=[]
             )
-            science_sources = gr.CheckboxGroup(
-                choices=list(NEWS_SOURCES["Science"].keys()),
-                label="Science Sources",
-                value=[]
-            )
-            sports_sources = gr.CheckboxGroup(
-                choices=list(NEWS_SOURCES["Sports"].keys()),
-                label="Sports Sources",
-                value=[]
-            )
-            health_sources = gr.CheckboxGroup(
-                choices=list(NEWS_SOURCES["Health"].keys()),
-                label="Health Sources",
-                value=[]
-            )
         with gr.Column():
             model_selector = gr.Radio(
@@ -243,9 +265,9 @@ with demo:
             return generate_summary(tech_sources, business_sources, world_sources, model_name)
         except Exception as e:
             logger.error(f"Error in get_summary: {str(e)}")
-            return "An error occurred while processing your request. Please try again."
-    # Connect the components to the summary function
     summarize_button.click(
         get_summary,
         inputs=[tech_sources, business_sources, world_sources, model_selector],

 import gradio as gr
+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqGeneration
 import feedparser
 from datetime import datetime, timedelta
 import pytz
 import hashlib
 import threading
 import logging
+import traceback
 # Set up logging
+logging.basicConfig(level=logging.INFO,
+                   format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 # Global settings
 ARTICLE_LIMIT = 5
 # Restructured news sources with fixed categories
+CATEGORIES = ["Technology", "Business", "World News"]
 NEWS_SOURCES = {
     "Technology": {
         "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
     },
     "Business": {
         "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
     },
     "World News": {
         "BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
+    }
 }
+# Global summarizer instance
+summarizer_instance = None
+summarizer_model_name = None
+def initialize_summarizer(model_name):
+    global summarizer_instance, summarizer_model_name
+    try:
+        if summarizer_instance is None or summarizer_model_name != model_name:
+            logger.info(f"Initializing summarizer with model: {model_name}")
+            summarizer_instance = pipeline("summarization",
+                                        model=model_name,
+                                        device=-1,
+                                        max_length=130,
+                                        min_length=30,
+                                        do_sample=False)
+            summarizer_model_name = model_name
+            logger.info("Summarizer initialized successfully")
+        return summarizer_instance
+    except Exception as e:
+        logger.error(f"Error initializing summarizer: {str(e)}")
+        logger.error(traceback.format_exc())
+        raise
 class NewsCache:
     def __init__(self, size):
         self.cache = {}
     articles = []
     cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
     selected_sources = {
         "Technology": tech_sources if tech_sources else [],
         "Business": business_sources if business_sources else [],
+        "World News": world_sources if world_sources else []
     }
     logger.info(f"Selected sources: {selected_sources}")
     for category, sources in selected_sources.items():
+        if not sources:
             continue
         logger.info(f"Processing category: {category} with sources: {sources}")
                     logger.info(f"Fetching from URL: {url}")
                     feed = feedparser.parse(url)
                     for entry in feed.entries:
                         try:
+                            # Handle different date formats
+                            if hasattr(entry, 'published_parsed'):
+                                published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
+                            else:
+                                published = datetime.now(pytz.UTC)
+                            # Extract and clean description
+                            description = entry.description if hasattr(entry, 'description') else ""
+                            description = BeautifulSoup(description, "html.parser").get_text()
+                            description = description.strip()
+                            if not description:  # Skip entries without description
+                                continue
                             if published > cutoff_time:
                                 articles.append({
                                     "title": entry.title,
+                                    "description": description,
                                     "link": entry.link,
                                     "category": category,
                                     "source": source,
                                     "published": published
                                 })
+                        except Exception as e:
                             logger.error(f"Error processing entry: {str(e)}")
                             continue
 def summarize_text(text, model_name):
     try:
+        # Get or initialize summarizer
+        summarizer = initialize_summarizer(model_name)
+        # Check if text is too short
+        if len(text.split()) < 30:
+            logger.info("Text too short for summarization, returning original")
+            return text
         content_hash = hashlib.md5(text.encode()).hexdigest()
         cached_summary = cache.get(content_hash)
             logger.info("Using cached summary")
             return cached_summary
+        logger.info("Generating new summary")
+        # Clean and prepare text
+        text = text.strip()
+        text = ' '.join(text.split())  # Normalize whitespace
+        # Generate summary
+        result = summarizer(text, max_length=130, min_length=30, do_sample=False)
         summary = result[0]['summary_text']
+        # Cache the result
         cache.set(content_hash, summary)
         return summary
     except Exception as e:
         logger.error(f"Error in summarization: {str(e)}")
+        logger.error(traceback.format_exc())
+        return text[:200] + "..."  # Return truncated text as fallback
 def summarize_articles(articles, model_name):
     summaries = []
+    for i, article in enumerate(articles):
+        try:
+            logger.info(f"Processing article {i+1}/{len(articles)}: {article['title']}")
+            content = article["description"]
+            summary = summarize_text(content, model_name)
+            summaries.append(f"""
+            📰 {article['title']}
+            - 📁 Category: {article['category']}
+            - 💡 Source: {article['source']}
+            - 🔗 Read More: {article['link']}
+            📃 Summary: {summary}
+            """)
+        except Exception as e:
+            logger.error(f"Error summarizing article: {str(e)}")
+            continue
+    if not summaries:
+        return "Could not generate summaries for the selected articles."
     return "\n".join(summaries)
 def generate_summary(tech_sources, business_sources, world_sources, model_name):
     - Model: {model_name}
     """)
     if not any([
         tech_sources is not None and len(tech_sources) > 0,
         business_sources is not None and len(business_sources) > 0,
         return summarize_articles(articles, model_name)
     except Exception as e:
         logger.error(f"Error in generate_summary: {str(e)}")
+        logger.error(traceback.format_exc())
+        return f"An error occurred: {str(e)}"
 # Gradio Interface
 demo = gr.Blocks()
     with gr.Row():
         with gr.Column():
             tech_sources = gr.CheckboxGroup(
                 choices=list(NEWS_SOURCES["Technology"].keys()),
                 label="Technology Sources",
                 label="World News Sources",
                 value=[]
             )
         with gr.Column():
             model_selector = gr.Radio(
             return generate_summary(tech_sources, business_sources, world_sources, model_name)
         except Exception as e:
             logger.error(f"Error in get_summary: {str(e)}")
+            logger.error(traceback.format_exc())
+            return f"An error occurred: {str(e)}"
     summarize_button.click(
         get_summary,
         inputs=[tech_sources, business_sources, world_sources, model_selector],