Spaces:

loayshabet
/

news-sumarry

Running

App Files Files Community

loayshabet commited on Dec 22, 2024

Commit

b2a401d

verified ·

1 Parent(s): ff70769

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -87

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqGeneration
 import feedparser
 from datetime import datetime, timedelta
 import pytz
@@ -7,11 +7,9 @@ from bs4 import BeautifulSoup
 import hashlib
 import threading
 import logging
-import traceback
 # Set up logging
-logging.basicConfig(level=logging.INFO,
-                   format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 # Global settings
@@ -37,29 +35,6 @@ NEWS_SOURCES = {
     }
 }
-# Global summarizer instance
-summarizer_instance = None
-summarizer_model_name = None
-def initialize_summarizer(model_name):
-    global summarizer_instance, summarizer_model_name
-    try:
-        if summarizer_instance is None or summarizer_model_name != model_name:
-            logger.info(f"Initializing summarizer with model: {model_name}")
-            summarizer_instance = pipeline("summarization",
-                                        model=model_name,
-                                        device=-1,
-                                        max_length=130,
-                                        min_length=30,
-                                        do_sample=False)
-            summarizer_model_name = model_name
-            logger.info("Summarizer initialized successfully")
-        return summarizer_instance
-    except Exception as e:
-        logger.error(f"Error initializing summarizer: {str(e)}")
-        logger.error(traceback.format_exc())
-        raise
 class NewsCache:
     def __init__(self, size):
         self.cache = {}
@@ -83,6 +58,7 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
     articles = []
     cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
     selected_sources = {
         "Technology": tech_sources if tech_sources else [],
         "Business": business_sources if business_sources else [],
@@ -92,7 +68,7 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
     logger.info(f"Selected sources: {selected_sources}")
     for category, sources in selected_sources.items():
-        if not sources:
             continue
         logger.info(f"Processing category: {category} with sources: {sources}")
@@ -104,33 +80,23 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
                     logger.info(f"Fetching from URL: {url}")
                     feed = feedparser.parse(url)
                     for entry in feed.entries:
                         try:
-                            # Handle different date formats
-                            if hasattr(entry, 'published_parsed'):
-                                published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
-                            else:
-                                published = datetime.now(pytz.UTC)
-                            # Extract and clean description
-                            description = entry.description if hasattr(entry, 'description') else ""
-                            description = BeautifulSoup(description, "html.parser").get_text()
-                            description = description.strip()
-                            if not description:  # Skip entries without description
-                                continue
                             if published > cutoff_time:
                                 articles.append({
                                     "title": entry.title,
-                                    "description": description,
                                     "link": entry.link,
                                     "category": category,
                                     "source": source,
                                     "published": published
                                 })
-                        except Exception as e:
                             logger.error(f"Error processing entry: {str(e)}")
                             continue
@@ -144,14 +110,7 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
 def summarize_text(text, model_name):
     try:
-        # Get or initialize summarizer
-        summarizer = initialize_summarizer(model_name)
-        # Check if text is too short
-        if len(text.split()) < 30:
-            logger.info("Text too short for summarization, returning original")
-            return text
         content_hash = hashlib.md5(text.encode()).hexdigest()
         cached_summary = cache.get(content_hash)
@@ -159,44 +118,27 @@ def summarize_text(text, model_name):
             logger.info("Using cached summary")
             return cached_summary
-        logger.info("Generating new summary")
-        # Clean and prepare text
-        text = text.strip()
-        text = ' '.join(text.split())  # Normalize whitespace
-        # Generate summary
-        result = summarizer(text, max_length=130, min_length=30, do_sample=False)
         summary = result[0]['summary_text']
-        # Cache the result
         cache.set(content_hash, summary)
         return summary
     except Exception as e:
         logger.error(f"Error in summarization: {str(e)}")
-        logger.error(traceback.format_exc())
-        return text[:200] + "..."  # Return truncated text as fallback
 def summarize_articles(articles, model_name):
     summaries = []
-    for i, article in enumerate(articles):
-        try:
-            logger.info(f"Processing article {i+1}/{len(articles)}: {article['title']}")
-            content = article["description"]
-            summary = summarize_text(content, model_name)
-            summaries.append(f"""
-            📰 {article['title']}
-            - 📁 Category: {article['category']}
-            - 💡 Source: {article['source']}
-            - 🔗 Read More: {article['link']}
-            📃 Summary: {summary}
-            """)
-        except Exception as e:
-            logger.error(f"Error summarizing article: {str(e)}")
-            continue
-    if not summaries:
-        return "Could not generate summaries for the selected articles."
     return "\n".join(summaries)
 def generate_summary(tech_sources, business_sources, world_sources, model_name):
@@ -208,6 +150,7 @@ def generate_summary(tech_sources, business_sources, world_sources, model_name):
     - Model: {model_name}
     """)
     if not any([
         tech_sources is not None and len(tech_sources) > 0,
         business_sources is not None and len(business_sources) > 0,
@@ -222,8 +165,7 @@ def generate_summary(tech_sources, business_sources, world_sources, model_name):
         return summarize_articles(articles, model_name)
     except Exception as e:
         logger.error(f"Error in generate_summary: {str(e)}")
-        logger.error(traceback.format_exc())
-        return f"An error occurred: {str(e)}"
 # Gradio Interface
 demo = gr.Blocks()
@@ -233,6 +175,7 @@ with demo:
     with gr.Row():
         with gr.Column():
             tech_sources = gr.CheckboxGroup(
                 choices=list(NEWS_SOURCES["Technology"].keys()),
                 label="Technology Sources",
@@ -265,9 +208,9 @@ with demo:
             return generate_summary(tech_sources, business_sources, world_sources, model_name)
         except Exception as e:
             logger.error(f"Error in get_summary: {str(e)}")
-            logger.error(traceback.format_exc())
-            return f"An error occurred: {str(e)}"
     summarize_button.click(
         get_summary,
         inputs=[tech_sources, business_sources, world_sources, model_selector],

 import gradio as gr
+from transformers import pipeline
 import feedparser
 from datetime import datetime, timedelta
 import pytz
 import hashlib
 import threading
 import logging
 # Set up logging
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Global settings
     }
 }
 class NewsCache:
     def __init__(self, size):
         self.cache = {}
     articles = []
     cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
+    # Create a mapping of selected sources
     selected_sources = {
         "Technology": tech_sources if tech_sources else [],
         "Business": business_sources if business_sources else [],
     logger.info(f"Selected sources: {selected_sources}")
     for category, sources in selected_sources.items():
+        if not sources:  # Skip if no sources selected for this category
             continue
         logger.info(f"Processing category: {category} with sources: {sources}")
                     logger.info(f"Fetching from URL: {url}")
                     feed = feedparser.parse(url)
+                    if hasattr(feed, 'status') and feed.status != 200:
+                        logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}")
+                        continue
                     for entry in feed.entries:
                         try:
+                            published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
                             if published > cutoff_time:
                                 articles.append({
                                     "title": entry.title,
+                                    "description": BeautifulSoup(entry.description, "html.parser").get_text(),
                                     "link": entry.link,
                                     "category": category,
                                     "source": source,
                                     "published": published
                                 })
+                        except (AttributeError, TypeError) as e:
                             logger.error(f"Error processing entry: {str(e)}")
                             continue
 def summarize_text(text, model_name):
     try:
+        summarizer = pipeline("summarization", model=model_name, device=-1)
         content_hash = hashlib.md5(text.encode()).hexdigest()
         cached_summary = cache.get(content_hash)
             logger.info("Using cached summary")
             return cached_summary
+        logger.info(f"Generating new summary using model: {model_name}")
+        result = summarizer(text, max_length=120, min_length=40, truncation=True)
         summary = result[0]['summary_text']
         cache.set(content_hash, summary)
         return summary
     except Exception as e:
         logger.error(f"Error in summarization: {str(e)}")
+        return "Summary unavailable."
 def summarize_articles(articles, model_name):
     summaries = []
+    for article in articles:
+        content = article["description"]
+        summary = summarize_text(content, model_name)
+        summaries.append(f"""
+        📰 {article['title']}
+        - 📁 Category: {article['category']}
+        - 💡 Source: {article['source']}
+        - 🔗 Read More: {article['link']}
+        📃 Summary: {summary}
+        """)
     return "\n".join(summaries)
 def generate_summary(tech_sources, business_sources, world_sources, model_name):
     - Model: {model_name}
     """)
+    # Check if any sources are selected
     if not any([
         tech_sources is not None and len(tech_sources) > 0,
         business_sources is not None and len(business_sources) > 0,
         return summarize_articles(articles, model_name)
     except Exception as e:
         logger.error(f"Error in generate_summary: {str(e)}")
+        return f"An error occurred while generating the summary. Please try again."
 # Gradio Interface
 demo = gr.Blocks()
     with gr.Row():
         with gr.Column():
+            # Create checkbox groups for each category
             tech_sources = gr.CheckboxGroup(
                 choices=list(NEWS_SOURCES["Technology"].keys()),
                 label="Technology Sources",
             return generate_summary(tech_sources, business_sources, world_sources, model_name)
         except Exception as e:
             logger.error(f"Error in get_summary: {str(e)}")
+            return "An error occurred while processing your request. Please try again."
+    # Connect the components to the summary function
     summarize_button.click(
         get_summary,
         inputs=[tech_sources, business_sources, world_sources, model_selector],