Spaces:

loayshabet
/

news-sumarry

Running

App Files Files Community

loayshabet commited on Nov 12, 2024

Commit

6e26c5a

verified ·

1 Parent(s): 0e97c1f

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -173

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import pipeline
 import feedparser
 from datetime import datetime, timedelta
 import json
@@ -20,211 +20,119 @@ logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
-# Language codes for supported languages
 LANGUAGE_CODES = {
-    "English": "en",
-    "Spanish": "es",
-    "French": "fr",
-    "German": "de",
-    "Italian": "it",
-    "Portuguese": "pt",
-    "Dutch": "nl",
-    "Russian": "ru",
-    "Chinese": "zh",
-    "Japanese": "ja",
-    "Arabic": "ar"  # Added Arabic support
 }
-# News sources organized by category
-NEWS_SOURCES = {
-    "Technology": [
-        "https://feeds.feedburner.com/TechCrunch/",
-        "https://www.theverge.com/rss/index.xml",
-        "https://www.wired.com/feed/rss",
-        "https://feeds.feedburner.com/TheNextWeb"  # Added for more variety
-    ],
-    "Business": [
-        "https://feeds.feedburner.com/forbes/business",
-        "https://www.ft.com/rss/home",
-        "https://feeds.bloomberg.com/markets/news.rss",
-        "https://www.aljazeera.com/xml/rss/all.xml"  # Added Arabic business news
-    ],
-    "Science": [
-        "https://rss.sciencedaily.com/all.xml",
-        "https://www.nature.com/nature.rss",
-        "https://science.nasa.gov/rss.xml"
-    ],
-    "Health": [
-        "https://rss.medicalnewstoday.com/newsfeeds/medical_all.xml",
-        "https://www.who.int/rss-feeds/news-english.xml",
-        "https://www.healthline.com/rss/news"
-    ],
-    "World News": [
-        "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
-        "https://feeds.bbci.co.uk/news/world/rss.xml",
-        "https://www.reuters.com/rssFeed/world",
-        "https://arabic.cnn.com/rss"  # Added Arabic news source
-    ]
-}
 # Initialize global variables
 summarizer = None
 class NewsCache:
     def __init__(self):
         self.summaries = {}
         self.max_cache_size = 1000
-    def store_summary(self, content_hash, summary):
         if len(self.summaries) >= self.max_cache_size:
             # Remove oldest entry if cache is full
             self.summaries.pop(next(iter(self.summaries)))
-        self.summaries[content_hash] = summary
-    def get_summary(self, content_hash):
-        return self.summaries.get(content_hash)
 news_cache = NewsCache()
-def get_content_hash(content):
-    """Generate hash for content to use as cache key"""
-    return hashlib.md5(content.encode()).hexdigest()
-def clean_text(text):
-    """Clean and normalize text content"""
-    if not text:
-        return ""
-    # Remove HTML tags and normalize whitespace
-    text = BeautifulSoup(text, "html.parser").get_text()
-    return " ".join(text.split())
-@lru_cache(maxsize=100)
-def fetch_feed_with_timeout(url):
-    """Fetch RSS feed with timeout and caching"""
-    try:
-        response = requests.get(url, timeout=10)
-        return feedparser.parse(response.content)
-    except Exception as e:
-        logging.error(f"Error fetching feed {url}: {e}")
-        return None
-def initialize_summarizer():
-    """Initialize the summarization pipeline"""
-    global summarizer
     try:
         summarizer = pipeline(
             "summarization",
             model="facebook/bart-large-cnn",
             device=-1  # Use CPU
         )
         return True
     except Exception as e:
-        logging.error(f"Error initializing summarizer: {e}")
         return False
-def parse_date(date_str):
-    """Parse various date formats to datetime"""
-    try:
-        # Try parsing RSS/Atom date format
-        return parsedate_to_datetime(date_str)
-    except (TypeError, ValueError):
-        try:
-            # Try ISO format
-            return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
-        except (TypeError, ValueError):
-            return None
-def is_recent_article(published_date, hours=8):
-    """Check if article is within the last specified hours"""
-    if not published_date:
-        return False
     try:
-        parsed_date = parse_date(published_date)
-        if not parsed_date:
-            return False
-        # Ensure timezone awareness
-        if parsed_date.tzinfo is None:
-            parsed_date = pytz.UTC.localize(parsed_date)
-        now = datetime.now(pytz.UTC)
-        time_difference = now - parsed_date
-        return time_difference <= timedelta(hours=hours)
     except Exception as e:
-        logging.error(f"Error parsing date: {e}")
-        return False
-def fetch_news_from_rss(interests):
-    """Fetch recent news from RSS feeds"""
-    articles = []
-    max_articles_per_category = 2
-    with ThreadPoolExecutor(max_workers=3) as executor:
-        for interest in interests:
-            if interest not in NEWS_SOURCES:
-                continue
-            future_to_url = {
-                executor.submit(fetch_feed_with_timeout, url): url
-                for url in NEWS_SOURCES[interest]
-            }
-            category_count = 0
-            for future in future_to_url:
-                if category_count >= max_articles_per_category:
-                    break
-                try:
-                    feed = future.result(timeout=15)
-                    if not feed:
-                        continue
-                    for entry in feed.entries:
-                        published_date = entry.get('published', '') or entry.get('updated', '')
-                        if not is_recent_article(published_date):
-                            continue
-                        description = entry.get('description', '') or entry.get('summary', '')
-                        description = clean_text(description)
-                        if len(description) < 50:
-                            continue
-                        article = {
-                            'title': clean_text(entry.get('title', 'Untitled')),
-                            'description': description,
-                            'category': interest,
-                            'link': entry.get('link', ''),
-                            'published': published_date
-                        }
-                        articles.append(article)
-                        category_count += 1
-                        if category_count >= max_articles_per_category:
-                            break
-                except (TimeoutError, Exception) as e:
-                    logging.error(f"Error processing feed: {e}")
-                    continue
-    return articles
-def generate_summary(text, title="", category=""):
-    """Generate summary with enhanced prompting"""
     if not summarizer:
-        if not initialize_summarizer():
             return None
     try:
         # Check cache first
         content_hash = get_content_hash(text)
-        cached_summary = news_cache.get_summary(content_hash)
         if cached_summary:
             return cached_summary
-        # Enhanced prompt template for better summaries
         prompt_template = f"""
 Analyze and summarize this {category} news article titled "{title}".
 Focus on providing:
@@ -239,7 +147,6 @@ Article text:
 Please provide a clear, concise summary that a general audience can understand:"""
-        # Prepare input text
         prompted_text = prompt_template.format(text=text[:1024])
         result = summarizer(prompted_text,
@@ -251,12 +158,16 @@ Please provide a clear, concise summary that a general audience can understand:"
         if result and len(result) > 0:
             summary = result[0]['summary_text']
-            # Post-process summary for better readability
             summary = summary.replace(" .", ".").replace(" ,", ",")
             sentences = summary.split(". ")
             formatted_summary = "\n• " + "\n• ".join(filter(None, sentences))
-            news_cache.store_summary(content_hash, formatted_summary)
             return formatted_summary
         return None
@@ -266,7 +177,7 @@ Please provide a clear, concise summary that a general audience can understand:"
         return None
 def get_personalized_summary(name, progress=gr.Progress()):
-    """Generate personalized news summary"""
     start_time = time.time()
     logging.info(f"Starting summary generation for user: {name}")
@@ -281,19 +192,21 @@ def get_personalized_summary(name, progress=gr.Progress()):
     except Exception as e:
         return f"Error loading preferences: {e}"
     # Fetch articles with progress
     progress(0.2, desc="Fetching recent news...")
     articles = fetch_news_from_rss(preferences["interests"])
     if not articles:
-        return "No recent news articles found from the last 8 hours. Please try again later."
     # Process articles with timeout
     progress(0.4, desc="Analyzing and summarizing...")
     summaries = []
     total_articles = len(articles)
-    max_processing_time = 60  # Maximum processing time in seconds
     for i, article in enumerate(articles):
         if time.time() - start_time > max_processing_time:
@@ -313,18 +226,24 @@ def get_personalized_summary(name, progress=gr.Progress()):
             if not content:
                 continue
-            summary = generate_summary(content, title, category)
             if not summary:
                 continue
             formatted_summary = f"""
 📰 {title}
-📁 Category: {category}
-⏰ Published: {published_str}
 {summary}
-🔗 Read more: {link}
 ---"""
             summaries.append(formatted_summary)
@@ -334,11 +253,13 @@ def get_personalized_summary(name, progress=gr.Progress()):
             continue
     if not summaries:
-        return "Unable to generate summaries for recent news. Please try again."
     progress(1.0, desc="Done!")
     return "\n".join(summaries)
 # Gradio interface
 with gr.Blocks(title="Enhanced News Summarizer") as demo:
     gr.Markdown("# 📰 Enhanced AI News Summarizer")

 import gradio as gr
+from transformers import pipeline, AutoModelForSeq2SeqGeneration, AutoTokenizer
 import feedparser
 from datetime import datetime, timedelta
 import json
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
+# Language codes and their corresponding MarianMT model names
 LANGUAGE_CODES = {
+    "English": {"code": "en", "model": None},  # No translation needed for English
+    "Spanish": {"code": "es", "model": "Helsinki-NLP/opus-mt-en-es"},
+    "French": {"code": "fr", "model": "Helsinki-NLP/opus-mt-en-fr"},
+    "German": {"code": "de", "model": "Helsinki-NLP/opus-mt-en-de"},
+    "Italian": {"code": "it", "model": "Helsinki-NLP/opus-mt-en-it"},
+    "Portuguese": {"code": "pt", "model": "Helsinki-NLP/opus-mt-en-pt"},
+    "Dutch": {"code": "nl", "model": "Helsinki-NLP/opus-mt-en-nl"},
+    "Russian": {"code": "ru", "model": "Helsinki-NLP/opus-mt-en-ru"},
+    "Chinese": {"code": "zh", "model": "Helsinki-NLP/opus-mt-en-zh"},
+    "Japanese": {"code": "ja", "model": "Helsinki-NLP/opus-mt-en-jap"},
+    "Arabic": {"code": "ar", "model": "Helsinki-NLP/opus-mt-en-ar"}
 }
+# [Previous NEWS_SOURCES definition remains the same...]
 # Initialize global variables
 summarizer = None
+translators = {}
 class NewsCache:
     def __init__(self):
         self.summaries = {}
+        self.translations = {}
         self.max_cache_size = 1000
+    def store_summary(self, content_hash, summary, language=None):
+        cache_key = f"{content_hash}_{language}" if language else content_hash
         if len(self.summaries) >= self.max_cache_size:
             # Remove oldest entry if cache is full
             self.summaries.pop(next(iter(self.summaries)))
+        self.summaries[cache_key] = summary
+    def get_summary(self, content_hash, language=None):
+        cache_key = f"{content_hash}_{language}" if language else content_hash
+        return self.summaries.get(cache_key)
 news_cache = NewsCache()
+def initialize_models():
+    """Initialize the summarization and translation models"""
+    global summarizer, translators
     try:
+        # Initialize summarizer
         summarizer = pipeline(
             "summarization",
             model="facebook/bart-large-cnn",
             device=-1  # Use CPU
         )
+        # Initialize translators for each language
+        for lang, info in LANGUAGE_CODES.items():
+            if info["model"]:  # Skip English as it doesn't need translation
+                try:
+                    model = AutoModelForSeq2SeqGeneration.from_pretrained(info["model"])
+                    tokenizer = AutoTokenizer.from_pretrained(info["model"])
+                    translators[lang] = (model, tokenizer)
+                    logging.info(f"Initialized translator for {lang}")
+                except Exception as e:
+                    logging.error(f"Error initializing translator for {lang}: {e}")
         return True
     except Exception as e:
+        logging.error(f"Error initializing models: {e}")
         return False
+def translate_text(text, target_language):
+    """Translate text to target language"""
+    if target_language == "English" or not text:
+        return text
     try:
+        if target_language not in translators:
+            logging.error(f"Translator not found for {target_language}")
+            return text
+        model, tokenizer = translators[target_language]
+        # Split text into chunks to handle long text
+        max_length = 512
+        chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
+        translated_chunks = []
+        for chunk in chunks:
+            inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
+            translated = model.generate(**inputs)
+            translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
+            translated_chunks.append(translated_text)
+        return " ".join(translated_chunks)
     except Exception as e:
+        logging.error(f"Translation error: {e}")
+        return text
+def generate_summary(text, title="", category="", language="English"):
+    """Generate summary with translation support"""
     if not summarizer:
+        if not initialize_models():
             return None
     try:
         # Check cache first
         content_hash = get_content_hash(text)
+        cached_summary = news_cache.get_summary(content_hash, language)
         if cached_summary:
             return cached_summary
+        # Generate English summary first
         prompt_template = f"""
 Analyze and summarize this {category} news article titled "{title}".
 Focus on providing:
 Please provide a clear, concise summary that a general audience can understand:"""
         prompted_text = prompt_template.format(text=text[:1024])
         result = summarizer(prompted_text,
         if result and len(result) > 0:
             summary = result[0]['summary_text']
+            # Post-process summary
             summary = summary.replace(" .", ".").replace(" ,", ",")
             sentences = summary.split(". ")
             formatted_summary = "\n• " + "\n• ".join(filter(None, sentences))
+            # Translate if needed
+            if language != "English":
+                formatted_summary = translate_text(formatted_summary, language)
+            news_cache.store_summary(content_hash, formatted_summary, language)
             return formatted_summary
         return None
         return None
 def get_personalized_summary(name, progress=gr.Progress()):
+    """Generate personalized news summary in user's preferred language"""
     start_time = time.time()
     logging.info(f"Starting summary generation for user: {name}")
     except Exception as e:
         return f"Error loading preferences: {e}"
+    user_language = preferences.get("language", "English")
     # Fetch articles with progress
     progress(0.2, desc="Fetching recent news...")
     articles = fetch_news_from_rss(preferences["interests"])
     if not articles:
+        return translate_text("No recent news articles found from the last 8 hours. Please try again later.", user_language)
     # Process articles with timeout
     progress(0.4, desc="Analyzing and summarizing...")
     summaries = []
     total_articles = len(articles)
+    max_processing_time = 60
     for i, article in enumerate(articles):
         if time.time() - start_time > max_processing_time:
             if not content:
                 continue
+            summary = generate_summary(content, title, category, user_language)
             if not summary:
                 continue
+            # Translate title and category if needed
+            if user_language != "English":
+                title = translate_text(title, user_language)
+                category = translate_text(category, user_language)
+                published_str = translate_text(published_str, user_language)
             formatted_summary = f"""
 📰 {title}
+📁 {translate_text("Category", user_language)}: {category}
+⏰ {translate_text("Published", user_language)}: {published_str}
 {summary}
+🔗 {translate_text("Read more", user_language)}: {link}
 ---"""
             summaries.append(formatted_summary)
             continue
     if not summaries:
+        return translate_text("Unable to generate summaries for recent news. Please try again.", user_language)
     progress(1.0, desc="Done!")
     return "\n".join(summaries)
+# [Rest of the code remains the same...]
 # Gradio interface
 with gr.Blocks(title="Enhanced News Summarizer") as demo:
     gr.Markdown("# 📰 Enhanced AI News Summarizer")