Spaces:

loayshabet
/

news-sumarry

Running

App Files Files Community

loayshabet commited on Dec 25, 2024

Commit

2cc6057

verified ·

1 Parent(s): 9dbc598

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -122

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import pipeline
 import feedparser
 from datetime import datetime, timedelta
 import pytz
@@ -8,133 +8,57 @@ import hashlib
 import threading
 import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Global settings
-SUMMARIZER_MODELS = {
-    "Default (facebook/bart-large-cnn)": "facebook/bart-large-cnn",
-    "Free Model (distilbart-cnn-6-6)": "sshleifer/distilbart-cnn-6-6"
-}
-CACHE_SIZE = 500
-RSS_FETCH_INTERVAL = timedelta(hours=8)
-ARTICLE_LIMIT = 5
-# Updated categories and news sources
-CATEGORIES = ["Technology", "Business", "Science", "World News", "Sports", "Health"]
-NEWS_SOURCES = {
-    "Technology": {
-        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
-        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best",
-        "alarabiya arabic": "https://www.alarabiya.net/feed/rss2/ar/technology.xml",
-    },
-    "Business": {
-        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
-        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best",
-        "alwatanvoice arabic": "https://feeds.alwatanvoice.com/ar/business.xml",
-    },
-    "Science": {
-        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml"
-    },
-    "World News": {
-        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
-        "BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
-        "CNN": "http://rss.cnn.com/rss/edition_world.rss",
-        "reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best",
-        "france24 arabic": "https://www.france24.com/ar/rss",
-        "aljazera arabic": "https://www.aljazeera.net/aljazeerarss/a7c186be-1baa-4bd4-9d80-a84db769f779/73d0e1b4-532f-45ef-b135-bfdff8b8cab9",
-    },
-    "Sports": {
-        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
-        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best",
-        "france24 arabic": "https://www.france24.com/ar/%D8%B1%D9%8A%D8%A7%D8%B6%D8%A9/rss",
-    },
-    "Health": {
-        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
-        "politico": "http://rss.politico.com/healthcare.xml",
-        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best"
-    },
-}
-class NewsCache:
-    def __init__(self, size):
-        self.cache = {}
-        self.size = size
-        self.lock = threading.Lock()
-    def get(self, key):
-        with self.lock:
-            return self.cache.get(key)
-    def set(self, key, value):
-        with self.lock:
-            if len(self.cache) >= self.size:
-                oldest_key = next(iter(self.cache))
-                del self.cache[oldest_key]
-            self.cache[key] = value
-cache = NewsCache(CACHE_SIZE)
-def fetch_rss_news(tech_sources, business_sources, science_sources, world_sources, sports_sources, health_sources):
-    articles = []
-    cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
-    # Create a mapping of selected sources
-    category_sources = {
-        "Technology": tech_sources if tech_sources else [],
-        "Business": business_sources if business_sources else [],
-        "Science": science_sources if science_sources else [],
-        "World News": world_sources if world_sources else [],
-        "Sports": sports_sources if sports_sources else [],
-        "Health": health_sources if health_sources else []
-    }
-    logger.info(f"Selected sources: {category_sources}")
-    for category, sources in category_sources.items():
-        if not sources:  # Skip if no sources selected for this category
-            continue
-        logger.info(f"Processing category: {category} with sources: {sources}")
-        for source in sources:
-            if source in NEWS_SOURCES[category]:
-                url = NEWS_SOURCES[category][source]
-                try:
-                    logger.info(f"Fetching from URL: {url}")
-                    feed = feedparser.parse(url)
-                    if hasattr(feed, 'status') and feed.status != 200:
-                        logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}")
-                        continue
-                    for entry in feed.entries:
-                        try:
-                            published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
-                            if published > cutoff_time:
-                                articles.append({
-                                    "title": entry.title,
-                                    "description": BeautifulSoup(entry.description, "html.parser").get_text(),
-                                    "link": entry.link,
-                                    "category": category,
-                                    "source": source,
-                                    "published": published
-                                })
-                        except (AttributeError, TypeError) as e:
-                            logger.error(f"Error processing entry: {str(e)}")
-                            continue
-                except Exception as e:
-                    logger.error(f"Error fetching feed from {url}: {str(e)}")
-                    continue
-    logger.info(f"Total articles fetched: {len(articles)}")
-    articles = sorted(articles, key=lambda x: x["published"], reverse=True)[:ARTICLE_LIMIT]
-    return articles
-def summarize_text(text, model_name):
     try:
         summarizer = pipeline("summarization", model=model_name, device=-1)
         content_hash = hashlib.md5(text.encode()).hexdigest()
         cached_summary = cache.get(content_hash)
@@ -156,7 +80,7 @@ def summarize_articles(articles, model_name):
     summaries = []
     for article in articles:
         content = article["description"]
-        summary = summarize_text(content, model_name)
         summaries.append(f"""
         📰 {article['title']}
         - 📁 Category: {article['category']}

 import gradio as gr
+from transformers import pipeline, AutoModelForSeq2SeqGeneration, AutoTokenizer
 import feedparser
 from datetime import datetime, timedelta
 import pytz
 import threading
 import logging
+# Add this to your imports
+from transformers import MarianMTModel, MarianTokenizer
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Add translation model configuration
+TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-ar-en"
+class Translator:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+    def load_model(self):
+        if self.model is None:
+            try:
+                self.tokenizer = MarianTokenizer.from_pretrained(TRANSLATION_MODEL)
+                self.model = MarianMTModel.from_pretrained(TRANSLATION_MODEL)
+                logger.info("Translation model loaded successfully")
+            except Exception as e:
+                logger.error(f"Error loading translation model: {str(e)}")
+                raise
+    def translate(self, text):
+        try:
+            self.load_model()
+            inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+            translated = self.model.generate(**inputs)
+            return self.tokenizer.decode(translated[0], skip_special_tokens=True)
+        except Exception as e:
+            logger.error(f"Translation error: {str(e)}")
+            return text
+# Initialize translator
+translator = Translator()
+# Rest of your existing configurations...
+[Your existing SUMMARIZER_MODELS, CACHE_SIZE, RSS_FETCH_INTERVAL, ARTICLE_LIMIT, CATEGORIES, and NEWS_SOURCES definitions]
+def is_arabic_source(source_name):
+    return any(arabic_indicator in source_name.lower() for arabic_indicator in ['arabic', 'alarabiya', 'aljazeera', 'alwatanvoice'])
+def summarize_text(text, model_name, source):
     try:
+        # Translate if it's an Arabic source
+        if is_arabic_source(source):
+            logger.info("Translating Arabic content before summarization")
+            text = translator.translate(text)
         summarizer = pipeline("summarization", model=model_name, device=-1)
         content_hash = hashlib.md5(text.encode()).hexdigest()
         cached_summary = cache.get(content_hash)
     summaries = []
     for article in articles:
         content = article["description"]
+        summary = summarize_text(content, model_name, article['source'])
         summaries.append(f"""
         📰 {article['title']}
         - 📁 Category: {article['category']}