Spaces:

loayshabet
/

news-sumarry

Running

File size: 11,738 Bytes

import gradio as gr
from transformers import pipeline
import feedparser
from datetime import datetime, timedelta
import pytz
from bs4 import BeautifulSoup
import hashlib
import threading
import logging
import requests

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Global settings
OPENROUTER_API_KEY = "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"
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"

SUMMARIZER_MODELS = {
    "Default (facebook/bart-large-cnn)": "local_bart",
    "Free Model (distilbart-cnn-6-6)": "local_distilbart",
    "OpenRouter (Claude-3)": "anthropic/claude-3-haiku"
}

CACHE_SIZE = 500
RSS_FETCH_INTERVAL = timedelta(hours=8)
ARTICLE_LIMIT = 5

# Updated categories and news sources
CATEGORIES = ["Technology", "Business", "Science", "World News", "Sports", "Health"]
NEWS_SOURCES = {
    "Technology": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best",
        "alarabiya arabic": "https://www.alarabiya.net/feed/rss2/ar/technology.xml",
    },
    "Business": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best",
        "alwatanvoice arabic": "https://feeds.alwatanvoice.com/ar/business.xml",
    },
    "Science": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml"
    },
    "World News": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
        "BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
        "CNN": "http://rss.cnn.com/rss/edition_world.rss",
        "reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best",
        "france24 arabic": "https://www.france24.com/ar/rss",
        "aljazera arabic": "https://www.aljazeera.net/aljazeerarss/a7c186be-1baa-4bd4-9d80-a84db769f779/73d0e1b4-532f-45ef-b135-bfdff8b8cab9",
    },
    "Sports": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best",
        "france24 arabic": "https://www.france24.com/ar/%D8%B1%D9%8A%D8%A7%D8%B6%D8%A9/rss",
    },
    "Health": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
        "politico": "http://rss.politico.com/healthcare.xml",
        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best"
    },
}

class NewsCache:
    def __init__(self, size):
        self.cache = {}
        self.size = size
        self.lock = threading.Lock()

    def get(self, key):
        with self.lock:
            return self.cache.get(key)

    def set(self, key, value):
        with self.lock:
            if len(self.cache) >= self.size:
                oldest_key = next(iter(self.cache))
                del self.cache[oldest_key]
            self.cache[key] = value

cache = NewsCache(CACHE_SIZE)

def detect_language(text):
    """Detect if the text is primarily Arabic"""
    if not text:
        return False
    arabic_chars = len([c for c in text if '\u0600' <= c <= '\u06FF'])
    return (arabic_chars / len(text)) > 0.5

def summarize_text(text, model_name):
    try:
        content_hash = hashlib.md5(text.encode()).hexdigest()
        cached_summary = cache.get(content_hash)
        
        if cached_summary:
            logger.info("Using cached summary")
            return cached_summary

        is_arabic = detect_language(text)
        
        if is_arabic or model_name == "OpenRouter (Claude-3)":
            logger.info("Using OpenRouter for summarization")
            headers = {
                "Authorization": f"Bearer {OPENROUTER_API_KEY}",
                "HTTP-Referer": "https://localhost:7860",  # Replace with your actual domain
                "X-Title": "News Summarizer App",
                "Content-Type": "application/json"
            }
            
            prompt = f"Please provide a concise summary of the following news article in the same language as the original text. Keep the summary brief and focused on key points:\n\n{text}"
            
            data = {
                "model": "anthropic/claude-3-haiku",
                "messages": [{"role": "user", "content": prompt}],
                "max_tokens": 150
            }
            
            response = requests.post(OPENROUTER_API_URL, headers=headers, json=data)
            response.raise_for_status()
            
            summary = response.json()["choices"][0]["message"]["content"]
        else:
            logger.info("Using local model for summarization")
            model_path = "facebook/bart-large-cnn" if model_name == "Default (facebook/bart-large-cnn)" else "sshleifer/distilbart-cnn-6-6"
            summarizer = pipeline("summarization", model=model_path, device=-1)
            result = summarizer(text, max_length=120, min_length=40, truncation=True)
            summary = result[0]['summary_text']
        
        cache.set(content_hash, summary)
        return summary
    except Exception as e:
        logger.error(f"Error in summarization: {str(e)}")
        return f"Summary unavailable. Error: {str(e)}"

def fetch_rss_news(tech_sources, business_sources, science_sources, world_sources, sports_sources, health_sources):
    articles = []
    cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
    
    category_sources = {
        "Technology": tech_sources if tech_sources else [],
        "Business": business_sources if business_sources else [],
        "Science": science_sources if science_sources else [],
        "World News": world_sources if world_sources else [],
        "Sports": sports_sources if sports_sources else [],
        "Health": health_sources if health_sources else []
    }
    
    logger.info(f"Selected sources: {category_sources}")
    
    for category, sources in category_sources.items():
        if not sources:
            continue
            
        logger.info(f"Processing category: {category} with sources: {sources}")
        
        for source in sources:
            if source in NEWS_SOURCES[category]:
                url = NEWS_SOURCES[category][source]
                try:
                    logger.info(f"Fetching from URL: {url}")
                    feed = feedparser.parse(url)
                    
                    if hasattr(feed, 'status') and feed.status != 200:
                        logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}")
                        continue
                        
                    for entry in feed.entries:
                        try:
                            published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
                            if published > cutoff_time:
                                articles.append({
                                    "title": entry.title,
                                    "description": BeautifulSoup(entry.description, "html.parser").get_text(),
                                    "link": entry.link,
                                    "category": category,
                                    "source": source,
                                    "published": published
                                })
                        except (AttributeError, TypeError) as e:
                            logger.error(f"Error processing entry: {str(e)}")
                            continue
                            
                except Exception as e:
                    logger.error(f"Error fetching feed from {url}: {str(e)}")
                    continue
    
    logger.info(f"Total articles fetched: {len(articles)}")
    articles = sorted(articles, key=lambda x: x["published"], reverse=True)[:ARTICLE_LIMIT]
    return articles

def summarize_articles(articles, model_name):
    summaries = []
    for article in articles:
        content = article["description"]
        summary = summarize_text(content, model_name)
        summaries.append(f"""
        <div style='margin-bottom: 20px; white-space: pre-wrap;'>
        📰 {article['title']}
        📃 Summary: {summary}
        - 📁 Category: {article['category']}
        - 💡 Source: {article['source']}
        - 🔗 Read More: <a href="{article['link']}" target="_blank" style="text-decoration: none;">click here</a>
        </div>
        """)
    return "\n".join(summaries)

def get_summary(tech_sources, business_sources, science_sources, world_sources, 
                sports_sources, health_sources, selected_model):
    try:
        if not any([tech_sources, business_sources, science_sources, 
                   world_sources, sports_sources, health_sources]):
            return "Please select at least one news source."
        
        articles = fetch_rss_news(tech_sources, business_sources, science_sources,
                                world_sources, sports_sources, health_sources)
        
        if not articles:
            return "No recent news found from the selected sources."
            
        return summarize_articles(articles, selected_model)
    except Exception as e:
        logger.error(f"Error in get_summary: {str(e)}")
        return f"An error occurred while processing your request: {str(e)}"

# Gradio Interface
demo = gr.Blocks()

with demo:
    gr.Markdown("# 📰 AI News Summarizer")
    
    with gr.Row():
        with gr.Column():
            tech_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["Technology"].keys()),
                label="Technology Sources",
                value=[]
            )
            
            business_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["Business"].keys()),
                label="Business Sources",
                value=[]
            )
            
            science_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["Science"].keys()),
                label="Science Sources",
                value=[]
            )
            
        with gr.Column():
            world_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["World News"].keys()),
                label="World News Sources",
                value=[]
            )
            
            sports_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["Sports"].keys()),
                label="Sports Sources",
                value=[]
            )
            
            health_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["Health"].keys()),
                label="Health Sources",
                value=[]
            )
            
        with gr.Column():
            model_selector = gr.Radio(
                choices=list(SUMMARIZER_MODELS.keys()),
                label="Choose Summarization Model",
                value="OpenRouter (Claude-3)"  # Changed default to OpenRouter
            )
    
       summarize_button = gr.Button("Get News Summary")
    summary_output = gr.HTML(label="News Summary")  # Changed from gr.Textbox to gr.HTML

    summarize_button.click(
        get_summary,
        inputs=[
            tech_sources,
            business_sources,
            science_sources,
            world_sources,
            sports_sources,
            health_sources,
            model_selector
        ],
        outputs=summary_output
    )

if __name__ == "__main__":
    demo.launch()