File size: 9,860 Bytes
41c3554
120af41
9f9e0f1
db7a151
8379514
0e97c1f
9b9dfb3
 
a21b264
 
 
 
 
0e97c1f
c4b815a
 
 
 
 
 
 
 
 
0908597
3e74e81
92e609e
0908597
 
3e74e81
0908597
 
 
3e74e81
0908597
 
3e74e81
0908597
3e74e81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92e609e
 
9b9dfb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e3626e
4e1894d
41e7bb5
4e3626e
0908597
4e1894d
 
a21b264
 
3e74e81
 
 
 
4e1894d
 
a21b264
 
0908597
4e1894d
 
a21b264
 
 
0908597
 
4e1894d
 
a21b264
4e1894d
a21b264
 
 
 
 
4e1894d
a21b264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e1894d
a21b264
 
29c7963
41e7bb5
 
c4b815a
db3de66
a21b264
 
 
 
 
 
 
 
 
4e3626e
9b9dfb3
 
 
a21b264
 
4e3626e
9f9e0f1
c4b815a
fa5e64a
 
c4b815a
 
9b9dfb3
 
 
 
 
 
 
 
 
4e1894d
a21b264
 
 
 
 
 
 
 
 
 
 
 
 
 
0908597
a21b264
 
 
 
 
 
 
 
 
c4b815a
9b9dfb3
4e3626e
cc8ee0f
 
9b9dfb3
0908597
92e609e
0908597
 
4e1894d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e74e81
 
 
 
 
 
 
 
 
 
 
7620715
3e74e81
 
 
a21b264
0908597
 
 
 
 
 
 
c4b815a
 
 
4e1894d
a21b264
 
 
 
 
 
4e1894d
 
0908597
 
4e1894d
0908597
 
7c1aeac
 
db3de66
7c1aeac
91e6d60
5f7526f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import gradio as gr
from transformers import pipeline
import feedparser
from datetime import datetime, timedelta
import pytz
from bs4 import BeautifulSoup
import hashlib
import threading
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Global settings
SUMMARIZER_MODELS = {
    "Default (facebook/bart-large-cnn)": "facebook/bart-large-cnn",
    "Free Model (distilbart-cnn-6-6)": "sshleifer/distilbart-cnn-6-6"
}
CACHE_SIZE = 500
RSS_FETCH_INTERVAL = timedelta(hours=8)
ARTICLE_LIMIT = 5

# Restructured news sources with fixed categories
CATEGORIES = ["Technology", "Business", "World News", "Science", "Sports", "Health"]
NEWS_SOURCES = {
    "Technology": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best"
    },
    "Business": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best"
    },
    "World News": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
        "BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
        "CNN": "http://rss.cnn.com/rss/edition_world.rss",
        "reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best"
    },
    "Science": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml"
    },
    "Sports": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best"
    },
    "Health": {
        "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
        "politico": "http://rss.politico.com/healthcare.xml",
        "reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best"
    },
}

class NewsCache:
    def __init__(self, size):
        self.cache = {}
        self.size = size
        self.lock = threading.Lock()

    def get(self, key):
        with self.lock:
            return self.cache.get(key)

    def set(self, key, value):
        with self.lock:
            if len(self.cache) >= self.size:
                oldest_key = next(iter(self.cache))
                del self.cache[oldest_key]
            self.cache[key] = value

cache = NewsCache(CACHE_SIZE)

def fetch_rss_news(tech_sources, business_sources, world_sources):
    articles = []
    cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
    
    # Create a mapping of selected sources
    selected_sources = {
        "Technology": tech_sources if tech_sources else [],
        "Business": business_sources if business_sources else [],
        "World News": world_sources if world_sources else [],
        "Science": science_sources if science_sources else [],
        "Sports": sports_sources if sports_sources else [],
        "Health": health_sources if health_sources else [],
    }
    
    logger.info(f"Selected sources: {selected_sources}")
    
    for category, sources in selected_sources.items():
        if not sources:  # Skip if no sources selected for this category
            continue
            
        logger.info(f"Processing category: {category} with sources: {sources}")
        
        for source in sources:
            if source in NEWS_SOURCES[category]:
                url = NEWS_SOURCES[category][source]
                try:
                    logger.info(f"Fetching from URL: {url}")
                    feed = feedparser.parse(url)
                    
                    if hasattr(feed, 'status') and feed.status != 200:
                        logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}")
                        continue
                        
                    for entry in feed.entries:
                        try:
                            published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
                            if published > cutoff_time:
                                articles.append({
                                    "title": entry.title,
                                    "description": BeautifulSoup(entry.description, "html.parser").get_text(),
                                    "link": entry.link,
                                    "category": category,
                                    "source": source,
                                    "published": published
                                })
                        except (AttributeError, TypeError) as e:
                            logger.error(f"Error processing entry: {str(e)}")
                            continue
                            
                except Exception as e:
                    logger.error(f"Error fetching feed from {url}: {str(e)}")
                    continue
    
    logger.info(f"Total articles fetched: {len(articles)}")
    articles = sorted(articles, key=lambda x: x["published"], reverse=True)[:ARTICLE_LIMIT]
    return articles

def summarize_text(text, model_name):
    try:
        summarizer = pipeline("summarization", model=model_name, device=-1)
        content_hash = hashlib.md5(text.encode()).hexdigest()
        cached_summary = cache.get(content_hash)
        
        if cached_summary:
            logger.info("Using cached summary")
            return cached_summary
            
        logger.info(f"Generating new summary using model: {model_name}")
        result = summarizer(text, max_length=120, min_length=40, truncation=True)
        summary = result[0]['summary_text']
        cache.set(content_hash, summary)
        return summary
    except Exception as e:
        logger.error(f"Error in summarization: {str(e)}")
        return "Summary unavailable."

def summarize_articles(articles, model_name):
    summaries = []
    for article in articles:
        content = article["description"]
        summary = summarize_text(content, model_name)
        summaries.append(f"""
        πŸ“° {article['title']}
        - πŸ“ Category: {article['category']}
        - πŸ’‘ Source: {article['source']}
        - πŸ”— Read More: {article['link']}
        πŸ“ƒ Summary: {summary}
        """)
    return "\n".join(summaries)

def generate_summary(tech_sources, business_sources, world_sources, model_name):
    logger.info(f"""
    Generating summary with:
    - Tech sources: {tech_sources}
    - Business sources: {business_sources}
    - World sources: {world_sources}
    - Model: {model_name}
    """)
    
    # Check if any sources are selected
    if not any([
        tech_sources is not None and len(tech_sources) > 0,
        business_sources is not None and len(business_sources) > 0,
        world_sources is not None and len(world_sources) > 0
    ]):
        return "Please select at least one news source."
    
    try:
        articles = fetch_rss_news(tech_sources, business_sources, world_sources)
        if not articles:
            return "No recent news found from the selected sources."
        return summarize_articles(articles, model_name)
    except Exception as e:
        logger.error(f"Error in generate_summary: {str(e)}")
        return f"An error occurred while generating the summary. Please try again."

# Gradio Interface
demo = gr.Blocks()

with demo:
    gr.Markdown("# πŸ“° AI News Summarizer")
    
    with gr.Row():
        with gr.Column():
            # Create checkbox groups for each category
            tech_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["Technology"].keys()),
                label="Technology Sources",
                value=[]
            )
            business_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["Business"].keys()),
                label="Business Sources",
                value=[]
            )
            world_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["World News"].keys()),
                label="World News Sources",
                value=[]
            )
            science_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["Science"].keys()),
                label="Science Sources",
                value=[]
            )
            sports_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["Sports"].keys()),
                label="Sports Sources",
                value=[]
            )
            health_sources = gr.CheckboxGroup(
                choices=list(NEWS_SOURCES["Health"].keys()),
                label="Health Sources",
                value=[]
            )
        
        with gr.Column():
            model_selector = gr.Radio(
                choices=list(SUMMARIZER_MODELS.keys()),
                label="Choose Summarization Model",
                value="Default (facebook/bart-large-cnn)"
            )
    
    summarize_button = gr.Button("Get News Summary")
    summary_output = gr.Textbox(label="News Summary", lines=20)

    def get_summary(tech_sources, business_sources, world_sources, selected_model):
        try:
            model_name = SUMMARIZER_MODELS[selected_model]
            return generate_summary(tech_sources, business_sources, world_sources, model_name)
        except Exception as e:
            logger.error(f"Error in get_summary: {str(e)}")
            return "An error occurred while processing your request. Please try again."

    # Connect the components to the summary function
    summarize_button.click(
        get_summary,
        inputs=[tech_sources, business_sources, world_sources, model_selector],
        outputs=summary_output
    )

if __name__ == "__main__":
    demo.launch()