loayshabet commited on
Commit
5c1b01d
·
verified ·
1 Parent(s): db799a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -129
app.py CHANGED
@@ -1,33 +1,25 @@
 
1
  import gradio as gr
2
  from transformers import pipeline
3
  import feedparser
4
  from datetime import datetime, timedelta
5
- import json
6
- import os
7
- import logging
8
  import pytz
9
  from bs4 import BeautifulSoup
10
- import hashlib
11
- import threading
12
-
13
- # Logging setup
14
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15
 
16
  # Global settings
17
- SUMMARIZER_MODEL = "facebook/bart-large-cnn" # You can replace this with other summarization models
18
- CACHE_SIZE = 500 # Maximum number of cached summaries
19
- RSS_FETCH_INTERVAL = timedelta(hours=8) # Fetch recent news within the last 8 hours
20
- ARTICLE_LIMIT = 5 # Limit to the last 5 articles
21
 
22
  # News sources
23
  NEWS_SOURCES = {
24
  "Technology": {
25
  "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
26
- "reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best"
27
  },
28
  "Business": {
29
  "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
30
- "reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best"
31
  },
32
  "Science": {
33
  "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml"
@@ -36,46 +28,23 @@ NEWS_SOURCES = {
36
  "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
37
  "BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
38
  "CNN": "http://rss.cnn.com/rss/edition_world.rss",
39
- "reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best"
40
  },
41
  "Sports": {
42
  "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
43
- "reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best"
44
  },
45
  "Health": {
46
  "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
47
- "politico": "http://rss.politico.com/healthcare.xml",
48
- "reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best"
49
  },
50
  }
51
 
52
- # Initialize cache
53
- class NewsCache:
54
- def __init__(self, size):
55
- self.cache = {}
56
- self.size = size
57
- self.lock = threading.Lock()
58
-
59
- def get(self, key):
60
- with self.lock:
61
- return self.cache.get(key)
62
-
63
- def set(self, key, value):
64
- with self.lock:
65
- if len(self.cache) >= self.size:
66
- # Remove oldest cached item
67
- oldest_key = next(iter(self.cache))
68
- del self.cache[oldest_key]
69
- self.cache[key] = value
70
-
71
- cache = NewsCache(CACHE_SIZE)
72
-
73
- # Initialize summarizer
74
  summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=-1)
75
 
76
- # Utility functions
77
  def fetch_rss_news(categories):
78
- """Fetch news articles from RSS feeds based on selected categories."""
79
  articles = []
80
  cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
81
 
@@ -84,7 +53,6 @@ def fetch_rss_news(categories):
84
  try:
85
  feed = feedparser.parse(url)
86
  for entry in feed.entries:
87
- # Parse publication time
88
  published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
89
  if published > cutoff_time:
90
  articles.append({
@@ -96,115 +64,73 @@ def fetch_rss_news(categories):
96
  "published": published
97
  })
98
  except Exception as e:
99
- logging.error(f"Failed to fetch from {url}: {e}")
100
-
101
- # Sort articles by published date (most recent first) and limit to ARTICLE_LIMIT
102
  articles = sorted(articles, key=lambda x: x["published"], reverse=True)[:ARTICLE_LIMIT]
103
  return articles
104
 
105
- def summarize_text(text):
106
- """Summarize the text using the AI model."""
107
- content_hash = hashlib.md5(text.encode()).hexdigest()
108
- cached_summary = cache.get(content_hash)
109
- if cached_summary:
110
- return cached_summary
111
-
112
  try:
113
  result = summarizer(text, max_length=120, min_length=40, truncation=True)
114
- summary = result[0]['summary_text']
115
- cache.set(content_hash, summary)
116
- return summary
117
  except Exception as e:
118
- logging.error(f"Summarization failed: {e}")
119
  return "Summary unavailable."
120
 
121
- def summarize_articles(articles, template_choice):
122
- """Summarize the last 5 fetched articles using the selected template."""
 
 
 
123
  summaries = []
 
 
124
  for article in articles:
125
  try:
126
- content = article["description"]
127
- title = article["title"]
128
- category = article["category"]
129
- source = article["source"]
130
- link = article["link"]
131
- published = article["published"].strftime('%Y-%m-%d %H:%M')
132
-
133
- # Summarize article content
134
- summary = summarize_text(content)
135
-
136
- if summary:
137
- if template_choice == "Default":
138
- summaries.append(f"""
139
- 📰 {title}
140
-
141
- - 📁 Category: {category}
142
- - 💡 Source: {source}
143
- - ⏰ Published: {published}
144
-
145
- 📃 Summary:
146
- {summary}
147
- 🔗 Read the full article here ({link})
148
-
149
- ---
150
- """)
151
- elif template_choice == "Free":
152
- summaries.append(f"""
153
- **{title}**
154
-
155
- *Category:* {category}
156
- *Source:* {source}
157
- *Published:* {published}
158
-
159
- *Summary:*
160
- {summary}
161
- *Link:* [{link}]({link})
162
-
163
- ---
164
- """)
165
  except Exception as e:
166
- logging.error(f"Error summarizing article: {e}")
167
- continue
168
  return summaries
169
 
170
- def generate_summary(selected_interests, template_choice):
171
- """Generate a news summary based on user preferences."""
172
- categories = selected_interests
173
  if not categories:
174
- return "No categories selected. Please update your preferences."
175
-
176
- # Fetch news
177
  articles = fetch_rss_news(categories)
178
  if not articles:
179
- return "No recent news found in your selected categories."
180
-
181
- # Summarize all articles
182
- summaries = summarize_articles(articles, template_choice)
183
 
184
- # Combine and return summaries
185
- return "\n\n".join(summaries) if summaries else "No summaries available."
186
 
187
- # Gradio interface
188
  demo = gr.Blocks()
189
 
190
  with demo:
191
- gr.Markdown("# 📰 AI News Summarizer")
192
-
193
  with gr.Row():
194
- interests = gr.CheckboxGroup(
195
- choices=list(NEWS_SOURCES.keys()),
196
- label="Select Your Interests"
197
- )
198
- template_choice = gr.Radio(
199
- choices=["Default", "Free"],
200
- label="Choose Summary Template",
201
- value="Default"
202
- )
203
-
204
- fetch_button = gr.Button("Get Summary")
205
- summary_output = gr.Textbox(label="News Summary", lines=20)
206
-
207
- fetch_button.click(generate_summary, inputs=[interests, template_choice], outputs=summary_output)
208
 
209
  if __name__ == "__main__":
210
  demo.launch()
 
1
+
2
  import gradio as gr
3
  from transformers import pipeline
4
  import feedparser
5
  from datetime import datetime, timedelta
 
 
 
6
  import pytz
7
  from bs4 import BeautifulSoup
 
 
 
 
 
8
 
9
  # Global settings
10
+ SUMMARIZER_MODEL = "facebook/bart-large-cnn"
11
+ RSS_FETCH_INTERVAL = timedelta(hours=8)
12
+ ARTICLE_LIMIT = 5
 
13
 
14
  # News sources
15
  NEWS_SOURCES = {
16
  "Technology": {
17
  "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
18
+ "Reuters": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best"
19
  },
20
  "Business": {
21
  "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
22
+ "Reuters": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best"
23
  },
24
  "Science": {
25
  "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml"
 
28
  "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
29
  "BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
30
  "CNN": "http://rss.cnn.com/rss/edition_world.rss",
31
+ "Reuters": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best"
32
  },
33
  "Sports": {
34
  "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
35
+ "Reuters": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best"
36
  },
37
  "Health": {
38
  "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
39
+ "Politico": "http://rss.politico.com/healthcare.xml",
40
+ "Reuters": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best"
41
  },
42
  }
43
 
44
+ # Summarizer initialization
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=-1)
46
 
 
47
  def fetch_rss_news(categories):
 
48
  articles = []
49
  cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
50
 
 
53
  try:
54
  feed = feedparser.parse(url)
55
  for entry in feed.entries:
 
56
  published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
57
  if published > cutoff_time:
58
  articles.append({
 
64
  "published": published
65
  })
66
  except Exception as e:
67
+ print(f"Error fetching from {url}: {e}")
68
+
 
69
  articles = sorted(articles, key=lambda x: x["published"], reverse=True)[:ARTICLE_LIMIT]
70
  return articles
71
 
72
+ def summarize_with_ai(text):
 
 
 
 
 
 
73
  try:
74
  result = summarizer(text, max_length=120, min_length=40, truncation=True)
75
+ return result[0]['summary_text']
 
 
76
  except Exception as e:
77
+ print(f"AI summarization error: {e}")
78
  return "Summary unavailable."
79
 
80
+ def summarize_with_free_module(text):
81
+ # Simple heuristic summarization: return the first few sentences
82
+ return '. '.join(text.split('. ')[:3]) + '...'
83
+
84
+ def summarize_articles(articles, method="AI Model"):
85
  summaries = []
86
+ summarizer_function = summarize_with_ai if method == "AI Model" else summarize_with_free_module
87
+
88
  for article in articles:
89
  try:
90
+ summary = summarizer_function(article["description"])
91
+ summaries.append(
92
+ "Title: {0}
93
+ - Category: {1}
94
+ - Source: {2}
95
+ - Published: {3}
96
+ Summary: {4}
97
+ Read more: {5}".format(
98
+ article["title"],
99
+ article["category"],
100
+ article["source"],
101
+ article["published"].strftime('%Y-%m-%d %H:%M'),
102
+ summary,
103
+ article["link"]
104
+ )
105
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  except Exception as e:
107
+ print(f"Error summarizing article: {e}")
108
+
109
  return summaries
110
 
111
+ # Gradio Interface
112
+ def generate_summary(categories, method):
 
113
  if not categories:
114
+ return "Please select at least one category."
 
 
115
  articles = fetch_rss_news(categories)
116
  if not articles:
117
+ return "No recent articles found."
118
+ summaries = summarize_articles(articles, method)
119
+ return "
 
120
 
121
+ ".join(summaries)
 
122
 
 
123
  demo = gr.Blocks()
124
 
125
  with demo:
126
+ gr.Markdown("# AI News Summarizer")
 
127
  with gr.Row():
128
+ categories = gr.CheckboxGroup(choices=list(NEWS_SOURCES.keys()), label="Select Categories")
129
+ method = gr.Dropdown(choices=["AI Model", "Free Module"], label="Summarization Method", value="AI Model")
130
+ summarize_button = gr.Button("Generate Summary")
131
+ output = gr.Textbox(label="Summarized News", lines=15)
132
+
133
+ summarize_button.click(generate_summary, inputs=[categories, method], outputs=output)
 
 
 
 
 
 
 
 
134
 
135
  if __name__ == "__main__":
136
  demo.launch()