loayshabet commited on
Commit
6e26c5a
·
verified ·
1 Parent(s): 0e97c1f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -173
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from transformers import pipeline
3
  import feedparser
4
  from datetime import datetime, timedelta
5
  import json
@@ -20,211 +20,119 @@ logging.basicConfig(
20
  format='%(asctime)s - %(levelname)s - %(message)s'
21
  )
22
 
23
- # Language codes for supported languages
24
  LANGUAGE_CODES = {
25
- "English": "en",
26
- "Spanish": "es",
27
- "French": "fr",
28
- "German": "de",
29
- "Italian": "it",
30
- "Portuguese": "pt",
31
- "Dutch": "nl",
32
- "Russian": "ru",
33
- "Chinese": "zh",
34
- "Japanese": "ja",
35
- "Arabic": "ar" # Added Arabic support
36
  }
37
 
38
- # News sources organized by category
39
- NEWS_SOURCES = {
40
- "Technology": [
41
- "https://feeds.feedburner.com/TechCrunch/",
42
- "https://www.theverge.com/rss/index.xml",
43
- "https://www.wired.com/feed/rss",
44
- "https://feeds.feedburner.com/TheNextWeb" # Added for more variety
45
- ],
46
- "Business": [
47
- "https://feeds.feedburner.com/forbes/business",
48
- "https://www.ft.com/rss/home",
49
- "https://feeds.bloomberg.com/markets/news.rss",
50
- "https://www.aljazeera.com/xml/rss/all.xml" # Added Arabic business news
51
- ],
52
- "Science": [
53
- "https://rss.sciencedaily.com/all.xml",
54
- "https://www.nature.com/nature.rss",
55
- "https://science.nasa.gov/rss.xml"
56
- ],
57
- "Health": [
58
- "https://rss.medicalnewstoday.com/newsfeeds/medical_all.xml",
59
- "https://www.who.int/rss-feeds/news-english.xml",
60
- "https://www.healthline.com/rss/news"
61
- ],
62
- "World News": [
63
- "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
64
- "https://feeds.bbci.co.uk/news/world/rss.xml",
65
- "https://www.reuters.com/rssFeed/world",
66
- "https://arabic.cnn.com/rss" # Added Arabic news source
67
- ]
68
- }
69
 
70
  # Initialize global variables
71
  summarizer = None
 
72
 
73
  class NewsCache:
74
  def __init__(self):
75
  self.summaries = {}
 
76
  self.max_cache_size = 1000
77
 
78
- def store_summary(self, content_hash, summary):
 
 
79
  if len(self.summaries) >= self.max_cache_size:
80
  # Remove oldest entry if cache is full
81
  self.summaries.pop(next(iter(self.summaries)))
82
- self.summaries[content_hash] = summary
 
83
 
84
- def get_summary(self, content_hash):
85
- return self.summaries.get(content_hash)
 
86
 
87
  news_cache = NewsCache()
88
 
89
- def get_content_hash(content):
90
- """Generate hash for content to use as cache key"""
91
- return hashlib.md5(content.encode()).hexdigest()
92
-
93
- def clean_text(text):
94
- """Clean and normalize text content"""
95
- if not text:
96
- return ""
97
- # Remove HTML tags and normalize whitespace
98
- text = BeautifulSoup(text, "html.parser").get_text()
99
- return " ".join(text.split())
100
-
101
- @lru_cache(maxsize=100)
102
- def fetch_feed_with_timeout(url):
103
- """Fetch RSS feed with timeout and caching"""
104
- try:
105
- response = requests.get(url, timeout=10)
106
- return feedparser.parse(response.content)
107
- except Exception as e:
108
- logging.error(f"Error fetching feed {url}: {e}")
109
- return None
110
-
111
- def initialize_summarizer():
112
- """Initialize the summarization pipeline"""
113
- global summarizer
114
  try:
 
115
  summarizer = pipeline(
116
  "summarization",
117
  model="facebook/bart-large-cnn",
118
  device=-1 # Use CPU
119
  )
 
 
 
 
 
 
 
 
 
 
 
 
120
  return True
121
  except Exception as e:
122
- logging.error(f"Error initializing summarizer: {e}")
123
  return False
124
 
125
- def parse_date(date_str):
126
- """Parse various date formats to datetime"""
127
- try:
128
- # Try parsing RSS/Atom date format
129
- return parsedate_to_datetime(date_str)
130
- except (TypeError, ValueError):
131
- try:
132
- # Try ISO format
133
- return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
134
- except (TypeError, ValueError):
135
- return None
136
-
137
- def is_recent_article(published_date, hours=8):
138
- """Check if article is within the last specified hours"""
139
- if not published_date:
140
- return False
141
 
142
  try:
143
- parsed_date = parse_date(published_date)
144
- if not parsed_date:
145
- return False
146
 
147
- # Ensure timezone awareness
148
- if parsed_date.tzinfo is None:
149
- parsed_date = pytz.UTC.localize(parsed_date)
 
 
 
 
 
 
 
 
 
150
 
151
- now = datetime.now(pytz.UTC)
152
- time_difference = now - parsed_date
153
- return time_difference <= timedelta(hours=hours)
154
  except Exception as e:
155
- logging.error(f"Error parsing date: {e}")
156
- return False
157
 
158
- def fetch_news_from_rss(interests):
159
- """Fetch recent news from RSS feeds"""
160
- articles = []
161
- max_articles_per_category = 2
162
-
163
- with ThreadPoolExecutor(max_workers=3) as executor:
164
- for interest in interests:
165
- if interest not in NEWS_SOURCES:
166
- continue
167
-
168
- future_to_url = {
169
- executor.submit(fetch_feed_with_timeout, url): url
170
- for url in NEWS_SOURCES[interest]
171
- }
172
-
173
- category_count = 0
174
- for future in future_to_url:
175
- if category_count >= max_articles_per_category:
176
- break
177
-
178
- try:
179
- feed = future.result(timeout=15)
180
- if not feed:
181
- continue
182
-
183
- for entry in feed.entries:
184
- published_date = entry.get('published', '') or entry.get('updated', '')
185
-
186
- if not is_recent_article(published_date):
187
- continue
188
-
189
- description = entry.get('description', '') or entry.get('summary', '')
190
- description = clean_text(description)
191
-
192
- if len(description) < 50:
193
- continue
194
-
195
- article = {
196
- 'title': clean_text(entry.get('title', 'Untitled')),
197
- 'description': description,
198
- 'category': interest,
199
- 'link': entry.get('link', ''),
200
- 'published': published_date
201
- }
202
- articles.append(article)
203
- category_count += 1
204
-
205
- if category_count >= max_articles_per_category:
206
- break
207
-
208
- except (TimeoutError, Exception) as e:
209
- logging.error(f"Error processing feed: {e}")
210
- continue
211
-
212
- return articles
213
-
214
- def generate_summary(text, title="", category=""):
215
- """Generate summary with enhanced prompting"""
216
  if not summarizer:
217
- if not initialize_summarizer():
218
  return None
219
 
220
  try:
221
  # Check cache first
222
  content_hash = get_content_hash(text)
223
- cached_summary = news_cache.get_summary(content_hash)
224
  if cached_summary:
225
  return cached_summary
226
 
227
- # Enhanced prompt template for better summaries
228
  prompt_template = f"""
229
  Analyze and summarize this {category} news article titled "{title}".
230
  Focus on providing:
@@ -239,7 +147,6 @@ Article text:
239
 
240
  Please provide a clear, concise summary that a general audience can understand:"""
241
 
242
- # Prepare input text
243
  prompted_text = prompt_template.format(text=text[:1024])
244
 
245
  result = summarizer(prompted_text,
@@ -251,12 +158,16 @@ Please provide a clear, concise summary that a general audience can understand:"
251
  if result and len(result) > 0:
252
  summary = result[0]['summary_text']
253
 
254
- # Post-process summary for better readability
255
  summary = summary.replace(" .", ".").replace(" ,", ",")
256
  sentences = summary.split(". ")
257
  formatted_summary = "\n• " + "\n• ".join(filter(None, sentences))
258
 
259
- news_cache.store_summary(content_hash, formatted_summary)
 
 
 
 
260
  return formatted_summary
261
 
262
  return None
@@ -266,7 +177,7 @@ Please provide a clear, concise summary that a general audience can understand:"
266
  return None
267
 
268
  def get_personalized_summary(name, progress=gr.Progress()):
269
- """Generate personalized news summary"""
270
  start_time = time.time()
271
  logging.info(f"Starting summary generation for user: {name}")
272
 
@@ -281,19 +192,21 @@ def get_personalized_summary(name, progress=gr.Progress()):
281
  except Exception as e:
282
  return f"Error loading preferences: {e}"
283
 
 
 
284
  # Fetch articles with progress
285
  progress(0.2, desc="Fetching recent news...")
286
  articles = fetch_news_from_rss(preferences["interests"])
287
 
288
  if not articles:
289
- return "No recent news articles found from the last 8 hours. Please try again later."
290
 
291
  # Process articles with timeout
292
  progress(0.4, desc="Analyzing and summarizing...")
293
  summaries = []
294
  total_articles = len(articles)
295
 
296
- max_processing_time = 60 # Maximum processing time in seconds
297
 
298
  for i, article in enumerate(articles):
299
  if time.time() - start_time > max_processing_time:
@@ -313,18 +226,24 @@ def get_personalized_summary(name, progress=gr.Progress()):
313
  if not content:
314
  continue
315
 
316
- summary = generate_summary(content, title, category)
317
  if not summary:
318
  continue
319
 
 
 
 
 
 
 
320
  formatted_summary = f"""
321
  📰 {title}
322
- 📁 Category: {category}
323
- ⏰ Published: {published_str}
324
 
325
  {summary}
326
 
327
- 🔗 Read more: {link}
328
 
329
  ---"""
330
  summaries.append(formatted_summary)
@@ -334,11 +253,13 @@ def get_personalized_summary(name, progress=gr.Progress()):
334
  continue
335
 
336
  if not summaries:
337
- return "Unable to generate summaries for recent news. Please try again."
338
 
339
  progress(1.0, desc="Done!")
340
  return "\n".join(summaries)
341
 
 
 
342
  # Gradio interface
343
  with gr.Blocks(title="Enhanced News Summarizer") as demo:
344
  gr.Markdown("# 📰 Enhanced AI News Summarizer")
 
1
  import gradio as gr
2
+ from transformers import pipeline, AutoModelForSeq2SeqGeneration, AutoTokenizer
3
  import feedparser
4
  from datetime import datetime, timedelta
5
  import json
 
20
  format='%(asctime)s - %(levelname)s - %(message)s'
21
  )
22
 
23
+ # Language codes and their corresponding MarianMT model names
24
  LANGUAGE_CODES = {
25
+ "English": {"code": "en", "model": None}, # No translation needed for English
26
+ "Spanish": {"code": "es", "model": "Helsinki-NLP/opus-mt-en-es"},
27
+ "French": {"code": "fr", "model": "Helsinki-NLP/opus-mt-en-fr"},
28
+ "German": {"code": "de", "model": "Helsinki-NLP/opus-mt-en-de"},
29
+ "Italian": {"code": "it", "model": "Helsinki-NLP/opus-mt-en-it"},
30
+ "Portuguese": {"code": "pt", "model": "Helsinki-NLP/opus-mt-en-pt"},
31
+ "Dutch": {"code": "nl", "model": "Helsinki-NLP/opus-mt-en-nl"},
32
+ "Russian": {"code": "ru", "model": "Helsinki-NLP/opus-mt-en-ru"},
33
+ "Chinese": {"code": "zh", "model": "Helsinki-NLP/opus-mt-en-zh"},
34
+ "Japanese": {"code": "ja", "model": "Helsinki-NLP/opus-mt-en-jap"},
35
+ "Arabic": {"code": "ar", "model": "Helsinki-NLP/opus-mt-en-ar"}
36
  }
37
 
38
+ # [Previous NEWS_SOURCES definition remains the same...]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  # Initialize global variables
41
  summarizer = None
42
+ translators = {}
43
 
44
  class NewsCache:
45
  def __init__(self):
46
  self.summaries = {}
47
+ self.translations = {}
48
  self.max_cache_size = 1000
49
 
50
+ def store_summary(self, content_hash, summary, language=None):
51
+ cache_key = f"{content_hash}_{language}" if language else content_hash
52
+
53
  if len(self.summaries) >= self.max_cache_size:
54
  # Remove oldest entry if cache is full
55
  self.summaries.pop(next(iter(self.summaries)))
56
+
57
+ self.summaries[cache_key] = summary
58
 
59
+ def get_summary(self, content_hash, language=None):
60
+ cache_key = f"{content_hash}_{language}" if language else content_hash
61
+ return self.summaries.get(cache_key)
62
 
63
  news_cache = NewsCache()
64
 
65
+ def initialize_models():
66
+ """Initialize the summarization and translation models"""
67
+ global summarizer, translators
68
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  try:
70
+ # Initialize summarizer
71
  summarizer = pipeline(
72
  "summarization",
73
  model="facebook/bart-large-cnn",
74
  device=-1 # Use CPU
75
  )
76
+
77
+ # Initialize translators for each language
78
+ for lang, info in LANGUAGE_CODES.items():
79
+ if info["model"]: # Skip English as it doesn't need translation
80
+ try:
81
+ model = AutoModelForSeq2SeqGeneration.from_pretrained(info["model"])
82
+ tokenizer = AutoTokenizer.from_pretrained(info["model"])
83
+ translators[lang] = (model, tokenizer)
84
+ logging.info(f"Initialized translator for {lang}")
85
+ except Exception as e:
86
+ logging.error(f"Error initializing translator for {lang}: {e}")
87
+
88
  return True
89
  except Exception as e:
90
+ logging.error(f"Error initializing models: {e}")
91
  return False
92
 
93
+ def translate_text(text, target_language):
94
+ """Translate text to target language"""
95
+ if target_language == "English" or not text:
96
+ return text
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  try:
99
+ if target_language not in translators:
100
+ logging.error(f"Translator not found for {target_language}")
101
+ return text
102
 
103
+ model, tokenizer = translators[target_language]
104
+
105
+ # Split text into chunks to handle long text
106
+ max_length = 512
107
+ chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
108
+ translated_chunks = []
109
+
110
+ for chunk in chunks:
111
+ inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
112
+ translated = model.generate(**inputs)
113
+ translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
114
+ translated_chunks.append(translated_text)
115
 
116
+ return " ".join(translated_chunks)
117
+
 
118
  except Exception as e:
119
+ logging.error(f"Translation error: {e}")
120
+ return text
121
 
122
+ def generate_summary(text, title="", category="", language="English"):
123
+ """Generate summary with translation support"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  if not summarizer:
125
+ if not initialize_models():
126
  return None
127
 
128
  try:
129
  # Check cache first
130
  content_hash = get_content_hash(text)
131
+ cached_summary = news_cache.get_summary(content_hash, language)
132
  if cached_summary:
133
  return cached_summary
134
 
135
+ # Generate English summary first
136
  prompt_template = f"""
137
  Analyze and summarize this {category} news article titled "{title}".
138
  Focus on providing:
 
147
 
148
  Please provide a clear, concise summary that a general audience can understand:"""
149
 
 
150
  prompted_text = prompt_template.format(text=text[:1024])
151
 
152
  result = summarizer(prompted_text,
 
158
  if result and len(result) > 0:
159
  summary = result[0]['summary_text']
160
 
161
+ # Post-process summary
162
  summary = summary.replace(" .", ".").replace(" ,", ",")
163
  sentences = summary.split(". ")
164
  formatted_summary = "\n• " + "\n• ".join(filter(None, sentences))
165
 
166
+ # Translate if needed
167
+ if language != "English":
168
+ formatted_summary = translate_text(formatted_summary, language)
169
+
170
+ news_cache.store_summary(content_hash, formatted_summary, language)
171
  return formatted_summary
172
 
173
  return None
 
177
  return None
178
 
179
  def get_personalized_summary(name, progress=gr.Progress()):
180
+ """Generate personalized news summary in user's preferred language"""
181
  start_time = time.time()
182
  logging.info(f"Starting summary generation for user: {name}")
183
 
 
192
  except Exception as e:
193
  return f"Error loading preferences: {e}"
194
 
195
+ user_language = preferences.get("language", "English")
196
+
197
  # Fetch articles with progress
198
  progress(0.2, desc="Fetching recent news...")
199
  articles = fetch_news_from_rss(preferences["interests"])
200
 
201
  if not articles:
202
+ return translate_text("No recent news articles found from the last 8 hours. Please try again later.", user_language)
203
 
204
  # Process articles with timeout
205
  progress(0.4, desc="Analyzing and summarizing...")
206
  summaries = []
207
  total_articles = len(articles)
208
 
209
+ max_processing_time = 60
210
 
211
  for i, article in enumerate(articles):
212
  if time.time() - start_time > max_processing_time:
 
226
  if not content:
227
  continue
228
 
229
+ summary = generate_summary(content, title, category, user_language)
230
  if not summary:
231
  continue
232
 
233
+ # Translate title and category if needed
234
+ if user_language != "English":
235
+ title = translate_text(title, user_language)
236
+ category = translate_text(category, user_language)
237
+ published_str = translate_text(published_str, user_language)
238
+
239
  formatted_summary = f"""
240
  📰 {title}
241
+ 📁 {translate_text("Category", user_language)}: {category}
242
+ {translate_text("Published", user_language)}: {published_str}
243
 
244
  {summary}
245
 
246
+ 🔗 {translate_text("Read more", user_language)}: {link}
247
 
248
  ---"""
249
  summaries.append(formatted_summary)
 
253
  continue
254
 
255
  if not summaries:
256
+ return translate_text("Unable to generate summaries for recent news. Please try again.", user_language)
257
 
258
  progress(1.0, desc="Done!")
259
  return "\n".join(summaries)
260
 
261
+ # [Rest of the code remains the same...]
262
+
263
  # Gradio interface
264
  with gr.Blocks(title="Enhanced News Summarizer") as demo:
265
  gr.Markdown("# 📰 Enhanced AI News Summarizer")