Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
-
from transformers import pipeline
|
3 |
import feedparser
|
4 |
from datetime import datetime, timedelta
|
5 |
import json
|
@@ -20,211 +20,119 @@ logging.basicConfig(
|
|
20 |
format='%(asctime)s - %(levelname)s - %(message)s'
|
21 |
)
|
22 |
|
23 |
-
# Language codes
|
24 |
LANGUAGE_CODES = {
|
25 |
-
"English": "en",
|
26 |
-
"Spanish": "es",
|
27 |
-
"French": "fr",
|
28 |
-
"German": "de",
|
29 |
-
"Italian": "it",
|
30 |
-
"Portuguese": "pt",
|
31 |
-
"Dutch": "nl",
|
32 |
-
"Russian": "ru",
|
33 |
-
"Chinese": "zh",
|
34 |
-
"Japanese": "ja",
|
35 |
-
"Arabic": "ar"
|
36 |
}
|
37 |
|
38 |
-
#
|
39 |
-
NEWS_SOURCES = {
|
40 |
-
"Technology": [
|
41 |
-
"https://feeds.feedburner.com/TechCrunch/",
|
42 |
-
"https://www.theverge.com/rss/index.xml",
|
43 |
-
"https://www.wired.com/feed/rss",
|
44 |
-
"https://feeds.feedburner.com/TheNextWeb" # Added for more variety
|
45 |
-
],
|
46 |
-
"Business": [
|
47 |
-
"https://feeds.feedburner.com/forbes/business",
|
48 |
-
"https://www.ft.com/rss/home",
|
49 |
-
"https://feeds.bloomberg.com/markets/news.rss",
|
50 |
-
"https://www.aljazeera.com/xml/rss/all.xml" # Added Arabic business news
|
51 |
-
],
|
52 |
-
"Science": [
|
53 |
-
"https://rss.sciencedaily.com/all.xml",
|
54 |
-
"https://www.nature.com/nature.rss",
|
55 |
-
"https://science.nasa.gov/rss.xml"
|
56 |
-
],
|
57 |
-
"Health": [
|
58 |
-
"https://rss.medicalnewstoday.com/newsfeeds/medical_all.xml",
|
59 |
-
"https://www.who.int/rss-feeds/news-english.xml",
|
60 |
-
"https://www.healthline.com/rss/news"
|
61 |
-
],
|
62 |
-
"World News": [
|
63 |
-
"https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
|
64 |
-
"https://feeds.bbci.co.uk/news/world/rss.xml",
|
65 |
-
"https://www.reuters.com/rssFeed/world",
|
66 |
-
"https://arabic.cnn.com/rss" # Added Arabic news source
|
67 |
-
]
|
68 |
-
}
|
69 |
|
70 |
# Initialize global variables
|
71 |
summarizer = None
|
|
|
72 |
|
73 |
class NewsCache:
|
74 |
def __init__(self):
|
75 |
self.summaries = {}
|
|
|
76 |
self.max_cache_size = 1000
|
77 |
|
78 |
-
def store_summary(self, content_hash, summary):
|
|
|
|
|
79 |
if len(self.summaries) >= self.max_cache_size:
|
80 |
# Remove oldest entry if cache is full
|
81 |
self.summaries.pop(next(iter(self.summaries)))
|
82 |
-
|
|
|
83 |
|
84 |
-
def get_summary(self, content_hash):
|
85 |
-
|
|
|
86 |
|
87 |
news_cache = NewsCache()
|
88 |
|
89 |
-
def
|
90 |
-
"""
|
91 |
-
|
92 |
-
|
93 |
-
def clean_text(text):
|
94 |
-
"""Clean and normalize text content"""
|
95 |
-
if not text:
|
96 |
-
return ""
|
97 |
-
# Remove HTML tags and normalize whitespace
|
98 |
-
text = BeautifulSoup(text, "html.parser").get_text()
|
99 |
-
return " ".join(text.split())
|
100 |
-
|
101 |
-
@lru_cache(maxsize=100)
|
102 |
-
def fetch_feed_with_timeout(url):
|
103 |
-
"""Fetch RSS feed with timeout and caching"""
|
104 |
-
try:
|
105 |
-
response = requests.get(url, timeout=10)
|
106 |
-
return feedparser.parse(response.content)
|
107 |
-
except Exception as e:
|
108 |
-
logging.error(f"Error fetching feed {url}: {e}")
|
109 |
-
return None
|
110 |
-
|
111 |
-
def initialize_summarizer():
|
112 |
-
"""Initialize the summarization pipeline"""
|
113 |
-
global summarizer
|
114 |
try:
|
|
|
115 |
summarizer = pipeline(
|
116 |
"summarization",
|
117 |
model="facebook/bart-large-cnn",
|
118 |
device=-1 # Use CPU
|
119 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
return True
|
121 |
except Exception as e:
|
122 |
-
logging.error(f"Error initializing
|
123 |
return False
|
124 |
|
125 |
-
def
|
126 |
-
"""
|
127 |
-
|
128 |
-
|
129 |
-
return parsedate_to_datetime(date_str)
|
130 |
-
except (TypeError, ValueError):
|
131 |
-
try:
|
132 |
-
# Try ISO format
|
133 |
-
return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
|
134 |
-
except (TypeError, ValueError):
|
135 |
-
return None
|
136 |
-
|
137 |
-
def is_recent_article(published_date, hours=8):
|
138 |
-
"""Check if article is within the last specified hours"""
|
139 |
-
if not published_date:
|
140 |
-
return False
|
141 |
|
142 |
try:
|
143 |
-
|
144 |
-
|
145 |
-
return
|
146 |
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
-
|
152 |
-
|
153 |
-
return time_difference <= timedelta(hours=hours)
|
154 |
except Exception as e:
|
155 |
-
logging.error(f"
|
156 |
-
return
|
157 |
|
158 |
-
def
|
159 |
-
"""
|
160 |
-
articles = []
|
161 |
-
max_articles_per_category = 2
|
162 |
-
|
163 |
-
with ThreadPoolExecutor(max_workers=3) as executor:
|
164 |
-
for interest in interests:
|
165 |
-
if interest not in NEWS_SOURCES:
|
166 |
-
continue
|
167 |
-
|
168 |
-
future_to_url = {
|
169 |
-
executor.submit(fetch_feed_with_timeout, url): url
|
170 |
-
for url in NEWS_SOURCES[interest]
|
171 |
-
}
|
172 |
-
|
173 |
-
category_count = 0
|
174 |
-
for future in future_to_url:
|
175 |
-
if category_count >= max_articles_per_category:
|
176 |
-
break
|
177 |
-
|
178 |
-
try:
|
179 |
-
feed = future.result(timeout=15)
|
180 |
-
if not feed:
|
181 |
-
continue
|
182 |
-
|
183 |
-
for entry in feed.entries:
|
184 |
-
published_date = entry.get('published', '') or entry.get('updated', '')
|
185 |
-
|
186 |
-
if not is_recent_article(published_date):
|
187 |
-
continue
|
188 |
-
|
189 |
-
description = entry.get('description', '') or entry.get('summary', '')
|
190 |
-
description = clean_text(description)
|
191 |
-
|
192 |
-
if len(description) < 50:
|
193 |
-
continue
|
194 |
-
|
195 |
-
article = {
|
196 |
-
'title': clean_text(entry.get('title', 'Untitled')),
|
197 |
-
'description': description,
|
198 |
-
'category': interest,
|
199 |
-
'link': entry.get('link', ''),
|
200 |
-
'published': published_date
|
201 |
-
}
|
202 |
-
articles.append(article)
|
203 |
-
category_count += 1
|
204 |
-
|
205 |
-
if category_count >= max_articles_per_category:
|
206 |
-
break
|
207 |
-
|
208 |
-
except (TimeoutError, Exception) as e:
|
209 |
-
logging.error(f"Error processing feed: {e}")
|
210 |
-
continue
|
211 |
-
|
212 |
-
return articles
|
213 |
-
|
214 |
-
def generate_summary(text, title="", category=""):
|
215 |
-
"""Generate summary with enhanced prompting"""
|
216 |
if not summarizer:
|
217 |
-
if not
|
218 |
return None
|
219 |
|
220 |
try:
|
221 |
# Check cache first
|
222 |
content_hash = get_content_hash(text)
|
223 |
-
cached_summary = news_cache.get_summary(content_hash)
|
224 |
if cached_summary:
|
225 |
return cached_summary
|
226 |
|
227 |
-
#
|
228 |
prompt_template = f"""
|
229 |
Analyze and summarize this {category} news article titled "{title}".
|
230 |
Focus on providing:
|
@@ -239,7 +147,6 @@ Article text:
|
|
239 |
|
240 |
Please provide a clear, concise summary that a general audience can understand:"""
|
241 |
|
242 |
-
# Prepare input text
|
243 |
prompted_text = prompt_template.format(text=text[:1024])
|
244 |
|
245 |
result = summarizer(prompted_text,
|
@@ -251,12 +158,16 @@ Please provide a clear, concise summary that a general audience can understand:"
|
|
251 |
if result and len(result) > 0:
|
252 |
summary = result[0]['summary_text']
|
253 |
|
254 |
-
# Post-process summary
|
255 |
summary = summary.replace(" .", ".").replace(" ,", ",")
|
256 |
sentences = summary.split(". ")
|
257 |
formatted_summary = "\n• " + "\n• ".join(filter(None, sentences))
|
258 |
|
259 |
-
|
|
|
|
|
|
|
|
|
260 |
return formatted_summary
|
261 |
|
262 |
return None
|
@@ -266,7 +177,7 @@ Please provide a clear, concise summary that a general audience can understand:"
|
|
266 |
return None
|
267 |
|
268 |
def get_personalized_summary(name, progress=gr.Progress()):
|
269 |
-
"""Generate personalized news summary"""
|
270 |
start_time = time.time()
|
271 |
logging.info(f"Starting summary generation for user: {name}")
|
272 |
|
@@ -281,19 +192,21 @@ def get_personalized_summary(name, progress=gr.Progress()):
|
|
281 |
except Exception as e:
|
282 |
return f"Error loading preferences: {e}"
|
283 |
|
|
|
|
|
284 |
# Fetch articles with progress
|
285 |
progress(0.2, desc="Fetching recent news...")
|
286 |
articles = fetch_news_from_rss(preferences["interests"])
|
287 |
|
288 |
if not articles:
|
289 |
-
return "No recent news articles found from the last 8 hours. Please try again later."
|
290 |
|
291 |
# Process articles with timeout
|
292 |
progress(0.4, desc="Analyzing and summarizing...")
|
293 |
summaries = []
|
294 |
total_articles = len(articles)
|
295 |
|
296 |
-
max_processing_time = 60
|
297 |
|
298 |
for i, article in enumerate(articles):
|
299 |
if time.time() - start_time > max_processing_time:
|
@@ -313,18 +226,24 @@ def get_personalized_summary(name, progress=gr.Progress()):
|
|
313 |
if not content:
|
314 |
continue
|
315 |
|
316 |
-
summary = generate_summary(content, title, category)
|
317 |
if not summary:
|
318 |
continue
|
319 |
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
formatted_summary = f"""
|
321 |
📰 {title}
|
322 |
-
📁 Category: {category}
|
323 |
-
⏰ Published: {published_str}
|
324 |
|
325 |
{summary}
|
326 |
|
327 |
-
🔗 Read more: {link}
|
328 |
|
329 |
---"""
|
330 |
summaries.append(formatted_summary)
|
@@ -334,11 +253,13 @@ def get_personalized_summary(name, progress=gr.Progress()):
|
|
334 |
continue
|
335 |
|
336 |
if not summaries:
|
337 |
-
return "Unable to generate summaries for recent news. Please try again."
|
338 |
|
339 |
progress(1.0, desc="Done!")
|
340 |
return "\n".join(summaries)
|
341 |
|
|
|
|
|
342 |
# Gradio interface
|
343 |
with gr.Blocks(title="Enhanced News Summarizer") as demo:
|
344 |
gr.Markdown("# 📰 Enhanced AI News Summarizer")
|
|
|
1 |
import gradio as gr
|
2 |
+
from transformers import pipeline, AutoModelForSeq2SeqGeneration, AutoTokenizer
|
3 |
import feedparser
|
4 |
from datetime import datetime, timedelta
|
5 |
import json
|
|
|
20 |
format='%(asctime)s - %(levelname)s - %(message)s'
|
21 |
)
|
22 |
|
23 |
+
# Language codes and their corresponding MarianMT model names
|
24 |
LANGUAGE_CODES = {
|
25 |
+
"English": {"code": "en", "model": None}, # No translation needed for English
|
26 |
+
"Spanish": {"code": "es", "model": "Helsinki-NLP/opus-mt-en-es"},
|
27 |
+
"French": {"code": "fr", "model": "Helsinki-NLP/opus-mt-en-fr"},
|
28 |
+
"German": {"code": "de", "model": "Helsinki-NLP/opus-mt-en-de"},
|
29 |
+
"Italian": {"code": "it", "model": "Helsinki-NLP/opus-mt-en-it"},
|
30 |
+
"Portuguese": {"code": "pt", "model": "Helsinki-NLP/opus-mt-en-pt"},
|
31 |
+
"Dutch": {"code": "nl", "model": "Helsinki-NLP/opus-mt-en-nl"},
|
32 |
+
"Russian": {"code": "ru", "model": "Helsinki-NLP/opus-mt-en-ru"},
|
33 |
+
"Chinese": {"code": "zh", "model": "Helsinki-NLP/opus-mt-en-zh"},
|
34 |
+
"Japanese": {"code": "ja", "model": "Helsinki-NLP/opus-mt-en-jap"},
|
35 |
+
"Arabic": {"code": "ar", "model": "Helsinki-NLP/opus-mt-en-ar"}
|
36 |
}
|
37 |
|
38 |
+
# [Previous NEWS_SOURCES definition remains the same...]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
# Initialize global variables
|
41 |
summarizer = None
|
42 |
+
translators = {}
|
43 |
|
44 |
class NewsCache:
|
45 |
def __init__(self):
|
46 |
self.summaries = {}
|
47 |
+
self.translations = {}
|
48 |
self.max_cache_size = 1000
|
49 |
|
50 |
+
def store_summary(self, content_hash, summary, language=None):
|
51 |
+
cache_key = f"{content_hash}_{language}" if language else content_hash
|
52 |
+
|
53 |
if len(self.summaries) >= self.max_cache_size:
|
54 |
# Remove oldest entry if cache is full
|
55 |
self.summaries.pop(next(iter(self.summaries)))
|
56 |
+
|
57 |
+
self.summaries[cache_key] = summary
|
58 |
|
59 |
+
def get_summary(self, content_hash, language=None):
|
60 |
+
cache_key = f"{content_hash}_{language}" if language else content_hash
|
61 |
+
return self.summaries.get(cache_key)
|
62 |
|
63 |
news_cache = NewsCache()
|
64 |
|
65 |
+
def initialize_models():
|
66 |
+
"""Initialize the summarization and translation models"""
|
67 |
+
global summarizer, translators
|
68 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
try:
|
70 |
+
# Initialize summarizer
|
71 |
summarizer = pipeline(
|
72 |
"summarization",
|
73 |
model="facebook/bart-large-cnn",
|
74 |
device=-1 # Use CPU
|
75 |
)
|
76 |
+
|
77 |
+
# Initialize translators for each language
|
78 |
+
for lang, info in LANGUAGE_CODES.items():
|
79 |
+
if info["model"]: # Skip English as it doesn't need translation
|
80 |
+
try:
|
81 |
+
model = AutoModelForSeq2SeqGeneration.from_pretrained(info["model"])
|
82 |
+
tokenizer = AutoTokenizer.from_pretrained(info["model"])
|
83 |
+
translators[lang] = (model, tokenizer)
|
84 |
+
logging.info(f"Initialized translator for {lang}")
|
85 |
+
except Exception as e:
|
86 |
+
logging.error(f"Error initializing translator for {lang}: {e}")
|
87 |
+
|
88 |
return True
|
89 |
except Exception as e:
|
90 |
+
logging.error(f"Error initializing models: {e}")
|
91 |
return False
|
92 |
|
93 |
+
def translate_text(text, target_language):
|
94 |
+
"""Translate text to target language"""
|
95 |
+
if target_language == "English" or not text:
|
96 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
try:
|
99 |
+
if target_language not in translators:
|
100 |
+
logging.error(f"Translator not found for {target_language}")
|
101 |
+
return text
|
102 |
|
103 |
+
model, tokenizer = translators[target_language]
|
104 |
+
|
105 |
+
# Split text into chunks to handle long text
|
106 |
+
max_length = 512
|
107 |
+
chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
|
108 |
+
translated_chunks = []
|
109 |
+
|
110 |
+
for chunk in chunks:
|
111 |
+
inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
|
112 |
+
translated = model.generate(**inputs)
|
113 |
+
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
|
114 |
+
translated_chunks.append(translated_text)
|
115 |
|
116 |
+
return " ".join(translated_chunks)
|
117 |
+
|
|
|
118 |
except Exception as e:
|
119 |
+
logging.error(f"Translation error: {e}")
|
120 |
+
return text
|
121 |
|
122 |
+
def generate_summary(text, title="", category="", language="English"):
|
123 |
+
"""Generate summary with translation support"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
if not summarizer:
|
125 |
+
if not initialize_models():
|
126 |
return None
|
127 |
|
128 |
try:
|
129 |
# Check cache first
|
130 |
content_hash = get_content_hash(text)
|
131 |
+
cached_summary = news_cache.get_summary(content_hash, language)
|
132 |
if cached_summary:
|
133 |
return cached_summary
|
134 |
|
135 |
+
# Generate English summary first
|
136 |
prompt_template = f"""
|
137 |
Analyze and summarize this {category} news article titled "{title}".
|
138 |
Focus on providing:
|
|
|
147 |
|
148 |
Please provide a clear, concise summary that a general audience can understand:"""
|
149 |
|
|
|
150 |
prompted_text = prompt_template.format(text=text[:1024])
|
151 |
|
152 |
result = summarizer(prompted_text,
|
|
|
158 |
if result and len(result) > 0:
|
159 |
summary = result[0]['summary_text']
|
160 |
|
161 |
+
# Post-process summary
|
162 |
summary = summary.replace(" .", ".").replace(" ,", ",")
|
163 |
sentences = summary.split(". ")
|
164 |
formatted_summary = "\n• " + "\n• ".join(filter(None, sentences))
|
165 |
|
166 |
+
# Translate if needed
|
167 |
+
if language != "English":
|
168 |
+
formatted_summary = translate_text(formatted_summary, language)
|
169 |
+
|
170 |
+
news_cache.store_summary(content_hash, formatted_summary, language)
|
171 |
return formatted_summary
|
172 |
|
173 |
return None
|
|
|
177 |
return None
|
178 |
|
179 |
def get_personalized_summary(name, progress=gr.Progress()):
|
180 |
+
"""Generate personalized news summary in user's preferred language"""
|
181 |
start_time = time.time()
|
182 |
logging.info(f"Starting summary generation for user: {name}")
|
183 |
|
|
|
192 |
except Exception as e:
|
193 |
return f"Error loading preferences: {e}"
|
194 |
|
195 |
+
user_language = preferences.get("language", "English")
|
196 |
+
|
197 |
# Fetch articles with progress
|
198 |
progress(0.2, desc="Fetching recent news...")
|
199 |
articles = fetch_news_from_rss(preferences["interests"])
|
200 |
|
201 |
if not articles:
|
202 |
+
return translate_text("No recent news articles found from the last 8 hours. Please try again later.", user_language)
|
203 |
|
204 |
# Process articles with timeout
|
205 |
progress(0.4, desc="Analyzing and summarizing...")
|
206 |
summaries = []
|
207 |
total_articles = len(articles)
|
208 |
|
209 |
+
max_processing_time = 60
|
210 |
|
211 |
for i, article in enumerate(articles):
|
212 |
if time.time() - start_time > max_processing_time:
|
|
|
226 |
if not content:
|
227 |
continue
|
228 |
|
229 |
+
summary = generate_summary(content, title, category, user_language)
|
230 |
if not summary:
|
231 |
continue
|
232 |
|
233 |
+
# Translate title and category if needed
|
234 |
+
if user_language != "English":
|
235 |
+
title = translate_text(title, user_language)
|
236 |
+
category = translate_text(category, user_language)
|
237 |
+
published_str = translate_text(published_str, user_language)
|
238 |
+
|
239 |
formatted_summary = f"""
|
240 |
📰 {title}
|
241 |
+
📁 {translate_text("Category", user_language)}: {category}
|
242 |
+
⏰ {translate_text("Published", user_language)}: {published_str}
|
243 |
|
244 |
{summary}
|
245 |
|
246 |
+
🔗 {translate_text("Read more", user_language)}: {link}
|
247 |
|
248 |
---"""
|
249 |
summaries.append(formatted_summary)
|
|
|
253 |
continue
|
254 |
|
255 |
if not summaries:
|
256 |
+
return translate_text("Unable to generate summaries for recent news. Please try again.", user_language)
|
257 |
|
258 |
progress(1.0, desc="Done!")
|
259 |
return "\n".join(summaries)
|
260 |
|
261 |
+
# [Rest of the code remains the same...]
|
262 |
+
|
263 |
# Gradio interface
|
264 |
with gr.Blocks(title="Enhanced News Summarizer") as demo:
|
265 |
gr.Markdown("# 📰 Enhanced AI News Summarizer")
|