Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
-
from transformers import pipeline
|
3 |
import feedparser
|
4 |
from datetime import datetime, timedelta
|
5 |
import pytz
|
@@ -7,9 +7,11 @@ from bs4 import BeautifulSoup
|
|
7 |
import hashlib
|
8 |
import threading
|
9 |
import logging
|
|
|
10 |
|
11 |
# Set up logging
|
12 |
-
logging.basicConfig(level=logging.INFO
|
|
|
13 |
logger = logging.getLogger(__name__)
|
14 |
|
15 |
# Global settings
|
@@ -22,36 +24,42 @@ RSS_FETCH_INTERVAL = timedelta(hours=8)
|
|
22 |
ARTICLE_LIMIT = 5
|
23 |
|
24 |
# Restructured news sources with fixed categories
|
25 |
-
CATEGORIES = ["Technology", "Business", "World News"
|
26 |
NEWS_SOURCES = {
|
27 |
"Technology": {
|
28 |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
|
29 |
-
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best"
|
30 |
},
|
31 |
"Business": {
|
32 |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
|
33 |
-
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best"
|
34 |
},
|
35 |
"World News": {
|
36 |
-
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
|
37 |
"BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
|
38 |
-
|
39 |
-
"reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best"
|
40 |
-
},
|
41 |
-
"Science": {
|
42 |
-
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml"
|
43 |
-
},
|
44 |
-
"Sports": {
|
45 |
-
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
|
46 |
-
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best"
|
47 |
-
},
|
48 |
-
"Health": {
|
49 |
-
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
|
50 |
-
"politico": "http://rss.politico.com/healthcare.xml",
|
51 |
-
"reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best"
|
52 |
-
},
|
53 |
}
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
class NewsCache:
|
56 |
def __init__(self, size):
|
57 |
self.cache = {}
|
@@ -75,20 +83,16 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
|
|
75 |
articles = []
|
76 |
cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
|
77 |
|
78 |
-
# Create a mapping of selected sources
|
79 |
selected_sources = {
|
80 |
"Technology": tech_sources if tech_sources else [],
|
81 |
"Business": business_sources if business_sources else [],
|
82 |
-
"World News": world_sources if world_sources else []
|
83 |
-
"Science": science_sources if science_sources else [],
|
84 |
-
"Sports": sports_sources if sports_sources else [],
|
85 |
-
"Health": health_sources if health_sources else [],
|
86 |
}
|
87 |
|
88 |
logger.info(f"Selected sources: {selected_sources}")
|
89 |
|
90 |
for category, sources in selected_sources.items():
|
91 |
-
if not sources:
|
92 |
continue
|
93 |
|
94 |
logger.info(f"Processing category: {category} with sources: {sources}")
|
@@ -100,23 +104,33 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
|
|
100 |
logger.info(f"Fetching from URL: {url}")
|
101 |
feed = feedparser.parse(url)
|
102 |
|
103 |
-
if hasattr(feed, 'status') and feed.status != 200:
|
104 |
-
logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}")
|
105 |
-
continue
|
106 |
-
|
107 |
for entry in feed.entries:
|
108 |
try:
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
if published > cutoff_time:
|
111 |
articles.append({
|
112 |
"title": entry.title,
|
113 |
-
"description":
|
114 |
"link": entry.link,
|
115 |
"category": category,
|
116 |
"source": source,
|
117 |
"published": published
|
118 |
})
|
119 |
-
|
|
|
120 |
logger.error(f"Error processing entry: {str(e)}")
|
121 |
continue
|
122 |
|
@@ -130,7 +144,14 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
|
|
130 |
|
131 |
def summarize_text(text, model_name):
|
132 |
try:
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
content_hash = hashlib.md5(text.encode()).hexdigest()
|
135 |
cached_summary = cache.get(content_hash)
|
136 |
|
@@ -138,27 +159,44 @@ def summarize_text(text, model_name):
|
|
138 |
logger.info("Using cached summary")
|
139 |
return cached_summary
|
140 |
|
141 |
-
logger.info(
|
142 |
-
|
|
|
|
|
|
|
|
|
|
|
143 |
summary = result[0]['summary_text']
|
|
|
|
|
144 |
cache.set(content_hash, summary)
|
145 |
return summary
|
|
|
146 |
except Exception as e:
|
147 |
logger.error(f"Error in summarization: {str(e)}")
|
148 |
-
|
|
|
149 |
|
150 |
def summarize_articles(articles, model_name):
|
151 |
summaries = []
|
152 |
-
for article in articles:
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
return "\n".join(summaries)
|
163 |
|
164 |
def generate_summary(tech_sources, business_sources, world_sources, model_name):
|
@@ -170,7 +208,6 @@ def generate_summary(tech_sources, business_sources, world_sources, model_name):
|
|
170 |
- Model: {model_name}
|
171 |
""")
|
172 |
|
173 |
-
# Check if any sources are selected
|
174 |
if not any([
|
175 |
tech_sources is not None and len(tech_sources) > 0,
|
176 |
business_sources is not None and len(business_sources) > 0,
|
@@ -185,7 +222,8 @@ def generate_summary(tech_sources, business_sources, world_sources, model_name):
|
|
185 |
return summarize_articles(articles, model_name)
|
186 |
except Exception as e:
|
187 |
logger.error(f"Error in generate_summary: {str(e)}")
|
188 |
-
|
|
|
189 |
|
190 |
# Gradio Interface
|
191 |
demo = gr.Blocks()
|
@@ -195,7 +233,6 @@ with demo:
|
|
195 |
|
196 |
with gr.Row():
|
197 |
with gr.Column():
|
198 |
-
# Create checkbox groups for each category
|
199 |
tech_sources = gr.CheckboxGroup(
|
200 |
choices=list(NEWS_SOURCES["Technology"].keys()),
|
201 |
label="Technology Sources",
|
@@ -211,21 +248,6 @@ with demo:
|
|
211 |
label="World News Sources",
|
212 |
value=[]
|
213 |
)
|
214 |
-
science_sources = gr.CheckboxGroup(
|
215 |
-
choices=list(NEWS_SOURCES["Science"].keys()),
|
216 |
-
label="Science Sources",
|
217 |
-
value=[]
|
218 |
-
)
|
219 |
-
sports_sources = gr.CheckboxGroup(
|
220 |
-
choices=list(NEWS_SOURCES["Sports"].keys()),
|
221 |
-
label="Sports Sources",
|
222 |
-
value=[]
|
223 |
-
)
|
224 |
-
health_sources = gr.CheckboxGroup(
|
225 |
-
choices=list(NEWS_SOURCES["Health"].keys()),
|
226 |
-
label="Health Sources",
|
227 |
-
value=[]
|
228 |
-
)
|
229 |
|
230 |
with gr.Column():
|
231 |
model_selector = gr.Radio(
|
@@ -243,9 +265,9 @@ with demo:
|
|
243 |
return generate_summary(tech_sources, business_sources, world_sources, model_name)
|
244 |
except Exception as e:
|
245 |
logger.error(f"Error in get_summary: {str(e)}")
|
246 |
-
|
|
|
247 |
|
248 |
-
# Connect the components to the summary function
|
249 |
summarize_button.click(
|
250 |
get_summary,
|
251 |
inputs=[tech_sources, business_sources, world_sources, model_selector],
|
|
|
1 |
import gradio as gr
|
2 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqGeneration
|
3 |
import feedparser
|
4 |
from datetime import datetime, timedelta
|
5 |
import pytz
|
|
|
7 |
import hashlib
|
8 |
import threading
|
9 |
import logging
|
10 |
+
import traceback
|
11 |
|
12 |
# Set up logging
|
13 |
+
logging.basicConfig(level=logging.INFO,
|
14 |
+
format='%(asctime)s - %(levelname)s - %(message)s')
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
17 |
# Global settings
|
|
|
24 |
ARTICLE_LIMIT = 5
|
25 |
|
26 |
# Restructured news sources with fixed categories
|
27 |
+
CATEGORIES = ["Technology", "Business", "World News"]
|
28 |
NEWS_SOURCES = {
|
29 |
"Technology": {
|
30 |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
|
|
|
31 |
},
|
32 |
"Business": {
|
33 |
"TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
|
|
|
34 |
},
|
35 |
"World News": {
|
|
|
36 |
"BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
|
37 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
}
|
39 |
|
40 |
+
# Global summarizer instance
|
41 |
+
summarizer_instance = None
|
42 |
+
summarizer_model_name = None
|
43 |
+
|
44 |
+
def initialize_summarizer(model_name):
|
45 |
+
global summarizer_instance, summarizer_model_name
|
46 |
+
try:
|
47 |
+
if summarizer_instance is None or summarizer_model_name != model_name:
|
48 |
+
logger.info(f"Initializing summarizer with model: {model_name}")
|
49 |
+
summarizer_instance = pipeline("summarization",
|
50 |
+
model=model_name,
|
51 |
+
device=-1,
|
52 |
+
max_length=130,
|
53 |
+
min_length=30,
|
54 |
+
do_sample=False)
|
55 |
+
summarizer_model_name = model_name
|
56 |
+
logger.info("Summarizer initialized successfully")
|
57 |
+
return summarizer_instance
|
58 |
+
except Exception as e:
|
59 |
+
logger.error(f"Error initializing summarizer: {str(e)}")
|
60 |
+
logger.error(traceback.format_exc())
|
61 |
+
raise
|
62 |
+
|
63 |
class NewsCache:
|
64 |
def __init__(self, size):
|
65 |
self.cache = {}
|
|
|
83 |
articles = []
|
84 |
cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
|
85 |
|
|
|
86 |
selected_sources = {
|
87 |
"Technology": tech_sources if tech_sources else [],
|
88 |
"Business": business_sources if business_sources else [],
|
89 |
+
"World News": world_sources if world_sources else []
|
|
|
|
|
|
|
90 |
}
|
91 |
|
92 |
logger.info(f"Selected sources: {selected_sources}")
|
93 |
|
94 |
for category, sources in selected_sources.items():
|
95 |
+
if not sources:
|
96 |
continue
|
97 |
|
98 |
logger.info(f"Processing category: {category} with sources: {sources}")
|
|
|
104 |
logger.info(f"Fetching from URL: {url}")
|
105 |
feed = feedparser.parse(url)
|
106 |
|
|
|
|
|
|
|
|
|
107 |
for entry in feed.entries:
|
108 |
try:
|
109 |
+
# Handle different date formats
|
110 |
+
if hasattr(entry, 'published_parsed'):
|
111 |
+
published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
|
112 |
+
else:
|
113 |
+
published = datetime.now(pytz.UTC)
|
114 |
+
|
115 |
+
# Extract and clean description
|
116 |
+
description = entry.description if hasattr(entry, 'description') else ""
|
117 |
+
description = BeautifulSoup(description, "html.parser").get_text()
|
118 |
+
description = description.strip()
|
119 |
+
|
120 |
+
if not description: # Skip entries without description
|
121 |
+
continue
|
122 |
+
|
123 |
if published > cutoff_time:
|
124 |
articles.append({
|
125 |
"title": entry.title,
|
126 |
+
"description": description,
|
127 |
"link": entry.link,
|
128 |
"category": category,
|
129 |
"source": source,
|
130 |
"published": published
|
131 |
})
|
132 |
+
|
133 |
+
except Exception as e:
|
134 |
logger.error(f"Error processing entry: {str(e)}")
|
135 |
continue
|
136 |
|
|
|
144 |
|
145 |
def summarize_text(text, model_name):
|
146 |
try:
|
147 |
+
# Get or initialize summarizer
|
148 |
+
summarizer = initialize_summarizer(model_name)
|
149 |
+
|
150 |
+
# Check if text is too short
|
151 |
+
if len(text.split()) < 30:
|
152 |
+
logger.info("Text too short for summarization, returning original")
|
153 |
+
return text
|
154 |
+
|
155 |
content_hash = hashlib.md5(text.encode()).hexdigest()
|
156 |
cached_summary = cache.get(content_hash)
|
157 |
|
|
|
159 |
logger.info("Using cached summary")
|
160 |
return cached_summary
|
161 |
|
162 |
+
logger.info("Generating new summary")
|
163 |
+
# Clean and prepare text
|
164 |
+
text = text.strip()
|
165 |
+
text = ' '.join(text.split()) # Normalize whitespace
|
166 |
+
|
167 |
+
# Generate summary
|
168 |
+
result = summarizer(text, max_length=130, min_length=30, do_sample=False)
|
169 |
summary = result[0]['summary_text']
|
170 |
+
|
171 |
+
# Cache the result
|
172 |
cache.set(content_hash, summary)
|
173 |
return summary
|
174 |
+
|
175 |
except Exception as e:
|
176 |
logger.error(f"Error in summarization: {str(e)}")
|
177 |
+
logger.error(traceback.format_exc())
|
178 |
+
return text[:200] + "..." # Return truncated text as fallback
|
179 |
|
180 |
def summarize_articles(articles, model_name):
|
181 |
summaries = []
|
182 |
+
for i, article in enumerate(articles):
|
183 |
+
try:
|
184 |
+
logger.info(f"Processing article {i+1}/{len(articles)}: {article['title']}")
|
185 |
+
content = article["description"]
|
186 |
+
summary = summarize_text(content, model_name)
|
187 |
+
summaries.append(f"""
|
188 |
+
π° {article['title']}
|
189 |
+
- π Category: {article['category']}
|
190 |
+
- π‘ Source: {article['source']}
|
191 |
+
- π Read More: {article['link']}
|
192 |
+
π Summary: {summary}
|
193 |
+
""")
|
194 |
+
except Exception as e:
|
195 |
+
logger.error(f"Error summarizing article: {str(e)}")
|
196 |
+
continue
|
197 |
+
|
198 |
+
if not summaries:
|
199 |
+
return "Could not generate summaries for the selected articles."
|
200 |
return "\n".join(summaries)
|
201 |
|
202 |
def generate_summary(tech_sources, business_sources, world_sources, model_name):
|
|
|
208 |
- Model: {model_name}
|
209 |
""")
|
210 |
|
|
|
211 |
if not any([
|
212 |
tech_sources is not None and len(tech_sources) > 0,
|
213 |
business_sources is not None and len(business_sources) > 0,
|
|
|
222 |
return summarize_articles(articles, model_name)
|
223 |
except Exception as e:
|
224 |
logger.error(f"Error in generate_summary: {str(e)}")
|
225 |
+
logger.error(traceback.format_exc())
|
226 |
+
return f"An error occurred: {str(e)}"
|
227 |
|
228 |
# Gradio Interface
|
229 |
demo = gr.Blocks()
|
|
|
233 |
|
234 |
with gr.Row():
|
235 |
with gr.Column():
|
|
|
236 |
tech_sources = gr.CheckboxGroup(
|
237 |
choices=list(NEWS_SOURCES["Technology"].keys()),
|
238 |
label="Technology Sources",
|
|
|
248 |
label="World News Sources",
|
249 |
value=[]
|
250 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
|
252 |
with gr.Column():
|
253 |
model_selector = gr.Radio(
|
|
|
265 |
return generate_summary(tech_sources, business_sources, world_sources, model_name)
|
266 |
except Exception as e:
|
267 |
logger.error(f"Error in get_summary: {str(e)}")
|
268 |
+
logger.error(traceback.format_exc())
|
269 |
+
return f"An error occurred: {str(e)}"
|
270 |
|
|
|
271 |
summarize_button.click(
|
272 |
get_summary,
|
273 |
inputs=[tech_sources, business_sources, world_sources, model_selector],
|