Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
-
from transformers import pipeline
|
3 |
import feedparser
|
4 |
from datetime import datetime, timedelta
|
5 |
import pytz
|
@@ -7,11 +7,9 @@ from bs4 import BeautifulSoup
|
|
7 |
import hashlib
|
8 |
import threading
|
9 |
import logging
|
10 |
-
import traceback
|
11 |
|
12 |
# Set up logging
|
13 |
-
logging.basicConfig(level=logging.INFO
|
14 |
-
format='%(asctime)s - %(levelname)s - %(message)s')
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
17 |
# Global settings
|
@@ -37,29 +35,6 @@ NEWS_SOURCES = {
|
|
37 |
}
|
38 |
}
|
39 |
|
40 |
-
# Global summarizer instance
|
41 |
-
summarizer_instance = None
|
42 |
-
summarizer_model_name = None
|
43 |
-
|
44 |
-
def initialize_summarizer(model_name):
|
45 |
-
global summarizer_instance, summarizer_model_name
|
46 |
-
try:
|
47 |
-
if summarizer_instance is None or summarizer_model_name != model_name:
|
48 |
-
logger.info(f"Initializing summarizer with model: {model_name}")
|
49 |
-
summarizer_instance = pipeline("summarization",
|
50 |
-
model=model_name,
|
51 |
-
device=-1,
|
52 |
-
max_length=130,
|
53 |
-
min_length=30,
|
54 |
-
do_sample=False)
|
55 |
-
summarizer_model_name = model_name
|
56 |
-
logger.info("Summarizer initialized successfully")
|
57 |
-
return summarizer_instance
|
58 |
-
except Exception as e:
|
59 |
-
logger.error(f"Error initializing summarizer: {str(e)}")
|
60 |
-
logger.error(traceback.format_exc())
|
61 |
-
raise
|
62 |
-
|
63 |
class NewsCache:
|
64 |
def __init__(self, size):
|
65 |
self.cache = {}
|
@@ -83,6 +58,7 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
|
|
83 |
articles = []
|
84 |
cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
|
85 |
|
|
|
86 |
selected_sources = {
|
87 |
"Technology": tech_sources if tech_sources else [],
|
88 |
"Business": business_sources if business_sources else [],
|
@@ -92,7 +68,7 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
|
|
92 |
logger.info(f"Selected sources: {selected_sources}")
|
93 |
|
94 |
for category, sources in selected_sources.items():
|
95 |
-
if not sources:
|
96 |
continue
|
97 |
|
98 |
logger.info(f"Processing category: {category} with sources: {sources}")
|
@@ -104,33 +80,23 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
|
|
104 |
logger.info(f"Fetching from URL: {url}")
|
105 |
feed = feedparser.parse(url)
|
106 |
|
|
|
|
|
|
|
|
|
107 |
for entry in feed.entries:
|
108 |
try:
|
109 |
-
|
110 |
-
if hasattr(entry, 'published_parsed'):
|
111 |
-
published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
|
112 |
-
else:
|
113 |
-
published = datetime.now(pytz.UTC)
|
114 |
-
|
115 |
-
# Extract and clean description
|
116 |
-
description = entry.description if hasattr(entry, 'description') else ""
|
117 |
-
description = BeautifulSoup(description, "html.parser").get_text()
|
118 |
-
description = description.strip()
|
119 |
-
|
120 |
-
if not description: # Skip entries without description
|
121 |
-
continue
|
122 |
-
|
123 |
if published > cutoff_time:
|
124 |
articles.append({
|
125 |
"title": entry.title,
|
126 |
-
"description": description,
|
127 |
"link": entry.link,
|
128 |
"category": category,
|
129 |
"source": source,
|
130 |
"published": published
|
131 |
})
|
132 |
-
|
133 |
-
except Exception as e:
|
134 |
logger.error(f"Error processing entry: {str(e)}")
|
135 |
continue
|
136 |
|
@@ -144,14 +110,7 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
|
|
144 |
|
145 |
def summarize_text(text, model_name):
|
146 |
try:
|
147 |
-
|
148 |
-
summarizer = initialize_summarizer(model_name)
|
149 |
-
|
150 |
-
# Check if text is too short
|
151 |
-
if len(text.split()) < 30:
|
152 |
-
logger.info("Text too short for summarization, returning original")
|
153 |
-
return text
|
154 |
-
|
155 |
content_hash = hashlib.md5(text.encode()).hexdigest()
|
156 |
cached_summary = cache.get(content_hash)
|
157 |
|
@@ -159,44 +118,27 @@ def summarize_text(text, model_name):
|
|
159 |
logger.info("Using cached summary")
|
160 |
return cached_summary
|
161 |
|
162 |
-
logger.info("Generating new summary")
|
163 |
-
|
164 |
-
text = text.strip()
|
165 |
-
text = ' '.join(text.split()) # Normalize whitespace
|
166 |
-
|
167 |
-
# Generate summary
|
168 |
-
result = summarizer(text, max_length=130, min_length=30, do_sample=False)
|
169 |
summary = result[0]['summary_text']
|
170 |
-
|
171 |
-
# Cache the result
|
172 |
cache.set(content_hash, summary)
|
173 |
return summary
|
174 |
-
|
175 |
except Exception as e:
|
176 |
logger.error(f"Error in summarization: {str(e)}")
|
177 |
-
|
178 |
-
return text[:200] + "..." # Return truncated text as fallback
|
179 |
|
180 |
def summarize_articles(articles, model_name):
|
181 |
summaries = []
|
182 |
-
for
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
π Summary: {summary}
|
193 |
-
""")
|
194 |
-
except Exception as e:
|
195 |
-
logger.error(f"Error summarizing article: {str(e)}")
|
196 |
-
continue
|
197 |
-
|
198 |
-
if not summaries:
|
199 |
-
return "Could not generate summaries for the selected articles."
|
200 |
return "\n".join(summaries)
|
201 |
|
202 |
def generate_summary(tech_sources, business_sources, world_sources, model_name):
|
@@ -208,6 +150,7 @@ def generate_summary(tech_sources, business_sources, world_sources, model_name):
|
|
208 |
- Model: {model_name}
|
209 |
""")
|
210 |
|
|
|
211 |
if not any([
|
212 |
tech_sources is not None and len(tech_sources) > 0,
|
213 |
business_sources is not None and len(business_sources) > 0,
|
@@ -222,8 +165,7 @@ def generate_summary(tech_sources, business_sources, world_sources, model_name):
|
|
222 |
return summarize_articles(articles, model_name)
|
223 |
except Exception as e:
|
224 |
logger.error(f"Error in generate_summary: {str(e)}")
|
225 |
-
|
226 |
-
return f"An error occurred: {str(e)}"
|
227 |
|
228 |
# Gradio Interface
|
229 |
demo = gr.Blocks()
|
@@ -233,6 +175,7 @@ with demo:
|
|
233 |
|
234 |
with gr.Row():
|
235 |
with gr.Column():
|
|
|
236 |
tech_sources = gr.CheckboxGroup(
|
237 |
choices=list(NEWS_SOURCES["Technology"].keys()),
|
238 |
label="Technology Sources",
|
@@ -265,9 +208,9 @@ with demo:
|
|
265 |
return generate_summary(tech_sources, business_sources, world_sources, model_name)
|
266 |
except Exception as e:
|
267 |
logger.error(f"Error in get_summary: {str(e)}")
|
268 |
-
|
269 |
-
return f"An error occurred: {str(e)}"
|
270 |
|
|
|
271 |
summarize_button.click(
|
272 |
get_summary,
|
273 |
inputs=[tech_sources, business_sources, world_sources, model_selector],
|
|
|
1 |
import gradio as gr
|
2 |
+
from transformers import pipeline
|
3 |
import feedparser
|
4 |
from datetime import datetime, timedelta
|
5 |
import pytz
|
|
|
7 |
import hashlib
|
8 |
import threading
|
9 |
import logging
|
|
|
10 |
|
11 |
# Set up logging
|
12 |
+
logging.basicConfig(level=logging.INFO)
|
|
|
13 |
logger = logging.getLogger(__name__)
|
14 |
|
15 |
# Global settings
|
|
|
35 |
}
|
36 |
}
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
class NewsCache:
|
39 |
def __init__(self, size):
|
40 |
self.cache = {}
|
|
|
58 |
articles = []
|
59 |
cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
|
60 |
|
61 |
+
# Create a mapping of selected sources
|
62 |
selected_sources = {
|
63 |
"Technology": tech_sources if tech_sources else [],
|
64 |
"Business": business_sources if business_sources else [],
|
|
|
68 |
logger.info(f"Selected sources: {selected_sources}")
|
69 |
|
70 |
for category, sources in selected_sources.items():
|
71 |
+
if not sources: # Skip if no sources selected for this category
|
72 |
continue
|
73 |
|
74 |
logger.info(f"Processing category: {category} with sources: {sources}")
|
|
|
80 |
logger.info(f"Fetching from URL: {url}")
|
81 |
feed = feedparser.parse(url)
|
82 |
|
83 |
+
if hasattr(feed, 'status') and feed.status != 200:
|
84 |
+
logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}")
|
85 |
+
continue
|
86 |
+
|
87 |
for entry in feed.entries:
|
88 |
try:
|
89 |
+
published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
if published > cutoff_time:
|
91 |
articles.append({
|
92 |
"title": entry.title,
|
93 |
+
"description": BeautifulSoup(entry.description, "html.parser").get_text(),
|
94 |
"link": entry.link,
|
95 |
"category": category,
|
96 |
"source": source,
|
97 |
"published": published
|
98 |
})
|
99 |
+
except (AttributeError, TypeError) as e:
|
|
|
100 |
logger.error(f"Error processing entry: {str(e)}")
|
101 |
continue
|
102 |
|
|
|
110 |
|
111 |
def summarize_text(text, model_name):
|
112 |
try:
|
113 |
+
summarizer = pipeline("summarization", model=model_name, device=-1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
content_hash = hashlib.md5(text.encode()).hexdigest()
|
115 |
cached_summary = cache.get(content_hash)
|
116 |
|
|
|
118 |
logger.info("Using cached summary")
|
119 |
return cached_summary
|
120 |
|
121 |
+
logger.info(f"Generating new summary using model: {model_name}")
|
122 |
+
result = summarizer(text, max_length=120, min_length=40, truncation=True)
|
|
|
|
|
|
|
|
|
|
|
123 |
summary = result[0]['summary_text']
|
|
|
|
|
124 |
cache.set(content_hash, summary)
|
125 |
return summary
|
|
|
126 |
except Exception as e:
|
127 |
logger.error(f"Error in summarization: {str(e)}")
|
128 |
+
return "Summary unavailable."
|
|
|
129 |
|
130 |
def summarize_articles(articles, model_name):
|
131 |
summaries = []
|
132 |
+
for article in articles:
|
133 |
+
content = article["description"]
|
134 |
+
summary = summarize_text(content, model_name)
|
135 |
+
summaries.append(f"""
|
136 |
+
π° {article['title']}
|
137 |
+
- π Category: {article['category']}
|
138 |
+
- π‘ Source: {article['source']}
|
139 |
+
- π Read More: {article['link']}
|
140 |
+
π Summary: {summary}
|
141 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
return "\n".join(summaries)
|
143 |
|
144 |
def generate_summary(tech_sources, business_sources, world_sources, model_name):
|
|
|
150 |
- Model: {model_name}
|
151 |
""")
|
152 |
|
153 |
+
# Check if any sources are selected
|
154 |
if not any([
|
155 |
tech_sources is not None and len(tech_sources) > 0,
|
156 |
business_sources is not None and len(business_sources) > 0,
|
|
|
165 |
return summarize_articles(articles, model_name)
|
166 |
except Exception as e:
|
167 |
logger.error(f"Error in generate_summary: {str(e)}")
|
168 |
+
return f"An error occurred while generating the summary. Please try again."
|
|
|
169 |
|
170 |
# Gradio Interface
|
171 |
demo = gr.Blocks()
|
|
|
175 |
|
176 |
with gr.Row():
|
177 |
with gr.Column():
|
178 |
+
# Create checkbox groups for each category
|
179 |
tech_sources = gr.CheckboxGroup(
|
180 |
choices=list(NEWS_SOURCES["Technology"].keys()),
|
181 |
label="Technology Sources",
|
|
|
208 |
return generate_summary(tech_sources, business_sources, world_sources, model_name)
|
209 |
except Exception as e:
|
210 |
logger.error(f"Error in get_summary: {str(e)}")
|
211 |
+
return "An error occurred while processing your request. Please try again."
|
|
|
212 |
|
213 |
+
# Connect the components to the summary function
|
214 |
summarize_button.click(
|
215 |
get_summary,
|
216 |
inputs=[tech_sources, business_sources, world_sources, model_selector],
|