loayshabet commited on
Commit
b2a401d
Β·
verified Β·
1 Parent(s): ff70769

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -87
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqGeneration
3
  import feedparser
4
  from datetime import datetime, timedelta
5
  import pytz
@@ -7,11 +7,9 @@ from bs4 import BeautifulSoup
7
  import hashlib
8
  import threading
9
  import logging
10
- import traceback
11
 
12
  # Set up logging
13
- logging.basicConfig(level=logging.INFO,
14
- format='%(asctime)s - %(levelname)s - %(message)s')
15
  logger = logging.getLogger(__name__)
16
 
17
  # Global settings
@@ -37,29 +35,6 @@ NEWS_SOURCES = {
37
  }
38
  }
39
 
40
- # Global summarizer instance
41
- summarizer_instance = None
42
- summarizer_model_name = None
43
-
44
- def initialize_summarizer(model_name):
45
- global summarizer_instance, summarizer_model_name
46
- try:
47
- if summarizer_instance is None or summarizer_model_name != model_name:
48
- logger.info(f"Initializing summarizer with model: {model_name}")
49
- summarizer_instance = pipeline("summarization",
50
- model=model_name,
51
- device=-1,
52
- max_length=130,
53
- min_length=30,
54
- do_sample=False)
55
- summarizer_model_name = model_name
56
- logger.info("Summarizer initialized successfully")
57
- return summarizer_instance
58
- except Exception as e:
59
- logger.error(f"Error initializing summarizer: {str(e)}")
60
- logger.error(traceback.format_exc())
61
- raise
62
-
63
  class NewsCache:
64
  def __init__(self, size):
65
  self.cache = {}
@@ -83,6 +58,7 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
83
  articles = []
84
  cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
85
 
 
86
  selected_sources = {
87
  "Technology": tech_sources if tech_sources else [],
88
  "Business": business_sources if business_sources else [],
@@ -92,7 +68,7 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
92
  logger.info(f"Selected sources: {selected_sources}")
93
 
94
  for category, sources in selected_sources.items():
95
- if not sources:
96
  continue
97
 
98
  logger.info(f"Processing category: {category} with sources: {sources}")
@@ -104,33 +80,23 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
104
  logger.info(f"Fetching from URL: {url}")
105
  feed = feedparser.parse(url)
106
 
 
 
 
 
107
  for entry in feed.entries:
108
  try:
109
- # Handle different date formats
110
- if hasattr(entry, 'published_parsed'):
111
- published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
112
- else:
113
- published = datetime.now(pytz.UTC)
114
-
115
- # Extract and clean description
116
- description = entry.description if hasattr(entry, 'description') else ""
117
- description = BeautifulSoup(description, "html.parser").get_text()
118
- description = description.strip()
119
-
120
- if not description: # Skip entries without description
121
- continue
122
-
123
  if published > cutoff_time:
124
  articles.append({
125
  "title": entry.title,
126
- "description": description,
127
  "link": entry.link,
128
  "category": category,
129
  "source": source,
130
  "published": published
131
  })
132
-
133
- except Exception as e:
134
  logger.error(f"Error processing entry: {str(e)}")
135
  continue
136
 
@@ -144,14 +110,7 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
144
 
145
  def summarize_text(text, model_name):
146
  try:
147
- # Get or initialize summarizer
148
- summarizer = initialize_summarizer(model_name)
149
-
150
- # Check if text is too short
151
- if len(text.split()) < 30:
152
- logger.info("Text too short for summarization, returning original")
153
- return text
154
-
155
  content_hash = hashlib.md5(text.encode()).hexdigest()
156
  cached_summary = cache.get(content_hash)
157
 
@@ -159,44 +118,27 @@ def summarize_text(text, model_name):
159
  logger.info("Using cached summary")
160
  return cached_summary
161
 
162
- logger.info("Generating new summary")
163
- # Clean and prepare text
164
- text = text.strip()
165
- text = ' '.join(text.split()) # Normalize whitespace
166
-
167
- # Generate summary
168
- result = summarizer(text, max_length=130, min_length=30, do_sample=False)
169
  summary = result[0]['summary_text']
170
-
171
- # Cache the result
172
  cache.set(content_hash, summary)
173
  return summary
174
-
175
  except Exception as e:
176
  logger.error(f"Error in summarization: {str(e)}")
177
- logger.error(traceback.format_exc())
178
- return text[:200] + "..." # Return truncated text as fallback
179
 
180
  def summarize_articles(articles, model_name):
181
  summaries = []
182
- for i, article in enumerate(articles):
183
- try:
184
- logger.info(f"Processing article {i+1}/{len(articles)}: {article['title']}")
185
- content = article["description"]
186
- summary = summarize_text(content, model_name)
187
- summaries.append(f"""
188
- πŸ“° {article['title']}
189
- - πŸ“ Category: {article['category']}
190
- - πŸ’‘ Source: {article['source']}
191
- - πŸ”— Read More: {article['link']}
192
- πŸ“ƒ Summary: {summary}
193
- """)
194
- except Exception as e:
195
- logger.error(f"Error summarizing article: {str(e)}")
196
- continue
197
-
198
- if not summaries:
199
- return "Could not generate summaries for the selected articles."
200
  return "\n".join(summaries)
201
 
202
  def generate_summary(tech_sources, business_sources, world_sources, model_name):
@@ -208,6 +150,7 @@ def generate_summary(tech_sources, business_sources, world_sources, model_name):
208
  - Model: {model_name}
209
  """)
210
 
 
211
  if not any([
212
  tech_sources is not None and len(tech_sources) > 0,
213
  business_sources is not None and len(business_sources) > 0,
@@ -222,8 +165,7 @@ def generate_summary(tech_sources, business_sources, world_sources, model_name):
222
  return summarize_articles(articles, model_name)
223
  except Exception as e:
224
  logger.error(f"Error in generate_summary: {str(e)}")
225
- logger.error(traceback.format_exc())
226
- return f"An error occurred: {str(e)}"
227
 
228
  # Gradio Interface
229
  demo = gr.Blocks()
@@ -233,6 +175,7 @@ with demo:
233
 
234
  with gr.Row():
235
  with gr.Column():
 
236
  tech_sources = gr.CheckboxGroup(
237
  choices=list(NEWS_SOURCES["Technology"].keys()),
238
  label="Technology Sources",
@@ -265,9 +208,9 @@ with demo:
265
  return generate_summary(tech_sources, business_sources, world_sources, model_name)
266
  except Exception as e:
267
  logger.error(f"Error in get_summary: {str(e)}")
268
- logger.error(traceback.format_exc())
269
- return f"An error occurred: {str(e)}"
270
 
 
271
  summarize_button.click(
272
  get_summary,
273
  inputs=[tech_sources, business_sources, world_sources, model_selector],
 
1
  import gradio as gr
2
+ from transformers import pipeline
3
  import feedparser
4
  from datetime import datetime, timedelta
5
  import pytz
 
7
  import hashlib
8
  import threading
9
  import logging
 
10
 
11
  # Set up logging
12
+ logging.basicConfig(level=logging.INFO)
 
13
  logger = logging.getLogger(__name__)
14
 
15
  # Global settings
 
35
  }
36
  }
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  class NewsCache:
39
  def __init__(self, size):
40
  self.cache = {}
 
58
  articles = []
59
  cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
60
 
61
+ # Create a mapping of selected sources
62
  selected_sources = {
63
  "Technology": tech_sources if tech_sources else [],
64
  "Business": business_sources if business_sources else [],
 
68
  logger.info(f"Selected sources: {selected_sources}")
69
 
70
  for category, sources in selected_sources.items():
71
+ if not sources: # Skip if no sources selected for this category
72
  continue
73
 
74
  logger.info(f"Processing category: {category} with sources: {sources}")
 
80
  logger.info(f"Fetching from URL: {url}")
81
  feed = feedparser.parse(url)
82
 
83
+ if hasattr(feed, 'status') and feed.status != 200:
84
+ logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}")
85
+ continue
86
+
87
  for entry in feed.entries:
88
  try:
89
+ published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  if published > cutoff_time:
91
  articles.append({
92
  "title": entry.title,
93
+ "description": BeautifulSoup(entry.description, "html.parser").get_text(),
94
  "link": entry.link,
95
  "category": category,
96
  "source": source,
97
  "published": published
98
  })
99
+ except (AttributeError, TypeError) as e:
 
100
  logger.error(f"Error processing entry: {str(e)}")
101
  continue
102
 
 
110
 
111
  def summarize_text(text, model_name):
112
  try:
113
+ summarizer = pipeline("summarization", model=model_name, device=-1)
 
 
 
 
 
 
 
114
  content_hash = hashlib.md5(text.encode()).hexdigest()
115
  cached_summary = cache.get(content_hash)
116
 
 
118
  logger.info("Using cached summary")
119
  return cached_summary
120
 
121
+ logger.info(f"Generating new summary using model: {model_name}")
122
+ result = summarizer(text, max_length=120, min_length=40, truncation=True)
 
 
 
 
 
123
  summary = result[0]['summary_text']
 
 
124
  cache.set(content_hash, summary)
125
  return summary
 
126
  except Exception as e:
127
  logger.error(f"Error in summarization: {str(e)}")
128
+ return "Summary unavailable."
 
129
 
130
  def summarize_articles(articles, model_name):
131
  summaries = []
132
+ for article in articles:
133
+ content = article["description"]
134
+ summary = summarize_text(content, model_name)
135
+ summaries.append(f"""
136
+ πŸ“° {article['title']}
137
+ - πŸ“ Category: {article['category']}
138
+ - πŸ’‘ Source: {article['source']}
139
+ - πŸ”— Read More: {article['link']}
140
+ πŸ“ƒ Summary: {summary}
141
+ """)
 
 
 
 
 
 
 
 
142
  return "\n".join(summaries)
143
 
144
  def generate_summary(tech_sources, business_sources, world_sources, model_name):
 
150
  - Model: {model_name}
151
  """)
152
 
153
+ # Check if any sources are selected
154
  if not any([
155
  tech_sources is not None and len(tech_sources) > 0,
156
  business_sources is not None and len(business_sources) > 0,
 
165
  return summarize_articles(articles, model_name)
166
  except Exception as e:
167
  logger.error(f"Error in generate_summary: {str(e)}")
168
+ return f"An error occurred while generating the summary. Please try again."
 
169
 
170
  # Gradio Interface
171
  demo = gr.Blocks()
 
175
 
176
  with gr.Row():
177
  with gr.Column():
178
+ # Create checkbox groups for each category
179
  tech_sources = gr.CheckboxGroup(
180
  choices=list(NEWS_SOURCES["Technology"].keys()),
181
  label="Technology Sources",
 
208
  return generate_summary(tech_sources, business_sources, world_sources, model_name)
209
  except Exception as e:
210
  logger.error(f"Error in get_summary: {str(e)}")
211
+ return "An error occurred while processing your request. Please try again."
 
212
 
213
+ # Connect the components to the summary function
214
  summarize_button.click(
215
  get_summary,
216
  inputs=[tech_sources, business_sources, world_sources, model_selector],