loayshabet commited on
Commit
ff70769
Β·
verified Β·
1 Parent(s): 7620715

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -68
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from transformers import pipeline
3
  import feedparser
4
  from datetime import datetime, timedelta
5
  import pytz
@@ -7,9 +7,11 @@ from bs4 import BeautifulSoup
7
  import hashlib
8
  import threading
9
  import logging
 
10
 
11
  # Set up logging
12
- logging.basicConfig(level=logging.INFO)
 
13
  logger = logging.getLogger(__name__)
14
 
15
  # Global settings
@@ -22,36 +24,42 @@ RSS_FETCH_INTERVAL = timedelta(hours=8)
22
  ARTICLE_LIMIT = 5
23
 
24
  # Restructured news sources with fixed categories
25
- CATEGORIES = ["Technology", "Business", "World News", "Science", "Sports", "Health"]
26
  NEWS_SOURCES = {
27
  "Technology": {
28
  "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
29
- "reutersagency": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best"
30
  },
31
  "Business": {
32
  "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
33
- "reutersagency": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best"
34
  },
35
  "World News": {
36
- "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
37
  "BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
38
- "CNN": "http://rss.cnn.com/rss/edition_world.rss",
39
- "reutersagency": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best"
40
- },
41
- "Science": {
42
- "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml"
43
- },
44
- "Sports": {
45
- "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
46
- "reutersagency": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best"
47
- },
48
- "Health": {
49
- "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
50
- "politico": "http://rss.politico.com/healthcare.xml",
51
- "reutersagency": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best"
52
- },
53
  }
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  class NewsCache:
56
  def __init__(self, size):
57
  self.cache = {}
@@ -75,20 +83,16 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
75
  articles = []
76
  cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
77
 
78
- # Create a mapping of selected sources
79
  selected_sources = {
80
  "Technology": tech_sources if tech_sources else [],
81
  "Business": business_sources if business_sources else [],
82
- "World News": world_sources if world_sources else [],
83
- "Science": science_sources if science_sources else [],
84
- "Sports": sports_sources if sports_sources else [],
85
- "Health": health_sources if health_sources else [],
86
  }
87
 
88
  logger.info(f"Selected sources: {selected_sources}")
89
 
90
  for category, sources in selected_sources.items():
91
- if not sources: # Skip if no sources selected for this category
92
  continue
93
 
94
  logger.info(f"Processing category: {category} with sources: {sources}")
@@ -100,23 +104,33 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
100
  logger.info(f"Fetching from URL: {url}")
101
  feed = feedparser.parse(url)
102
 
103
- if hasattr(feed, 'status') and feed.status != 200:
104
- logger.warning(f"Failed to fetch feed from {url}. Status: {feed.status}")
105
- continue
106
-
107
  for entry in feed.entries:
108
  try:
109
- published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  if published > cutoff_time:
111
  articles.append({
112
  "title": entry.title,
113
- "description": BeautifulSoup(entry.description, "html.parser").get_text(),
114
  "link": entry.link,
115
  "category": category,
116
  "source": source,
117
  "published": published
118
  })
119
- except (AttributeError, TypeError) as e:
 
120
  logger.error(f"Error processing entry: {str(e)}")
121
  continue
122
 
@@ -130,7 +144,14 @@ def fetch_rss_news(tech_sources, business_sources, world_sources):
130
 
131
  def summarize_text(text, model_name):
132
  try:
133
- summarizer = pipeline("summarization", model=model_name, device=-1)
 
 
 
 
 
 
 
134
  content_hash = hashlib.md5(text.encode()).hexdigest()
135
  cached_summary = cache.get(content_hash)
136
 
@@ -138,27 +159,44 @@ def summarize_text(text, model_name):
138
  logger.info("Using cached summary")
139
  return cached_summary
140
 
141
- logger.info(f"Generating new summary using model: {model_name}")
142
- result = summarizer(text, max_length=120, min_length=40, truncation=True)
 
 
 
 
 
143
  summary = result[0]['summary_text']
 
 
144
  cache.set(content_hash, summary)
145
  return summary
 
146
  except Exception as e:
147
  logger.error(f"Error in summarization: {str(e)}")
148
- return "Summary unavailable."
 
149
 
150
  def summarize_articles(articles, model_name):
151
  summaries = []
152
- for article in articles:
153
- content = article["description"]
154
- summary = summarize_text(content, model_name)
155
- summaries.append(f"""
156
- πŸ“° {article['title']}
157
- - πŸ“ Category: {article['category']}
158
- - πŸ’‘ Source: {article['source']}
159
- - πŸ”— Read More: {article['link']}
160
- πŸ“ƒ Summary: {summary}
161
- """)
 
 
 
 
 
 
 
 
162
  return "\n".join(summaries)
163
 
164
  def generate_summary(tech_sources, business_sources, world_sources, model_name):
@@ -170,7 +208,6 @@ def generate_summary(tech_sources, business_sources, world_sources, model_name):
170
  - Model: {model_name}
171
  """)
172
 
173
- # Check if any sources are selected
174
  if not any([
175
  tech_sources is not None and len(tech_sources) > 0,
176
  business_sources is not None and len(business_sources) > 0,
@@ -185,7 +222,8 @@ def generate_summary(tech_sources, business_sources, world_sources, model_name):
185
  return summarize_articles(articles, model_name)
186
  except Exception as e:
187
  logger.error(f"Error in generate_summary: {str(e)}")
188
- return f"An error occurred while generating the summary. Please try again."
 
189
 
190
  # Gradio Interface
191
  demo = gr.Blocks()
@@ -195,7 +233,6 @@ with demo:
195
 
196
  with gr.Row():
197
  with gr.Column():
198
- # Create checkbox groups for each category
199
  tech_sources = gr.CheckboxGroup(
200
  choices=list(NEWS_SOURCES["Technology"].keys()),
201
  label="Technology Sources",
@@ -211,21 +248,6 @@ with demo:
211
  label="World News Sources",
212
  value=[]
213
  )
214
- science_sources = gr.CheckboxGroup(
215
- choices=list(NEWS_SOURCES["Science"].keys()),
216
- label="Science Sources",
217
- value=[]
218
- )
219
- sports_sources = gr.CheckboxGroup(
220
- choices=list(NEWS_SOURCES["Sports"].keys()),
221
- label="Sports Sources",
222
- value=[]
223
- )
224
- health_sources = gr.CheckboxGroup(
225
- choices=list(NEWS_SOURCES["Health"].keys()),
226
- label="Health Sources",
227
- value=[]
228
- )
229
 
230
  with gr.Column():
231
  model_selector = gr.Radio(
@@ -243,9 +265,9 @@ with demo:
243
  return generate_summary(tech_sources, business_sources, world_sources, model_name)
244
  except Exception as e:
245
  logger.error(f"Error in get_summary: {str(e)}")
246
- return "An error occurred while processing your request. Please try again."
 
247
 
248
- # Connect the components to the summary function
249
  summarize_button.click(
250
  get_summary,
251
  inputs=[tech_sources, business_sources, world_sources, model_selector],
 
1
  import gradio as gr
2
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqGeneration
3
  import feedparser
4
  from datetime import datetime, timedelta
5
  import pytz
 
7
  import hashlib
8
  import threading
9
  import logging
10
+ import traceback
11
 
12
  # Set up logging
13
+ logging.basicConfig(level=logging.INFO,
14
+ format='%(asctime)s - %(levelname)s - %(message)s')
15
  logger = logging.getLogger(__name__)
16
 
17
  # Global settings
 
24
  ARTICLE_LIMIT = 5
25
 
26
  # Restructured news sources with fixed categories
27
+ CATEGORIES = ["Technology", "Business", "World News"]
28
  NEWS_SOURCES = {
29
  "Technology": {
30
  "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
 
31
  },
32
  "Business": {
33
  "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
 
34
  },
35
  "World News": {
 
36
  "BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
37
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  }
39
 
40
+ # Global summarizer instance
41
+ summarizer_instance = None
42
+ summarizer_model_name = None
43
+
44
+ def initialize_summarizer(model_name):
45
+ global summarizer_instance, summarizer_model_name
46
+ try:
47
+ if summarizer_instance is None or summarizer_model_name != model_name:
48
+ logger.info(f"Initializing summarizer with model: {model_name}")
49
+ summarizer_instance = pipeline("summarization",
50
+ model=model_name,
51
+ device=-1,
52
+ max_length=130,
53
+ min_length=30,
54
+ do_sample=False)
55
+ summarizer_model_name = model_name
56
+ logger.info("Summarizer initialized successfully")
57
+ return summarizer_instance
58
+ except Exception as e:
59
+ logger.error(f"Error initializing summarizer: {str(e)}")
60
+ logger.error(traceback.format_exc())
61
+ raise
62
+
63
  class NewsCache:
64
  def __init__(self, size):
65
  self.cache = {}
 
83
  articles = []
84
  cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
85
 
 
86
  selected_sources = {
87
  "Technology": tech_sources if tech_sources else [],
88
  "Business": business_sources if business_sources else [],
89
+ "World News": world_sources if world_sources else []
 
 
 
90
  }
91
 
92
  logger.info(f"Selected sources: {selected_sources}")
93
 
94
  for category, sources in selected_sources.items():
95
+ if not sources:
96
  continue
97
 
98
  logger.info(f"Processing category: {category} with sources: {sources}")
 
104
  logger.info(f"Fetching from URL: {url}")
105
  feed = feedparser.parse(url)
106
 
 
 
 
 
107
  for entry in feed.entries:
108
  try:
109
+ # Handle different date formats
110
+ if hasattr(entry, 'published_parsed'):
111
+ published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
112
+ else:
113
+ published = datetime.now(pytz.UTC)
114
+
115
+ # Extract and clean description
116
+ description = entry.description if hasattr(entry, 'description') else ""
117
+ description = BeautifulSoup(description, "html.parser").get_text()
118
+ description = description.strip()
119
+
120
+ if not description: # Skip entries without description
121
+ continue
122
+
123
  if published > cutoff_time:
124
  articles.append({
125
  "title": entry.title,
126
+ "description": description,
127
  "link": entry.link,
128
  "category": category,
129
  "source": source,
130
  "published": published
131
  })
132
+
133
+ except Exception as e:
134
  logger.error(f"Error processing entry: {str(e)}")
135
  continue
136
 
 
144
 
145
  def summarize_text(text, model_name):
146
  try:
147
+ # Get or initialize summarizer
148
+ summarizer = initialize_summarizer(model_name)
149
+
150
+ # Check if text is too short
151
+ if len(text.split()) < 30:
152
+ logger.info("Text too short for summarization, returning original")
153
+ return text
154
+
155
  content_hash = hashlib.md5(text.encode()).hexdigest()
156
  cached_summary = cache.get(content_hash)
157
 
 
159
  logger.info("Using cached summary")
160
  return cached_summary
161
 
162
+ logger.info("Generating new summary")
163
+ # Clean and prepare text
164
+ text = text.strip()
165
+ text = ' '.join(text.split()) # Normalize whitespace
166
+
167
+ # Generate summary
168
+ result = summarizer(text, max_length=130, min_length=30, do_sample=False)
169
  summary = result[0]['summary_text']
170
+
171
+ # Cache the result
172
  cache.set(content_hash, summary)
173
  return summary
174
+
175
  except Exception as e:
176
  logger.error(f"Error in summarization: {str(e)}")
177
+ logger.error(traceback.format_exc())
178
+ return text[:200] + "..." # Return truncated text as fallback
179
 
180
  def summarize_articles(articles, model_name):
181
  summaries = []
182
+ for i, article in enumerate(articles):
183
+ try:
184
+ logger.info(f"Processing article {i+1}/{len(articles)}: {article['title']}")
185
+ content = article["description"]
186
+ summary = summarize_text(content, model_name)
187
+ summaries.append(f"""
188
+ πŸ“° {article['title']}
189
+ - πŸ“ Category: {article['category']}
190
+ - πŸ’‘ Source: {article['source']}
191
+ - πŸ”— Read More: {article['link']}
192
+ πŸ“ƒ Summary: {summary}
193
+ """)
194
+ except Exception as e:
195
+ logger.error(f"Error summarizing article: {str(e)}")
196
+ continue
197
+
198
+ if not summaries:
199
+ return "Could not generate summaries for the selected articles."
200
  return "\n".join(summaries)
201
 
202
  def generate_summary(tech_sources, business_sources, world_sources, model_name):
 
208
  - Model: {model_name}
209
  """)
210
 
 
211
  if not any([
212
  tech_sources is not None and len(tech_sources) > 0,
213
  business_sources is not None and len(business_sources) > 0,
 
222
  return summarize_articles(articles, model_name)
223
  except Exception as e:
224
  logger.error(f"Error in generate_summary: {str(e)}")
225
+ logger.error(traceback.format_exc())
226
+ return f"An error occurred: {str(e)}"
227
 
228
  # Gradio Interface
229
  demo = gr.Blocks()
 
233
 
234
  with gr.Row():
235
  with gr.Column():
 
236
  tech_sources = gr.CheckboxGroup(
237
  choices=list(NEWS_SOURCES["Technology"].keys()),
238
  label="Technology Sources",
 
248
  label="World News Sources",
249
  value=[]
250
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
  with gr.Column():
253
  model_selector = gr.Radio(
 
265
  return generate_summary(tech_sources, business_sources, world_sources, model_name)
266
  except Exception as e:
267
  logger.error(f"Error in get_summary: {str(e)}")
268
+ logger.error(traceback.format_exc())
269
+ return f"An error occurred: {str(e)}"
270
 
 
271
  summarize_button.click(
272
  get_summary,
273
  inputs=[tech_sources, business_sources, world_sources, model_selector],