loayshabet commited on
Commit
4e3626e
·
verified ·
1 Parent(s): 29adf37

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -177
app.py CHANGED
@@ -5,223 +5,161 @@ from datetime import datetime, timedelta
5
  import json
6
  import os
7
  import logging
8
- import hashlib
9
  import pytz
10
  from bs4 import BeautifulSoup
 
 
11
 
12
- # إعداد السجلات
13
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
 
15
- # مصادر الأخبار ورموز RSS
 
 
 
 
 
 
16
  NEWS_SOURCES = {
17
- "Technology": {
18
- "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
19
- "Reuters": "https://www.reutersagency.com/feed/?best-topics=tech&post_type=best"
20
- },
21
- "Business": {
22
- "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Business.xml",
23
- "Reuters": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best"
24
- },
25
- "Science": {
26
- "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Science.xml"
27
- },
28
- "World News": {
29
- "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/World.xml",
30
- "BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
31
- "CNN": "http://rss.cnn.com/rss/edition_world.rss",
32
- "Reuters": "https://www.reutersagency.com/feed/?taxonomy=best-regions&post_type=best"
33
- },
34
- "Sports": {
35
- "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml",
36
- "Reuters": "https://www.reutersagency.com/feed/?best-topics=sports&post_type=best"
37
- },
38
- "Health": {
39
- "TheNewYorkTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Health.xml",
40
- "Politico": "http://rss.politico.com/healthcare.xml",
41
- "Reuters": "https://www.reutersagency.com/feed/?best-topics=health&post_type=best"
42
- },
43
  }
44
 
45
- # نموذج تلخيص الأخبار
46
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1) # استخدام المعالج المركزي
47
-
48
  class NewsCache:
49
- """ Class to manage cached summaries for articles. """
50
- def __init__(self):
51
- self.summaries = {}
52
- self.max_cache_size = 1000
53
-
54
- def store_summary(self, content_hash, summary):
55
- if len(self.summaries) >= self.max_cache_size:
56
- self.summaries.pop(next(iter(self.summaries)))
57
- self.summaries[content_hash] = summary
58
-
59
- def get_summary(self, content_hash):
60
- return self.summaries.get(content_hash)
61
-
62
- news_cache = NewsCache()
63
-
64
- def get_content_hash(content):
65
- """Generate a hash for the content."""
66
- return hashlib.md5(content.encode()).hexdigest()
67
-
68
- def parse_date(date_str):
69
- """Parse date string to datetime object."""
70
- try:
71
- return datetime.fromisoformat(date_str).astimezone(pytz.UTC)
72
- except Exception as e:
73
- logging.warning(f"Date parsing error: {e}")
74
- return None
75
-
76
- def fetch_news_from_rss(categories):
77
- """Fetch news from RSS feeds based on user interests."""
78
  articles = []
79
- cutoff_time = datetime.now(pytz.UTC) - timedelta(hours=24) # تأكد من استخدام 24 ساعة هنا.
80
-
81
  for category in categories:
82
- if category in NEWS_SOURCES:
83
- for source, feed_url in NEWS_SOURCES[category].items():
84
- try:
85
- feed = feedparser.parse(feed_url)
86
- logging.info(f"Fetched {len(feed.entries)} entries from {source} for category {category}") # سجل عدد المقاطع
87
- for entry in feed.entries:
88
- published = parse_date(entry.get('published', entry.get('updated'))) # استخدام "updated" إذا لم يكن موجوداً "published"
89
- if published and published > cutoff_time:
90
- articles.append({
91
- 'title': entry.get('title', ''),
92
- 'description': BeautifulSoup(entry.get('description', ''), 'html.parser').get_text(),
93
- 'link': entry.get('link', ''),
94
- 'published': entry.get('published', entry.get('updated', '')),
95
- 'category': category,
96
- 'source': source
97
- })
98
- except Exception as e:
99
- logging.error(f"Error fetching from {feed_url}: {e}")
100
- continue
101
-
102
- logging.info(f"Total articles fetched in last 24 hours: {len(articles)}") # سجل العدد الإجمالي للمقالات
103
-
104
- if not articles: # إذا لم يتم العثور على أي مقالات
105
- logging.warning("No articles found within the specified timeframe.")
106
-
107
  return articles
108
 
109
- def generate_summary(text):
110
- """Generate AI-powered summary for the article."""
111
- content_hash = get_content_hash(text)
112
- cached_summary = news_cache.get_summary(content_hash)
113
  if cached_summary:
114
  return cached_summary
115
-
116
  try:
117
- result = summarizer(
118
- text,
119
- max_length=200,
120
- min_length=50,
121
- do_sample=False,
122
- truncation=True
123
- )
124
-
125
- if result and len(result) > 0:
126
- summary = result[0]['summary_text'].strip()
127
- if summary:
128
- formatted_summary = "\n• " + "\n• ".join([s.strip() for s in summary.split('.') if s.strip()])
129
- news_cache.store_summary(content_hash, formatted_summary)
130
- return formatted_summary
131
-
132
  except Exception as e:
133
- logging.error(f"Summarization error: {e}")
134
-
135
- return "Unable to generate summary."
136
 
137
- def get_personalized_summary(name):
138
- """Generate personalized news summary."""
139
- logging.info(f"Starting summary generation for user: {name}")
140
-
141
- if not name:
142
- return "Please enter your name!"
143
-
144
- preferences_file = f"user_preferences/preferences_{name}.json"
145
-
146
  try:
147
- with open(preferences_file, "r") as f:
148
  preferences = json.load(f)
149
  except FileNotFoundError:
150
- return "Please set your preferences first!"
151
  except Exception as e:
152
- return f"Error loading preferences: {e}"
 
153
 
154
- articles = fetch_news_from_rss(preferences["interests"])
155
-
 
 
 
 
156
  if not articles:
157
- return "No recent news articles found from the last 12 hours."
158
 
 
159
  summaries = []
160
-
161
  for article in articles:
162
- content = article['description']
163
- if not content:
164
- continue
165
-
166
- summary = generate_summary(content)
167
- if not summary:
168
- continue
169
-
170
- formatted_summary = f"""📰 {article['title']}
171
- 📁 Category: {article['category']}
172
- ⏰ Published: {article['published']}
173
-
174
- {summary}
175
-
176
- 🔗 [Read more]({article['link']})
177
 
178
- ---"""
179
- summaries.append(formatted_summary)
180
 
181
- if not summaries:
182
- return "Unable to generate summaries for recent news."
183
-
184
- return "\n".join(summaries)
185
-
186
- # واجهة Gradio
187
- demo = gr.Blocks(title="AI News Summarizer")
188
 
189
  with demo:
190
- gr.Markdown("# 📰 AI News Summarizer")
191
 
192
  with gr.Tab("Set Preferences"):
193
  name_input = gr.Textbox(label="Your Name")
194
- interests_checkboxes = gr.CheckboxGroup(choices=list(NEWS_SOURCES.keys()), label="News Interests (Select multiple)")
 
 
 
195
  save_button = gr.Button("Save Preferences")
196
- preferences_output = gr.Textbox(label="Status")
197
-
198
- def save_preferences(name, interests):
199
- if not name or not interests:
200
- return "Please fill in all required fields!"
201
-
202
- preferences = {
203
- "name": name,
204
- "interests": interests,
205
- "last_updated": datetime.now().isoformat()
206
- }
207
-
208
  try:
209
- os.makedirs('user_preferences', exist_ok=True)
210
  with open(f"user_preferences/preferences_{name}.json", "w") as f:
211
  json.dump(preferences, f)
212
- return f"Preferences saved for {name}!"
213
  except Exception as e:
214
- logging.error(f"Error saving preferences: {e}")
215
- return f"Error saving preferences: {e}"
216
-
217
- save_button.click(save_preferences, inputs=[name_input, interests_checkboxes], outputs=[preferences_output])
218
-
219
- with gr.Tab("Get News Summary"):
220
- name_check = gr.Textbox(label="Enter your name to get summary")
221
- get_summary_button = gr.Button("Get Summary")
222
- summary_output = gr.Markdown(value="Waiting for summary...")
223
 
224
- get_summary_button.click(get_personalized_summary, inputs=[name_check], outputs=[summary_output])
 
 
 
 
 
 
 
225
 
226
  if __name__ == "__main__":
227
  demo.launch()
@@ -241,3 +179,4 @@ if __name__ == "__main__":
241
 
242
 
243
 
 
 
5
  import json
6
  import os
7
  import logging
 
8
  import pytz
9
  from bs4 import BeautifulSoup
10
+ import hashlib
11
+ import threading
12
 
13
+ # Logging setup
14
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15
 
16
+ # Global settings
17
+ SUMMARIZER_MODEL = "facebook/bart-large-cnn" # You can replace this with other summarization models
18
+ CACHE_SIZE = 500 # Maximum number of cached summaries
19
+ RSS_FETCH_INTERVAL = timedelta(hours=8) # Fetch recent news within the last 8 hours
20
+ TIMEOUT_LIMIT = 30 # Maximum time in seconds to process summaries
21
+
22
+ # News sources
23
  NEWS_SOURCES = {
24
+ "Technology": {"NYTimes": "https://rss.nytimes.com/services/xml/rss/nyt/Technology.xml"},
25
+ "Business": {"Reuters": "https://www.reutersagency.com/feed/?best-topics=business-finance&post_type=best"},
26
+ "World": {"BBC": "http://feeds.bbci.co.uk/news/world/rss.xml"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  }
28
 
29
+ # Initialize cache
 
 
30
  class NewsCache:
31
+ def __init__(self, size):
32
+ self.cache = {}
33
+ self.size = size
34
+ self.lock = threading.Lock()
35
+
36
+ def get(self, key):
37
+ with self.lock:
38
+ return self.cache.get(key)
39
+
40
+ def set(self, key, value):
41
+ with self.lock:
42
+ if len(self.cache) >= self.size:
43
+ # Remove oldest cached item
44
+ oldest_key = next(iter(self.cache))
45
+ del self.cache[oldest_key]
46
+ self.cache[key] = value
47
+
48
+ cache = NewsCache(CACHE_SIZE)
49
+
50
+ # Initialize summarizer
51
+ summarizer = pipeline("summarization", model=SUMMARIZER_MODEL, device=-1)
52
+
53
+ # Utility functions
54
+ def fetch_rss_news(categories):
55
+ """Fetch news articles from RSS feeds based on selected categories."""
 
 
 
 
56
  articles = []
57
+ cutoff_time = datetime.now(pytz.UTC) - RSS_FETCH_INTERVAL
58
+
59
  for category in categories:
60
+ for source, url in NEWS_SOURCES.get(category, {}).items():
61
+ try:
62
+ feed = feedparser.parse(url)
63
+ for entry in feed.entries:
64
+ published = datetime(*entry.published_parsed[:6], tzinfo=pytz.UTC)
65
+ if published > cutoff_time:
66
+ articles.append({
67
+ "title": entry.title,
68
+ "description": BeautifulSoup(entry.description, "html.parser").get_text(),
69
+ "link": entry.link,
70
+ "category": category,
71
+ "source": source,
72
+ "published": published
73
+ })
74
+ except Exception as e:
75
+ logging.error(f"Failed to fetch from {url}: {e}")
 
 
 
 
 
 
 
 
 
76
  return articles
77
 
78
+ def summarize_text(text):
79
+ """Summarize the text using the AI model."""
80
+ content_hash = hashlib.md5(text.encode()).hexdigest()
81
+ cached_summary = cache.get(content_hash)
82
  if cached_summary:
83
  return cached_summary
84
+
85
  try:
86
+ result = summarizer(text, max_length=120, min_length=40, truncation=True)
87
+ summary = result[0]['summary_text']
88
+ cache.set(content_hash, summary)
89
+ return summary
 
 
 
 
 
 
 
 
 
 
 
90
  except Exception as e:
91
+ logging.error(f"Summarization failed: {e}")
92
+ return "Summary unavailable."
 
93
 
94
+ def generate_user_summary(name):
95
+ """Generate a personalized news summary based on user preferences."""
96
+ # Load preferences
 
 
 
 
 
 
97
  try:
98
+ with open(f"user_preferences/preferences_{name}.json") as f:
99
  preferences = json.load(f)
100
  except FileNotFoundError:
101
+ return "Preferences not found. Please set your preferences first."
102
  except Exception as e:
103
+ logging.error(f"Error loading preferences: {e}")
104
+ return "Failed to load preferences."
105
 
106
+ categories = preferences.get("interests", [])
107
+ if not categories:
108
+ return "No categories selected. Please update your preferences."
109
+
110
+ # Fetch news
111
+ articles = fetch_rss_news(categories)
112
  if not articles:
113
+ return "No recent news found in your selected categories."
114
 
115
+ # Summarize articles
116
  summaries = []
 
117
  for article in articles:
118
+ summary = summarize_text(article["description"])
119
+ summaries.append(f"""**{article['title']}**
120
+ **Category:** {article['category']} | **Source:** {article['source']} | **Published:** {article['published'].strftime('%Y-%m-%d %H:%M')}
121
+ {summary}
122
+ [Read more]({article['link']})
123
+ ---""")
 
 
 
 
 
 
 
 
 
124
 
125
+ return "\n\n".join(summaries) if summaries else "No summaries available."
 
126
 
127
+ # Gradio interface
128
+ demo = gr.Blocks()
 
 
 
 
 
129
 
130
  with demo:
131
+ gr.Markdown("# 📰 Personalized AI News Summarizer")
132
 
133
  with gr.Tab("Set Preferences"):
134
  name_input = gr.Textbox(label="Your Name")
135
+ interests = gr.CheckboxGroup(
136
+ choices=list(NEWS_SOURCES.keys()),
137
+ label="Select Your Interests"
138
+ )
139
  save_button = gr.Button("Save Preferences")
140
+ save_status = gr.Textbox(label="Status")
141
+
142
+ def save_preferences(name, selected_interests):
143
+ if not name or not selected_interests:
144
+ return "Name and interests are required!"
145
+ preferences = {"name": name, "interests": selected_interests}
 
 
 
 
 
 
146
  try:
147
+ os.makedirs("user_preferences", exist_ok=True)
148
  with open(f"user_preferences/preferences_{name}.json", "w") as f:
149
  json.dump(preferences, f)
150
+ return "Preferences saved successfully!"
151
  except Exception as e:
152
+ logging.error(f"Failed to save preferences: {e}")
153
+ return "Failed to save preferences."
 
 
 
 
 
 
 
154
 
155
+ save_button.click(save_preferences, inputs=[name_input, interests], outputs=save_status)
156
+
157
+ with gr.Tab("Get News Summary"):
158
+ name_input_summary = gr.Textbox(label="Your Name")
159
+ fetch_button = gr.Button("Get Summary")
160
+ summary_output = gr.Textbox(label="News Summary", lines=20)
161
+
162
+ fetch_button.click(generate_user_summary, inputs=[name_input_summary], outputs=summary_output)
163
 
164
  if __name__ == "__main__":
165
  demo.launch()
 
179
 
180
 
181
 
182
+