acecalisto3 commited on
Commit
cce82a9
·
verified ·
1 Parent(s): d840901

Update 1app.py

Browse files
Files changed (1) hide show
  1. 1app.py +59 -70
1app.py CHANGED
@@ -56,7 +56,7 @@ class SmartWebScraper:
56
  self.text_generator = DialoGPTModel()
57
  self.lemmatizer = WordNetLemmatizer()
58
  self.stop_words = set(stopwords.words('english'))
59
-
60
  def process_query(self, query: str) -> Tuple[str, List[str]]:
61
  tokens = word_tokenize(query.lower())
62
  tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words]
@@ -111,90 +111,79 @@ class SmartWebScraper:
111
  return {"error": str(e)}
112
 
113
  def format_response(self, data: dict, query: str) -> str:
114
- if "error" in data:
115
- return f"I encountered an error while processing your request: {data['error']}"
116
-
117
- # Create a structured summary of the data
118
- summary = []
119
- query_lower = query.lower()
120
-
121
- # First, collect summary information
122
- if "images" in data:
123
- summary.append(f"Found {len(data['images'])} images")
124
- if "links" in data:
125
- summary.append(f"Found {len(data['links'])} links")
126
- if "text" in data:
127
- summary.append(f"Found {len(data['text'])} text blocks")
128
- if "prices" in data:
129
- summary.append(f"Found {len(data['prices'])} price mentions")
130
-
131
- # Handle specific query types
132
- if "how many" in query_lower:
133
- if "image" in query_lower and "images" in data:
134
- return f"There are {len(data['images'])} images on the webpage."
135
- elif "link" in query_lower and "links" in data:
136
- return f"There are {len(data['links'])} links on the webpage."
137
- elif "price" in query_lower and "prices" in data:
138
- return f"There are {len(data['prices'])} prices mentioned on the webpage."
139
- elif "text" in query_lower and "text" in data:
140
- return f"There are {len(data['text'])} text blocks on the webpage."
141
-
142
- if "show" in query_lower or "list" in query_lower:
143
- if "image" in query_lower and "images" in data:
144
- images = data['images'][:5] # Limit to 5 images
145
- return "Here are up to 5 images found:\n" + "\n".join([f"- {img['alt'] or 'No description'} ({img['src']})" for img in images])
146
-
147
- elif "link" in query_lower and "links" in data:
148
- links = data['links'][:5] # Limit to 5 links
149
- return "Here are up to 5 links found:\n" + "\n".join([f"- {link['text'] or 'No text'} ({link['href']})" for link in links])
150
-
151
- elif "text" in query_lower and "text" in data:
152
- texts = data['text'][:3] # Limit to 3 text blocks
153
- return "Here are up to 3 text blocks found:\n" + "\n".join([f"- {text[:100]}..." for text in texts])
154
-
155
- # If no specific handling matched, return general summary
156
- if summary:
157
- return "Here's what I found on the webpage:\n" + "\n".join(summary)
158
-
159
- return "I couldn't find any relevant information based on your query."
160
-
161
- elif "show" in query_lower or "list" in query_lower:
162
- if "image" in query_lower and "images" in data:
163
- return f"Images found:\n" + "\n".join([f"- {img.get('alt', 'No description')} ({img.get('src', 'No source')})" for img in data['images'][:5]])
164
- elif "link" in query_lower and "links" in data:
165
- return f"Links found:\n" + "\n".join([f"- {link.get('text', 'No text')} ({link.get('href', 'No URL')})" for link in data['links'][:5]])
166
-
167
- # If no specific pattern matches, create a general summary
168
- summary_text = "Here's what I found on the webpage:\n" + "\n".join(summary)
169
-
170
- if len(summary) == 0:
171
  return "I couldn't find any relevant information based on your query."
172
-
173
- return summary_text
174
 
175
- def create_interface():
176
- scraper = SmartWebScraper()
177
- def process_request(query: str, url: str) -> str:
178
  if not url:
179
  return "Please provide a URL to analyze."
180
  try:
181
  parsed_url = urlparse(url)
182
  if not all([parsed_url.scheme, parsed_url.netloc]):
183
  return "Please provide a valid URL (including http:// or https://)."
184
-
185
  # Add timeout to prevent hanging
186
- data = scraper.extract_data(url, query)
187
- response = scraper.format_response(data, query)
188
-
189
  # Validate response
190
  if not response or response.isspace():
191
  return "I couldn't generate a meaningful response based on the available data."
192
-
193
  return response
194
  except Exception as e:
195
  logging.error(f"Error processing request: {str(e)}")
196
  return f"An error occurred while processing your request: {str(e)}"
197
-
 
 
 
 
 
198
  with gr.Blocks() as demo:
199
  gr.Markdown("# Smart Web Scraper")
200
  gr.Markdown("Ask me anything about a webpage, and I'll try to find the information you need!")
@@ -217,7 +206,7 @@ def create_interface():
217
  - "List all forms"
218
  """)
219
  return demo
220
-
221
  if __name__ == "__main__":
222
  demo = create_interface() # Assign the returned Gradio interface to 'demo'
223
  demo.launch(debug=True)
 
56
  self.text_generator = DialoGPTModel()
57
  self.lemmatizer = WordNetLemmatizer()
58
  self.stop_words = set(stopwords.words('english'))
59
+
60
  def process_query(self, query: str) -> Tuple[str, List[str]]:
61
  tokens = word_tokenize(query.lower())
62
  tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words]
 
111
  return {"error": str(e)}
112
 
113
  def format_response(self, data: dict, query: str) -> str:
114
+ if "error" in data:
115
+ return f"I encountered an error while processing your request: {data['error']}"
116
+
117
+ # Create a structured summary of the data
118
+ summary = []
119
+ query_lower = query.lower()
120
+
121
+ # First, collect summary information
122
+ if "images" in data:
123
+ summary.append(f"Found {len(data['images'])} images")
124
+ if "links" in data:
125
+ summary.append(f"Found {len(data['links'])} links")
126
+ if "text" in data:
127
+ summary.append(f"Found {len(data['text'])} text blocks")
128
+ if "prices" in data:
129
+ summary.append(f"Found {len(data['prices'])} price mentions")
130
+
131
+ # Handle specific query types
132
+ if "how many" in query_lower:
133
+ if "image" in query_lower and "images" in data:
134
+ return f"There are {len(data['images'])} images on the webpage."
135
+ elif "link" in query_lower and "links" in data:
136
+ return f"There are {len(data['links'])} links on the webpage."
137
+ elif "price" in query_lower and "prices" in data:
138
+ return f"There are {len(data['prices'])} prices mentioned on the webpage."
139
+ elif "text" in query_lower and "text" in data:
140
+ return f"There are {len(data['text'])} text blocks on the webpage."
141
+
142
+ if "show" in query_lower or "list" in query_lower:
143
+ if "image" in query_lower and "images" in data:
144
+ images = data['images'][:5] # Limit to 5 images
145
+ return "Here are up to 5 images found:\n" + "\n".join([f"- {img['alt'] or 'No description'} ({img['src']})" for img in images])
146
+
147
+ elif "link" in query_lower and "links" in data:
148
+ links = data['links'][:5] # Limit to 5 links
149
+ return "Here are up to 5 links found:\n" + "\n".join([f"- {link['text'] or 'No text'} ({link['href']})" for link in links])
150
+
151
+ elif "text" in query_lower and "text" in data:
152
+ texts = data['text'][:3] # Limit to 3 text blocks
153
+ return "Here are up to 3 text blocks found:\n" + "\n".join([f"- {text[:100]}..." for text in texts])
154
+
155
+ # If no specific handling matched, return general summary
156
+ if summary:
157
+ return "Here's what I found on the webpage:\n" + "\n".join(summary)
158
+
 
 
 
 
 
 
 
 
 
 
 
 
159
  return "I couldn't find any relevant information based on your query."
 
 
160
 
161
+ def handle_query(self, query: str, url: str) -> str:
 
 
162
  if not url:
163
  return "Please provide a URL to analyze."
164
  try:
165
  parsed_url = urlparse(url)
166
  if not all([parsed_url.scheme, parsed_url.netloc]):
167
  return "Please provide a valid URL (including http:// or https://)."
168
+
169
  # Add timeout to prevent hanging
170
+ data = self.extract_data(url, query)
171
+ response = self.format_response(data, query)
172
+
173
  # Validate response
174
  if not response or response.isspace():
175
  return "I couldn't generate a meaningful response based on the available data."
176
+
177
  return response
178
  except Exception as e:
179
  logging.error(f"Error processing request: {str(e)}")
180
  return f"An error occurred while processing your request: {str(e)}"
181
+
182
+ def create_interface():
183
+ scraper = SmartWebScraper()
184
+ def process_request(query: str, url: str) -> str:
185
+ return scraper.handle_query(query, url)
186
+
187
  with gr.Blocks() as demo:
188
  gr.Markdown("# Smart Web Scraper")
189
  gr.Markdown("Ask me anything about a webpage, and I'll try to find the information you need!")
 
206
  - "List all forms"
207
  """)
208
  return demo
209
+
210
  if __name__ == "__main__":
211
  demo = create_interface() # Assign the returned Gradio interface to 'demo'
212
  demo.launch(debug=True)