Spaces:
Runtime error
Runtime error
Update 1app.py
Browse files
1app.py
CHANGED
@@ -56,7 +56,7 @@ class SmartWebScraper:
|
|
56 |
self.text_generator = DialoGPTModel()
|
57 |
self.lemmatizer = WordNetLemmatizer()
|
58 |
self.stop_words = set(stopwords.words('english'))
|
59 |
-
|
60 |
def process_query(self, query: str) -> Tuple[str, List[str]]:
|
61 |
tokens = word_tokenize(query.lower())
|
62 |
tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words]
|
@@ -111,90 +111,79 @@ class SmartWebScraper:
|
|
111 |
return {"error": str(e)}
|
112 |
|
113 |
def format_response(self, data: dict, query: str) -> str:
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
return "I couldn't find any relevant information based on your query."
|
160 |
-
|
161 |
-
elif "show" in query_lower or "list" in query_lower:
|
162 |
-
if "image" in query_lower and "images" in data:
|
163 |
-
return f"Images found:\n" + "\n".join([f"- {img.get('alt', 'No description')} ({img.get('src', 'No source')})" for img in data['images'][:5]])
|
164 |
-
elif "link" in query_lower and "links" in data:
|
165 |
-
return f"Links found:\n" + "\n".join([f"- {link.get('text', 'No text')} ({link.get('href', 'No URL')})" for link in data['links'][:5]])
|
166 |
-
|
167 |
-
# If no specific pattern matches, create a general summary
|
168 |
-
summary_text = "Here's what I found on the webpage:\n" + "\n".join(summary)
|
169 |
-
|
170 |
-
if len(summary) == 0:
|
171 |
return "I couldn't find any relevant information based on your query."
|
172 |
-
|
173 |
-
return summary_text
|
174 |
|
175 |
-
def
|
176 |
-
scraper = SmartWebScraper()
|
177 |
-
def process_request(query: str, url: str) -> str:
|
178 |
if not url:
|
179 |
return "Please provide a URL to analyze."
|
180 |
try:
|
181 |
parsed_url = urlparse(url)
|
182 |
if not all([parsed_url.scheme, parsed_url.netloc]):
|
183 |
return "Please provide a valid URL (including http:// or https://)."
|
184 |
-
|
185 |
# Add timeout to prevent hanging
|
186 |
-
data =
|
187 |
-
response =
|
188 |
-
|
189 |
# Validate response
|
190 |
if not response or response.isspace():
|
191 |
return "I couldn't generate a meaningful response based on the available data."
|
192 |
-
|
193 |
return response
|
194 |
except Exception as e:
|
195 |
logging.error(f"Error processing request: {str(e)}")
|
196 |
return f"An error occurred while processing your request: {str(e)}"
|
197 |
-
|
|
|
|
|
|
|
|
|
|
|
198 |
with gr.Blocks() as demo:
|
199 |
gr.Markdown("# Smart Web Scraper")
|
200 |
gr.Markdown("Ask me anything about a webpage, and I'll try to find the information you need!")
|
@@ -217,7 +206,7 @@ def create_interface():
|
|
217 |
- "List all forms"
|
218 |
""")
|
219 |
return demo
|
220 |
-
|
221 |
if __name__ == "__main__":
|
222 |
demo = create_interface() # Assign the returned Gradio interface to 'demo'
|
223 |
demo.launch(debug=True)
|
|
|
56 |
self.text_generator = DialoGPTModel()
|
57 |
self.lemmatizer = WordNetLemmatizer()
|
58 |
self.stop_words = set(stopwords.words('english'))
|
59 |
+
|
60 |
def process_query(self, query: str) -> Tuple[str, List[str]]:
|
61 |
tokens = word_tokenize(query.lower())
|
62 |
tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words]
|
|
|
111 |
return {"error": str(e)}
|
112 |
|
113 |
def format_response(self, data: dict, query: str) -> str:
|
114 |
+
if "error" in data:
|
115 |
+
return f"I encountered an error while processing your request: {data['error']}"
|
116 |
+
|
117 |
+
# Create a structured summary of the data
|
118 |
+
summary = []
|
119 |
+
query_lower = query.lower()
|
120 |
+
|
121 |
+
# First, collect summary information
|
122 |
+
if "images" in data:
|
123 |
+
summary.append(f"Found {len(data['images'])} images")
|
124 |
+
if "links" in data:
|
125 |
+
summary.append(f"Found {len(data['links'])} links")
|
126 |
+
if "text" in data:
|
127 |
+
summary.append(f"Found {len(data['text'])} text blocks")
|
128 |
+
if "prices" in data:
|
129 |
+
summary.append(f"Found {len(data['prices'])} price mentions")
|
130 |
+
|
131 |
+
# Handle specific query types
|
132 |
+
if "how many" in query_lower:
|
133 |
+
if "image" in query_lower and "images" in data:
|
134 |
+
return f"There are {len(data['images'])} images on the webpage."
|
135 |
+
elif "link" in query_lower and "links" in data:
|
136 |
+
return f"There are {len(data['links'])} links on the webpage."
|
137 |
+
elif "price" in query_lower and "prices" in data:
|
138 |
+
return f"There are {len(data['prices'])} prices mentioned on the webpage."
|
139 |
+
elif "text" in query_lower and "text" in data:
|
140 |
+
return f"There are {len(data['text'])} text blocks on the webpage."
|
141 |
+
|
142 |
+
if "show" in query_lower or "list" in query_lower:
|
143 |
+
if "image" in query_lower and "images" in data:
|
144 |
+
images = data['images'][:5] # Limit to 5 images
|
145 |
+
return "Here are up to 5 images found:\n" + "\n".join([f"- {img['alt'] or 'No description'} ({img['src']})" for img in images])
|
146 |
+
|
147 |
+
elif "link" in query_lower and "links" in data:
|
148 |
+
links = data['links'][:5] # Limit to 5 links
|
149 |
+
return "Here are up to 5 links found:\n" + "\n".join([f"- {link['text'] or 'No text'} ({link['href']})" for link in links])
|
150 |
+
|
151 |
+
elif "text" in query_lower and "text" in data:
|
152 |
+
texts = data['text'][:3] # Limit to 3 text blocks
|
153 |
+
return "Here are up to 3 text blocks found:\n" + "\n".join([f"- {text[:100]}..." for text in texts])
|
154 |
+
|
155 |
+
# If no specific handling matched, return general summary
|
156 |
+
if summary:
|
157 |
+
return "Here's what I found on the webpage:\n" + "\n".join(summary)
|
158 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
return "I couldn't find any relevant information based on your query."
|
|
|
|
|
160 |
|
161 |
+
def handle_query(self, query: str, url: str) -> str:
|
|
|
|
|
162 |
if not url:
|
163 |
return "Please provide a URL to analyze."
|
164 |
try:
|
165 |
parsed_url = urlparse(url)
|
166 |
if not all([parsed_url.scheme, parsed_url.netloc]):
|
167 |
return "Please provide a valid URL (including http:// or https://)."
|
168 |
+
|
169 |
# Add timeout to prevent hanging
|
170 |
+
data = self.extract_data(url, query)
|
171 |
+
response = self.format_response(data, query)
|
172 |
+
|
173 |
# Validate response
|
174 |
if not response or response.isspace():
|
175 |
return "I couldn't generate a meaningful response based on the available data."
|
176 |
+
|
177 |
return response
|
178 |
except Exception as e:
|
179 |
logging.error(f"Error processing request: {str(e)}")
|
180 |
return f"An error occurred while processing your request: {str(e)}"
|
181 |
+
|
182 |
+
def create_interface():
|
183 |
+
scraper = SmartWebScraper()
|
184 |
+
def process_request(query: str, url: str) -> str:
|
185 |
+
return scraper.handle_query(query, url)
|
186 |
+
|
187 |
with gr.Blocks() as demo:
|
188 |
gr.Markdown("# Smart Web Scraper")
|
189 |
gr.Markdown("Ask me anything about a webpage, and I'll try to find the information you need!")
|
|
|
206 |
- "List all forms"
|
207 |
""")
|
208 |
return demo
|
209 |
+
|
210 |
if __name__ == "__main__":
|
211 |
demo = create_interface() # Assign the returned Gradio interface to 'demo'
|
212 |
demo.launch(debug=True)
|