KingNish commited on
Commit
7de02ad
·
verified ·
1 Parent(s): 413592b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -44
app.py CHANGED
@@ -10,6 +10,8 @@ import filetype
10
  import requests
11
  import os
12
  import mimetypes
 
 
13
 
14
  # Constants
15
  CHUNK_SIZE = 32000
@@ -32,7 +34,6 @@ def clean_text(content):
32
  content = re.sub(r'\s+', ' ', content)
33
  return content
34
 
35
-
36
  def split_content(content, chunk_size=CHUNK_SIZE):
37
  """Splits content into chunks of a specified size."""
38
  chunks = []
@@ -92,7 +93,6 @@ def extract_text_from_pptx(pptx_data, clean=True):
92
  return text, len(text)
93
 
94
  def read_document(file_path, clean=True):
95
-
96
  with open(file_path, "rb") as f:
97
  file_content = f.read()
98
 
@@ -154,7 +154,17 @@ def read_document(file_path, clean=True):
154
  return extract_text_from_pptx(file_content, clean)
155
  except Exception as e:
156
  return f"Error reading PPTX: {e}", 0
157
-
 
 
 
 
 
 
 
 
 
 
158
  else:
159
  try:
160
  content = file_content.decode('utf-8')
@@ -165,43 +175,111 @@ def read_document(file_path, clean=True):
165
  return f"Error reading file: {e}", 0
166
 
167
  def download_and_process_file(url, clean=True):
168
- """Downloads a file from a URL and returns the local file path."""
169
- if not url.startswith("http://") and not url.startswith("https://"):
170
- url = "http://" + url # Prepend "http://" if not present
171
-
172
- try:
173
- response = requests.get(url, stream=True)
174
- response.raise_for_status() # Raise an exception for bad status codes
175
-
176
- # Generate a safe and unique temporary filename
177
- original_filename = os.path.basename(url)
178
- # Remove invalid characters from filename
179
- safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
180
- temp_filename = f"{safe_filename}"
181
-
182
- # Infer file extension from content type
183
- content_type = response.headers['content-type']
184
- ext = mimetypes.guess_extension(content_type)
185
- if ext and not temp_filename.endswith(ext): # Append extension if not already present
186
- temp_filename += ext
187
-
188
- with open(temp_filename, 'wb') as f:
189
- for chunk in response.iter_content(chunk_size=8192000):
190
- f.write(chunk)
191
-
192
- # Check if it's an image type
193
- kind = filetype.guess(temp_filename)
194
- if kind and kind.mime.startswith('image/'):
195
- return f"![]({url})", 0 # Return markdown image syntax if it's an image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  else:
197
- return read_document(temp_filename, clean) # Otherwise, process as a document
198
-
199
- except requests.exceptions.MissingSchema:
200
- return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid.", 0
201
- except requests.exceptions.ConnectionError:
202
- return "Error: Could not connect to the server. Please check your internet connection.", 0
203
- except requests.exceptions.RequestException as e:
204
- return f"Error downloading file: {e}", 0
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  # --- Gradio Interface ---
207
 
@@ -212,13 +290,11 @@ iface = gr.Interface(
212
  gr.Checkbox(label="Clean Text", value=True),
213
  ],
214
  outputs=[
215
- gr.Textbox(label="Document Content/Image Markdown"),
216
  gr.Number(label="Document Length (characters)"),
217
  ],
218
- title="Enhanced File Processor for Hugging Face Chat Tools",
219
- description="Enter the URL of site and extract its content"
220
- "This tool is designed for use with Hugging Face Chat Tools: "
221
- "[https://hf.co/chat/tools/66f1a8159d41ad4398ebb711](https://hf.co/chat/tools/66f1a8159d41ad4398ebb711)",
222
  concurrency_limit=None
223
  )
224
 
 
10
  import requests
11
  import os
12
  import mimetypes
13
+ from bs4 import BeautifulSoup
14
+ from urllib.parse import urljoin
15
 
16
  # Constants
17
  CHUNK_SIZE = 32000
 
34
  content = re.sub(r'\s+', ' ', content)
35
  return content
36
 
 
37
  def split_content(content, chunk_size=CHUNK_SIZE):
38
  """Splits content into chunks of a specified size."""
39
  chunks = []
 
93
  return text, len(text)
94
 
95
  def read_document(file_path, clean=True):
 
96
  with open(file_path, "rb") as f:
97
  file_content = f.read()
98
 
 
154
  return extract_text_from_pptx(file_content, clean)
155
  except Exception as e:
156
  return f"Error reading PPTX: {e}", 0
157
+ elif mime == "text/html": # Handle HTML content
158
+ try:
159
+ soup = BeautifulSoup(file_content, 'html.parser')
160
+ structured_data = {
161
+ "Texts": extract_texts(soup),
162
+ "Links": extract_links(soup, ""),
163
+ "Images": extract_images(soup, "")
164
+ }
165
+ return format_detailed_output(structured_data), 0
166
+ except Exception as e:
167
+ return f"Error parsing HTML content: {e}", 0
168
  else:
169
  try:
170
  content = file_content.decode('utf-8')
 
175
  return f"Error reading file: {e}", 0
176
 
177
  def download_and_process_file(url, clean=True):
178
+ """Downloads a file from a URL and returns the local file path."""
179
+ if not url.startswith("http://") and not url.startswith("https://"):
180
+ url = "http://" + url # Prepend "http://" if not present
181
+
182
+ try:
183
+ response = requests.get(url, stream=True, timeout=10)
184
+ response.raise_for_status() # Raise an exception for bad status codes
185
+
186
+ # Generate a safe and unique temporary filename
187
+ original_filename = os.path.basename(url)
188
+ # Remove invalid characters from filename
189
+ safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
190
+ temp_filename = f"{safe_filename}"
191
+
192
+ # Infer file extension from content type
193
+ content_type = response.headers['content-type']
194
+ ext = mimetypes.guess_extension(content_type)
195
+ if ext and not temp_filename.endswith(ext): # Append extension if not already present
196
+ temp_filename += ext
197
+
198
+ with open(temp_filename, 'wb') as f:
199
+ for chunk in response.iter_content(chunk_size=8192000):
200
+ f.write(chunk)
201
+
202
+ # Check if it's an image type
203
+ kind = filetype.guess(temp_filename)
204
+ if kind and kind.mime.startswith('image/'):
205
+ return f"![]({url})", 0 # Return markdown image syntax if it's an image
206
+ else:
207
+ return read_document(temp_filename, clean) # Otherwise, process as a document
208
+
209
+ except requests.exceptions.MissingSchema:
210
+ return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid.", 0
211
+ except requests.exceptions.ConnectionError:
212
+ return "Error: Could not connect to the server. Please check your internet connection.", 0
213
+ except requests.exceptions.Timeout:
214
+ return "Error: Connection timed out while trying to fetch the URL.", 0
215
+ except requests.exceptions.RequestException as e:
216
+ return f"Error downloading file: {e}", 0
217
+
218
+ # --- Web Page Content Extraction Functions (from previous code) ---
219
+
220
+ def extract_texts(soup):
221
+ """Extracts all text content from the soup."""
222
+ return [text for text in soup.stripped_strings]
223
+
224
+ def extract_links(soup, base_url):
225
+ """Extracts all valid links from the soup."""
226
+ links = []
227
+ for link in soup.find_all('a', href=True):
228
+ href = link['href']
229
+ # Use urljoin to create an absolute URL
230
+ full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
231
+ link_text = link.get_text(strip=True) or "No Text"
232
+ links.append({"Text": link_text, "URL": full_url})
233
+ return links
234
+
235
+ def extract_images(soup, base_url):
236
+ """Extracts all valid image URLs and their alt text from the soup."""
237
+ images = []
238
+ for img in soup.find_all('img', src=True):
239
+ img_url = img['src']
240
+ # Use urljoin to create an absolute URL
241
+ full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
242
+ alt_text = img.get('alt', 'No Alt Text')
243
+ images.append({"Alt Text": alt_text, "Image URL": full_img_url})
244
+ return images
245
+
246
+ def fetch_page_content(url):
247
+ """Fetches the content of the page at the given URL."""
248
+ try:
249
+ response = requests.get(url, timeout=10)
250
+ response.raise_for_status()
251
+ return response.text
252
+ except requests.exceptions.RequestException as e:
253
+ return f"Error fetching the URL: {e}"
254
+
255
+ def format_detailed_output(structured_data):
256
+ """Formats the structured data into a Markdown string."""
257
+ result = "### Structured Page Content\n\n"
258
+ result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
259
+ result += "**Links:**\n"
260
+ if structured_data["Links"]:
261
+ result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n"
262
  else:
263
+ result += "No links found.\n"
264
+ result += "**Images:**\n"
265
+ if structured_data["Images"]:
266
+ result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n"
267
+ else:
268
+ result += "No images found.\n"
269
+ return result
270
+
271
+ def extract_page_content(url):
272
+ """Extracts and formats the content of the page at the given URL."""
273
+ page_content = fetch_page_content(url)
274
+ if "Error" in page_content:
275
+ return page_content
276
+ soup = BeautifulSoup(page_content, 'html.parser')
277
+ structured_data = {
278
+ "Texts": extract_texts(soup),
279
+ "Links": extract_links(soup, url), # Pass the base URL
280
+ "Images": extract_images(soup, url) # Pass the base URL
281
+ }
282
+ return format_detailed_output(structured_data)
283
 
284
  # --- Gradio Interface ---
285
 
 
290
  gr.Checkbox(label="Clean Text", value=True),
291
  ],
292
  outputs=[
293
+ gr.Textbox(label="Document Content/Image Markdown/Web Page Content"),
294
  gr.Number(label="Document Length (characters)"),
295
  ],
296
+ title="Enhanced File and Web Page Processor for Hugging Face Chat Tools",
297
+ description="Enter the URL of an image, video, document, or web page. The tool will handle it accordingly: images will be displayed as Markdown, documents will have their text extracted, and web pages will have their content structured and displayed. This tool is designed for use with Hugging Face Chat Tools. \n [https://hf.co/chat/tools/66f1a8159d41ad4398ebb711](https://hf.co/chat/tools/66f1a8159d41ad4398ebb711)",
 
 
298
  concurrency_limit=None
299
  )
300