KingNish commited on
Commit
413592b
·
verified ·
1 Parent(s): b79a457

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +186 -11
app.py CHANGED
@@ -1,10 +1,170 @@
 
 
 
1
  import gradio as gr
 
 
 
 
 
2
  import requests
3
  import os
4
- import re
5
  import mimetypes
6
 
7
- def download_file(url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  """Downloads a file from a URL and returns the local file path."""
9
  if not url.startswith("http://") and not url.startswith("https://"):
10
  url = "http://" + url # Prepend "http://" if not present
@@ -28,20 +188,35 @@ def download_file(url):
28
  with open(temp_filename, 'wb') as f:
29
  for chunk in response.iter_content(chunk_size=8192000):
30
  f.write(chunk)
31
- return temp_filename
 
 
 
 
 
 
 
32
  except requests.exceptions.MissingSchema:
33
- return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid."
34
  except requests.exceptions.ConnectionError:
35
- return "Error: Could not connect to the server. Please check your internet connection."
36
  except requests.exceptions.RequestException as e:
37
- return f"Error downloading file: {e}"
 
 
38
 
39
  iface = gr.Interface(
40
- fn=download_file,
41
- inputs=gr.Textbox(lines=1, placeholder="Enter URL of the file"),
42
- outputs=gr.File(),
43
- title="File Downloader for Hugging Face Chat Tools",
44
- description="Enter the URL of an image, video, document, etc. to download it. "
 
 
 
 
 
 
45
  "This tool is designed for use with Hugging Face Chat Tools: "
46
  "[https://hf.co/chat/tools/66f1a8159d41ad4398ebb711](https://hf.co/chat/tools/66f1a8159d41ad4398ebb711)",
47
  concurrency_limit=None
 
1
+ import PyPDF2
2
+ from openpyxl import load_workbook
3
+ from pptx import Presentation
4
  import gradio as gr
5
+ import io
6
+ import re
7
+ import zipfile
8
+ import xml.etree.ElementTree as ET
9
+ import filetype
10
  import requests
11
  import os
 
12
  import mimetypes
13
 
14
+ # Constants
15
+ CHUNK_SIZE = 32000
16
+
17
+ # --- Utility Functions ---
18
+
19
+ def xml2text(xml):
20
+ """Extracts text from XML data."""
21
+ text = u''
22
+ root = ET.fromstring(xml)
23
+ for child in root.iter():
24
+ text += child.text + " " if child.text is not None else ''
25
+ return text
26
+
27
+ def clean_text(content):
28
+ """Cleans text content based on the 'clean' parameter."""
29
+ content = content.replace('\n', ' ')
30
+ content = content.replace('\r', ' ')
31
+ content = content.replace('\t', ' ')
32
+ content = re.sub(r'\s+', ' ', content)
33
+ return content
34
+
35
+
36
+ def split_content(content, chunk_size=CHUNK_SIZE):
37
+ """Splits content into chunks of a specified size."""
38
+ chunks = []
39
+ for i in range(0, len(content), chunk_size):
40
+ chunks.append(content[i:i + chunk_size])
41
+ return chunks
42
+
43
+ # --- Document Reading Functions ---
44
+
45
+ def extract_text_from_docx(docx_data, clean=True):
46
+ """Extracts text from DOCX files."""
47
+ text = u''
48
+ zipf = zipfile.ZipFile(io.BytesIO(docx_data))
49
+
50
+ filelist = zipf.namelist()
51
+
52
+ header_xmls = 'word/header[0-9]*.xml'
53
+ for fname in filelist:
54
+ if re.match(header_xmls, fname):
55
+ text += xml2text(zipf.read(fname))
56
+
57
+ doc_xml = 'word/document.xml'
58
+ text += xml2text(zipf.read(doc_xml))
59
+
60
+ footer_xmls = 'word/footer[0-9]*.xml'
61
+ for fname in filelist:
62
+ if re.match(footer_xmls, fname):
63
+ text += xml2text(zipf.read(fname))
64
+
65
+ zipf.close()
66
+ if clean:
67
+ text = clean_text(text)
68
+ return text, len(text)
69
+
70
+ def extract_text_from_pptx(pptx_data, clean=True):
71
+ """Extracts text from PPT files."""
72
+ text = u''
73
+ zipf = zipfile.ZipFile(io.BytesIO(pptx_data))
74
+
75
+ filelist = zipf.namelist()
76
+
77
+ # Extract text from slide notes
78
+ notes_xmls = 'ppt/notesSlides/notesSlide[0-9]*.xml'
79
+ for fname in filelist:
80
+ if re.match(notes_xmls, fname):
81
+ text += xml2text(zipf.read(fname))
82
+
83
+ # Extract text from slide content (shapes and text boxes)
84
+ slide_xmls = 'ppt/slides/slide[0-9]*.xml'
85
+ for fname in filelist:
86
+ if re.match(slide_xmls, fname):
87
+ text += xml2text(zipf.read(fname))
88
+
89
+ zipf.close()
90
+ if clean:
91
+ text = clean_text(text)
92
+ return text, len(text)
93
+
94
+ def read_document(file_path, clean=True):
95
+
96
+ with open(file_path, "rb") as f:
97
+ file_content = f.read()
98
+
99
+ kind = filetype.guess(file_content)
100
+
101
+ if kind is None:
102
+ mime = "text"
103
+ else:
104
+ mime = kind.mime
105
+
106
+ if mime == "application/pdf":
107
+ try:
108
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
109
+ content = ''
110
+ for page in range(len(pdf_reader.pages)):
111
+ content += pdf_reader.pages[page].extract_text()
112
+ if clean:
113
+ content = clean_text(content)
114
+ return content, len(repr(content))
115
+ except Exception as e:
116
+ return f"Error reading PDF: {e}", 0
117
+ elif mime == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
118
+ try:
119
+ wb = load_workbook(io.BytesIO(file_content))
120
+ content = ''
121
+ for sheet in wb.worksheets:
122
+ for row in sheet.rows:
123
+ for cell in row:
124
+ if cell.value is not None:
125
+ content += str(cell.value) + ' '
126
+ if clean:
127
+ content = clean_text(content)
128
+ return content, len(repr(content))
129
+ except Exception as e:
130
+ return f"Error reading XLSX: {e}", 0
131
+ elif mime == "text/plain":
132
+ try:
133
+ content = file_content.decode('utf-8')
134
+ if clean:
135
+ content = clean_text(content)
136
+ return content, len(repr(content))
137
+ except Exception as e:
138
+ return f"Error reading TXT file: {e}", 0
139
+ elif mime == "text/csv":
140
+ try:
141
+ content = file_content.decode('utf-8')
142
+ if clean:
143
+ content = clean_text(content)
144
+ return content, len(repr(content))
145
+ except Exception as e:
146
+ return f"Error reading CSV file: {e}", 0
147
+ elif mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
148
+ try:
149
+ return extract_text_from_docx(file_content, clean)
150
+ except Exception as e:
151
+ return f"Error reading DOCX: {e}", 0
152
+ elif mime == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
153
+ try:
154
+ return extract_text_from_pptx(file_content, clean)
155
+ except Exception as e:
156
+ return f"Error reading PPTX: {e}", 0
157
+
158
+ else:
159
+ try:
160
+ content = file_content.decode('utf-8')
161
+ if clean:
162
+ content = clean_text(content)
163
+ return content, len(repr(content))
164
+ except Exception as e:
165
+ return f"Error reading file: {e}", 0
166
+
167
+ def download_and_process_file(url, clean=True):
168
  """Downloads a file from a URL and returns the local file path."""
169
  if not url.startswith("http://") and not url.startswith("https://"):
170
  url = "http://" + url # Prepend "http://" if not present
 
188
  with open(temp_filename, 'wb') as f:
189
  for chunk in response.iter_content(chunk_size=8192000):
190
  f.write(chunk)
191
+
192
+ # Check if it's an image type
193
+ kind = filetype.guess(temp_filename)
194
+ if kind and kind.mime.startswith('image/'):
195
+ return f"![]({url})", 0 # Return markdown image syntax if it's an image
196
+ else:
197
+ return read_document(temp_filename, clean) # Otherwise, process as a document
198
+
199
  except requests.exceptions.MissingSchema:
200
+ return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid.", 0
201
  except requests.exceptions.ConnectionError:
202
+ return "Error: Could not connect to the server. Please check your internet connection.", 0
203
  except requests.exceptions.RequestException as e:
204
+ return f"Error downloading file: {e}", 0
205
+
206
+ # --- Gradio Interface ---
207
 
208
  iface = gr.Interface(
209
+ fn=download_and_process_file,
210
+ inputs=[
211
+ gr.Textbox(lines=1, placeholder="Enter URL of the file"),
212
+ gr.Checkbox(label="Clean Text", value=True),
213
+ ],
214
+ outputs=[
215
+ gr.Textbox(label="Document Content/Image Markdown"),
216
+ gr.Number(label="Document Length (characters)"),
217
+ ],
218
+ title="Enhanced File Processor for Hugging Face Chat Tools",
219
+ description="Enter the URL of site and extract its content"
220
  "This tool is designed for use with Hugging Face Chat Tools: "
221
  "[https://hf.co/chat/tools/66f1a8159d41ad4398ebb711](https://hf.co/chat/tools/66f1a8159d41ad4398ebb711)",
222
  concurrency_limit=None