KingNish commited on
Commit
a74d94b
·
verified ·
1 Parent(s): 3e87e84

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -94
app.py CHANGED
@@ -10,16 +10,13 @@ import xml.etree.ElementTree as ET
10
 
11
  # Constants
12
  CHUNK_SIZE = 32000
13
- SYSTEM_PROMPT = """
14
- You are a helpful and informative assistant that can answer questions based on the content of documents.
15
- You will receive the content of a document and a question about it.
16
- Your task is to provide a concise and accurate answer to the question based solely on the provided document content.
17
- If the document does not contain enough information to answer the question, simply state that you cannot answer the question based on the provided information.
18
- """
19
 
20
  # Initialize the Mistral chat model
21
  client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407")
22
 
 
 
23
  def xml2text(xml):
24
  """Extracts text from XML data."""
25
  text = u''
@@ -28,37 +25,54 @@ def xml2text(xml):
28
  text += child.text + " " if child.text is not None else ''
29
  return text
30
 
31
- def extract_text_from_docx(docx_data, strip_content):
32
- """Extracts text from a DOCX file."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  text = u''
34
  zipf = zipfile.ZipFile(io.BytesIO(docx_data))
 
35
  filelist = zipf.namelist()
36
 
 
37
  for fname in filelist:
38
- if re.match('word/header[0-9]*.xml', fname):
39
- text += xml2text(zipf.read(fname))
40
- elif re.match('word/footer[0-9]*.xml', fname):
41
  text += xml2text(zipf.read(fname))
42
-
43
- text += xml2text(zipf.read('word/document.xml'))
44
- zipf.close()
45
 
46
- if strip_content:
47
- text = strip_text(text)
48
-
49
- return f"{text}\n\n**Document Length:** {len(text)} characters"
 
 
 
50
 
 
 
 
 
51
 
52
- def strip_text(text):
53
- """Strips unnecessary characters from text."""
54
- content = text.replace('\n', ' ')
55
- content = content.replace('\r', ' ')
56
- content = content.replace('\t', ' ')
57
- content = content.replace(' ', '')
58
- return content.strip()
59
 
60
- def read_document(file, strip_content):
61
- """Reads the content of a document based on its file type."""
62
  file_path = file.name
63
  file_extension = file_path.split('.')[-1].lower()
64
 
@@ -71,11 +85,11 @@ def read_document(file, strip_content):
71
  content = ''
72
  for page in range(len(pdf_reader.pages)):
73
  content += pdf_reader.pages[page].extract_text()
74
- if strip_content:
75
- content = strip_text(content)
76
- return content
77
  except Exception as e:
78
- return f"Error reading PDF: {e}"
79
 
80
  elif file_extension == 'xlsx':
81
  try:
@@ -84,13 +98,13 @@ def read_document(file, strip_content):
84
  for sheet in wb.worksheets:
85
  for row in sheet.rows:
86
  for cell in row:
87
- if cell.value is not None:
88
  content += str(cell.value) + ' '
89
- if strip_content:
90
- content = strip_text(content)
91
- return content
92
  except Exception as e:
93
- return f"Error reading XLSX: {e}"
94
 
95
  elif file_extension == 'pptx':
96
  try:
@@ -100,74 +114,90 @@ def read_document(file, strip_content):
100
  for shape in slide.shapes:
101
  if hasattr(shape, "text"):
102
  content += shape.text + ' '
103
- if strip_content:
104
- content = strip_text(content)
105
- return content
106
  except Exception as e:
107
- return f"Error reading PPTX: {e}"
108
 
109
  elif file_extension == 'doc' or file_extension == 'docx':
110
  try:
111
- return extract_text_from_docx(file_content, strip_content)
112
  except Exception as e:
113
- return f"Error reading DOC/DOCX: {e}"
114
 
115
  else:
116
  try:
117
- content = file_content.decode('utf-8')
118
- if strip_content:
119
- content = strip_text(content)
120
- return content
121
  except Exception as e:
122
- return f"Error reading file: {e}"
123
 
124
 
125
- def split_content(content):
126
- """Splits content into chunks for processing."""
127
- chunks = []
128
- for i in range(0, len(content), CHUNK_SIZE):
129
- chunks.append(content[i:i + CHUNK_SIZE])
130
- return chunks
 
 
 
 
 
 
 
 
 
 
131
 
132
 
133
- def chat_document(file, question, strip_content):
134
- """Handles chat with a document using Mistral."""
135
- content = str(read_document(file, strip_content))
 
 
136
 
137
- if len(content) > CHUNK_SIZE:
138
- content = content[:CHUNK_SIZE]
 
 
 
 
139
 
140
- message = f"""[INST] [SYSTEM] {SYSTEM_PROMPT}
141
  Document Content: {content}
142
  Question: {question}
143
  Answer:"""
144
 
145
- stream = client.text_generation(message, max_new_tokens=4096, stream=True, details=True, return_full_text=False)
146
- output = ""
147
- for response in stream:
148
- if not response.token.text == "</s>":
149
- output += response.token.text
150
- yield output
151
 
152
 
153
- def chat_document_v2(file, question, strip_content):
154
- """Handles chat with a document using Mistral and chunk-based approach."""
155
- content = str(read_document(file, strip_content))
156
  chunks = split_content(content)
157
-
 
 
 
 
 
 
 
158
  all_answers = []
159
  for chunk in chunks:
160
- message = f"""[INST] [SYSTEM] {SYSTEM_PROMPT}
161
- Document Content: {chunk[:CHUNK_SIZE]}
162
  Question: {question}
163
  Answer:"""
164
 
165
- stream = client.text_generation(message, max_new_tokens=4096, stream=True, details=True, return_full_text=False)
166
- output = ""
167
- for response in stream:
168
- if not response.token.text == "</s>":
169
- output += response.token.text
170
- all_answers.append(output)
171
 
172
  # Summarize all answers using Mistral
173
  summary_prompt = """
@@ -177,45 +207,56 @@ def chat_document_v2(file, question, strip_content):
177
 
178
  Answers:
179
  """
180
-
181
  all_answers_str = "\n".join(all_answers)
182
- print(all_answers_str)
183
  summary_message = f"""[INST] [SYSTEM] {summary_prompt}
184
- {all_answers_str[:30000]}
185
  Summary:"""
186
-
187
- stream = client.text_generation(summary_message, max_new_tokens=4096, stream=True, details=True, return_full_text=False)
188
- output = ""
189
- for response in stream:
190
- if not response.token.text == "</s>":
191
- output += response.token.text
192
- yield output
193
 
194
  with gr.Blocks() as demo:
195
  with gr.Tabs():
196
  with gr.TabItem("Document Reader"):
197
  iface1 = gr.Interface(
198
  fn=read_document,
199
- inputs=[gr.File(label="Upload a Document"), gr.Checkbox(label="Strip Content", value=True)],
200
- outputs=gr.Textbox(label="Document Content"),
 
 
 
 
 
 
201
  title="Document Reader",
202
  description="Upload a document (PDF, XLSX, PPTX, TXT, CSV, DOC, DOCX and Code or text file) to read its content."
203
  )
204
  with gr.TabItem("Document Chat"):
205
  iface2 = gr.Interface(
206
  fn=chat_document,
207
- inputs=[gr.File(label="Upload a Document"), gr.Textbox(label="Question"), gr.Checkbox(label="Strip Content", value=True)],
208
- outputs=gr.Markdown(label="Answer"),
 
 
 
 
209
  title="Document Chat",
210
  description="Upload a document and ask questions about its content."
211
  )
212
  with gr.TabItem("Document Chat V2"):
213
  iface3 = gr.Interface(
214
  fn=chat_document_v2,
215
- inputs=[gr.File(label="Upload a Document"), gr.Textbox(label="Question"), gr.Checkbox(label="Strip Content", value=True)],
216
- outputs=gr.Markdown(label="Answer"),
 
 
 
 
217
  title="Document Chat V2",
218
  description="Upload a document and ask questions about its content (using chunk-based approach)."
219
  )
220
 
221
- demo.launch()
 
10
 
11
  # Constants
12
  CHUNK_SIZE = 32000
13
+ MAX_NEW_TOKENS = 4096
 
 
 
 
 
14
 
15
  # Initialize the Mistral chat model
16
  client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407")
17
 
18
+ # --- Utility Functions ---
19
+
20
  def xml2text(xml):
21
  """Extracts text from XML data."""
22
  text = u''
 
25
  text += child.text + " " if child.text is not None else ''
26
  return text
27
 
28
+ def clean_text(content):
29
+ """Cleans text content based on the 'clean' parameter."""
30
+ if clean:
31
+ content = content.replace('\n', ' ')
32
+ content = content.replace('\r', ' ')
33
+ content = content.replace('\t', ' ')
34
+ content = content.replace(' ', ' ') # Replace double spaces with single
35
+ content = content.strip()
36
+ return content
37
+
38
+
39
+ def split_content(content, chunk_size=CHUNK_SIZE):
40
+ """Splits content into chunks of a specified size."""
41
+ chunks = []
42
+ for i in range(0, len(content), chunk_size):
43
+ chunks.append(content[i:i + chunk_size])
44
+ return chunks
45
+
46
+ # --- Document Reading Functions ---
47
+
48
+ def extract_text_from_docx(docx_data, clean=True):
49
+ """Extracts text from DOCX files."""
50
  text = u''
51
  zipf = zipfile.ZipFile(io.BytesIO(docx_data))
52
+
53
  filelist = zipf.namelist()
54
 
55
+ header_xmls = 'word/header[0-9]*.xml'
56
  for fname in filelist:
57
+ if re.match(header_xmls, fname):
 
 
58
  text += xml2text(zipf.read(fname))
 
 
 
59
 
60
+ doc_xml = 'word/document.xml'
61
+ text += xml2text(zipf.read(doc_xml))
62
+
63
+ footer_xmls = 'word/footer[0-9]*.xml'
64
+ for fname in filelist:
65
+ if re.match(footer_xmls, fname):
66
+ text += xml2text(zipf.read(fname))
67
 
68
+ zipf.close()
69
+ if clean
70
+ text = clean_text(text)
71
+ return text, len(text)
72
 
 
 
 
 
 
 
 
73
 
74
+ def read_document(file, clean=True):
75
+ """Reads content from various document formats."""
76
  file_path = file.name
77
  file_extension = file_path.split('.')[-1].lower()
78
 
 
85
  content = ''
86
  for page in range(len(pdf_reader.pages)):
87
  content += pdf_reader.pages[page].extract_text()
88
+ if clean:
89
+ content = clean_text(content)
90
+ return content, len(content)
91
  except Exception as e:
92
+ return f"Error reading PDF: {e}", 0
93
 
94
  elif file_extension == 'xlsx':
95
  try:
 
98
  for sheet in wb.worksheets:
99
  for row in sheet.rows:
100
  for cell in row:
101
+ if cell.value is not None:
102
  content += str(cell.value) + ' '
103
+ if clean
104
+ content = clean_text(content)
105
+ return content, len(content)
106
  except Exception as e:
107
+ return f"Error reading XLSX: {e}", 0
108
 
109
  elif file_extension == 'pptx':
110
  try:
 
114
  for shape in slide.shapes:
115
  if hasattr(shape, "text"):
116
  content += shape.text + ' '
117
+ if clean:
118
+ content = clean_text(content)
119
+ return content, len(content)
120
  except Exception as e:
121
+ return f"Error reading PPTX: {e}", 0
122
 
123
  elif file_extension == 'doc' or file_extension == 'docx':
124
  try:
125
+ return extract_text_from_docx(file_content, clean)
126
  except Exception as e:
127
+ return f"Error reading DOC/DOCX: {e}", 0
128
 
129
  else:
130
  try:
131
+ content = file_content.decode('utf-8')
132
+ if clean:
133
+ content = clean_text(content)
134
+ return content, len(content)
135
  except Exception as e:
136
+ return f"Error reading file: {e}", 0
137
 
138
 
139
+ # --- Chat Functions ---
140
+
141
+ def generate_mistral_response(message):
142
+ """Generates a response from the Mistral API."""
143
+ stream = client.text_generation(
144
+ message,
145
+ max_new_tokens=MAX_NEW_TOKENS,
146
+ stream=True,
147
+ details=True,
148
+ return_full_text=False
149
+ )
150
+ output = ""
151
+ for response in stream:
152
+ if not response.token.text == "</s>":
153
+ output += response.token.text
154
+ yield output
155
 
156
 
157
+ def chat_document(file, question, clean=True):
158
+ """Chats with a document using a single Mistral API call."""
159
+ content, length = read_document(file, clean)
160
+ if length > CHUNK_SIZE:
161
+ content = content[:CHUNK_SIZE] # Limit to max chunk size
162
 
163
+ system_prompt = """
164
+ You are a helpful and informative assistant that can answer questions based on the content of documents.
165
+ You will receive the content of a document and a question about it.
166
+ Your task is to provide a concise and accurate answer to the question based solely on the provided document content.
167
+ If the document does not contain enough information to answer the question, simply state that you cannot answer the question based on the provided information.
168
+ """
169
 
170
+ message = f"""[INST] [SYSTEM] {system_prompt}
171
  Document Content: {content}
172
  Question: {question}
173
  Answer:"""
174
 
175
+ yield from generate_mistral_response(message)
 
 
 
 
 
176
 
177
 
178
+ def chat_document_v2(file, question, clean=True):
179
+ """Chats with a document using chunk-based Mistral API calls and summarizes the answers."""
180
+ content, length = read_document(file, clean)
181
  chunks = split_content(content)
182
+
183
+ system_prompt = """
184
+ You are a helpful and informative assistant that can answer questions based on the content of documents.
185
+ You will receive the content of a document and a question about it.
186
+ Your task is to provide a concise and accurate answer to the question based solely on the provided document content.
187
+ If the document does not contain enough information to answer the question, simply state that you cannot answer the question based on the provided information.
188
+ """
189
+
190
  all_answers = []
191
  for chunk in chunks:
192
+ message = f"""[INST] [SYSTEM] {system_prompt}
193
+ Document Content: {chunk[:CHUNK_SIZE]}
194
  Question: {question}
195
  Answer:"""
196
 
197
+ response = ""
198
+ for stream_response in generate_mistral_response(message):
199
+ response = stream_response # Update with latest response
200
+ all_answers.append(response)
 
 
201
 
202
  # Summarize all answers using Mistral
203
  summary_prompt = """
 
207
 
208
  Answers:
209
  """
210
+
211
  all_answers_str = "\n".join(all_answers)
 
212
  summary_message = f"""[INST] [SYSTEM] {summary_prompt}
213
+ {all_answers_str[:30000]}
214
  Summary:"""
215
+
216
+ yield from generate_mistral_response(summary_message)
217
+
218
+
219
+ # --- Gradio Interface ---
 
 
220
 
221
  with gr.Blocks() as demo:
222
  with gr.Tabs():
223
  with gr.TabItem("Document Reader"):
224
  iface1 = gr.Interface(
225
  fn=read_document,
226
+ inputs=[
227
+ gr.File(label="Upload a Document"),
228
+ gr.Checkbox(label="Clean Text", value=True),
229
+ ],
230
+ outputs=[
231
+ gr.Textbox(label="Document Content"),
232
+ gr.Number(label="Document Length (characters)"),
233
+ ],
234
  title="Document Reader",
235
  description="Upload a document (PDF, XLSX, PPTX, TXT, CSV, DOC, DOCX and Code or text file) to read its content."
236
  )
237
  with gr.TabItem("Document Chat"):
238
  iface2 = gr.Interface(
239
  fn=chat_document,
240
+ inputs=[
241
+ gr.File(label="Upload a Document"),
242
+ gr.Textbox(label="Question"),
243
+ gr.Checkbox(label="Clean and Compress Text", value=True),
244
+ ],
245
+ outputs=gr.Markdown(label="Answer"),
246
  title="Document Chat",
247
  description="Upload a document and ask questions about its content."
248
  )
249
  with gr.TabItem("Document Chat V2"):
250
  iface3 = gr.Interface(
251
  fn=chat_document_v2,
252
+ inputs=[
253
+ gr.File(label="Upload a Document"),
254
+ gr.Textbox(label="Question"),
255
+ gr.Checkbox(label="Clean Text", value=True),
256
+ ],
257
+ outputs=gr.Markdown(label="Answer"),
258
  title="Document Chat V2",
259
  description="Upload a document and ask questions about its content (using chunk-based approach)."
260
  )
261
 
262
+ demo.launch()