KingNish commited on
Commit
a1654f3
·
verified ·
1 Parent(s): 0259009

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -4
app.py CHANGED
@@ -3,8 +3,40 @@ from openpyxl import load_workbook
3
  from pptx import Presentation
4
  import gradio as gr
5
  import io
6
- from docx2python import docx2python
7
  from huggingface_hub import InferenceClient
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  # Initialize the Mistral chat model
10
  client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407")
@@ -52,9 +84,7 @@ def read_document(file):
52
 
53
  elif file_extension == 'doc' or file_extension == 'docx':
54
  try:
55
- # extract docx content
56
- with docx2python(io.BytesIO(file_content)) as content:
57
- return content.text
58
  except Exception as e:
59
  return f"Error reading DOC/DOCX: {e}"
60
 
 
3
  from pptx import Presentation
4
  import gradio as gr
5
  import io
 
6
  from huggingface_hub import InferenceClient
7
+ import re
8
+ import zipfile
9
+ import xml.etree.ElementTree as ET
10
+
11
+ def xml2text(xml):
12
+ text = u''
13
+ root = ET.fromstring(xml)
14
+ for child in root.iter():
15
+ text += child.text + " " if child.text is not None else ''
16
+ return text
17
+
18
+
19
+ def extract_text_from_docx(docx_data):
20
+ text = u''
21
+ zipf = zipfile.ZipFile(io.BytesIO(docx_data))
22
+
23
+ filelist = zipf.namelist()
24
+
25
+ header_xmls = 'word/header[0-9]*.xml'
26
+ for fname in filelist:
27
+ if re.match(header_xmls, fname):
28
+ text += xml2text(zipf.read(fname))
29
+
30
+ doc_xml = 'word/document.xml'
31
+ text += xml2text(zipf.read(doc_xml))
32
+
33
+ footer_xmls = 'word/footer[0-9]*.xml'
34
+ for fname in filelist:
35
+ if re.match(footer_xmls, fname):
36
+ text += xml2text(zipf.read(fname))
37
+
38
+ zipf.close()
39
+ return text.strip()
40
 
41
  # Initialize the Mistral chat model
42
  client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407")
 
84
 
85
  elif file_extension == 'doc' or file_extension == 'docx':
86
  try:
87
+ return extract_text_from_docx(file_content)
 
 
88
  except Exception as e:
89
  return f"Error reading DOC/DOCX: {e}"
90