Spaces:

KingNish
/

Doc-Reader-and-Chat

Running

KingNish commited on Sep 19, 2024

Commit

a1654f3

verified ·

1 Parent(s): 0259009

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,8 +3,40 @@ from openpyxl import load_workbook
 from pptx import Presentation
 import gradio as gr
 import io
-from docx2python import docx2python
 from huggingface_hub import InferenceClient
 # Initialize the Mistral chat model
 client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407")
@@ -52,9 +84,7 @@ def read_document(file):
     elif file_extension == 'doc' or file_extension == 'docx':
         try:
-            # extract docx content
-            with docx2python(io.BytesIO(file_content)) as content:
-                return content.text
         except Exception as e:
             return f"Error reading DOC/DOCX: {e}"

 from pptx import Presentation
 import gradio as gr
 import io
 from huggingface_hub import InferenceClient
+import re
+import zipfile
+import xml.etree.ElementTree as ET
+def xml2text(xml):
+    text = u''
+    root = ET.fromstring(xml)
+    for child in root.iter():
+        text += child.text + " " if child.text is not None else ''
+    return text
+def extract_text_from_docx(docx_data):
+    text = u''
+    zipf = zipfile.ZipFile(io.BytesIO(docx_data))
+    filelist = zipf.namelist()
+    header_xmls = 'word/header[0-9]*.xml'
+    for fname in filelist:
+        if re.match(header_xmls, fname):
+            text += xml2text(zipf.read(fname))
+    doc_xml = 'word/document.xml'
+    text += xml2text(zipf.read(doc_xml))
+    footer_xmls = 'word/footer[0-9]*.xml'
+    for fname in filelist:
+        if re.match(footer_xmls, fname):
+            text += xml2text(zipf.read(fname))
+    zipf.close()
+    return text.strip()
 # Initialize the Mistral chat model
 client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407")
     elif file_extension == 'doc' or file_extension == 'docx':
         try:
+            return extract_text_from_docx(file_content)
         except Exception as e:
             return f"Error reading DOC/DOCX: {e}"