Spaces:

ndurner
/

oai_chat

Running

App Files Files Community

Nils Durner commited on Jan 14, 2024

Commit

e6ad240

1 Parent(s): 679fce2

docx support

Browse files

Files changed (3) hide show

app.py +11 -6
doc2json.py +181 -0
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ import os
 from openai import OpenAI
 import json
 dump_controls = False
 log_to_console = False
@@ -48,13 +50,16 @@ def add_text(history, text):
     return history, gr.Textbox(value="", interactive=False)
 def add_file(history, file):
-    with open(file.name, mode="rb") as f:
-        content = f.read()
-        if isinstance(content, bytes):
-            content = content.decode('utf-8', 'replace')
-        else:
-            content = str(content)
     fn = os.path.basename(file.name)
     history = history + [(f'```{fn}\n{content}\n```', None)]

 from openai import OpenAI
 import json
+from doc2json import process_docx
 dump_controls = False
 log_to_console = False
     return history, gr.Textbox(value="", interactive=False)
 def add_file(history, file):
+    if file.name.endswith(".docx"):
+        content = process_docx(file.name)
+    else:
+        with open(file.name, mode="rb") as f:
+            content = f.read()
+            if isinstance(content, bytes):
+                content = content.decode('utf-8', 'replace')
+            else:
+                content = str(content)
     fn = os.path.basename(file.name)
     history = history + [(f'```{fn}\n{content}\n```', None)]

doc2json.py ADDED Viewed

	@@ -0,0 +1,181 @@

+from collections import defaultdict
+import json
+import zipfile
+from lxml import etree
+# Define common fonts to ignore
+common_fonts = {
+    'Times New Roman',
+    'Arial',
+    'Calibri',
+    # Add any other common fonts here
+}
+# Define elements to ignore
+ignored_elements = {
+    'proofErr',
+    'bookmarkStart',
+    'bookmarkEnd',
+    'lastRenderedPageBreak',
+    'webHidden',
+    'numPr',
+    'pBdr',
+    'ind',
+    'spacing',
+    'jc',
+    'tabs',
+    'sectPr',
+    'pgMar'
+    # Add any other elements to ignore here
+}
+# Define attributes to ignore
+ignored_attributes = {
+    'rsidR',
+    'rsidRPr',
+    'rsidRDefault',
+    'rsidP',
+    'paraId',
+    'textId',
+    'rsidR',
+    'rsidRPr',
+    'rsidDel',
+    'rsidP',
+    'rsidTr',
+    # Add any other attributes to ignore here
+}
+# Define metadata elements to ignore
+ignored_metadata_elements = {
+    'application',
+    'docSecurity',
+    'scaleCrop',
+    'linksUpToDate',
+    'charactersWithSpaces',
+    'hiddenSlides',
+    'mmClips',
+    'notes',
+    'words',
+    'characters',
+    'pages',
+    'lines',
+    'paragraphs',
+    'company',
+    'template',
+    # Add any other metadata elements to ignore here
+}
+def remove_ignored_elements(tree):
+    """Remove all ignored elements from the XML tree, except highlights."""
+    for elem in tree.xpath(".//*"):
+        tag_without_ns = elem.tag.split('}')[-1]
+        if tag_without_ns in ignored_elements:
+            elem.getparent().remove(elem)
+        elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr':  # Check for highlights in rPr
+            if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
+                elem.getparent().remove(elem)
+        else:
+            # Remove ignored attributes
+            for attr in list(elem.attrib):
+                attr_without_ns = attr.split('}')[-1]
+                if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
+                    del elem.attrib[attr]
+    return tree
+def etree_to_dict(t):
+    """Convert an lxml etree to a nested dictionary, excluding ignored namespaces and attributes."""
+    tag = t.tag.split('}')[-1]  # Remove namespace URI
+    if tag in ignored_elements:
+        return None
+    d = {tag: {} if t.attrib else None}
+    children = list(t)
+    if children:
+        dd = defaultdict(list)
+        for dc in filter(None, map(etree_to_dict, children)):
+            for k, v in dc.items():
+                dd[k].append(v)
+        d = {tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
+    if t.attrib:
+        # Filter out common fonts and ignored attributes
+        filtered_attribs = {}
+        for k, v in t.attrib.items():
+            k = k.split('}')[-1]  # Remove namespace URI
+            if k in ('ascii', 'hAnsi', 'cs', 'eastAsia'):
+                if v not in common_fonts:
+                    filtered_attribs[k] = v
+            elif k not in ignored_attributes and not k.startswith('rsid'):
+                filtered_attribs[k] = v
+        d[tag].update(filtered_attribs)
+    if t.text:
+        text = t.text.strip()
+        # Here we ensure that the text encoding is correctly handled
+        text = bytes(text, 'utf-8').decode('utf-8', 'ignore')
+        if children or t.attrib:
+            if text:
+                d[tag]['#text'] = text
+        else:
+            d[tag] = text
+    if not t.attrib and not children and not t.text:
+        return None
+    return d
+# Additionally, update the 'remove_ignored_elements' function to fix encoding
+def remove_ignored_elements(tree):
+    """Remove all ignored elements from the XML tree, except highlights."""
+    for elem in tree.xpath(".//*"):
+        tag_without_ns = elem.tag.split('}')[-1]
+        if tag_without_ns in ignored_elements:
+            elem.getparent().remove(elem)
+        elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr':  # Check for highlights in rPr
+            if not any(child.tag.endswith('highlight') for child in elem.getchildren()):
+                elem.getparent().remove(elem)
+        else:
+            # Remove ignored attributes
+            for attr in list(elem.attrib):
+                attr_without_ns = attr.split('}')[-1]
+                if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'):
+                    del elem.attrib[attr]
+    # Decode the text correctly for each XML element
+    for elem in tree.xpath(".//text()"):
+        elem_text = elem.strip()
+        encoded_text = bytes(elem_text, 'utf-8').decode('utf-8', 'ignore')
+        parent = elem.getparent()
+        if parent is not None:
+            parent.text = encoded_text
+    return tree
+def extract_metadata(docx):
+    """Extract metadata from the document properties, ignoring specified elements."""
+    metadata = {}
+    with docx.open('docProps/core.xml') as core_xml:
+        xml_content = core_xml.read()
+        core_tree = etree.XML(xml_content)
+        for child in core_tree.getchildren():
+            tag = child.tag.split('}')[-1]  # Get tag without namespace
+            if tag not in ignored_metadata_elements:
+                metadata[tag] = child.text
+    return metadata
+def process_docx(file_path):
+    # Load the document with zipfile and lxml
+    with zipfile.ZipFile(file_path) as docx:
+        metadata = extract_metadata(docx)
+        with docx.open('word/document.xml') as document_xml:
+            xml_content = document_xml.read()
+            document_tree = etree.XML(xml_content)
+            # Remove the ignored elements
+            document_tree = remove_ignored_elements(document_tree)
+            # Convert the rest of the XML tree to a dictionary
+            document_dict = etree_to_dict(document_tree)
+            document_dict['metadata'] = metadata  # Add metadata to the document dictionary
+            docx_json = json.dumps(document_dict, ensure_ascii=False, indent=2)
+            return docx_json

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 gradio
-openai >= 1.0.0

 gradio
+openai >= 1.0.0
+lxml