|
from collections import defaultdict |
|
import json |
|
import zipfile |
|
from lxml import etree |
|
|
|
|
|
common_fonts = { |
|
'Times New Roman', |
|
'Arial', |
|
'Calibri', |
|
|
|
} |
|
|
|
|
|
ignored_elements = { |
|
'proofErr', |
|
'bookmarkStart', |
|
'bookmarkEnd', |
|
'lastRenderedPageBreak', |
|
'webHidden', |
|
'numPr', |
|
'pBdr', |
|
'ind', |
|
'spacing', |
|
'jc', |
|
'tabs', |
|
'sectPr', |
|
'pgMar' |
|
|
|
} |
|
|
|
|
|
ignored_attributes = { |
|
'rsidR', |
|
'rsidRPr', |
|
'rsidRDefault', |
|
'rsidP', |
|
'paraId', |
|
'textId', |
|
'rsidR', |
|
'rsidRPr', |
|
'rsidDel', |
|
'rsidP', |
|
'rsidTr', |
|
|
|
} |
|
|
|
|
|
ignored_metadata_elements = { |
|
'application', |
|
'docSecurity', |
|
'scaleCrop', |
|
'linksUpToDate', |
|
'charactersWithSpaces', |
|
'hiddenSlides', |
|
'mmClips', |
|
'notes', |
|
'words', |
|
'characters', |
|
'pages', |
|
'lines', |
|
'paragraphs', |
|
'company', |
|
'template', |
|
|
|
} |
|
|
|
def remove_ignored_elements(tree): |
|
"""Remove all ignored elements from the XML tree, except highlights.""" |
|
for elem in tree.xpath(".//*"): |
|
tag_without_ns = elem.tag.split('}')[-1] |
|
if tag_without_ns in ignored_elements: |
|
elem.getparent().remove(elem) |
|
elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr': |
|
if not any(child.tag.endswith('highlight') for child in elem.getchildren()): |
|
elem.getparent().remove(elem) |
|
else: |
|
|
|
for attr in list(elem.attrib): |
|
attr_without_ns = attr.split('}')[-1] |
|
if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'): |
|
del elem.attrib[attr] |
|
return tree |
|
|
|
def etree_to_dict(t): |
|
"""Convert an lxml etree to a nested dictionary, excluding ignored namespaces and attributes.""" |
|
tag = t.tag.split('}')[-1] |
|
if tag in ignored_elements: |
|
return None |
|
|
|
d = {tag: {} if t.attrib else None} |
|
children = list(t) |
|
if children: |
|
dd = defaultdict(list) |
|
for dc in filter(None, map(etree_to_dict, children)): |
|
for k, v in dc.items(): |
|
dd[k].append(v) |
|
d = {tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}} |
|
|
|
if t.attrib: |
|
|
|
filtered_attribs = {} |
|
for k, v in t.attrib.items(): |
|
k = k.split('}')[-1] |
|
if k in ('ascii', 'hAnsi', 'cs', 'eastAsia'): |
|
if v not in common_fonts: |
|
filtered_attribs[k] = v |
|
elif k not in ignored_attributes and not k.startswith('rsid'): |
|
filtered_attribs[k] = v |
|
d[tag].update(filtered_attribs) |
|
|
|
if t.text: |
|
text = t.text.strip() |
|
|
|
text = bytes(text, 'utf-8').decode('utf-8', 'ignore') |
|
if children or t.attrib: |
|
if text: |
|
d[tag]['#text'] = text |
|
else: |
|
d[tag] = text |
|
|
|
if not t.attrib and not children and not t.text: |
|
return None |
|
|
|
return d |
|
|
|
|
|
def remove_ignored_elements(tree): |
|
"""Remove all ignored elements from the XML tree, except highlights.""" |
|
for elem in tree.xpath(".//*"): |
|
tag_without_ns = elem.tag.split('}')[-1] |
|
if tag_without_ns in ignored_elements: |
|
elem.getparent().remove(elem) |
|
elif elem.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}rPr': |
|
if not any(child.tag.endswith('highlight') for child in elem.getchildren()): |
|
elem.getparent().remove(elem) |
|
else: |
|
|
|
for attr in list(elem.attrib): |
|
attr_without_ns = attr.split('}')[-1] |
|
if attr_without_ns in ignored_attributes or attr_without_ns.startswith('rsid'): |
|
del elem.attrib[attr] |
|
|
|
for elem in tree.xpath(".//text()"): |
|
elem_text = elem.strip() |
|
encoded_text = bytes(elem_text, 'utf-8').decode('utf-8', 'ignore') |
|
parent = elem.getparent() |
|
if parent is not None: |
|
parent.text = encoded_text |
|
return tree |
|
|
|
def extract_metadata(docx): |
|
"""Extract metadata from the document properties, ignoring specified elements.""" |
|
metadata = {} |
|
with docx.open('docProps/core.xml') as core_xml: |
|
xml_content = core_xml.read() |
|
core_tree = etree.XML(xml_content) |
|
for child in core_tree.getchildren(): |
|
tag = child.tag.split('}')[-1] |
|
if tag not in ignored_metadata_elements: |
|
metadata[tag] = child.text |
|
return metadata |
|
|
|
def process_docx(file_path): |
|
|
|
with zipfile.ZipFile(file_path) as docx: |
|
metadata = extract_metadata(docx) |
|
with docx.open('word/document.xml') as document_xml: |
|
xml_content = document_xml.read() |
|
document_tree = etree.XML(xml_content) |
|
|
|
|
|
document_tree = remove_ignored_elements(document_tree) |
|
|
|
|
|
document_dict = etree_to_dict(document_tree) |
|
document_dict['metadata'] = metadata |
|
|
|
docx_json = json.dumps(document_dict, ensure_ascii=False, indent=2) |
|
|
|
return docx_json |
|
|