import gradio as gr from pdf2docx import Converter from docx import Document import os import glob import base64 from docx.shared import Inches, Pt from docx.oxml import OxmlElement from docx.enum.text import WD_ALIGN_PARAGRAPH import xml.etree.ElementTree as ET def find_ttf_fonts(): files = glob.glob('**/*.ttf', recursive=True) return files def embed_font_in_html(font_path, font_name, html_content): with open(font_path, "rb") as font_file: font_data = font_file.read() encoded_font = base64.b64encode(font_data).decode('utf-8') font_style = f""" """ return font_style + html_content def extract_images_from_doc(doc): images = {} for rel in doc.part.rels.values(): if "image" in rel.reltype: try: image_data = rel.target_part.blob image_type = rel.target_part.content_type.split('/')[-1] if image_type.lower() not in ['jpeg', 'jpg', 'png', 'gif']: image_type = 'png' encoded_image = base64.b64encode(image_data).decode('utf-8') images[rel.rId] = f"data:image/{image_type};base64,{encoded_image}" except Exception as e: print(f"Error processing image: {str(e)}") continue return images def get_image_position(element): try: anchor = element.find('.//wp:anchor', {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) if anchor is not None: pos_h = anchor.find('.//wp:positionH', {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) pos_v = anchor.find('.//wp:positionV', {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) if pos_h is not None and pos_v is not None: x = pos_h.find('.//wp:posOffset', {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) y = pos_v.find('.//wp:posOffset', {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) if x is not None and y is not None: return { 'x': int(x.text) / 914400, 'y': int(y.text) / 914400 } except Exception: pass return None def process_paragraph(paragraph, images_dict): html_content = '
'
for paragraph in cell.paragraphs:
for run in paragraph.runs:
style = []
if run.bold: style.append('font-weight: bold')
if run.italic: style.append('font-style: italic')
if run.underline: style.append('text-decoration: underline')
if run.font.size: style.append(f'font-size: {run.font.size.pt}pt')
drawing_elements = run._element.findall('.//w:drawing',
{'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
for drawing in drawing_elements:
blip = drawing.find('.//a:blip',
{'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
if blip is not None:
image_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
if image_rel_id in images_dict:
html_content += f' '
html_content += f''
html_content += ' '
style_str = '; '.join(style)
if run.text.strip():
html_content += f'{run.text}'
html_content += ' | '
html_content += '