Converted Document

' if paragraph.alignment == WD_ALIGN_PARAGRAPH.CENTER: html_content += '

' elif paragraph.alignment == WD_ALIGN_PARAGRAPH.RIGHT: html_content += '

' else: html_content += '

' for run in paragraph.runs: style = [] if run.bold: style.append('font-weight: bold') if run.italic: style.append('font-style: italic') if run.underline: style.append('text-decoration: underline') if run.font.size: style.append(f'font-size: {run.font.size.pt}pt') drawing_elements = run._element.findall('.//w:drawing', {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) for drawing in drawing_elements: blip = drawing.find('.//a:blip', {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}) if blip is not None: image_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed') if image_rel_id in images_dict: position = get_image_position(drawing) if position: style_pos = f"position: absolute; left: {position['x']}in; top: {position['y']}in;" html_content += f'

' html_content += f' Document Image

' html_content += '

' else: html_content += f'

' html_content += f' Document Image

' html_content += '

' style_str = '; '.join(style) if run.text.strip(): html_content += f'{run.text}' html_content += '

' return html_content def process_table(table, images_dict): html_content = '' for row in table.rows: html_content += '' for cell in row.cells: html_content += '' html_content += '' html_content += '

' for paragraph in cell.paragraphs: for run in paragraph.runs: style = [] if run.bold: style.append('font-weight: bold') if run.italic: style.append('font-style: italic') if run.underline: style.append('text-decoration: underline') if run.font.size: style.append(f'font-size: {run.font.size.pt}pt') drawing_elements = run._element.findall('.//w:drawing', {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) for drawing in drawing_elements: blip = drawing.find('.//a:blip', {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}) if blip is not None: image_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed') if image_rel_id in images_dict: html_content += f'

' html_content += f' Table Cell Image

' html_content += '

' style_str = '; '.join(style) if run.text.strip(): html_content += f'{run.text}' html_content += '

' return html_content def pdf_to_html(pdf_file, font_name): if not pdf_file: return None try: docx_filename = pdf_file.name.replace('.pdf', '.docx') cv = Converter(pdf_file.name) cv.convert(docx_filename) cv.close() doc = Document(docx_filename) images_dict = extract_images_from_doc(doc) html_content = """ Converted Document

""" paragraph_map = {} current_paragraph_index = 0 for para in doc.paragraphs: paragraph_map[para._element] = current_paragraph_index current_paragraph_index += 1 for element in doc.element.body: if element.tag.endswith('p'): if element in paragraph_map: paragraph = doc.paragraphs[paragraph_map[element]] html_content += process_paragraph(paragraph, images_dict) elif element.tag.endswith('tbl'): table_index = len([e for e in doc.element.body[:doc.element.body.index(element)] if e.tag.endswith('tbl')]) html_content += process_table(doc.tables[table_index], images_dict) html_content += "

" ttf_files = {os.path.basename(f): f for f in find_ttf_fonts()} if font_name in ttf_files: font_path = ttf_files[font_name] font_name_clean = os.path.splitext(font_name)[0] html_content = embed_font_in_html(font_path, font_name_clean, html_content) html_filename = "output_with_font.html" with open(html_filename, "w", encoding="utf-8") as html_file: html_file.write(html_content) os.remove(docx_filename) return html_filename except Exception as e: print(f"Error in pdf_to_html: {str(e)}") return None # Gradio Interface with gr.Blocks(theme=gr.themes.Soft()) as app: gr.Markdown("# Bionic Reading PDF Converter") gr.Markdown("### https://github.com/SanshruthR/Bionic_Reading_Hub") with gr.Row(): gr.Image("image.jpeg", label="Bionic Reading Example", show_label=False, width=400, height=300) with gr.Row(): with gr.Column(scale=2): pdf_input = gr.File( label="Upload Your PDF", file_types=[".pdf"], file_count="single" ) ttf_files = find_ttf_fonts() font_dropdown = gr.Dropdown( [os.path.basename(font) for font in ttf_files], label="Select Font Style", value=os.path.basename(ttf_files[0]) if ttf_files else None, info="Choose your preferred reading font" ) convert_pdf_to_html = gr.Button( "Convert to Bionic Format", variant="primary", size="lg" ) font_output = gr.File( label="Download Enhanced HTML File", type="filepath" ) with gr.Row(): example_files = [ os.path.join("examples", f) for f in os.listdir("examples") if f.endswith('.pdf') ] if os.path.exists("examples") else [] if example_files: gr.Examples( example_files, pdf_input, label="Sample PDFs" ) with gr.Row(): gr.Markdown( """ --- 📝 Best results with text-based PDFs (not scanned documents) """ ) convert_pdf_to_html.click( pdf_to_html, inputs=[pdf_input, font_dropdown], outputs=[font_output] ) app.launch()