Spaces:

Sanshruth
/

Bionic_Reading_Hub

Running

App Files Files Community

Sanshruth commited on Oct 27, 2024

Commit

5ff5737

verified ·

1 Parent(s): 41c4568

initial_commit

Browse files

Files changed (7) hide show

.gitattributes +2 -0
Fast_Sans_Bionic.ttf +3 -0
Fast_Sans_Dotted.ttf +3 -0
Fast_Serif.ttf +0 -0
app.py +316 -0
image.jpeg +0 -0
requirements.txt +4 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Fast_Sans_Bionic.ttf filter=lfs diff=lfs merge=lfs -text
+Fast_Sans_Dotted.ttf filter=lfs diff=lfs merge=lfs -text

Fast_Sans_Bionic.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e721ed03e58b0a7a964d425d4e851d421fb30ac1bf4c5f6501f339a121b97b6e
+size 1868664

Fast_Sans_Dotted.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8299839ea5626ca69924d9e60d11a6bcb22eee97621b11f9b9005c8745631454
+size 1738812

Fast_Serif.ttf ADDED Viewed

Binary file (930 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,316 @@

+import gradio as gr
+from pdf2docx import Converter
+from docx import Document
+import os
+import glob
+import base64
+from docx.shared import Inches, Pt
+from docx.oxml import OxmlElement
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+import xml.etree.ElementTree as ET
+def find_ttf_fonts():
+    files = glob.glob('**/*.ttf', recursive=True)
+def embed_font_in_html(font_path, font_name, html_content):
+    with open(font_path, "rb") as font_file:
+        font_data = font_file.read()
+    encoded_font = base64.b64encode(font_data).decode('utf-8')
+    font_style = f"""
+    <style>
+    @font-face {{
+        font-family: '{font_name}';
+        src: url(data:font/ttf;base64,{encoded_font}) format('truetype');
+    }}
+    body {{
+        font-family: '{font_name}', Arial, sans-serif;
+        margin: 0;
+        padding: 0;
+        background-color: white;
+    }}
+    .page {{
+        position: relative;
+        width: 8.5in;
+        margin: 20px auto;
+        padding: 20px;
+        box-sizing: border-box;
+        background-color: white;
+        box-shadow: 0 0 10px rgba(0,0,0,0.1);
+    }}
+    .paragraph {{
+        margin: 0;
+        padding: 0;
+        position: relative;
+    }}
+    .image-container {{
+        display: inline-block;
+        position: relative;
+        vertical-align: middle;
+    }}
+    img {{
+        max-width: 100%;
+        height: auto;
+        display: inline-block;
+        vertical-align: middle;
+    }}
+    table {{
+        border-collapse: collapse;
+        width: 100%;
+        margin: 10px 0;
+    }}
+    td, th {{
+        border: 1px solid black;
+        padding: 8px;
+        position: relative;
+    }}
+    </style>
+    """
+    return font_style + html_content
+def extract_images_from_doc(doc):
+    images = {}
+    for rel in doc.part.rels.values():
+        if "image" in rel.reltype:
+            try:
+                image_data = rel.target_part.blob
+                image_type = rel.target_part.content_type.split('/')[-1]
+                if image_type.lower() not in ['jpeg', 'jpg', 'png', 'gif']:
+                    image_type = 'png'
+                encoded_image = base64.b64encode(image_data).decode('utf-8')
+                images[rel.rId] = f"data:image/{image_type};base64,{encoded_image}"
+            except Exception as e:
+                print(f"Error processing image: {str(e)}")
+                continue
+    return images
+def get_image_position(element):
+    try:
+        anchor = element.find('.//wp:anchor',
+            {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
+        if anchor is not None:
+            pos_h = anchor.find('.//wp:positionH',
+                {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
+            pos_v = anchor.find('.//wp:positionV',
+                {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
+            if pos_h is not None and pos_v is not None:
+                x = pos_h.find('.//wp:posOffset',
+                    {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
+                y = pos_v.find('.//wp:posOffset',
+                    {'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
+                if x is not None and y is not None:
+                    return {
+                        'x': int(x.text) / 914400,
+                        'y': int(y.text) / 914400
+                    }
+    except Exception:
+        pass
+    return None
+def process_paragraph(paragraph, images_dict):
+    html_content = '<div class="paragraph">'
+    if paragraph.alignment == WD_ALIGN_PARAGRAPH.CENTER:
+        html_content += '<div style="text-align: center;">'
+    elif paragraph.alignment == WD_ALIGN_PARAGRAPH.RIGHT:
+        html_content += '<div style="text-align: right;">'
+    else:
+        html_content += '<div>'
+    for run in paragraph.runs:
+        style = []
+        if run.bold: style.append('font-weight: bold')
+        if run.italic: style.append('font-style: italic')
+        if run.underline: style.append('text-decoration: underline')
+        if run.font.size: style.append(f'font-size: {run.font.size.pt}pt')
+        drawing_elements = run._element.findall('.//w:drawing',
+            {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
+        for drawing in drawing_elements:
+            blip = drawing.find('.//a:blip',
+                {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
+            if blip is not None:
+                image_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
+                if image_rel_id in images_dict:
+                    position = get_image_position(drawing)
+                    if position:
+                        style_pos = f"position: absolute; left: {position['x']}in; top: {position['y']}in;"
+                        html_content += f'<div class="image-container" style="{style_pos}">'
+                        html_content += f'<img src="{images_dict[image_rel_id]}" alt="Document Image"/>'
+                        html_content += '</div>'
+                    else:
+                        html_content += f'<div class="image-container">'
+                        html_content += f'<img src="{images_dict[image_rel_id]}" alt="Document Image"/>'
+                        html_content += '</div>'
+        style_str = '; '.join(style)
+        if run.text.strip():
+            html_content += f'<span style="{style_str}">{run.text}</span>'
+    html_content += '</div></div>'
+    return html_content
+def process_table(table, images_dict):
+    html_content = '<table>'
+    for row in table.rows:
+        html_content += '<tr>'
+        for cell in row.cells:
+            html_content += '<td>'
+            for paragraph in cell.paragraphs:
+                for run in paragraph.runs:
+                    style = []
+                    if run.bold: style.append('font-weight: bold')
+                    if run.italic: style.append('font-style: italic')
+                    if run.underline: style.append('text-decoration: underline')
+                    if run.font.size: style.append(f'font-size: {run.font.size.pt}pt')
+                    drawing_elements = run._element.findall('.//w:drawing',
+                        {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
+                    for drawing in drawing_elements:
+                        blip = drawing.find('.//a:blip',
+                            {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
+                        if blip is not None:
+                            image_rel_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
+                            if image_rel_id in images_dict:
+                                html_content += f'<div class="image-container">'
+                                html_content += f'<img src="{images_dict[image_rel_id]}" alt="Table Cell Image"/>'
+                                html_content += '</div>'
+                    style_str = '; '.join(style)
+                    if run.text.strip():
+                        html_content += f'<span style="{style_str}">{run.text}</span>'
+            html_content += '</td>'
+        html_content += '</tr>'
+    html_content += '</table>'
+    return html_content
+def pdf_to_html(pdf_file, font_name):
+    if not pdf_file:
+        return None
+    try:
+        docx_filename = pdf_file.name.replace('.pdf', '.docx')
+        cv = Converter(pdf_file.name)
+        cv.convert(docx_filename)
+        cv.close()
+        doc = Document(docx_filename)
+        images_dict = extract_images_from_doc(doc)
+        html_content = """<!DOCTYPE html>
+        <html>
+        <head>
+            <meta charset='utf-8'>
+            <title>Converted Document</title>
+        </head>
+        <body>
+        <div class="page">
+        """
+        paragraph_map = {}
+        current_paragraph_index = 0
+        for para in doc.paragraphs:
+            paragraph_map[para._element] = current_paragraph_index
+            current_paragraph_index += 1
+        for element in doc.element.body:
+            if element.tag.endswith('p'):
+                if element in paragraph_map:
+                    paragraph = doc.paragraphs[paragraph_map[element]]
+                    html_content += process_paragraph(paragraph, images_dict)
+            elif element.tag.endswith('tbl'):
+                table_index = len([e for e in doc.element.body[:doc.element.body.index(element)]
+                                 if e.tag.endswith('tbl')])
+                html_content += process_table(doc.tables[table_index], images_dict)
+        html_content += "</div></body></html>"
+        ttf_files = {os.path.basename(f): f for f in find_ttf_fonts()}
+        if font_name in ttf_files:
+            font_path = ttf_files[font_name]
+            font_name_clean = os.path.splitext(font_name)[0]
+            html_content = embed_font_in_html(font_path, font_name_clean, html_content)
+        html_filename = "output_with_font.html"
+        with open(html_filename, "w", encoding="utf-8") as html_file:
+            html_file.write(html_content)
+        os.remove(docx_filename)
+        return html_filename
+    except Exception as e:
+        print(f"Error in pdf_to_html: {str(e)}")
+        return None
+# Gradio Interface
+with gr.Blocks(theme=gr.themes.Soft()) as app:
+    gr.Markdown("# Bionic Reading PDF Converter")
+    with gr.Row():
+        gr.Image("image.jpeg",
+                label="Bionic Reading Example",
+                show_label=False,
+                width=400,
+                height=300)
+    with gr.Row():
+        with gr.Column(scale=2):
+            pdf_input = gr.File(
+                label="Upload Your PDF",
+                file_types=[".pdf"],
+                file_count="single"
+            )
+            ttf_files = find_ttf_fonts()
+            font_dropdown = gr.Dropdown(
+                [os.path.basename(font) for font in ttf_files],
+                label="Select Font Style",
+                value=os.path.basename(ttf_files[0]) if ttf_files else None,
+                info="Choose your preferred reading font"
+            )
+            convert_pdf_to_html = gr.Button(
+                "Convert to Bionic Format",
+                variant="primary",
+                size="lg"
+            )
+            font_output = gr.File(
+                label="Download Enhanced HTML File",
+                type="filepath"
+            )
+    with gr.Row():
+        example_files = [
+            os.path.join("examples", f)
+            for f in os.listdir("examples")
+            if f.endswith('.pdf')
+        ] if os.path.exists("examples") else []
+        if example_files:
+            gr.Examples(
+                example_files,
+                pdf_input,
+                label="Sample PDFs"
+            )
+    with gr.Row():
+        gr.Markdown(
+            """
+            ---
+            📝 Best results with text-based PDFs (not scanned documents)
+            """
+        )
+    convert_pdf_to_html.click(
+        pdf_to_html,
+        inputs=[pdf_input, font_dropdown],
+        outputs=[font_output]
+    )
+app.launch(debug=True)

image.jpeg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+pdf2docx
+python-docx
+fpdf
+gradio