pdf-toweb / app.py
Corran's picture
Update app.py
d697159 verified
raw
history blame
6.63 kB
from pathlib import Path
HERE = Path(__file__).parent
import sys
sys.path.append(HERE)
from utils import GParse_Paper, Get_Bibliography
from bs4 import BeautifulSoup
import solara
from solara.components.file_drop import FileInfo
app_style = (HERE / "style.css").read_text()
def Get_HTMLTop(title):
# Top part of HTML
html_top = f"""
<h1>{title}</h1>
<span typeof="schema:Person" resource="http://orcid.org/0000-0003-1279-3709">
</span>
"""
return html_top
def Get_Controls():
controls="""
<label for="textSize">Text Size: </label>
<select id="textSize" name="textSize" onchange="adjustTextSize(this.value)">
<option value="10">10px</option>
<option value="12">12px</option>
<option value="14">14px</option>
<option value="16" selected>16px</option>
<option value="18">18px</option>
<option value="20">20px</option>
<option value="24">24px</option>
<option value="28">28px</option>
<option value="32">32px</option>
<option value="36">36px</option>
<option value="40">40px</option>
<option value="44">44px</option>
<option value="48">48px</option>
<option value="50">50px</option>
</select>
<script>
function adjustTextSize(size) {
const baseSize = parseInt(size);
document.body.style.fontSize = baseSize + 'px';
}
</script>
<script>
function openDialog(event, dialogId) {
var dialog = document.getElementById(dialogId);
var rect = event.target.getBoundingClientRect();
dialog.style.top = rect.top + window.scrollY + 'px';
dialog.style.left = rect.left + window.scrollX + 'px';
dialog.style.display = 'block';
// Add an event listener to close the dialog when clicking outside of it
document.addEventListener('click', function(event) {
var isClickInside = dialog.contains(event.target);
var isClickOnText = event.target.classList.contains('text-area');
if (!isClickInside && !isClickOnText) {
closeDialog(dialogId);
}
}, { once: true });
}
function closeDialog(dialogId) {
document.getElementById(dialogId).style.display = 'none';
}
</script>
"""
return Controls
def Get_Sections(soup):
# Generate sections from divs
sections_content = ""
sections_list = []
raw_text=""
bib = Get_Bibliography(soup)
citation_modals = []
for div in soup.find_all("div"):
header = div.find("head")
if header is not None:
section_number = header.get('n', "")
section_id = header.text.replace(" ", "_")
sections_list.append({'num': normalize_section(section_number), 'text': section_id})
sections_content += f"<section id='{section_id}'>"
sections_content += f"<h2>{section_number} {header.text}</h2>"
else:
sections_content += f"<section id=''>"
for i,paragraph in enumerate(div.find_all("p")):
new_paragraph = ""
for ii,element in enumerate(paragraph.contents):
if isinstance(element, NavigableString):
new_paragraph += element
elif isinstance(element, Tag) and element.name == "ref" and element.get("target")!=None:
ref_id = element.get("target").lstrip("#")
if ref_id in bib.keys():
citation = f"""<span class="text-area" onclick="openDialog(event, '{ref_id}')">{element.text}</span>"""
new_paragraph += citation
cit_info = bib[ref_id]
citation_modals.append(f"""<div id="{ref_id}" class="dialog">
<b>{element.text}</b><br>
<b>Title:</b> {cit_info['title']}<br>
<b>Authors:</b> {", ".join(cit_info['authors'])}<br>
<b>Year:</b> {cit_info['year']}<br>
<b>Journal:</b> {cit_info['journal']}<br>
<b>DOI:</b> <a href="https://doi.org/{cit_info['doi']}">{cit_info['doi']} </a><br>
<button class="close-button" onclick="closeDialog('{ref_id}')">Close</button>
</div>""")
else:
new_paragraph += element.text
sections_content += f"<p>{new_paragraph}</p>"
raw_text += "\n" + paragraph.text
sections_content += "</section>"
return sections_list, sections_content
def Get_Navigation(controls,):
# Generate navigation for sections
navigation = "<div class='sticky-content' style='max-height: 100%; overflow-y: auto;'>" + controls + " <h2> Navigation </h2>"
for section in sections_list:
no_tabs = section['num'].count(".")
if no_tabs>0:
left = str(20*no_tabs)+"px" # Adjust the multiplier for desired tab width
else:
left= "0px"
navigation += f'<p style="margin-left: {left}; font-size: 10px;"><a href="#{section["text"]}">{section["num"]} {section["text"]}</a></p>'
navigation += "</div>"
def Get_Article_HTML(pdf):
article = GParse_Paper(pdf)
soup = BeautifulSoup(article, "xml")
html = Get_Article_HTML(soup)
try:
document_title = soup.find("fileDesc").find("title").text
except:
document_title = ""
html_top = Get_HTMLTop(document_title)
sections_list, sections_content = Get_Sections(soup)
controls = Get_Controls()
navigation = Get_Navigation(controls, sections_list)
# Combine all parts into final HTML
html = style + "<body><article>" + html_top + sections_content + navigation + "n".join(citation_modals)+"</body></article>"
with open("article_demo.html","w") as f:
f.write(html)
return html
html = solara.reactive("<h1> Article PDF to HTML converter </h1>")
@solara.component
def Page():
solara.Style(app_style)
def on_file(f: FileInfo):
html.value = Get_Article_HTML(f["file_obj"].read())
solara.FileDrop(label="Drag and drop article pdf", on_file=on_file, lazy=True)
solara.HTML(unsafe_innerHTML=html.value)