from pathlib import Path HERE = Path(__file__).parent import sys sys.path.append(HERE) from utils import GParse_Paper, Get_Bibliography from bs4 import BeautifulSoup import solara from solara.components.file_drop import FileInfo app_style = (HERE / "style.css").read_text() def Get_HTMLTop(title): # Top part of HTML html_top = f"""

{title}

""" return html_top def Get_Controls(): controls=""" """ return Controls def Get_Sections(soup): # Generate sections from divs sections_content = "" sections_list = [] raw_text="" bib = Get_Bibliography(soup) citation_modals = [] for div in soup.find_all("div"): header = div.find("head") if header is not None: section_number = header.get('n', "") section_id = header.text.replace(" ", "_") sections_list.append({'num': normalize_section(section_number), 'text': section_id}) sections_content += f"
" sections_content += f"

{section_number} {header.text}

" else: sections_content += f"
" for i,paragraph in enumerate(div.find_all("p")): new_paragraph = "" for ii,element in enumerate(paragraph.contents): if isinstance(element, NavigableString): new_paragraph += element elif isinstance(element, Tag) and element.name == "ref" and element.get("target")!=None: ref_id = element.get("target").lstrip("#") if ref_id in bib.keys(): citation = f"""{element.text}""" new_paragraph += citation cit_info = bib[ref_id] citation_modals.append(f"""
{element.text}
Title: {cit_info['title']}
Authors: {", ".join(cit_info['authors'])}
Year: {cit_info['year']}
Journal: {cit_info['journal']}
DOI: {cit_info['doi']}
""") else: new_paragraph += element.text sections_content += f"

{new_paragraph}

" raw_text += "\n" + paragraph.text sections_content += "
" return sections_list, sections_content def Get_Navigation(controls,): # Generate navigation for sections navigation = "
" + controls + "

Navigation

" for section in sections_list: no_tabs = section['num'].count(".") if no_tabs>0: left = str(20*no_tabs)+"px" # Adjust the multiplier for desired tab width else: left= "0px" navigation += f'

{section["num"]} {section["text"]}

' navigation += "
" def Get_Article_HTML(pdf): article = GParse_Paper(pdf) soup = BeautifulSoup(article, "xml") html = Get_Article_HTML(soup) try: document_title = soup.find("fileDesc").find("title").text except: document_title = "" html_top = Get_HTMLTop(document_title) sections_list, sections_content = Get_Sections(soup) controls = Get_Controls() navigation = Get_Navigation(controls, sections_list) # Combine all parts into final HTML html = style + "
" + html_top + sections_content + navigation + "n".join(citation_modals)+"
" with open("article_demo.html","w") as f: f.write(html) return html html = solara.reactive("

Article PDF to HTML converter

") @solara.component def Page(): solara.Style(app_style) def on_file(f: FileInfo): html.value = Get_Article_HTML(f["file_obj"].read()) solara.FileDrop(label="Drag and drop article pdf", on_file=on_file, lazy=True) solara.HTML(unsafe_innerHTML=html.value)