from fasthtml.common import * from fasthtml.components import * def web_data(): return Div( Div( Ul( Li( A( "Raw Documentation", href="https://drive.google.com/drive/folders/1mIJ-Zx8tRhohFdj4ByMToNz1u_9Saa8W?usp=drive_link", ) ), Li( A( "Github link of Web Data Pipeline", href="https://github.com/CIAI-LLM/WebDataProcessing.git", ) ), ), style=""" background-color: #d4edda; /* Light green background */ padding: 15px; border: 1px solid #c3e6cb; /* Green border */ border-radius: 5px; margin-bottom: 20px; """, ), Div( P( "To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Starting from ", A("Common Crawl", href="https://commoncrawl.org/"), ", our process can be summarized as five main steps: document preparation, line-level removal, document-level filtering, deduplication and PII removal.", ), style="margin-top: 20px;", ), )