Spaces:
Sleeping
Sleeping
from fasthtml.common import * | |
from fasthtml.components import * | |
def web_data(): | |
return Div( | |
Div( | |
Ul( | |
Li( | |
A( | |
"Raw Documentation", | |
href="https://drive.google.com/drive/folders/1mIJ-Zx8tRhohFdj4ByMToNz1u_9Saa8W?usp=drive_link", | |
) | |
), | |
Li( | |
A( | |
"Github link of Web Data Pipeline", | |
href="https://github.com/CIAI-LLM/WebDataProcessing.git", | |
) | |
), | |
), | |
style=""" | |
background-color: #d4edda; /* Light green background */ | |
padding: 15px; | |
border: 1px solid #c3e6cb; /* Green border */ | |
border-radius: 5px; | |
margin-bottom: 20px; | |
""", | |
), | |
Div( | |
P( | |
"To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Starting from ", | |
A("Common Crawl", href="https://commoncrawl.org/"), | |
", our process can be summarized as five main steps: document preparation, line-level removal, document-level filtering, deduplication and PII removal.", | |
), | |
style="margin-top: 20px;", | |
), | |
) | |