from fasthtml.common import *
from fasthtml.components import *
def web_data():
return Div(
Div(
Ul(
Li(
A(
"Raw Documentation",
href="https://drive.google.com/drive/folders/1mIJ-Zx8tRhohFdj4ByMToNz1u_9Saa8W?usp=drive_link",
)
),
Li(
A(
"Github link of Web Data Pipeline",
href="https://github.com/CIAI-LLM/WebDataProcessing.git",
)
),
),
style="""
background-color: #d4edda; /* Light green background */
padding: 15px;
border: 1px solid #c3e6cb; /* Green border */
border-radius: 5px;
margin-bottom: 20px;
""",
),
Div(
P(
"To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Starting from ",
A("Common Crawl", href="https://commoncrawl.org/"),
", our process can be summarized as five main steps: document preparation, line-level removal, document-level filtering, deduplication and PII removal.",
),
style="margin-top: 20px;",
),
)