File size: 1,435 Bytes
34ecf31
 
 
 
 
600ab03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from fasthtml.common import *
from fasthtml.components import *


def web_data():
    return Div(
        Div(
            Ul(
                Li(
                    A(
                        "Raw Documentation",
                        href="https://drive.google.com/drive/folders/1mIJ-Zx8tRhohFdj4ByMToNz1u_9Saa8W?usp=drive_link",
                    )
                ),
                Li(
                    A(
                        "Github link of Web Data Pipeline",
                        href="https://github.com/CIAI-LLM/WebDataProcessing.git",
                    )
                ),
            ),
            style="""
            background-color: #d4edda; /* Light green background */
            padding: 15px;
            border: 1px solid #c3e6cb; /* Green border */
            border-radius: 5px;
            margin-bottom: 20px;
        """,
        ),
        Div(
            P(
                "To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Starting from ",
                A("Common Crawl", href="https://commoncrawl.org/"),
                ", our process can be summarized as five main steps: document preparation, line-level removal, document-level filtering, deduplication and PII removal.",
            ),
            style="margin-top: 20px;",
        ),
    )