Spaces:
Sleeping
Sleeping
omkarenator
commited on
Commit
•
600ab03
1
Parent(s):
5a0e586
start web data port
Browse files
style.css
CHANGED
@@ -264,3 +264,7 @@ d-contents nav > div > a:hover,
|
|
264 |
d-contents nav > ul > li > a:hover {
|
265 |
text-decoration: none;
|
266 |
}
|
|
|
|
|
|
|
|
|
|
264 |
d-contents nav > ul > li > a:hover {
|
265 |
text-decoration: none;
|
266 |
}
|
267 |
+
|
268 |
+
.hljs {
|
269 |
+
background: rgb(255, 255, 255) !important;
|
270 |
+
}
|
web.py
CHANGED
@@ -3,5 +3,36 @@ from fasthtml.components import *
|
|
3 |
|
4 |
|
5 |
def web_data():
|
6 |
-
return Div(
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
|
5 |
def web_data():
|
6 |
+
return Div(
|
7 |
+
Div(
|
8 |
+
Ul(
|
9 |
+
Li(
|
10 |
+
A(
|
11 |
+
"Raw Documentation",
|
12 |
+
href="https://drive.google.com/drive/folders/1mIJ-Zx8tRhohFdj4ByMToNz1u_9Saa8W?usp=drive_link",
|
13 |
+
)
|
14 |
+
),
|
15 |
+
Li(
|
16 |
+
A(
|
17 |
+
"Github link of Web Data Pipeline",
|
18 |
+
href="https://github.com/CIAI-LLM/WebDataProcessing.git",
|
19 |
+
)
|
20 |
+
),
|
21 |
+
),
|
22 |
+
style="""
|
23 |
+
background-color: #d4edda; /* Light green background */
|
24 |
+
padding: 15px;
|
25 |
+
border: 1px solid #c3e6cb; /* Green border */
|
26 |
+
border-radius: 5px;
|
27 |
+
margin-bottom: 20px;
|
28 |
+
""",
|
29 |
+
),
|
30 |
+
Div(
|
31 |
+
P(
|
32 |
+
"To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Starting from ",
|
33 |
+
A("Common Crawl", href="https://commoncrawl.org/"),
|
34 |
+
", our process can be summarized as five main steps: document preparation, line-level removal, document-level filtering, deduplication and PII removal.",
|
35 |
+
),
|
36 |
+
style="margin-top: 20px;",
|
37 |
+
),
|
38 |
+
)
|