omkarenator commited on
Commit
600ab03
1 Parent(s): 5a0e586

start web data port

Browse files
Files changed (2) hide show
  1. style.css +4 -0
  2. web.py +33 -2
style.css CHANGED
@@ -264,3 +264,7 @@ d-contents nav > div > a:hover,
264
  d-contents nav > ul > li > a:hover {
265
  text-decoration: none;
266
  }
 
 
 
 
 
264
  d-contents nav > ul > li > a:hover {
265
  text-decoration: none;
266
  }
267
+
268
+ .hljs {
269
+ background: rgb(255, 255, 255) !important;
270
+ }
web.py CHANGED
@@ -3,5 +3,36 @@ from fasthtml.components import *
3
 
4
 
5
  def web_data():
6
- return Div(Section(H2(P("Web Data")), id="inner-text"))
7
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  def web_data():
6
+ return Div(
7
+ Div(
8
+ Ul(
9
+ Li(
10
+ A(
11
+ "Raw Documentation",
12
+ href="https://drive.google.com/drive/folders/1mIJ-Zx8tRhohFdj4ByMToNz1u_9Saa8W?usp=drive_link",
13
+ )
14
+ ),
15
+ Li(
16
+ A(
17
+ "Github link of Web Data Pipeline",
18
+ href="https://github.com/CIAI-LLM/WebDataProcessing.git",
19
+ )
20
+ ),
21
+ ),
22
+ style="""
23
+ background-color: #d4edda; /* Light green background */
24
+ padding: 15px;
25
+ border: 1px solid #c3e6cb; /* Green border */
26
+ border-radius: 5px;
27
+ margin-bottom: 20px;
28
+ """,
29
+ ),
30
+ Div(
31
+ P(
32
+ "To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Starting from ",
33
+ A("Common Crawl", href="https://commoncrawl.org/"),
34
+ ", our process can be summarized as five main steps: document preparation, line-level removal, document-level filtering, deduplication and PII removal.",
35
+ ),
36
+ style="margin-top: 20px;",
37
+ ),
38
+ )