fh-new-vm1

Sleeping

omkarenator commited on Sep 24, 2024

Commit

600ab03

1 Parent(s): 5a0e586

start web data port

Files changed (2) hide show

style.css CHANGED Viewed

@@ -264,3 +264,7 @@ d-contents nav > div > a:hover,
 d-contents nav > ul > li > a:hover {
     text-decoration: none;
 }

 d-contents nav > ul > li > a:hover {
     text-decoration: none;
 }
+.hljs {
+    background: rgb(255, 255, 255) !important;
+}

web.py CHANGED Viewed

@@ -3,5 +3,36 @@ from fasthtml.components import *
 def web_data():
-    return Div(Section(H2(P("Web Data")), id="inner-text"))

 def web_data():
+    return Div(
+        Div(
+            Ul(
+                Li(
+                    A(
+                        "Raw Documentation",
+                        href="https://drive.google.com/drive/folders/1mIJ-Zx8tRhohFdj4ByMToNz1u_9Saa8W?usp=drive_link",
+                    )
+                ),
+                Li(
+                    A(
+                        "Github link of Web Data Pipeline",
+                        href="https://github.com/CIAI-LLM/WebDataProcessing.git",
+                    )
+                ),
+            ),
+            style="""
+            background-color: #d4edda; /* Light green background */
+            padding: 15px;
+            border: 1px solid #c3e6cb; /* Green border */
+            border-radius: 5px;
+            margin-bottom: 20px;
+        """,
+        ),
+        Div(
+            P(
+                "To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Starting from ",
+                A("Common Crawl", href="https://commoncrawl.org/"),
+                ", our process can be summarized as five main steps: document preparation, line-level removal, document-level filtering, deduplication and PII removal.",
+            ),
+            style="margin-top: 20px;",
+        ),
+    )