omwdataset

Runtime error

App Files Files Community

victormiller commited on Sep 27, 2024

Commit

7e7a96b

verified ·

1 Parent(s): 26832b9

Update web.py

Browse files

Files changed (1) hide show

web.py +34 -19

web.py CHANGED Viewed

@@ -731,23 +731,24 @@ def web_data():
             ),
         ),
         H4("3.3 Statistics-based Heuristics"),
-        P("""
-        We summarize other statistics-based rules originating from Gopher [2] in this section, which include:
-        - Word count in the document,
-        - Mean word length,
-        - Number of sentences,
-        - Symbol-to-word ratio,
-        - Fraction of alphabetic words,
-        - Number of stop words.
-        Specifically, we remove any document which meets any of the following criteria:
-        - Contains fewer than 50 words or more than 100,000 words
-        - Has a mean word length outside the range of 3 to 10 characters
-        - Contains fewer than 3 sentences
-        - Has a symbol-to-word ratio greater than 0.1
-        - Contains less than 80% alphabetic words
-        - Contains fewer than two of the following stop words: "the," "be," "to," "of," "and," "that," "have," "with"
-        """),
         H5("Word Count"),
         P("""
         Implementations from Dolma
@@ -809,7 +810,14 @@ def web_data():
         P("""
         The implementations across existing pipelines are largely identical. We adopt them and apply them to our pipeline.
         """),
-        Img(),
         H5("Our Implementations"),
         Details(
             Summary("Sample documents that are filtered out by statistics-based heuristics"),
@@ -830,7 +838,14 @@ def web_data():
             DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
         ),
         H3("4. Deduplication"),
-        P("..."),  # Add detailed content and images as needed
         H3("5. PII Removal"),
         P("..."),  # Add detailed content and images as needed
         H2("Reference"),

             ),
         ),
         H4("3.3 Statistics-based Heuristics"),
+        P("We summarize other statistics-based rules originated from Gopher [7] in this section. The statistics can be used include:"),
+        Ul(
+            Li("the word count in the document", style = "margin-bottom: 5px"),
+            Li("the mean word length", style = "margin-bottom: 5px"),
+            Li("the number of sentences", style = "margin-bottom: 5px"),
+            Li("the symbol-to-word ratio", style = "margin-bottom: 5px"),
+            Li("the fraction of alphabetic words", style = "margin-bottom: 5px"),
+            Li("and the number of stop words", style = "margin-bottom: 5px"),
+        ),
+        P("Specifically, we remove any document which satisfies any of the following criteria:"),
+        Ul(
+            Li("it contains less than 50 words or more than 100,000 words", style = "margin-bottom: 5px"),
+            Li("its mean word length is outside the range of 3 to 10", style = "margin-bottom: 5px"),
+            Li("it contains less than 3 sentences", style = "margin-bottom: 5px"),
+            Li("its symbol-to-word ratio is greater than 0.1", style = "margin-bottom: 5px"),
+            Li("the words that contain at least one alphabetic character are less than 80% of the whole words", style = "margin-bottom: 5px"),
+            Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with", style = "margin-bottom: 5px"),
+        ),
         H5("Word Count"),
         P("""
         Implementations from Dolma
         P("""
         The implementations across existing pipelines are largely identical. We adopt them and apply them to our pipeline.
         """),
+        D_code("""
+        STOP_WORDS = ('the', 'be', 'to', 'of', 'and', 'that', 'have', 'with')
+        ...
+        stop_words_pattern = re.compile("|".join(re.escape(symbol) for symbol in STOP_WORDS))
+        ...
+        attrs.num_of_stop_words = sum(1 for word in words if stop_words_pattern.search(word))
+        """, block="block", language="python"),
         H5("Our Implementations"),
         Details(
             Summary("Sample documents that are filtered out by statistics-based heuristics"),
             DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
         ),
         H3("4. Deduplication"),
+        P("""
+        After careful filtering, although data quality has improved, a large fraction of the content is repeated across documents. This may be due to the crawler indirectly hitting the same page multiple times, to boilerplate content being repeated (e.g., licences), or even to plagiarism. These duplicates can strongly impact models, favoring memorization instead of generalization.
+        """),  # Add detailed content and images as needed
+        P("We perform two-level deduplication: local exact deduplication and global fuzzy deduplication")
+        P(B("Local Exact Deduplication"))
+        P("To reduce the expensive cost of global deduplication, we apply a local exact deduplication before it. Specifically, each dump is split into 70 splits. A bloom filter is applied within each split.")
+        P(B("Global Fuzzy Deduplication"))
+        P("NEED TO UPDATE")
         H3("5. PII Removal"),
         P("..."),  # Add detailed content and images as needed
         H2("Reference"),