omwdataset

Runtime error

App Files Files Community

hunterhector commited on Oct 5, 2024

Commit

ddc7526

1 Parent(s): a1ddc25

fix sankey

Browse files

Files changed (2) hide show

main.py +17 -17
web.py +72 -19

main.py CHANGED Viewed

@@ -52,7 +52,7 @@ front_matter = {
         },
         {
             "author": "Nikhil Ranjan",
-            "authorURL": "https://huggingface.co/NikhilRanjan",
             "affiliation": "MBZUAI",
             "affiliationURL": "",
         },
@@ -64,56 +64,56 @@ front_matter = {
         },
         {
             "author": "Zhen Wang",
-            "authorURL": "https://huggingface.co/ZhenWang",
             "affiliation": "MBZUAI",
             "affiliationURL": "",
         },
         {
             "author": "An Li",
-            "authorURL": "https://huggingface.co/AnLi",
-            "affiliation": "",
             "affiliationURL": "",
         },
         {
             "author": "Zhoujun Cheng",
-            "authorURL": "https://huggingface.co/ZhoujunCheng",
-            "affiliation": "",
             "affiliationURL": "",
         },
         {
             "author": "Suqi Sun",
-            "authorURL": "https://huggingface.co/SuqiSun",
             "affiliation": "Petuum, Inc.",
             "affiliationURL": "",
         },
         {
             "author": "Cun Mu",
-            "authorURL": "https://huggingface.co/CunMu",
-            "affiliation": "",
             "affiliationURL": "",
         },
         {
             "author": "Victor Miller",
-            "authorURL": "https://huggingface.co/VictorMiller",
-            "affiliation": "",
             "affiliationURL": "",
         },
         {
             "author": "Yue Peng",
-            "authorURL": "https://huggingface.co/YuePeng",
-            "affiliation": "",
             "affiliationURL": "",
         },
         {
             "author": "Eric P. Xing",
-            "authorURL": "https://huggingface.co/EricXing",
-            "affiliation": "MBZUAI & CMU",
             "affiliationURL": "https://www.mbzuai.ac.ae/ & https://www.cs.cmu.edu/",
         },
         {
             "author": "Zhengzhong Liu",
-            "authorURL": "https://huggingface.co/ZhengzhongLiu",
-            "affiliation": "",
             "affiliationURL": "",
         },
     ],

         },
         {
             "author": "Nikhil Ranjan",
+            "authorURL": "https://huggingface.co/nikhilranjan",
             "affiliation": "MBZUAI",
             "affiliationURL": "",
         },
         },
         {
             "author": "Zhen Wang",
+            "authorURL": "",
             "affiliation": "MBZUAI",
             "affiliationURL": "",
         },
         {
             "author": "An Li",
+            "authorURL": "https://huggingface.co/an1118",
+            "affiliation": "UCSD",
             "affiliationURL": "",
         },
         {
             "author": "Zhoujun Cheng",
+            "authorURL": "https://huggingface.co/zhoujun",
+            "affiliation": "UCSD",
             "affiliationURL": "",
         },
         {
             "author": "Suqi Sun",
+            "authorURL": "https://huggingface.co/mylibrar",
             "affiliation": "Petuum, Inc.",
             "affiliationURL": "",
         },
         {
             "author": "Cun Mu",
+            "authorURL": "https://huggingface.co/CarisMu",
+            "affiliation": "MBZUAI",
             "affiliationURL": "",
         },
         {
             "author": "Victor Miller",
+            "authorURL": "https://huggingface.co/vamiller12",
+            "affiliation": "Petuum, Inc.",
             "affiliationURL": "",
         },
         {
             "author": "Yue Peng",
+            "authorURL": "https://huggingface.co/Dreamever",
+            "affiliation": "MBZUAI",
             "affiliationURL": "",
         },
         {
             "author": "Eric P. Xing",
+            "authorURL": "",
+            "affiliation": "MBZUAI",
             "affiliationURL": "https://www.mbzuai.ac.ae/ & https://www.cs.cmu.edu/",
         },
         {
             "author": "Zhengzhong Liu",
+            "authorURL": "https://huggingface.co/hunterhector",
+            "affiliation": "Petuum, Inc. / MBZUAI ",
             "affiliationURL": "",
         },
     ],

web.py CHANGED Viewed

@@ -248,7 +248,7 @@ attrs.fraction_of_characters_in_duplicate_lines = sum(
 # Plot the distribution sankey.
 # The filtering percentages
-web_filtering_percentages = [
     100,
     96.98,
     43.84,
@@ -264,13 +264,13 @@ web_filtering_percentages = [
 web_filtering_steps = [
     "Common Crawl",
     "Text Extraction",
-    "Language Identification",
     "URL Filtering",
     "Repetition Removal",
-    "Document-wise Filtering",
-    "Line-wise Corrections",
-    "Local Exact Deduplication",
-    "Global Fuzzy Deduplication",
 ]
 step_colors = [
@@ -285,6 +285,8 @@ step_colors = [
     '#1f773c',   # Lightest green added at the end
 ]
 def add_opacity(hex_color, opacity):
     # Remove '#' if present
     hex_color = hex_color.lstrip('#')
@@ -293,29 +295,80 @@ def add_opacity(hex_color, opacity):
     # Add the opacity value
     return f"rgba({rgb[0]}, {rgb[1]}, {rgb[2]}, {opacity})"
-# Concatenate the percentage to each label
-labels_with_percentages = [f"{label} ({percentage}%)" for label, percentage in zip(web_filtering_steps, web_filtering_percentages)]
 filtering_sankey_fig = go.Figure(go.Sankey(
     node=dict(
-        label=labels_with_percentages,
-        color=[add_opacity(c, 0.8) for c in step_colors[:9]] ,
         pad=15,  # Adjust padding between nodes
         thickness=30,
     ),
     link=dict(
-        source=list(range(0,8)),  # Each source is the previous step
-        target=list(range(1,9)),  # Each target is the next step
-        value=web_filtering_percentages,
-        color=[add_opacity(c, 0.5) for c in step_colors[:8]]  # Match the link colors to the source node
     )
 ))
 filtering_sankey_fig.update_layout(
-    title_text="Web Data Filtering Process",
-    font_size=10,
     margin=dict(l=0, r=0, t=40, b=0)
 )
@@ -345,10 +398,10 @@ def web_data():
         P("The table below provides a comparison of the quality filters that have been applied to each dataset. Of note, TxT360 does not use any machine learning (ML) based filters. ML filters are a useful and efficient filtering processing that should be consider for any filtering project. However, we are leaving this to future work."),
         table_div_qf_filter_data,
         P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across snapshots. "),
-        Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
         # The sankey diagram of the filtering percentage
         plotly2fasthtml(filtering_sankey_fig),
-        P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
         id="section2",),
         Section(
         H2("Document Preparation"),

 # Plot the distribution sankey.
 # The filtering percentages
+web_remaining_percent = [
     100,
     96.98,
     43.84,
 web_filtering_steps = [
     "Common Crawl",
     "Text Extraction",
+    "Language ID",
     "URL Filtering",
     "Repetition Removal",
+    "Document Filtering",
+    "Line Corrections",
+    "Local Exact Dedup",
+    "Global Fuzzy Dedup",
 ]
 step_colors = [
     '#1f773c',   # Lightest green added at the end
 ]
+grey_color = "#d3d3d3"
 def add_opacity(hex_color, opacity):
     # Remove '#' if present
     hex_color = hex_color.lstrip('#')
     # Add the opacity value
     return f"rgba({rgb[0]}, {rgb[1]}, {rgb[2]}, {opacity})"
+# Create a list for all the node labels, colors, and values
+node_labels = []
+node_colors = []
+# Create source and target for links
+source = []
+target = []
+link_colors = []
+link_values = []
+# For each step, we have two nodes: remaining and filtered
+for i, label in enumerate(web_filtering_steps):
+    node_labels.append(f"{label} ({web_remaining_percent[i]}%)")
+    node_colors.append(add_opacity(step_colors[i], 0.85))
+    if i > 0:
+        # Nothing filtered at step 0, set the nodes of the remaining percentages.
+        node_labels.append(f"{100 - web_remaining_percent[i]:.2f}%")
+        node_colors.append(grey_color)
+        # From the previous remaining part to the current remaining part.
+        if i == 1:
+            # Nothing got filtered before step 1.
+            prev_remain_idx = 0
+            curr_remain_idx = 1
+            curr_filtered_idx = 2
+        else:
+            prev_remain_idx = 2 * i - 3
+            prev_filtered_idx = 2 * i - 2
+            curr_remain_idx = 2 * i - 1
+            curr_filtered_idx = 2 * i
+        # Previous remaining -> current remaining
+        source.append(prev_remain_idx)
+        target.append(curr_remain_idx)
+        link_colors.append(add_opacity(step_colors[i-1], 0.7))
+        link_values.append(web_remaining_percent[i])
+        # Previous remaining -> current filtered
+        source.append(prev_remain_idx)
+        target.append(curr_filtered_idx)
+        link_colors.append(add_opacity(step_colors[i-1], 0.5))
+        link_values.append(web_remaining_percent[i-1] - web_remaining_percent[i])
+        if i > 1:
+            # We have data filtered out at step 1, previous filtered -> current filtered
+            source.append(prev_filtered_idx)
+            target.append(curr_filtered_idx)
+            link_colors.append(grey_color)
+            link_values.append(100 - web_remaining_percent[i - 1])
 filtering_sankey_fig = go.Figure(go.Sankey(
     node=dict(
+        label=node_labels,
+        color=node_colors,
         pad=15,  # Adjust padding between nodes
         thickness=30,
     ),
     link=dict(
+        source=source,  # Source from remaining
+        target=target,  # Target to filtered
+        value=link_values,  # Interleaved remaining and filtered values
+        color=link_colors
     )
 ))
 filtering_sankey_fig.update_layout(
+    title_text="Web Data Filtering Percentage",
+    title_x=0.5,  # Centers the title
+    title_font=dict(
+        family="Arial, sans-serif",  # Font family
+        size=18,  # Font size
+    ),
+    font_size=8,
     margin=dict(l=0, r=0, t=40, b=0)
 )
         P("The table below provides a comparison of the quality filters that have been applied to each dataset. Of note, TxT360 does not use any machine learning (ML) based filters. ML filters are a useful and efficient filtering processing that should be consider for any filtering project. However, we are leaving this to future work."),
         table_div_qf_filter_data,
         P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across snapshots. "),
+        # Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
         # The sankey diagram of the filtering percentage
         plotly2fasthtml(filtering_sankey_fig),
+        P("A significant portion of the documents is filtered after the whole process. This figure illustrates the percentage of documents filtered at each step. The grey bars represent the filtered documents. The statistics are largely consistent with prior work (e.g., RefinedWeb) across most steps, though we have incorporated some custom filtering steps."),
         id="section2",),
         Section(
         H2("Document Preparation"),