lhoestq HF staff commited on
Commit
e417e74
1 Parent(s): 905f549

stream on full warc

Browse files
app.py CHANGED
@@ -26,8 +26,6 @@ from datatrove.utils.typeshelper import Languages
26
 
27
  nltk.download('punkt_tab')
28
  DUMP_TO_PROCESS = "CC-MAIN-2023-50"
29
- default_output_docs_2k = pd.read_json(f"output_all-2k/base_processing/output/{DUMP_TO_PROCESS}/00000.jsonl.gz", compression="gzip", lines=True).to_dict(orient="records")
30
- default_output_docs_200 = pd.read_json(f"output_all-200/base_processing/output/{DUMP_TO_PROCESS}/00000.jsonl.gz", compression="gzip", lines=True).to_dict(orient="records")
31
 
32
  make_gallery_image_buttons_js = """
33
  function load() {
@@ -81,12 +79,6 @@ function load() {
81
  }
82
  """
83
  css = """
84
- tr:has(> td div span span div.diffInsertion) {
85
- background: darkgreen;
86
- }
87
- tr:has(> td div span span div.diffDeletion) {
88
- background: darkred;
89
- }
90
  tr td {
91
  border-top: 1px solid black;
92
  }
@@ -399,9 +391,10 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
399
 
400
  pipeline_executor = LocalPipelineExecutor(
401
  pipeline=[
402
- JsonlReader(data_folder=f"output_text_extraction-2k/base_processing/output/{DUMP_TO_PROCESS}", glob_pattern="*.jsonl.gz"),
403
  partial(increment_num_warc_samples, num_warc_samples_per_doc=2000 / 1687)
404
  ] + steps_to_run[2:] + [
 
405
  lambda data, rank, world_size: map(output_docs.append, data)
406
  ],
407
  logging_dir="logs",
@@ -411,8 +404,9 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
411
  pipeline_executor = LocalPipelineExecutor(
412
  pipeline=[
413
  WarcReader(data_folder="data", glob_pattern="*.warc.gz"),
414
- lambda data, rank, world_size: islice(data, num_warc_samples),
415
  ] + steps_to_run + [
 
416
  lambda data, rank, world_size: map(output_docs.append, data)
417
  ],
418
  logging_dir="logs",
@@ -445,12 +439,12 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
445
  output_tab: gr.Tab("Output (loading...)"),
446
  excluded_tab: gr.Tab("Excluded (loading...)"),
447
  **{
448
- excluded_dataframes[type(step_to_run)]: pd.DataFrame({"text": [doc.text for doc in step_to_run.exclusion_writer.docs]})
449
  for step_to_run in pipeline_executor.pipeline
450
  if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
451
  },
452
  **{
453
- excluded_tabs[type(step_to_run)]: gr.Tab(f"{type(step_to_run).__name__} (~{len(step_to_run.exclusion_writer.docs)/num_warc_samples*100:.03f}% of data)")
454
  for step_to_run in pipeline_executor.pipeline
455
  if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
456
  },
@@ -470,6 +464,8 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
470
  if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
471
  },
472
  }
 
 
473
 
474
  if __name__ == "__main__":
475
  demo.launch()
 
26
 
27
  nltk.download('punkt_tab')
28
  DUMP_TO_PROCESS = "CC-MAIN-2023-50"
 
 
29
 
30
  make_gallery_image_buttons_js = """
31
  function load() {
 
79
  }
80
  """
81
  css = """
 
 
 
 
 
 
82
  tr td {
83
  border-top: 1px solid black;
84
  }
 
391
 
392
  pipeline_executor = LocalPipelineExecutor(
393
  pipeline=[
394
+ JsonlReader(data_folder=f"output_text_extraction-full/base_processing/output/{DUMP_TO_PROCESS}", glob_pattern="*.jsonl.gz"),
395
  partial(increment_num_warc_samples, num_warc_samples_per_doc=2000 / 1687)
396
  ] + steps_to_run[2:] + [
397
+ lambda data, rank, world_size: islice(data, 100),
398
  lambda data, rank, world_size: map(output_docs.append, data)
399
  ],
400
  logging_dir="logs",
 
404
  pipeline_executor = LocalPipelineExecutor(
405
  pipeline=[
406
  WarcReader(data_folder="data", glob_pattern="*.warc.gz"),
407
+ increment_num_warc_samples
408
  ] + steps_to_run + [
409
+ lambda data, rank, world_size: islice(data, 100),
410
  lambda data, rank, world_size: map(output_docs.append, data)
411
  ],
412
  logging_dir="logs",
 
439
  output_tab: gr.Tab("Output (loading...)"),
440
  excluded_tab: gr.Tab("Excluded (loading...)"),
441
  **{
442
+ excluded_dataframes[type(step_to_run)]: pd.DataFrame({"text": []})
443
  for step_to_run in pipeline_executor.pipeline
444
  if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
445
  },
446
  **{
447
+ excluded_tabs[type(step_to_run)]: gr.Tab(f"{type(step_to_run).__name__}")
448
  for step_to_run in pipeline_executor.pipeline
449
  if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
450
  },
 
464
  if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
465
  },
466
  }
467
+
468
+ stop_button.click(cancels=[view_pipeline_results])
469
 
470
  if __name__ == "__main__":
471
  demo.launch()
output_all-2k/base_processing/output/CC-MAIN-2023-50/00000.jsonl.gz DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e32e8d71f3bcf050fe2d0a744b21e411c66b95ee4f88a4fa8eac5f6b459ce345
3
- size 299465
 
 
 
 
{output_all-200 → output_text_extraction-full}/base_processing/output/CC-MAIN-2023-50/00000.jsonl.gz RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7798c4f4cb22b19532ea4c5a58dbc94b72dd7627007a90dc0b2d83cf6ac56007
3
- size 28010
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7d0b9e9dd069a639f56c3ad81e92bdb053e983899cb7910f633af174f44903d
3
+ size 28830205