victormiller commited on
Commit
2c39f2b
·
verified ·
1 Parent(s): 81bacff

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +60 -0
curated.py CHANGED
@@ -436,6 +436,35 @@ s2o_filter = pd.DataFrame(
436
  table_html_s2o = s2o_filter.to_html(index=False, border=0)
437
  table_div_s2o = Div(NotStr(table_html_s2o))
438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
  med_filter = pd.DataFrame(
440
  {
441
  "Dataset": [
@@ -465,6 +494,35 @@ med_filter = pd.DataFrame(
465
  table_html_med = med_filter.to_html(index=False, border=0)
466
  table_div_med = Div(NotStr(table_html_med))
467
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  phil_filter = pd.DataFrame(
469
  {
470
  "Dataset": [
@@ -855,6 +913,7 @@ filtering_process = Div(
855
  style="margin-bottom: -3px",
856
  ),
857
  ),
 
858
  #Details(
859
  # Summary("S2ORC Abstract Filtering Examples "),
860
  # Div(
@@ -914,6 +973,7 @@ filtering_process = Div(
914
  ),
915
  ),
916
  table_div_med,
 
917
  Details(
918
  Summary("PubMed Filtering Examples"),
919
  Div(
 
436
  table_html_s2o = s2o_filter.to_html(index=False, border=0)
437
  table_div_s2o = Div(NotStr(table_html_s2o))
438
 
439
+ s2oa_filter = pd.DataFrame(
440
+ {
441
+ "Dataset": [
442
+ "S2ORC Abstract",
443
+ ],
444
+ "Lines Downloaded": [
445
+ "102324176",
446
+ ],
447
+ "Percent Removed After Language Filter": [
448
+ "18.04%",
449
+ ],
450
+ "Percent Removed After Min Word Count Filter": [
451
+ "1.17%",
452
+ ],
453
+ "Percent Removed After Unigram Probability Filter": [
454
+ "0.00%",
455
+ ],
456
+ "Percent Removed After Local Dedup": [
457
+ "0.13%",
458
+ ],
459
+ "Total Percentage Remaining": [
460
+ "80.66%",
461
+ ],
462
+ }
463
+ )
464
+
465
+ table_html_s2oa = s2oa_filter.to_html(index=False, border=0)
466
+ table_div_s2oa = Div(NotStr(table_html_s2oa))
467
+
468
  med_filter = pd.DataFrame(
469
  {
470
  "Dataset": [
 
494
  table_html_med = med_filter.to_html(index=False, border=0)
495
  table_div_med = Div(NotStr(table_html_med))
496
 
497
+ pma_filter = pd.DataFrame(
498
+ {
499
+ "Dataset": [
500
+ "PubMed - Abstract",
501
+ ],
502
+ "Lines Downloaded": [
503
+ "25787474",
504
+ ],
505
+ "Percent Removed After Language Filter": [
506
+ "0.01%",
507
+ ],
508
+ "Percent Removed After Min Word Count Filter": [
509
+ "0.14%",
510
+ ],
511
+ "Percent Removed After Unigram Probability Filter": [
512
+ "0.00%",
513
+ ],
514
+ "Percent Removed After Local Dedup": [
515
+ "0.00%",
516
+ ],
517
+ "Total Percentage Remaining": [
518
+ "98.85%",
519
+ ],
520
+ }
521
+ )
522
+
523
+ table_html_pma = pma_filter.to_html(index=False, border=0)
524
+ table_div_pma = Div(NotStr(table_html_pma))
525
+
526
  phil_filter = pd.DataFrame(
527
  {
528
  "Dataset": [
 
913
  style="margin-bottom: -3px",
914
  ),
915
  ),
916
+ table_div_s2oa,
917
  #Details(
918
  # Summary("S2ORC Abstract Filtering Examples "),
919
  # Div(
 
973
  ),
974
  ),
975
  table_div_med,
976
+ table_div_pma,
977
  Details(
978
  Summary("PubMed Filtering Examples"),
979
  Div(