victormiller commited on
Commit
5b83110
1 Parent(s): 7444772

Update web.py

Browse files
Files changed (1) hide show
  1. web.py +12 -12
web.py CHANGED
@@ -399,21 +399,21 @@ def web_data():
399
  ),
400
  P("We summarize other statistics-based rules originated from Gopher [7] in this section. The statistics can be used include:"),
401
  Ul(
402
- Li("the word count in the document", style = "margin-bottom: 3px"),
403
- Li("the mean word length", style = "margin-bottom: 3px"),
404
- Li("the number of sentences", style = "margin-bottom: 3px"),
405
- Li("the symbol-to-word ratio", style = "margin-bottom: 3px"),
406
- Li("the fraction of alphabetic words", style = "margin-bottom: 3px"),
407
- Li("and the number of stop words", style = "margin-bottom: 3px"),
408
  ),
409
  P("Specifically, we remove any document which satisfies any of the following criteria:"),
410
  Ul(
411
- Li("it contains less than 50 words or more than 100,000 words"),
412
- Li("its mean word length is outside the range of 3 to 10"),
413
- Li("it contains less than 3 sentences"),
414
- Li("its symbol-to-word ratio is greater than 0.1"),
415
- Li("the words that contain at least one alphabetic character are less than 80% of the whole words"),
416
- Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with"),
417
  ),
418
 
419
  P("Following C4, we remove any page where the phrase “lorem ipsum” appears since some pages have placeholder “lorem ipsum” text."),
 
399
  ),
400
  P("We summarize other statistics-based rules originated from Gopher [7] in this section. The statistics can be used include:"),
401
  Ul(
402
+ Li("the word count in the document", style = "margin-bottom: 5px"),
403
+ Li("the mean word length", style = "margin-bottom: 5px"),
404
+ Li("the number of sentences", style = "margin-bottom: 5px"),
405
+ Li("the symbol-to-word ratio", style = "margin-bottom: 5px"),
406
+ Li("the fraction of alphabetic words", style = "margin-bottom: 5px"),
407
+ Li("and the number of stop words", style = "margin-bottom: 5px"),
408
  ),
409
  P("Specifically, we remove any document which satisfies any of the following criteria:"),
410
  Ul(
411
+ Li("it contains less than 50 words or more than 100,000 words", style = "margin-bottom: 5px"),
412
+ Li("its mean word length is outside the range of 3 to 10", style = "margin-bottom: 5px"),
413
+ Li("it contains less than 3 sentences", style = "margin-bottom: 5px"),
414
+ Li("its symbol-to-word ratio is greater than 0.1", style = "margin-bottom: 5px"),
415
+ Li("the words that contain at least one alphabetic character are less than 80% of the whole words", style = "margin-bottom: 5px"),
416
+ Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with", style = "margin-bottom: 5px"),
417
  ),
418
 
419
  P("Following C4, we remove any page where the phrase “lorem ipsum” appears since some pages have placeholder “lorem ipsum” text."),