teven commited on
Commit
c8f45af
1 Parent(s): 96e0b3b

better description, flagged words

Browse files
Files changed (2) hide show
  1. app.py +20 -20
  2. en_examples_with_stats_ldnoob.json +2 -2
app.py CHANGED
@@ -30,51 +30,51 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
30
 
31
  if "special_%" in columns:
32
  special_ratio = st.sidebar.slider(
33
- "% filtered by special characters ratio", 0.0, 50.0, 0.0, step=1.0
34
  )
35
  cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
36
  special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
37
- st.sidebar.text(f"Kept text with <{special_cutoff:.1f}% special chars")
38
  keys.append(("special_%", special_cutoff, True))
39
 
40
  if "stop_%" in columns:
41
  stop_ratio = st.sidebar.slider(
42
- "% filtered by stop word ratio", 0.0, 50.0, 0.0, step=1.0
43
  )
44
  cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
45
  stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
46
- st.sidebar.text(f"Kept text with >{stop_cutoff:.2f}% stop words")
47
  keys.append(("stop_%", stop_cutoff, False))
48
 
49
  @st.cache(suppress_st_warning=True)
50
- def recalculate_bad_words(file):
51
 
52
- def bad_word_ratio(text: str, bad_word_list):
53
- return len([word for word in text.split() if word.lower().strip() in bad_word_list]) / len(text.split())
54
 
55
- bad_word_list = [word.decode().strip() for word in file.readlines()]
56
 
57
- bad_word_ratios = [bad_word_ratio(text, bad_word_list) * 100 for text in data["text"]]
58
- data["bad_%"] = bad_word_ratios
59
 
60
- bad_word_file = st.sidebar.file_uploader("Upload your own list of bad words (1 word per line)")
61
 
62
- if "bad_%" in columns:
63
- bad_ratio = st.sidebar.slider(
64
- "% filtered by badwords ratio", 0.0, 50.0, 0.0, step=0.1
65
  )
66
- bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
67
- bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
68
- st.sidebar.text(f"Kept text with <{bad_cutoff:.2f}% bad words")
69
- keys.append(("bad_%", bad_cutoff, True))
70
 
71
  if "perplexity" in columns:
72
  ppl_ratio = st.sidebar.slider(
73
- "% filtered by perplexity", 0.0, 50.0, 0.0, step=1.0
74
  )
75
  ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
76
  ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
77
- st.sidebar.text(f"Kept text with <{ppl_cutoff:.0f} perplexity")
78
  keys.append(("perplexity", ppl_cutoff, True))
79
 
80
  cond = [
 
30
 
31
  if "special_%" in columns:
32
  special_ratio = st.sidebar.slider(
33
+ "% filtered by special characters ratio", 0.0, 50.0, 0.0, step=0.1
34
  )
35
  cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
36
  special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
37
+ st.sidebar.text(f"No docs with <{special_cutoff:.1f}% special chars")
38
  keys.append(("special_%", special_cutoff, True))
39
 
40
  if "stop_%" in columns:
41
  stop_ratio = st.sidebar.slider(
42
+ "% filtered by stop word ratio", 0.0, 50.0, 0.0, step=0.1
43
  )
44
  cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
45
  stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
46
+ st.sidebar.text(f"No docs with >{stop_cutoff:.2f}% stop words")
47
  keys.append(("stop_%", stop_cutoff, False))
48
 
49
  @st.cache(suppress_st_warning=True)
50
+ def recalculate_flagged_words(file):
51
 
52
+ def flagged_word_ratio(text: str, flagged_word_list):
53
+ return len([word for word in text.split() if word.lower().strip() in flagged_word_list]) / len(text.split())
54
 
55
+ flagged_word_list = [word.decode().strip() for word in file.readlines()]
56
 
57
+ flagged_word_ratios = [flagged_word_ratio(text, flagged_word_list) * 100 for text in data["text"]]
58
+ data["flagged_%"] = flagged_word_ratios
59
 
60
+ flagged_word_file = st.sidebar.file_uploader("Upload your own list of flagged words (1 word per line)")
61
 
62
+ if "flagged_%" in columns:
63
+ flagged_ratio = st.sidebar.slider(
64
+ "% filtered by flaggedwords ratio", 0.0, 50.0, 0.0, step=0.1
65
  )
66
+ flagged_index = max(0, math.floor((100 - flagged_ratio) * len(data.index) / 100) - 1)
67
+ flagged_cutoff = np.partition(data["flagged_%"], flagged_index)[flagged_index]
68
+ st.sidebar.text(f"No docs with >{flagged_cutoff:.2f}% flagged words")
69
+ keys.append(("flagged_%", flagged_cutoff, True))
70
 
71
  if "perplexity" in columns:
72
  ppl_ratio = st.sidebar.slider(
73
+ "% filtered by perplexity", 0.0, 50.0, 0.0, step=0.1
74
  )
75
  ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
76
  ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
77
+ st.sidebar.text(f"No docs with >{ppl_cutoff:.0f} perplexity")
78
  keys.append(("perplexity", ppl_cutoff, True))
79
 
80
  cond = [
en_examples_with_stats_ldnoob.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f82e7ebdbb2054c2f6dbfab70ce589e0972fa61fd7e08b778e848e07537c4e1
3
- size 21187243
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e4e2a111df4e1a3243d53c9516baf8a3f495f8faec5b86fe8787bc6dc2a03bc
3
+ size 21206447