Spaces:
Runtime error
Runtime error
better sliders
Browse files
app.py
CHANGED
@@ -7,7 +7,6 @@ import matplotlib.pyplot as plt
|
|
7 |
|
8 |
|
9 |
def visualization(path_data, lang, num_docs, num_docs_for_words):
|
10 |
-
|
11 |
with open(path_data) as json_file:
|
12 |
data = json.load(json_file)
|
13 |
|
@@ -29,10 +28,9 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
|
|
29 |
|
30 |
st.header("Filtering based on document content")
|
31 |
|
32 |
-
|
33 |
if "special_%" in columns:
|
34 |
special_ratio = st.sidebar.slider(
|
35 |
-
"% filtered by special characters ratio", 0.0,
|
36 |
)
|
37 |
cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
|
38 |
special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
|
@@ -41,16 +39,33 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
|
|
41 |
|
42 |
if "stop_%" in columns:
|
43 |
stop_ratio = st.sidebar.slider(
|
44 |
-
"% filtered by stop word ratio", 0.0,
|
45 |
)
|
46 |
cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
|
47 |
stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
|
48 |
st.sidebar.text(f"Kept text with >{stop_cutoff:.1f}% stop words")
|
49 |
keys.append(("stop_%", stop_cutoff, False))
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
if "bad_%" in columns:
|
52 |
bad_ratio = st.sidebar.slider(
|
53 |
-
"% filtered by badwords ratio", 0.0,
|
54 |
)
|
55 |
bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
|
56 |
bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
|
@@ -59,7 +74,7 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
|
|
59 |
|
60 |
if "perplexity" in columns:
|
61 |
ppl_ratio = st.sidebar.slider(
|
62 |
-
"% filtered by perplexity", 0.0,
|
63 |
)
|
64 |
ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
|
65 |
ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
|
@@ -82,13 +97,13 @@ def visualization(path_data, lang, num_docs, num_docs_for_words):
|
|
82 |
st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
|
83 |
st.dataframe(data_keep)
|
84 |
|
85 |
-
def plot_hist(dataframe, key, num_bins=50):
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
|
93 |
# for key, _, _ in keys:
|
94 |
# plot_hist(data, key)
|
|
|
7 |
|
8 |
|
9 |
def visualization(path_data, lang, num_docs, num_docs_for_words):
|
|
|
10 |
with open(path_data) as json_file:
|
11 |
data = json.load(json_file)
|
12 |
|
|
|
28 |
|
29 |
st.header("Filtering based on document content")
|
30 |
|
|
|
31 |
if "special_%" in columns:
|
32 |
special_ratio = st.sidebar.slider(
|
33 |
+
"% filtered by special characters ratio", 0.0, 50.0, 0.0, step=1.0
|
34 |
)
|
35 |
cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
|
36 |
special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
|
|
|
39 |
|
40 |
if "stop_%" in columns:
|
41 |
stop_ratio = st.sidebar.slider(
|
42 |
+
"% filtered by stop word ratio", 0.0, 50.0, 0.0, step=1.0
|
43 |
)
|
44 |
cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
|
45 |
stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
|
46 |
st.sidebar.text(f"Kept text with >{stop_cutoff:.1f}% stop words")
|
47 |
keys.append(("stop_%", stop_cutoff, False))
|
48 |
|
49 |
+
# def recalculate_bad_words(file):
|
50 |
+
#
|
51 |
+
# def bad_word_ratio(text: str, bad_word_list):
|
52 |
+
# return sum(
|
53 |
+
# [text.count(bad_word.decode()) * len(bad_word.decode().split()) for bad_word in bad_word_list]) / len(
|
54 |
+
# text.split())
|
55 |
+
#
|
56 |
+
# bad_word_list = file.readlines()
|
57 |
+
#
|
58 |
+
# bad_word_ratios = [bad_word_ratio(text, bad_word_list) * 100 for text in data["text"]]
|
59 |
+
# data["bad_%"] = bad_word_ratios
|
60 |
+
#
|
61 |
+
# bad_word_file = st.sidebar.file_uploader("Upload your own list of bad words (1 word per line)")
|
62 |
+
#
|
63 |
+
# if bad_word_file is not None:
|
64 |
+
# recalculate_bad_words(bad_word_file)
|
65 |
+
|
66 |
if "bad_%" in columns:
|
67 |
bad_ratio = st.sidebar.slider(
|
68 |
+
"% filtered by badwords ratio", 0.0, 50.0, 0.0, step=0.1
|
69 |
)
|
70 |
bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
|
71 |
bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
|
|
|
74 |
|
75 |
if "perplexity" in columns:
|
76 |
ppl_ratio = st.sidebar.slider(
|
77 |
+
"% filtered by perplexity", 0.0, 50.0, 0.0, step=1.0
|
78 |
)
|
79 |
ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
|
80 |
ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
|
|
|
97 |
st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
|
98 |
st.dataframe(data_keep)
|
99 |
|
100 |
+
# def plot_hist(dataframe, key, num_bins=50):
|
101 |
+
# st.subheader(" ".join(key.split("_")))
|
102 |
+
# hist_values = dataframe[key].values
|
103 |
+
# max_range = np.max(hist_values)
|
104 |
+
# hist_values = np.histogram(hist_values, bins=num_bins, range=(0, max_range))[0]
|
105 |
+
# st.bar_chart(hist_values)
|
106 |
+
# st.markdown(f"Each bin is of size: {max_range/num_bins}.")
|
107 |
|
108 |
# for key, _, _ in keys:
|
109 |
# plot_hist(data, key)
|