Spaces:
Runtime error
Runtime error
Mariusz Kossakowski
commited on
Commit
•
d572e8e
1
Parent(s):
41b0597
Major refactor
Browse files- app.py +16 -236
- clarin_datasets/abusive_clauses_dataset.py +188 -0
- clarin_datasets/dataset_to_show.py +18 -0
- clarin_datasets/polemo_dataset.py +200 -0
- clarin_datasets/utils.py +16 -0
app.py
CHANGED
@@ -1,244 +1,24 @@
|
|
1 |
-
import re
|
2 |
-
from typing import Dict, List
|
3 |
-
|
4 |
-
from datasets import load_dataset
|
5 |
-
import pandas as pd
|
6 |
-
import plotly.figure_factory as ff
|
7 |
-
import plotly.graph_objects as go
|
8 |
import streamlit as st
|
9 |
-
from unidecode import unidecode
|
10 |
-
|
11 |
-
DATA_SPLITS = ["train", "validation", "test"]
|
12 |
-
|
13 |
-
|
14 |
-
def load_data() -> Dict[str, pd.DataFrame]:
|
15 |
-
return {
|
16 |
-
data: pd.read_csv(f"data/{data}.csv").rename(
|
17 |
-
{"label": "target"}, axis="columns"
|
18 |
-
)
|
19 |
-
for data in DATA_SPLITS
|
20 |
-
}
|
21 |
-
|
22 |
-
|
23 |
-
def flatten_list(main_list: List[List]) -> List:
|
24 |
-
return [item for sublist in main_list for item in sublist]
|
25 |
-
|
26 |
-
|
27 |
-
def count_num_of_characters(text: str) -> int:
|
28 |
-
return len(re.sub(r"[^a-zA-Z]", "", unidecode(text)))
|
29 |
-
|
30 |
-
|
31 |
-
def count_num_of_words(text: str) -> int:
|
32 |
-
return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))
|
33 |
|
|
|
|
|
|
|
34 |
|
35 |
selected_dataset = st.sidebar.selectbox(
|
36 |
"Choose a dataset to load",
|
37 |
-
(
|
|
|
|
|
|
|
|
|
38 |
)
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
data = load_dataset("clarin-pl/polemo2-official")
|
44 |
-
DATA_DICT = {
|
45 |
-
"train": data["train"].to_pandas(),
|
46 |
-
"validation": data["validation"].to_pandas(),
|
47 |
-
"test": data["test"].to_pandas(),
|
48 |
-
}
|
49 |
-
DATA_DESCRIPTION = """The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
|
50 |
-
hotels, products, and university. It is human-annotated on a level of full reviews and individual
|
51 |
-
sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and
|
52 |
-
sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197,
|
53 |
-
046 annotations. About 85% of the reviews are from the medicine and hotel domains. Each review is
|
54 |
-
annotated with four labels: positive, negative, neutral, or ambiguous. """
|
55 |
-
elif selected_dataset == "laugustyniak/abusive-clauses-pl":
|
56 |
-
DATA_DICT = load_data()
|
57 |
-
DATA_DESCRIPTION = """
|
58 |
-
''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
|
59 |
-
Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
|
60 |
-
But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
|
61 |
-
we probably skip most of the Terms and Conditions. However, we must remember that we have concluded many more
|
62 |
-
contracts. Imagine that we want to buy a house, a car, send our kids to the nursery, open a bank account,
|
63 |
-
or many more. In all these situations, you will need to conclude the contract, but there is a high probability
|
64 |
-
that you will not read the entire agreement with proper understanding. European consumer law aims to prevent
|
65 |
-
businesses from using so-called ''unfair contractual terms'' in their unilaterally drafted contracts,
|
66 |
-
requiring consumers to accept.
|
67 |
-
|
68 |
-
Our dataset treats ''unfair contractual term'' as the equivalent of an abusive clause. It could be defined as a
|
69 |
-
clause that is unilaterally imposed by one of the contract's parties, unequally affecting the other, or creating a
|
70 |
-
situation of imbalance between the duties and rights of the parties.
|
71 |
-
|
72 |
-
On the EU and at the national such as the Polish levels, agencies cannot check possible agreements by hand. Hence,
|
73 |
-
we took the first step to evaluate the possibility of accelerating this process. We created a dataset and machine
|
74 |
-
learning models to automate potentially abusive clauses detection partially. Consumer protection organizations and
|
75 |
-
agencies can use these resources to make their work more effective and efficient. Moreover, consumers can automatically
|
76 |
-
analyze contracts and understand what they agree upon.
|
77 |
-
"""
|
78 |
-
return DATA_DICT, DATA_DESCRIPTION
|
79 |
-
|
80 |
-
|
81 |
-
DATA_DICT, DATA_DESCRIPTION = load_hf_dataset()
|
82 |
-
|
83 |
-
header = st.container()
|
84 |
-
description = st.container()
|
85 |
-
dataframe_head = st.container()
|
86 |
-
word_searching = st.container()
|
87 |
-
dataset_statistics = st.container()
|
88 |
-
|
89 |
-
with header:
|
90 |
-
st.title(selected_dataset)
|
91 |
-
|
92 |
-
with description:
|
93 |
-
st.header("Dataset description")
|
94 |
-
st.write(DATA_DESCRIPTION)
|
95 |
-
|
96 |
-
with dataframe_head:
|
97 |
-
filtering_options = DATA_DICT["train"]["target"].unique().tolist()
|
98 |
-
filtering_options.append("All classes")
|
99 |
-
|
100 |
-
st.header("First 10 observations of a chosen class")
|
101 |
-
class_to_show = st.selectbox(
|
102 |
-
label="Select class to show", options=filtering_options
|
103 |
-
)
|
104 |
-
df_to_show = pd.concat(
|
105 |
-
[
|
106 |
-
DATA_DICT["train"].copy(),
|
107 |
-
DATA_DICT["validation"].copy(),
|
108 |
-
DATA_DICT["test"].copy(),
|
109 |
-
]
|
110 |
-
)
|
111 |
-
if class_to_show == "All classes":
|
112 |
-
df_to_show = df_to_show.head(10)
|
113 |
-
else:
|
114 |
-
df_to_show = df_to_show.loc[df_to_show["target"] == class_to_show].head(10)
|
115 |
-
st.dataframe(df_to_show)
|
116 |
-
st.text_area(label="Latex code", value=df_to_show.style.to_latex())
|
117 |
-
|
118 |
-
if selected_dataset == "clarin-pl/polemo2-official":
|
119 |
-
st.subheader("First 10 observations of a chosen domain and text type")
|
120 |
-
domain = st.selectbox(
|
121 |
-
label="Select domain",
|
122 |
-
options=["all", "hotels", "medicine", "products", "reviews"],
|
123 |
-
)
|
124 |
-
text_type = st.selectbox(
|
125 |
-
label="Select text type", options=["Full text", "Tokenized to sentences"]
|
126 |
-
)
|
127 |
-
text_type_mapping_dict = {
|
128 |
-
"Full text": "text",
|
129 |
-
"Tokenized to sentences": "sentence",
|
130 |
-
}
|
131 |
-
|
132 |
-
polemo_subset = load_dataset(
|
133 |
-
selected_dataset, f"{domain}_{text_type_mapping_dict[text_type]}"
|
134 |
-
)
|
135 |
-
df = pd.concat(
|
136 |
-
[
|
137 |
-
polemo_subset["train"].to_pandas(),
|
138 |
-
polemo_subset["validation"].to_pandas(),
|
139 |
-
polemo_subset["test"].to_pandas(),
|
140 |
-
]
|
141 |
-
).head(10)
|
142 |
-
st.dataframe(df)
|
143 |
-
st.text_area(label="Latex code", value=df.style.to_latex())
|
144 |
-
|
145 |
-
with word_searching:
|
146 |
-
st.header("Observations containing a chosen word")
|
147 |
-
searched_word = st.text_input(label="Enter the word you are looking for below")
|
148 |
-
df_to_show = pd.concat(
|
149 |
-
[
|
150 |
-
DATA_DICT["train"].copy(),
|
151 |
-
DATA_DICT["validation"].copy(),
|
152 |
-
DATA_DICT["test"].copy(),
|
153 |
-
]
|
154 |
-
)
|
155 |
-
df_to_show = df_to_show.loc[df_to_show["text"].str.contains(searched_word)]
|
156 |
-
st.dataframe(df_to_show)
|
157 |
-
st.text_area(label="Latex code", value=df_to_show.style.to_latex())
|
158 |
-
|
159 |
-
with dataset_statistics:
|
160 |
-
st.header("Dataset statistics")
|
161 |
-
st.subheader("Number of samples in each data split")
|
162 |
-
metrics_df = pd.DataFrame.from_dict(
|
163 |
-
{
|
164 |
-
"Train": DATA_DICT["train"].shape[0],
|
165 |
-
"Validation": DATA_DICT["validation"].shape[0],
|
166 |
-
"Test": DATA_DICT["test"].shape[0],
|
167 |
-
"Total": sum(
|
168 |
-
[
|
169 |
-
DATA_DICT["train"].shape[0],
|
170 |
-
DATA_DICT["validation"].shape[0],
|
171 |
-
DATA_DICT["test"].shape[0],
|
172 |
-
]
|
173 |
-
),
|
174 |
-
},
|
175 |
-
orient="index",
|
176 |
-
).reset_index()
|
177 |
-
metrics_df.columns = ["Subset", "Number of samples"]
|
178 |
-
st.dataframe(metrics_df)
|
179 |
-
|
180 |
-
latex_df = metrics_df.style.to_latex()
|
181 |
-
st.text_area(label="Latex code", value=latex_df)
|
182 |
-
|
183 |
-
# Class distribution in each subset
|
184 |
-
st.subheader("Class distribution in each subset")
|
185 |
-
target_unique_values = DATA_DICT["train"]["target"].unique()
|
186 |
-
hist = (
|
187 |
-
pd.DataFrame(
|
188 |
-
[
|
189 |
-
df["target"].value_counts(normalize=True).rename(k)
|
190 |
-
for k, df in DATA_DICT.items()
|
191 |
-
]
|
192 |
-
)
|
193 |
-
.reset_index()
|
194 |
-
.rename({"index": "split_name"}, axis=1)
|
195 |
-
)
|
196 |
-
plot_data = [
|
197 |
-
go.Bar(
|
198 |
-
name=str(target_unique_values[i]),
|
199 |
-
x=DATA_SPLITS,
|
200 |
-
y=hist[target_unique_values[i]].values,
|
201 |
-
)
|
202 |
-
for i in range(len(target_unique_values))
|
203 |
-
]
|
204 |
-
barchart_class_dist = go.Figure(data=plot_data)
|
205 |
-
barchart_class_dist.update_layout(
|
206 |
-
barmode="group",
|
207 |
-
title_text="Barchart - class distribution",
|
208 |
-
xaxis_title="Split name",
|
209 |
-
yaxis_title="Number of data points",
|
210 |
-
)
|
211 |
-
st.plotly_chart(barchart_class_dist, use_container_width=True)
|
212 |
-
st.dataframe(hist)
|
213 |
-
st.text_area(label="Latex code", value=hist.style.to_latex())
|
214 |
-
|
215 |
-
# Number of words per observation
|
216 |
-
st.subheader("Number of words per observation in each subset")
|
217 |
-
hist_data_num_words = [
|
218 |
-
df["text"].apply(count_num_of_words) for df in DATA_DICT.values()
|
219 |
-
]
|
220 |
-
fig_num_words = ff.create_distplot(
|
221 |
-
hist_data_num_words, DATA_SPLITS, show_rug=False, bin_size=1
|
222 |
-
)
|
223 |
-
fig_num_words.update_traces(
|
224 |
-
nbinsx=100, autobinx=True, selector={"type": "histogram"}
|
225 |
-
)
|
226 |
-
fig_num_words.update_layout(
|
227 |
-
title_text="Histogram - number of characters per observation",
|
228 |
-
xaxis_title="Number of characters",
|
229 |
-
)
|
230 |
-
st.plotly_chart(fig_num_words, use_container_width=True)
|
231 |
-
|
232 |
-
# Number of characters per observation
|
233 |
-
st.subheader("Number of characters per observation in each subset")
|
234 |
-
hist_data_num_characters = [
|
235 |
-
df["text"].apply(count_num_of_characters) for df in DATA_DICT.values()
|
236 |
-
]
|
237 |
-
fig_num_chars = ff.create_distplot(
|
238 |
-
hist_data_num_characters, DATA_SPLITS, show_rug=False, bin_size=1
|
239 |
-
)
|
240 |
-
fig_num_chars.update_layout(
|
241 |
-
title_text="Histogram - number of characters per observation",
|
242 |
-
xaxis_title="Number of characters",
|
243 |
-
)
|
244 |
-
st.plotly_chart(fig_num_chars, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
from clarin_datasets.polemo_dataset import PolemoDataset
|
4 |
+
from clarin_datasets.abusive_clauses_dataset import AbusiveClausesDataset
|
5 |
+
from clarin_datasets.aspectemo_dataset import AspectEmoDataset
|
6 |
|
7 |
selected_dataset = st.sidebar.selectbox(
|
8 |
"Choose a dataset to load",
|
9 |
+
(
|
10 |
+
"clarin-pl/polemo2-official",
|
11 |
+
"laugustyniak/abusive-clauses-pl",
|
12 |
+
"clarin-pl/aspectemo",
|
13 |
+
),
|
14 |
)
|
15 |
|
16 |
+
if selected_dataset == "clarin-pl/polemo2-official":
|
17 |
+
dataset = PolemoDataset()
|
18 |
+
elif selected_dataset == "laugustyniak/abusive-clauses-pl":
|
19 |
+
dataset = AbusiveClausesDataset()
|
20 |
+
elif selected_dataset == "clarin-pl/aspectemo":
|
21 |
+
dataset = AspectEmoDataset()
|
22 |
|
23 |
+
dataset.load_data()
|
24 |
+
dataset.show_dataset()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
clarin_datasets/abusive_clauses_dataset.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import plotly.figure_factory as ff
|
3 |
+
import plotly.graph_objects as go
|
4 |
+
import streamlit as st
|
5 |
+
|
6 |
+
from clarin_datasets.dataset_to_show import DatasetToShow
|
7 |
+
from clarin_datasets.utils import (
|
8 |
+
count_num_of_characters,
|
9 |
+
count_num_of_words,
|
10 |
+
)
|
11 |
+
|
12 |
+
|
13 |
+
class AbusiveClausesDataset(DatasetToShow):
|
14 |
+
def __init__(self):
|
15 |
+
self.dataset_name = "laugustyniak/abusive-clauses-pl"
|
16 |
+
self.data_dict = None
|
17 |
+
self.subsets = ["train", "validation", "test"]
|
18 |
+
self.description = """
|
19 |
+
''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
|
20 |
+
Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
|
21 |
+
But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
|
22 |
+
we probably skip most of the Terms and Conditions. However, we must remember that we have concluded many more
|
23 |
+
contracts. Imagine that we want to buy a house, a car, send our kids to the nursery, open a bank account,
|
24 |
+
or many more. In all these situations, you will need to conclude the contract, but there is a high probability
|
25 |
+
that you will not read the entire agreement with proper understanding. European consumer law aims to prevent
|
26 |
+
businesses from using so-called ''unfair contractual terms'' in their unilaterally drafted contracts,
|
27 |
+
requiring consumers to accept.
|
28 |
+
|
29 |
+
Our dataset treats ''unfair contractual term'' as the equivalent of an abusive clause. It could be defined as a
|
30 |
+
clause that is unilaterally imposed by one of the contract's parties, unequally affecting the other, or creating a
|
31 |
+
situation of imbalance between the duties and rights of the parties.
|
32 |
+
|
33 |
+
On the EU and at the national such as the Polish levels, agencies cannot check possible agreements by hand. Hence,
|
34 |
+
we took the first step to evaluate the possibility of accelerating this process. We created a dataset and machine
|
35 |
+
learning models to automate potentially abusive clauses detection partially. Consumer protection organizations and
|
36 |
+
agencies can use these resources to make their work more effective and efficient. Moreover, consumers can automatically
|
37 |
+
analyze contracts and understand what they agree upon.
|
38 |
+
"""
|
39 |
+
|
40 |
+
def load_data(self):
|
41 |
+
self.data_dict = {
|
42 |
+
subset: pd.read_csv(f"data/{subset}.csv").rename(
|
43 |
+
{"label": "target"}, axis="columns"
|
44 |
+
)
|
45 |
+
for subset in self.subsets
|
46 |
+
}
|
47 |
+
|
48 |
+
def show_dataset(self):
|
49 |
+
header = st.container()
|
50 |
+
description = st.container()
|
51 |
+
dataframe_head = st.container()
|
52 |
+
word_searching = st.container()
|
53 |
+
dataset_statistics = st.container()
|
54 |
+
|
55 |
+
with header:
|
56 |
+
st.title(self.dataset_name)
|
57 |
+
|
58 |
+
with description:
|
59 |
+
st.header("Dataset description")
|
60 |
+
st.write(self.description)
|
61 |
+
|
62 |
+
with dataframe_head:
|
63 |
+
filtering_options = self.data_dict["train"]["target"].unique().tolist()
|
64 |
+
filtering_options.append("All classes")
|
65 |
+
|
66 |
+
st.header("First 10 observations of a chosen class")
|
67 |
+
class_to_show = st.selectbox(
|
68 |
+
label="Select class to show", options=filtering_options
|
69 |
+
)
|
70 |
+
df_to_show = pd.concat(
|
71 |
+
[
|
72 |
+
self.data_dict["train"].copy(),
|
73 |
+
self.data_dict["validation"].copy(),
|
74 |
+
self.data_dict["test"].copy(),
|
75 |
+
]
|
76 |
+
)
|
77 |
+
if class_to_show == "All classes":
|
78 |
+
df_to_show = df_to_show.head(10)
|
79 |
+
else:
|
80 |
+
df_to_show = df_to_show.loc[df_to_show["target"] == class_to_show].head(
|
81 |
+
10
|
82 |
+
)
|
83 |
+
st.dataframe(df_to_show)
|
84 |
+
st.text_area(label="Latex code", value=df_to_show.style.to_latex())
|
85 |
+
|
86 |
+
with word_searching:
|
87 |
+
st.header("Observations containing a chosen word")
|
88 |
+
searched_word = st.text_input(
|
89 |
+
label="Enter the word you are looking for below"
|
90 |
+
)
|
91 |
+
df_to_show = pd.concat(
|
92 |
+
[
|
93 |
+
self.data_dict["train"].copy(),
|
94 |
+
self.data_dict["validation"].copy(),
|
95 |
+
self.data_dict["test"].copy(),
|
96 |
+
]
|
97 |
+
)
|
98 |
+
df_to_show = df_to_show.loc[df_to_show["text"].str.contains(searched_word)]
|
99 |
+
st.dataframe(df_to_show)
|
100 |
+
st.text_area(label="Latex code", value=df_to_show.style.to_latex())
|
101 |
+
|
102 |
+
with dataset_statistics:
|
103 |
+
st.header("Dataset statistics")
|
104 |
+
st.subheader("Number of samples in each data split")
|
105 |
+
metrics_df = pd.DataFrame.from_dict(
|
106 |
+
{
|
107 |
+
"Train": self.data_dict["train"].shape[0],
|
108 |
+
"Validation": self.data_dict["validation"].shape[0],
|
109 |
+
"Test": self.data_dict["test"].shape[0],
|
110 |
+
"Total": sum(
|
111 |
+
[
|
112 |
+
self.data_dict["train"].shape[0],
|
113 |
+
self.data_dict["validation"].shape[0],
|
114 |
+
self.data_dict["test"].shape[0],
|
115 |
+
]
|
116 |
+
),
|
117 |
+
},
|
118 |
+
orient="index",
|
119 |
+
).reset_index()
|
120 |
+
metrics_df.columns = ["Subset", "Number of samples"]
|
121 |
+
st.dataframe(metrics_df)
|
122 |
+
|
123 |
+
latex_df = metrics_df.style.to_latex()
|
124 |
+
st.text_area(label="Latex code", value=latex_df)
|
125 |
+
|
126 |
+
# Class distribution in each subset
|
127 |
+
st.subheader("Class distribution in each subset")
|
128 |
+
target_unique_values = self.data_dict["train"]["target"].unique()
|
129 |
+
hist = (
|
130 |
+
pd.DataFrame(
|
131 |
+
[
|
132 |
+
df["target"].value_counts(normalize=True).rename(k)
|
133 |
+
for k, df in self.data_dict.items()
|
134 |
+
]
|
135 |
+
)
|
136 |
+
.reset_index()
|
137 |
+
.rename({"index": "split_name"}, axis=1)
|
138 |
+
)
|
139 |
+
plot_data = [
|
140 |
+
go.Bar(
|
141 |
+
name=str(target_unique_values[i]),
|
142 |
+
x=self.subsets,
|
143 |
+
y=hist[target_unique_values[i]].values,
|
144 |
+
)
|
145 |
+
for i in range(len(target_unique_values))
|
146 |
+
]
|
147 |
+
barchart_class_dist = go.Figure(data=plot_data)
|
148 |
+
barchart_class_dist.update_layout(
|
149 |
+
barmode="group",
|
150 |
+
title_text="Barchart - class distribution",
|
151 |
+
xaxis_title="Split name",
|
152 |
+
yaxis_title="Number of data points",
|
153 |
+
)
|
154 |
+
st.plotly_chart(barchart_class_dist, use_container_width=True)
|
155 |
+
st.dataframe(hist)
|
156 |
+
st.text_area(label="Latex code", value=hist.style.to_latex())
|
157 |
+
|
158 |
+
# Number of words per observation
|
159 |
+
st.subheader("Number of words per observation in each subset")
|
160 |
+
hist_data_num_words = [
|
161 |
+
df["text"].apply(count_num_of_words) for df in self.data_dict.values()
|
162 |
+
]
|
163 |
+
fig_num_words = ff.create_distplot(
|
164 |
+
hist_data_num_words, self.subsets, show_rug=False, bin_size=1
|
165 |
+
)
|
166 |
+
fig_num_words.update_traces(
|
167 |
+
nbinsx=100, autobinx=True, selector={"type": "histogram"}
|
168 |
+
)
|
169 |
+
fig_num_words.update_layout(
|
170 |
+
title_text="Histogram - number of characters per observation",
|
171 |
+
xaxis_title="Number of characters",
|
172 |
+
)
|
173 |
+
st.plotly_chart(fig_num_words, use_container_width=True)
|
174 |
+
|
175 |
+
# Number of characters per observation
|
176 |
+
st.subheader("Number of characters per observation in each subset")
|
177 |
+
hist_data_num_characters = [
|
178 |
+
df["text"].apply(count_num_of_characters)
|
179 |
+
for df in self.data_dict.values()
|
180 |
+
]
|
181 |
+
fig_num_chars = ff.create_distplot(
|
182 |
+
hist_data_num_characters, self.subsets, show_rug=False, bin_size=1
|
183 |
+
)
|
184 |
+
fig_num_chars.update_layout(
|
185 |
+
title_text="Histogram - number of characters per observation",
|
186 |
+
xaxis_title="Number of characters",
|
187 |
+
)
|
188 |
+
st.plotly_chart(fig_num_chars, use_container_width=True)
|
clarin_datasets/dataset_to_show.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
|
3 |
+
|
4 |
+
class DatasetToShow(ABC):
|
5 |
+
@abstractmethod
|
6 |
+
def __init__(self):
|
7 |
+
self.dataset_name = None
|
8 |
+
self.data_dict = None
|
9 |
+
self.subsets = None
|
10 |
+
self.description = None
|
11 |
+
|
12 |
+
@abstractmethod
|
13 |
+
def load_data(self):
|
14 |
+
pass
|
15 |
+
|
16 |
+
@abstractmethod
|
17 |
+
def show_dataset(self):
|
18 |
+
pass
|
clarin_datasets/polemo_dataset.py
ADDED
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
import pandas as pd
|
3 |
+
import plotly.figure_factory as ff
|
4 |
+
import plotly.graph_objects as go
|
5 |
+
import streamlit as st
|
6 |
+
|
7 |
+
from clarin_datasets.dataset_to_show import DatasetToShow
|
8 |
+
from clarin_datasets.utils import (
|
9 |
+
count_num_of_characters,
|
10 |
+
count_num_of_words,
|
11 |
+
)
|
12 |
+
|
13 |
+
|
14 |
+
class PolemoDataset(DatasetToShow):
|
15 |
+
def __init__(self):
|
16 |
+
self.dataset_name = "clarin-pl/polemo2-official"
|
17 |
+
self.data_dict = None
|
18 |
+
self.subsets = ["train", "validation", "test"]
|
19 |
+
self.description = """The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
|
20 |
+
hotels, products, and university. It is human-annotated on a level of full reviews and individual
|
21 |
+
sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and
|
22 |
+
sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197,
|
23 |
+
046 annotations. About 85% of the reviews are from the medicine and hotel domains. Each review is
|
24 |
+
annotated with four labels: positive, negative, neutral, or ambiguous. """
|
25 |
+
|
26 |
+
def load_data(self):
|
27 |
+
raw_dataset = load_dataset(self.dataset_name)
|
28 |
+
self.data_dict = {
|
29 |
+
subset: raw_dataset[subset].to_pandas() for subset in self.subsets
|
30 |
+
}
|
31 |
+
|
32 |
+
def show_dataset(self):
|
33 |
+
header = st.container()
|
34 |
+
description = st.container()
|
35 |
+
dataframe_head = st.container()
|
36 |
+
word_searching = st.container()
|
37 |
+
dataset_statistics = st.container()
|
38 |
+
|
39 |
+
with header:
|
40 |
+
st.title(self.dataset_name)
|
41 |
+
|
42 |
+
with description:
|
43 |
+
st.header("Dataset description")
|
44 |
+
st.write(self.description)
|
45 |
+
|
46 |
+
with dataframe_head:
|
47 |
+
filtering_options = self.data_dict["train"]["target"].unique().tolist()
|
48 |
+
filtering_options.append("All classes")
|
49 |
+
|
50 |
+
st.header("First 10 observations of a chosen class")
|
51 |
+
class_to_show = st.selectbox(
|
52 |
+
label="Select class to show", options=filtering_options
|
53 |
+
)
|
54 |
+
df_to_show = pd.concat(
|
55 |
+
[
|
56 |
+
self.data_dict["train"].copy(),
|
57 |
+
self.data_dict["validation"].copy(),
|
58 |
+
self.data_dict["test"].copy(),
|
59 |
+
]
|
60 |
+
)
|
61 |
+
if class_to_show == "All classes":
|
62 |
+
df_to_show = df_to_show.head(10)
|
63 |
+
else:
|
64 |
+
df_to_show = df_to_show.loc[df_to_show["target"] == class_to_show].head(
|
65 |
+
10
|
66 |
+
)
|
67 |
+
st.dataframe(df_to_show)
|
68 |
+
st.text_area(label="Latex code", value=df_to_show.style.to_latex())
|
69 |
+
|
70 |
+
st.subheader("First 10 observations of a chosen domain and text type")
|
71 |
+
domain = st.selectbox(
|
72 |
+
label="Select domain",
|
73 |
+
options=["all", "hotels", "medicine", "products", "reviews"],
|
74 |
+
)
|
75 |
+
text_type = st.selectbox(
|
76 |
+
label="Select text type",
|
77 |
+
options=["Full text", "Tokenized to sentences"],
|
78 |
+
)
|
79 |
+
text_type_mapping_dict = {
|
80 |
+
"Full text": "text",
|
81 |
+
"Tokenized to sentences": "sentence",
|
82 |
+
}
|
83 |
+
|
84 |
+
polemo_subset = load_dataset(
|
85 |
+
self.dataset_name,
|
86 |
+
f"{domain}_{text_type_mapping_dict[text_type]}",
|
87 |
+
)
|
88 |
+
df = pd.concat(
|
89 |
+
[
|
90 |
+
polemo_subset["train"].to_pandas(),
|
91 |
+
polemo_subset["validation"].to_pandas(),
|
92 |
+
polemo_subset["test"].to_pandas(),
|
93 |
+
]
|
94 |
+
).head(10)
|
95 |
+
st.dataframe(df)
|
96 |
+
st.text_area(label="Latex code", value=df.style.to_latex())
|
97 |
+
|
98 |
+
with word_searching:
|
99 |
+
st.header("Observations containing a chosen word")
|
100 |
+
searched_word = st.text_input(
|
101 |
+
label="Enter the word you are looking for below"
|
102 |
+
)
|
103 |
+
df_to_show = pd.concat(
|
104 |
+
[
|
105 |
+
self.data_dict["train"].copy(),
|
106 |
+
self.data_dict["validation"].copy(),
|
107 |
+
self.data_dict["test"].copy(),
|
108 |
+
]
|
109 |
+
)
|
110 |
+
df_to_show = df_to_show.loc[df_to_show["text"].str.contains(searched_word)]
|
111 |
+
st.dataframe(df_to_show)
|
112 |
+
st.text_area(label="Latex code", value=df_to_show.style.to_latex())
|
113 |
+
|
114 |
+
with dataset_statistics:
|
115 |
+
st.header("Dataset statistics")
|
116 |
+
st.subheader("Number of samples in each data split")
|
117 |
+
metrics_df = pd.DataFrame.from_dict(
|
118 |
+
{
|
119 |
+
"Train": self.data_dict["train"].shape[0],
|
120 |
+
"Validation": self.data_dict["validation"].shape[0],
|
121 |
+
"Test": self.data_dict["test"].shape[0],
|
122 |
+
"Total": sum(
|
123 |
+
[
|
124 |
+
self.data_dict["train"].shape[0],
|
125 |
+
self.data_dict["validation"].shape[0],
|
126 |
+
self.data_dict["test"].shape[0],
|
127 |
+
]
|
128 |
+
),
|
129 |
+
},
|
130 |
+
orient="index",
|
131 |
+
).reset_index()
|
132 |
+
metrics_df.columns = ["Subset", "Number of samples"]
|
133 |
+
st.dataframe(metrics_df)
|
134 |
+
|
135 |
+
latex_df = metrics_df.style.to_latex()
|
136 |
+
st.text_area(label="Latex code", value=latex_df)
|
137 |
+
|
138 |
+
# Class distribution in each subset
|
139 |
+
st.subheader("Class distribution in each subset")
|
140 |
+
target_unique_values = self.data_dict["train"]["target"].unique()
|
141 |
+
hist = (
|
142 |
+
pd.DataFrame(
|
143 |
+
[
|
144 |
+
df["target"].value_counts(normalize=True).rename(k)
|
145 |
+
for k, df in self.data_dict.items()
|
146 |
+
]
|
147 |
+
)
|
148 |
+
.reset_index()
|
149 |
+
.rename({"index": "split_name"}, axis=1)
|
150 |
+
)
|
151 |
+
plot_data = [
|
152 |
+
go.Bar(
|
153 |
+
name=str(target_unique_values[i]),
|
154 |
+
x=self.subsets,
|
155 |
+
y=hist[target_unique_values[i]].values,
|
156 |
+
)
|
157 |
+
for i in range(len(target_unique_values))
|
158 |
+
]
|
159 |
+
barchart_class_dist = go.Figure(data=plot_data)
|
160 |
+
barchart_class_dist.update_layout(
|
161 |
+
barmode="group",
|
162 |
+
title_text="Barchart - class distribution",
|
163 |
+
xaxis_title="Split name",
|
164 |
+
yaxis_title="Number of data points",
|
165 |
+
)
|
166 |
+
st.plotly_chart(barchart_class_dist, use_container_width=True)
|
167 |
+
st.dataframe(hist)
|
168 |
+
st.text_area(label="Latex code", value=hist.style.to_latex())
|
169 |
+
|
170 |
+
# Number of words per observation
|
171 |
+
st.subheader("Number of words per observation in each subset")
|
172 |
+
hist_data_num_words = [
|
173 |
+
df["text"].apply(count_num_of_words) for df in self.data_dict.values()
|
174 |
+
]
|
175 |
+
fig_num_words = ff.create_distplot(
|
176 |
+
hist_data_num_words, self.subsets, show_rug=False, bin_size=1
|
177 |
+
)
|
178 |
+
fig_num_words.update_traces(
|
179 |
+
nbinsx=100, autobinx=True, selector={"type": "histogram"}
|
180 |
+
)
|
181 |
+
fig_num_words.update_layout(
|
182 |
+
title_text="Histogram - number of characters per observation",
|
183 |
+
xaxis_title="Number of characters",
|
184 |
+
)
|
185 |
+
st.plotly_chart(fig_num_words, use_container_width=True)
|
186 |
+
|
187 |
+
# Number of characters per observation
|
188 |
+
st.subheader("Number of characters per observation in each subset")
|
189 |
+
hist_data_num_characters = [
|
190 |
+
df["text"].apply(count_num_of_characters)
|
191 |
+
for df in self.data_dict.values()
|
192 |
+
]
|
193 |
+
fig_num_chars = ff.create_distplot(
|
194 |
+
hist_data_num_characters, self.subsets, show_rug=False, bin_size=1
|
195 |
+
)
|
196 |
+
fig_num_chars.update_layout(
|
197 |
+
title_text="Histogram - number of characters per observation",
|
198 |
+
xaxis_title="Number of characters",
|
199 |
+
)
|
200 |
+
st.plotly_chart(fig_num_chars, use_container_width=True)
|
clarin_datasets/utils.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
from unidecode import unidecode
|
5 |
+
|
6 |
+
|
7 |
+
def flatten_list(main_list: List[List]) -> List:
|
8 |
+
return [item for sublist in main_list for item in sublist]
|
9 |
+
|
10 |
+
|
11 |
+
def count_num_of_characters(text: str) -> int:
|
12 |
+
return len(re.sub(r"[^a-zA-Z]", "", unidecode(text)))
|
13 |
+
|
14 |
+
|
15 |
+
def count_num_of_words(text: str) -> int:
|
16 |
+
return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))
|