Spaces:
Runtime error
Runtime error
Mariusz Kossakowski
commited on
Commit
·
afc4898
1
Parent(s):
934878d
Add app first version
Browse files- LICENSE +21 -0
- README.md +1 -12
- app.py +161 -0
- data/dev.csv +0 -0
- data/test.csv +0 -0
- data/train.csv +0 -0
- poetry.lock +0 -0
- pyproject.toml +23 -0
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2022 CLARIN-PL
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,12 +1 @@
|
|
1 |
-
|
2 |
-
title: Abusive Clauses Dashboard
|
3 |
-
emoji: 🦀
|
4 |
-
colorFrom: red
|
5 |
-
colorTo: yellow
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.10.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# abusive-clauses-dashboard
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import plotly.figure_factory as ff
|
5 |
+
import plotly.graph_objects as go
|
6 |
+
import pyperclip
|
7 |
+
import streamlit as st
|
8 |
+
from unidecode import unidecode
|
9 |
+
|
10 |
+
st.set_page_config(layout="wide")
|
11 |
+
|
12 |
+
DATA_SPLITS = ["train", "dev", "test"]
|
13 |
+
|
14 |
+
|
15 |
+
def load_data() -> dict[str, pd.DataFrame]:
|
16 |
+
return {data: pd.read_csv(f"data/{data}.csv") for data in DATA_SPLITS}
|
17 |
+
|
18 |
+
|
19 |
+
def flatten_list(main_list: list[list]) -> list:
|
20 |
+
return [item for sublist in main_list for item in sublist]
|
21 |
+
|
22 |
+
|
23 |
+
def count_num_of_characters(text: str) -> int:
|
24 |
+
return len(re.sub(r"[^a-zA-Z]", "", unidecode(text)))
|
25 |
+
|
26 |
+
|
27 |
+
def count_num_of_words(text: str) -> int:
|
28 |
+
return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))
|
29 |
+
|
30 |
+
|
31 |
+
DATA_DICT = load_data()
|
32 |
+
|
33 |
+
header = st.container()
|
34 |
+
description = st.container()
|
35 |
+
dataset_statistics = st.container()
|
36 |
+
class_distribution = st.container()
|
37 |
+
|
38 |
+
with header:
|
39 |
+
st.title("PAC - Polish Abusive Clauses Dataset")
|
40 |
+
|
41 |
+
with description:
|
42 |
+
st.header("Dataset description")
|
43 |
+
desc = """
|
44 |
+
''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
|
45 |
+
Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
|
46 |
+
But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
|
47 |
+
we probably skip most of the Terms and Conditions. However, we must remember that we have concluded many more
|
48 |
+
contracts. Imagine that we want to buy a house, a car, send our kids to the nursery, open a bank account,
|
49 |
+
or many more. In all these situations, you will need to conclude the contract, but there is a high probability
|
50 |
+
that you will not read the entire agreement with proper understanding. European consumer law aims to prevent
|
51 |
+
businesses from using so-called ''unfair contractual terms'' in their unilaterally drafted contracts,
|
52 |
+
requiring consumers to accept.
|
53 |
+
|
54 |
+
Our dataset treats ''unfair contractual term'' as the equivalent of an abusive clause. It could be defined as a
|
55 |
+
clause that is unilaterally imposed by one of the contract's parties, unequally affecting the other, or creating a
|
56 |
+
situation of imbalance between the duties and rights of the parties.
|
57 |
+
|
58 |
+
On the EU and at the national such as the Polish levels, agencies cannot check possible agreements by hand. Hence,
|
59 |
+
we took the first step to evaluate the possibility of accelerating this process. We created a dataset and machine
|
60 |
+
learning models to automate potentially abusive clauses detection partially. Consumer protection organizations and
|
61 |
+
agencies can use these resources to make their work more effective and efficient. Moreover, consumers can automatically
|
62 |
+
analyze contracts and understand what they agree upon.
|
63 |
+
"""
|
64 |
+
st.write(desc)
|
65 |
+
st.markdown("<h1 style='text-align: center; color: white;'>Dataset statistics</h1>",
|
66 |
+
unsafe_allow_html=True)
|
67 |
+
|
68 |
+
with dataset_statistics:
|
69 |
+
st.header("Number of samples in each data split")
|
70 |
+
metrics_df = pd.DataFrame.from_dict(
|
71 |
+
{
|
72 |
+
"Train": DATA_DICT["train"].shape[0],
|
73 |
+
"Dev": DATA_DICT["dev"].shape[0],
|
74 |
+
"Test": DATA_DICT["test"].shape[0],
|
75 |
+
"Total": sum(
|
76 |
+
[
|
77 |
+
DATA_DICT["train"].shape[0],
|
78 |
+
DATA_DICT["dev"].shape[0],
|
79 |
+
DATA_DICT["test"].shape[0],
|
80 |
+
]
|
81 |
+
),
|
82 |
+
},
|
83 |
+
orient="index",
|
84 |
+
).reset_index()
|
85 |
+
metrics_df.columns = ["Subset", "Number of samples"]
|
86 |
+
st.dataframe(metrics_df)
|
87 |
+
latex_df = metrics_df.style.to_latex()
|
88 |
+
st.button(label="Copy table to LaTeX", on_click=lambda: pyperclip.copy(latex_df), key="copy_metrics_df")
|
89 |
+
|
90 |
+
# Class distribution in each subset
|
91 |
+
with class_distribution:
|
92 |
+
st.header("Class distribution in each subset")
|
93 |
+
plot_column, table_column = st.columns(2)
|
94 |
+
with plot_column:
|
95 |
+
hist = (
|
96 |
+
pd.DataFrame(
|
97 |
+
[
|
98 |
+
df["label"].value_counts(normalize=True).rename(k)
|
99 |
+
for k, df in DATA_DICT.items()
|
100 |
+
]
|
101 |
+
)
|
102 |
+
.reset_index()
|
103 |
+
.rename({"index": "split_name"}, axis=1)
|
104 |
+
)
|
105 |
+
barchart_class_dist = go.Figure(
|
106 |
+
data=[
|
107 |
+
go.Bar(
|
108 |
+
name="BEZPIECZNE_POSTANOWIENIE_UMOWNE",
|
109 |
+
x=DATA_SPLITS,
|
110 |
+
y=hist["BEZPIECZNE_POSTANOWIENIE_UMOWNE"].values,
|
111 |
+
),
|
112 |
+
go.Bar(
|
113 |
+
name="KLAUZULA_ABUZYWNA",
|
114 |
+
x=DATA_SPLITS,
|
115 |
+
y=hist["KLAUZULA_ABUZYWNA"].values,
|
116 |
+
),
|
117 |
+
]
|
118 |
+
)
|
119 |
+
barchart_class_dist.update_layout(
|
120 |
+
barmode="group",
|
121 |
+
xaxis_title="Split name",
|
122 |
+
yaxis_title="Number of data points",
|
123 |
+
)
|
124 |
+
st.plotly_chart(barchart_class_dist, use_container_width=True)
|
125 |
+
|
126 |
+
with table_column:
|
127 |
+
for _ in range(10):
|
128 |
+
st.text("")
|
129 |
+
st.dataframe(hist)
|
130 |
+
latex_df_class_dist = hist.style.to_latex()
|
131 |
+
st.button(label="Copy table to LaTeX", on_click=lambda: pyperclip.copy(latex_df_class_dist),
|
132 |
+
key="copy_class_dist_df")
|
133 |
+
|
134 |
+
# Number of words per observation
|
135 |
+
hist_data_num_words = [
|
136 |
+
df["text"].apply(count_num_of_words) for df in DATA_DICT.values()
|
137 |
+
]
|
138 |
+
fig_num_words = ff.create_distplot(
|
139 |
+
hist_data_num_words, DATA_SPLITS, show_rug=False, bin_size=1
|
140 |
+
)
|
141 |
+
fig_num_words.update_traces(
|
142 |
+
nbinsx=100, autobinx=True, selector={"type": "histogram"}
|
143 |
+
)
|
144 |
+
fig_num_words.update_layout(
|
145 |
+
title_text="Histogram - number of characters per observation",
|
146 |
+
xaxis_title="Number of characters",
|
147 |
+
)
|
148 |
+
st.plotly_chart(fig_num_words, use_container_width=True)
|
149 |
+
|
150 |
+
# Number of characters per observation
|
151 |
+
hist_data_num_characters = [
|
152 |
+
df["text"].apply(count_num_of_characters) for df in DATA_DICT.values()
|
153 |
+
]
|
154 |
+
fig_num_chars = ff.create_distplot(
|
155 |
+
hist_data_num_characters, DATA_SPLITS, show_rug=False, bin_size=1
|
156 |
+
)
|
157 |
+
fig_num_chars.update_layout(
|
158 |
+
title_text="Histogram - number of characters per observation",
|
159 |
+
xaxis_title="Number of characters",
|
160 |
+
)
|
161 |
+
st.plotly_chart(fig_num_chars, use_container_width=True)
|
data/dev.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/test.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/train.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "abusive-clauses-dashboard"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = ""
|
5 |
+
authors = ["Your Name <you@example.com>"]
|
6 |
+
|
7 |
+
[tool.poetry.dependencies]
|
8 |
+
python = ">=3.10,<3.11"
|
9 |
+
streamlit = "^1.11.0"
|
10 |
+
gradio = "^3.0.26"
|
11 |
+
transformers = "^4.20.1"
|
12 |
+
datasets = "^2.3.2"
|
13 |
+
black = "^22.6.0"
|
14 |
+
pyperclip = "^1.8.2"
|
15 |
+
plotly = "^5.9.0"
|
16 |
+
Unidecode = "^1.3.4"
|
17 |
+
scipy = "^1.8.1"
|
18 |
+
|
19 |
+
[tool.poetry.dev-dependencies]
|
20 |
+
|
21 |
+
[build-system]
|
22 |
+
requires = ["poetry-core>=1.0.0"]
|
23 |
+
build-backend = "poetry.core.masonry.api"
|