File size: 10,879 Bytes
a06494a
 
 
 
2e8bd01
a06494a
 
 
b7eb9ad
 
2e8bd01
 
 
a06494a
 
 
 
 
2e8bd01
a06494a
 
b7eb9ad
009b7f5
61d0102
a06494a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7eb9ad
 
a06494a
 
b7eb9ad
a06494a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7eb9ad
a06494a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e8bd01
 
79508cf
 
 
2e8bd01
 
 
a06494a
 
2e8bd01
 
 
b7eb9ad
 
49020f2
 
 
b7eb9ad
 
 
 
49020f2
 
b7eb9ad
 
a06494a
a95396e
a06494a
 
b7eb9ad
 
61d0102
a06494a
b7eb9ad
 
a06494a
b7eb9ad
f9f9d33
74a0f49
d12fff8
a06494a
61d0102
a06494a
b7eb9ad
 
 
 
a06494a
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
import os
import pprint as pp
from collections import OrderedDict, defaultdict

import json
import diff_viewer
import pandas as pd
import streamlit as st
from datasets import load_dataset, get_dataset_config_names

CHECK_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT = st.secrets["CHECK_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT"]
LOGS_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT = st.secrets["LOGS_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT"]
HF_API_TOKEN = st.secrets["HF_API_TOKEN"]
OPERATION_TYPES = [
    "Applied filter",
    "Applied deduplication function",
    "Applied map function",
]
MAX_LEN_DS_CHECKS = st.secrets["MAX_LEN_DS_CHECKS"]


def get_ds(config):
    ds = load_dataset(CHECK_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT, config, use_auth_token=HF_API_TOKEN, trust_remote_code=True)
    return ds["train"]


def next_idx(idx: int):
    idx += 1
    return idx % len(st.session_state["ds"])


def previous_idx(idx: int):
    idx -= 1
    return idx % len(st.session_state["ds"])


def on_click_next():
    st.session_state["idx_1"] = next_idx(st.session_state["idx_1"])
    st.session_state["idx_2"] = next_idx(st.session_state["idx_2"])


def on_click_previous():
    st.session_state["idx_1"] = previous_idx(st.session_state["idx_1"])
    st.session_state["idx_2"] = previous_idx(st.session_state["idx_2"])


def on_ds_change(config):
    st.session_state["ds"] = get_ds(config)
    st.session_state["idx_1"] = 0
    st.session_state["idx_2"] = 1 if len(st.session_state["ds"]) > 1 else 0
    st.session_state["ds_check_config"] = config
    st.session_state["ds_max_docs"] = len(st.session_state["ds"])


def get_log_stats_df(raw_log):
    data = OrderedDict(
        {
            "Order": [],
            "Name": [],
            "Initial number of samples": [],
            "Final number of samples": [],
            "Initial size in bytes": [],
            "Final size in bytes": [],
        }
    )

    metric_dict = defaultdict(lambda: {})
    order = 0
    for line in raw_log.split("\n"):
        for metric_name in list(data.keys()) + OPERATION_TYPES:

            if metric_name == "Name" or metric_name == "Order":
                continue

            if metric_name not in line:
                continue

            if (
                metric_name == "Removed percentage"
                and "Removed percentage in bytes" in line
            ):
                continue

            if (
                metric_name == "Deduplicated percentage"
                and "Deduplicated percentage in bytes" in line
            ):
                continue

            value = line.split(metric_name)[1].split(" ")[1]

            if metric_name in OPERATION_TYPES:
                operation_name = value
                metric_dict[operation_name]["Order"] = order
                order += 1
                continue

            assert (
                metric_name not in metric_dict[operation_name]
            ), f"operation_name: {operation_name}\n\nvalue: {value}\n\nmetric_dict: {pp.pformat(metric_dict)} \n\nmetric_name: {metric_name} \n\nline: {line}"
            metric_dict[operation_name][metric_name] = value
    for name, data_dict in metric_dict.items():
        for metric_name in data.keys():
            if metric_name == "Name":
                data[metric_name].append(name)
                continue

            data[metric_name].append(data_dict[metric_name])
    df = pd.DataFrame(data)
    df.rename(
        {
            "Initial size in bytes": "Initial size (GB)",
            "Final size in bytes": "Final size (GB)",
        },
        axis=1,
        inplace=True,
    )
    df["% samples removed"] = (
        (
            df["Initial number of samples"].astype(float)
            - df["Final number of samples"].astype(float)
        )
        / df["Initial number of samples"].astype(float)
        * 100
    )
    df["Size (GB) % removed"] = (
        (df["Initial size (GB)"].astype(float) - df["Final size (GB)"].astype(float))
        / df["Initial size (GB)"].astype(float)
        * 100
    )
    return df


def get_logs_stats(raw_log):
    try:
        df = get_log_stats_df(raw_log)
        st.dataframe(df)
    except Exception as e:
        st.write(e)
        st.write("Subset of the logs:")
        subcontent = [
            line
            for line in raw_log.split("\n")
            if "INFO - __main__" in line
            and "Examples of" not in line
            and "Examples n°" not in line
        ]
        st.write(subcontent)


def meta_component(idx_key: str = "idx_1"):
    if "meta" not in st.session_state["ds"][st.session_state[idx_key]]:
        return

    with st.expander("See meta field of the example"):
        meta = st.session_state["ds"][st.session_state["idx_1"]]["meta"]
        st.write(meta)


def filter_page():
    index_example = st.number_input("Index of the chosen example", min_value=0, max_value=st.session_state["ds_max_docs"] -1, value=0, step=1)
    st.session_state["idx_1"] =  index_example
    st.session_state["idx_2"] = next_idx(index_example) 
    idx_1 = st.session_state["idx_1"]
    idx_2 = st.session_state["idx_2"] 
    text_1 = st.session_state["ds"][idx_1]["text"]
    text_2 = st.session_state["ds"][idx_2]["text"]

    st.markdown(
        f"<h1 style='text-align: center'>Some examples of filtered out texts</h1>",
        unsafe_allow_html=True,
    )
    # col_button_previous, _, col_button_next = st.columns(3)


    # col_button_next.button(
    #     "Go to next example",
    #     key=None,
    #     help=None,
    #     on_click=on_click_next,
    #     args=None,
    #     kwargs=None,
    # )
    # col_button_previous.button(
    #     "Go to previous example",
    #     key=None,
    #     help=None,
    #     on_click=on_click_previous,
    #     args=None,
    #     kwargs=None,
    # )
    col_1, col_2 = st.columns(2)
    with col_1:
        st.subheader(f"Example n°{idx_1}")
        meta_component(idx_key="idx_1")
        text_1_show = text_1.replace("\n", "<br>")
        st.markdown(f"<div>{text_1_show}</div>", unsafe_allow_html=True)

    with col_2:
        st.subheader(f"Example n°{idx_2}")
        meta_component(idx_key="idx_2")
        text_2_show = text_2.replace("\n", "<br>")
        st.markdown(f"<div>{text_2_show}</div>", unsafe_allow_html=True)


def dedup_or_cleaning_page():
    index_example = st.number_input("Index of the chosen example", min_value=0, max_value=st.session_state["ds_max_docs"] -1, value=0, step=1)
    st.session_state["idx_1"] =  index_example
    st.session_state["idx_2"] = next_idx(index_example) 

    # col_button_previous, col_title, col_button_next = st.columns(3)
    # col_title.markdown(
    #     f"<h1 style='text-align: center'>Example n°{st.session_state['idx_1']}</h1>",
    #     unsafe_allow_html=True,
    # )
    # col_button_next.button(
    #     "Go to next example",
    #     key=None,
    #     help=None,
    #     on_click=on_click_next,
    #     args=None,
    #     kwargs=None,
    # )
    # col_button_previous.button(
    #     "Go to previous example",
    #     key=None,
    #     help=None,
    #     on_click=on_click_previous,
    #     args=None,
    #     kwargs=None,
    # )

    text = st.session_state["ds"][st.session_state["idx_1"]]["text"]
    old_text = st.session_state["ds"][st.session_state["idx_1"]]["old_text"]
    st.markdown(
        f"<h2 style='text-align: center'>Changes applied</h1>", unsafe_allow_html=True
    )
    col_text_1, col_text_2 = st.columns(2)
    with col_text_1:
        st.subheader("Old text")
    with col_text_2:
        st.subheader("New text")
    diff_viewer.diff_viewer(old_text=old_text, new_text=text, lang="none")
    meta_component(idx_key="idx_1")

    with st.expander("See full old and new texts of the example"):
        text_show = text.replace("\n", "<br>")
        old_text_show = old_text.replace("\n", "<br>")

        col_1, col_2 = st.columns(2)
        with col_1:
            st.subheader("Old text")
            st.markdown(f"<div>{old_text_show}</div>", unsafe_allow_html=True)
        with col_2:
            st.subheader("New text")
            st.markdown(f"<div>{text_show}</div>", unsafe_allow_html=True)


# Streamlit page
st.set_page_config(page_title="Dataset explorer", page_icon=":hugging_face:", layout="wide")
st.write(
    "The purpose of this application is to sequentially view the changes made to a dataset."
)


# st.write(CHECK_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT)
# ds_log = load_dataset(CHECK_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT, 'clean_v1_dsname_lm_en_multi_un_2', use_auth_token=HF_API_TOKEN)
# st.write(ds_log)



col_option_clean, col_option_ds = st.columns(2)

with open("dataset_configs.json", "r") as f:
    CHECK_CONFIGS = json.load(f)
# CHECK_CONFIGS = get_dataset_config_names(CHECK_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT, use_auth_token=HF_API_TOKEN)

CLEANING_VERSIONS = set()
dataset_names = defaultdict(set)
checks_names = defaultdict(lambda: defaultdict(set))

for check_config in CHECK_CONFIGS:
    cleaning_version, check_config = check_config.split("_dsname_")
    dataset_name, checks_name = check_config.split("_operation_")
    CLEANING_VERSIONS.add(cleaning_version)
    dataset_names[cleaning_version].add(dataset_name)
    checks_names[cleaning_version][dataset_name].add(checks_name)

# CLEANING_VERSIONS = sorted(list(os.listdir(DATASET_DIR_PATH_BEFORE_CLEAN_SELECT)), reverse=True)
option_clean = col_option_clean.selectbox(
    "Select the cleaning version", sorted(CLEANING_VERSIONS, reverse=True)
)

# DATASET_DIR_PATH = os.path.join(DATASET_DIR_PATH_BEFORE_CLEAN_SELECT, option_clean)
# dataset_names = sorted(list(os.listdir(DATASET_DIR_PATH)))
option_ds = col_option_ds.selectbox("Select the dataset", sorted(dataset_names[option_clean]))

# checks_path = os.path.join(DATASET_DIR_PATH, option_ds, "checks")
# checks_names = sorted(list(os.listdir(checks_path)))

# log_path = os.path.join(DATASET_DIR_PATH, option_ds, "logs.txt")
ds_log = load_dataset(LOGS_DATASET_DIR_PATH_BEFORE_CLEAN_SELECT, f"{option_clean}_dsname_{option_ds}", use_auth_token=HF_API_TOKEN, trust_remote_code=True)
log = ds_log["train"][0]["log"]
get_logs_stats(raw_log=log)

option_check = st.selectbox("Select the operation applied to inspect", sorted(checks_names[option_clean][option_ds]))

ds_check_config = f"{option_clean}_dsname_{option_ds}_operation_{option_check}"

if "ds" not in st.session_state or ds_check_config != st.session_state["ds_check_config"]:
    on_ds_change(ds_check_config)

if len(st.session_state["ds"]) == MAX_LEN_DS_CHECKS:
    st.warning(
        f"Note: only a subset of size {MAX_LEN_DS_CHECKS} of the modified / filtered examples can be shown in this application"
    )
with st.expander("See details of the available checks"):
    st.write(st.session_state["ds"])


_ = filter_page() if "_filter_" in option_check else dedup_or_cleaning_page()