import numpy as np import matplotlib.pyplot as plt import pandas as pd from datasets import load_dataset from sklearn.manifold import TSNE import streamlit as st from clarin_datasets.dataset_to_show import DatasetToShow from clarin_datasets.utils import embed_sentence, PLOT_COLOR_PALETTE class CSTWikinewsDataset(DatasetToShow): def __init__(self): DatasetToShow.__init__(self) self.dataset_name = "clarin-pl/cst-wikinews" self.description = f""" Dataset link: https://huggingface.co/datasets/{self.dataset_name} """ def load_data(self): raw_dataset = load_dataset(self.dataset_name) self.data_dict = { subset: raw_dataset[subset].to_pandas() for subset in self.subsets } def show_dataset(self): header = st.container() dataframe_head = st.container() class_distribution = st.container() tsne_projection = st.container() with header: st.title(self.dataset_name) with dataframe_head: st.header("First 10 observations of the chosen subset") subset_to_show = st.selectbox( label="Select subset to see", options=self.subsets ) df_to_show = self.data_dict[subset_to_show].head(10) st.dataframe(df_to_show) st.text_area(label="LaTeX code", value=df_to_show.style.to_latex()) class_distribution_df = pd.merge( pd.DataFrame( self.data_dict["train"]["label"] .value_counts(normalize=True) .reset_index(drop=False) .rename({"index": "class"}, axis="columns") ), pd.DataFrame( self.data_dict["test"]["label"] .value_counts(normalize=True) .reset_index(drop=False) .rename({"index": "class"}, axis="columns") ), on="class", ).rename({"label_x": "train", "label_y": "test"}, axis="columns") with class_distribution: st.dataframe(class_distribution_df) with tsne_projection: st.header("t-SNE projection of the dataset") subset_to_project = st.selectbox( label="Select subset to project", options=self.subsets ) first_sentences = self.data_dict[subset_to_project]["sentence_1"].values second_sentences = self.data_dict[subset_to_project]["sentence_2"].values labels = self.data_dict[subset_to_project]["label"].values first_sentences_embedded = np.array([embed_sentence(x) for x in first_sentences]) second_sentences_embedded = np.array([embed_sentence(x) for x in second_sentences]) mean_embeddings = (first_sentences_embedded + second_sentences_embedded) / 2 reducer = TSNE( n_components=2 ) transformed_embeddings = reducer.fit_transform(mean_embeddings) fig, ax = plt.subplots() ax.scatter( x=transformed_embeddings[:, 0], y=transformed_embeddings[:, 1], c=[ PLOT_COLOR_PALETTE[i] for i in labels ] ) st.pyplot(fig)