Spaces:

clarin-pl
/

datasets-explorer

Runtime error

File size: 9,520 Bytes

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
import pandas as pd
import plotly.figure_factory as ff
import plotly.graph_objects as go
from sklearn.manifold import TSNE
import streamlit as st

from clarin_datasets.dataset_to_show import DatasetToShow
from clarin_datasets.utils import (
    count_num_of_characters,
    count_num_of_words,
    embed_sentence,
    PLOT_COLOR_PALETTE
)


class PolemoDataset(DatasetToShow):
    def __init__(self):
        DatasetToShow.__init__(self)
        self.dataset_name = "clarin-pl/polemo2-official"
        self.subsets = ["train", "validation", "test"]
        self.description = f"""
        Dataset link: https://huggingface.co/datasets/{self.dataset_name}
        
        The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine, 
        hotels, products, and university. It is human-annotated on a level of full reviews and individual 
        sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and 
        sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197,
        046 annotations. About 85% of the reviews are from the medicine and hotel domains. Each review is 
        annotated with four labels: positive, negative, neutral, or ambiguous. """

    def load_data(self):
        raw_dataset = load_dataset(self.dataset_name)
        self.data_dict = {
            subset: raw_dataset[subset].to_pandas() for subset in self.subsets
        }

    def show_dataset(self):
        header = st.container()
        description = st.container()
        dataframe_head = st.container()
        word_searching = st.container()
        dataset_statistics = st.container()
        tsne_projection = st.container()

        with header:
            st.title(self.dataset_name)

        with description:
            st.header("Dataset description")
            st.write(self.description)

        with dataframe_head:
            filtering_options = self.data_dict["train"]["target"].unique().tolist()
            filtering_options.append("All classes")

            st.header("First 10 observations of a chosen class")
            class_to_show = st.selectbox(
                label="Select class to show", options=filtering_options
            )
            df_to_show = pd.concat(
                [
                    self.data_dict["train"].copy(),
                    self.data_dict["validation"].copy(),
                    self.data_dict["test"].copy(),
                ]
            )
            if class_to_show == "All classes":
                df_to_show = df_to_show.head(10)
            else:
                df_to_show = df_to_show.loc[df_to_show["target"] == class_to_show].head(
                    10
                )
            st.dataframe(df_to_show)
            st.text_area(label="Latex code", value=df_to_show.style.to_latex())

            st.subheader("First 10 observations of a chosen domain and text type")
            domain = st.selectbox(
                label="Select domain",
                options=["all", "hotels", "medicine", "products", "reviews"],
            )
            text_type = st.selectbox(
                label="Select text type",
                options=["Full text", "Tokenized to sentences"],
            )
            text_type_mapping_dict = {
                "Full text": "text",
                "Tokenized to sentences": "sentence",
            }

            polemo_subset = load_dataset(
                self.dataset_name,
                f"{domain}_{text_type_mapping_dict[text_type]}",
            )
            df = pd.concat(
                [
                    polemo_subset["train"].to_pandas(),
                    polemo_subset["validation"].to_pandas(),
                    polemo_subset["test"].to_pandas(),
                ]
            ).head(10)
            st.dataframe(df)
            st.text_area(label="Latex code", value=df.style.to_latex())

        with word_searching:
            st.header("Observations containing a chosen word")
            searched_word = st.text_input(
                label="Enter the word you are looking for below"
            )
            df_to_show = pd.concat(
                [
                    self.data_dict["train"].copy(),
                    self.data_dict["validation"].copy(),
                    self.data_dict["test"].copy(),
                ]
            )
            df_to_show = df_to_show.loc[df_to_show["text"].str.contains(searched_word)]
            st.dataframe(df_to_show)
            st.text_area(label="Latex code", value=df_to_show.style.to_latex())

        with dataset_statistics:
            st.header("Dataset statistics")
            st.subheader("Number of samples in each data split")
            metrics_df = pd.DataFrame.from_dict(
                {
                    "Train": self.data_dict["train"].shape[0],
                    "Validation": self.data_dict["validation"].shape[0],
                    "Test": self.data_dict["test"].shape[0],
                    "Total": sum(
                        [
                            self.data_dict["train"].shape[0],
                            self.data_dict["validation"].shape[0],
                            self.data_dict["test"].shape[0],
                        ]
                    ),
                },
                orient="index",
            ).reset_index()
            metrics_df.columns = ["Subset", "Number of samples"]
            st.dataframe(metrics_df)

            latex_df = metrics_df.style.to_latex()
            st.text_area(label="Latex code", value=latex_df)

            # Class distribution in each subset
            st.subheader("Class distribution in each subset")
            target_unique_values = self.data_dict["train"]["target"].unique()
            hist = (
                pd.DataFrame(
                    [
                        df["target"].value_counts(normalize=True).rename(k)
                        for k, df in self.data_dict.items()
                    ]
                )
                .reset_index()
                .rename({"index": "split_name"}, axis=1)
            )
            plot_data = [
                go.Bar(
                    name=str(target_unique_values[i]),
                    x=self.subsets,
                    y=hist[target_unique_values[i]].values,
                )
                for i in range(len(target_unique_values))
            ]
            barchart_class_dist = go.Figure(data=plot_data)
            barchart_class_dist.update_layout(
                barmode="group",
                title_text="Barchart - class distribution",
                xaxis_title="Split name",
                yaxis_title="Number of data points",
            )
            st.plotly_chart(barchart_class_dist, use_container_width=True)
            st.dataframe(hist)
            st.text_area(label="Latex code", value=hist.style.to_latex())

            # Number of words per observation
            st.subheader("Number of words per observation in each subset")
            hist_data_num_words = [
                df["text"].apply(count_num_of_words) for df in self.data_dict.values()
            ]
            fig_num_words = ff.create_distplot(
                hist_data_num_words, self.subsets, show_rug=False, bin_size=1
            )
            fig_num_words.update_traces(
                nbinsx=100, autobinx=True, selector={"type": "histogram"}
            )
            fig_num_words.update_layout(
                title_text="Histogram - number of characters per observation",
                xaxis_title="Number of characters",
            )
            st.plotly_chart(fig_num_words, use_container_width=True)

            # Number of characters per observation
            st.subheader("Number of characters per observation in each subset")
            hist_data_num_characters = [
                df["text"].apply(count_num_of_characters)
                for df in self.data_dict.values()
            ]
            fig_num_chars = ff.create_distplot(
                hist_data_num_characters, self.subsets, show_rug=False, bin_size=1
            )
            fig_num_chars.update_layout(
                title_text="Histogram - number of characters per observation",
                xaxis_title="Number of characters",
            )
            st.plotly_chart(fig_num_chars, use_container_width=True)

            with tsne_projection:
                st.header("t-SNE projection of the dataset")
                subset_to_project = st.selectbox(
                    label="Select subset to project", options=self.subsets
                )
                sentences = self.data_dict[subset_to_project]["text"].values
                reducer = TSNE(
                    n_components=2
                )
                embedded_sentences = np.array(
                    [embed_sentence(text) for text in sentences]
                )
                transformed_embeddings = reducer.fit_transform(embedded_sentences)
                fig, ax = plt.subplots()
                ax.scatter(
                    x=transformed_embeddings[:, 0],
                    y=transformed_embeddings[:, 1],
                    c=[
                        PLOT_COLOR_PALETTE[x]
                        for x in self.data_dict[subset_to_project]["target"].values
                    ],
                )
                st.pyplot(fig)