Spaces:

clarin-pl
/

datasets-explorer

Runtime error

App Files Files Community

Mariusz Kossakowski commited on Aug 18, 2022

Commit

010bd36

•

1 Parent(s): d572e8e

Add aspectemo dataset

Browse files

Files changed (1) hide show

clarin_datasets/aspectemo_dataset.py +64 -0

clarin_datasets/aspectemo_dataset.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import pandas as pd
+from datasets import load_dataset
+import streamlit as st
+from clarin_datasets.dataset_to_show import DatasetToShow
+class AspectEmoDataset(DatasetToShow):
+    def __init__(self):
+        self.dataset_name = "clarin-pl/aspectemo"
+        self.subsets = ["train", "test"]
+        self.description = """
+        Description AspectEmo Corpus is an extended version of a publicly available PolEmo 2.0
+        corpus of Polish customer reviews used in many projects on the use of different methods in sentiment
+        analysis. The AspectEmo corpus consists of four subcorpora, each containing online customer reviews from the
+        following domains: school, medicine, hotels, and products. All documents are annotated at the aspect level
+        with six sentiment categories: strong negative (minus_m), weak negative (minus_s), neutral (zero),
+        weak positive (plus_s), strong positive (plus_m).
+        Tasks (input, output and metrics)
+        Aspect-based sentiment analysis (ABSA) is a text analysis method that
+        categorizes data by aspects and identifies the sentiment assigned to each aspect. It is the sequence tagging
+        task.
+        Input ('tokens' column): sequence of tokens
+        Output ('labels' column): sequence of predicted tokens’ classes ("O" + 6 possible classes: strong negative (
+        a_minus_m), weak negative (a_minus_s), neutral (a_zero), weak positive (a_plus_s), strong positive (
+        a_plus_m), ambiguous (a_amb) )
+        Domain: school, medicine, hotels and products
+        Measurements:
+        Example: ['Dużo', 'wymaga', ',', 'ale', 'bardzo', 'uczciwy', 'i', 'przyjazny', 'studentom', '.', 'Warto', 'chodzić',
+        'na', 'konsultacje', '.', 'Docenia', 'postępy', 'i', 'zaangażowanie', '.', 'Polecam', '.'] → ['O', 'a_plus_s', 'O',
+        'O', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'a_zero', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O']
+        """
+    def load_data(self):
+        raw_dataset = load_dataset(self.dataset_name)
+        self.data_dict = {
+            subset: raw_dataset[subset].to_pandas() for subset in self.subsets
+        }
+    def show_dataset(self):
+        header = st.container()
+        description = st.container()
+        dataframe_head = st.container()
+        with header:
+            st.title(self.dataset_name)
+        with description:
+            st.header("Dataset description")
+            st.write(self.description)
+        full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
+        with dataframe_head:
+            df_to_show = full_dataframe.head(10)
+            st.header("First 10 observations of the dataset")
+            st.dataframe(df_to_show)
+            st.text_area(label="Latex code", value=df_to_show.style.to_latex())