Spaces:
Runtime error
Runtime error
import pandas as pd | |
from datasets import load_dataset | |
import streamlit as st | |
from clarin_datasets.dataset_to_show import DatasetToShow | |
class AspectEmoDataset(DatasetToShow): | |
def __init__(self): | |
self.dataset_name = "clarin-pl/aspectemo" | |
self.subsets = ["train", "test"] | |
self.description = """ | |
Description AspectEmo Corpus is an extended version of a publicly available PolEmo 2.0 | |
corpus of Polish customer reviews used in many projects on the use of different methods in sentiment | |
analysis. The AspectEmo corpus consists of four subcorpora, each containing online customer reviews from the | |
following domains: school, medicine, hotels, and products. All documents are annotated at the aspect level | |
with six sentiment categories: strong negative (minus_m), weak negative (minus_s), neutral (zero), | |
weak positive (plus_s), strong positive (plus_m). | |
Tasks (input, output and metrics) | |
Aspect-based sentiment analysis (ABSA) is a text analysis method that | |
categorizes data by aspects and identifies the sentiment assigned to each aspect. It is the sequence tagging | |
task. | |
Input ('tokens' column): sequence of tokens | |
Output ('labels' column): sequence of predicted tokens’ classes ("O" + 6 possible classes: strong negative ( | |
a_minus_m), weak negative (a_minus_s), neutral (a_zero), weak positive (a_plus_s), strong positive ( | |
a_plus_m), ambiguous (a_amb) ) | |
Domain: school, medicine, hotels and products | |
Measurements: | |
Example: ['Dużo', 'wymaga', ',', 'ale', 'bardzo', 'uczciwy', 'i', 'przyjazny', 'studentom', '.', 'Warto', 'chodzić', | |
'na', 'konsultacje', '.', 'Docenia', 'postępy', 'i', 'zaangażowanie', '.', 'Polecam', '.'] → ['O', 'a_plus_s', 'O', | |
'O', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'a_zero', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O'] | |
""" | |
def load_data(self): | |
raw_dataset = load_dataset(self.dataset_name) | |
self.data_dict = { | |
subset: raw_dataset[subset].to_pandas() for subset in self.subsets | |
} | |
def show_dataset(self): | |
header = st.container() | |
description = st.container() | |
dataframe_head = st.container() | |
with header: | |
st.title(self.dataset_name) | |
with description: | |
st.header("Dataset description") | |
st.write(self.description) | |
full_dataframe = pd.concat(self.data_dict.values(), axis="rows") | |
with dataframe_head: | |
df_to_show = full_dataframe.head(10) | |
st.header("First 10 observations of the dataset") | |
st.dataframe(df_to_show) | |
st.text_area(label="Latex code", value=df_to_show.style.to_latex()) | |