Spaces:
Runtime error
Runtime error
Mariusz Kossakowski
commited on
Commit
•
010bd36
1
Parent(s):
d572e8e
Add aspectemo dataset
Browse files
clarin_datasets/aspectemo_dataset.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from datasets import load_dataset
|
3 |
+
import streamlit as st
|
4 |
+
|
5 |
+
from clarin_datasets.dataset_to_show import DatasetToShow
|
6 |
+
|
7 |
+
|
8 |
+
class AspectEmoDataset(DatasetToShow):
|
9 |
+
def __init__(self):
|
10 |
+
self.dataset_name = "clarin-pl/aspectemo"
|
11 |
+
self.subsets = ["train", "test"]
|
12 |
+
self.description = """
|
13 |
+
Description AspectEmo Corpus is an extended version of a publicly available PolEmo 2.0
|
14 |
+
corpus of Polish customer reviews used in many projects on the use of different methods in sentiment
|
15 |
+
analysis. The AspectEmo corpus consists of four subcorpora, each containing online customer reviews from the
|
16 |
+
following domains: school, medicine, hotels, and products. All documents are annotated at the aspect level
|
17 |
+
with six sentiment categories: strong negative (minus_m), weak negative (minus_s), neutral (zero),
|
18 |
+
weak positive (plus_s), strong positive (plus_m).
|
19 |
+
|
20 |
+
Tasks (input, output and metrics)
|
21 |
+
|
22 |
+
Aspect-based sentiment analysis (ABSA) is a text analysis method that
|
23 |
+
categorizes data by aspects and identifies the sentiment assigned to each aspect. It is the sequence tagging
|
24 |
+
task.
|
25 |
+
|
26 |
+
Input ('tokens' column): sequence of tokens
|
27 |
+
|
28 |
+
Output ('labels' column): sequence of predicted tokens’ classes ("O" + 6 possible classes: strong negative (
|
29 |
+
a_minus_m), weak negative (a_minus_s), neutral (a_zero), weak positive (a_plus_s), strong positive (
|
30 |
+
a_plus_m), ambiguous (a_amb) )
|
31 |
+
|
32 |
+
Domain: school, medicine, hotels and products
|
33 |
+
|
34 |
+
Measurements:
|
35 |
+
|
36 |
+
Example: ['Dużo', 'wymaga', ',', 'ale', 'bardzo', 'uczciwy', 'i', 'przyjazny', 'studentom', '.', 'Warto', 'chodzić',
|
37 |
+
'na', 'konsultacje', '.', 'Docenia', 'postępy', 'i', 'zaangażowanie', '.', 'Polecam', '.'] → ['O', 'a_plus_s', 'O',
|
38 |
+
'O', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'a_zero', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O']
|
39 |
+
"""
|
40 |
+
|
41 |
+
def load_data(self):
|
42 |
+
raw_dataset = load_dataset(self.dataset_name)
|
43 |
+
self.data_dict = {
|
44 |
+
subset: raw_dataset[subset].to_pandas() for subset in self.subsets
|
45 |
+
}
|
46 |
+
|
47 |
+
def show_dataset(self):
|
48 |
+
header = st.container()
|
49 |
+
description = st.container()
|
50 |
+
dataframe_head = st.container()
|
51 |
+
|
52 |
+
with header:
|
53 |
+
st.title(self.dataset_name)
|
54 |
+
|
55 |
+
with description:
|
56 |
+
st.header("Dataset description")
|
57 |
+
st.write(self.description)
|
58 |
+
|
59 |
+
full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
|
60 |
+
with dataframe_head:
|
61 |
+
df_to_show = full_dataframe.head(10)
|
62 |
+
st.header("First 10 observations of the dataset")
|
63 |
+
st.dataframe(df_to_show)
|
64 |
+
st.text_area(label="Latex code", value=df_to_show.style.to_latex())
|