Mariusz Kossakowski commited on
Commit
010bd36
1 Parent(s): d572e8e

Add aspectemo dataset

Browse files
clarin_datasets/aspectemo_dataset.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from datasets import load_dataset
3
+ import streamlit as st
4
+
5
+ from clarin_datasets.dataset_to_show import DatasetToShow
6
+
7
+
8
+ class AspectEmoDataset(DatasetToShow):
9
+ def __init__(self):
10
+ self.dataset_name = "clarin-pl/aspectemo"
11
+ self.subsets = ["train", "test"]
12
+ self.description = """
13
+ Description AspectEmo Corpus is an extended version of a publicly available PolEmo 2.0
14
+ corpus of Polish customer reviews used in many projects on the use of different methods in sentiment
15
+ analysis. The AspectEmo corpus consists of four subcorpora, each containing online customer reviews from the
16
+ following domains: school, medicine, hotels, and products. All documents are annotated at the aspect level
17
+ with six sentiment categories: strong negative (minus_m), weak negative (minus_s), neutral (zero),
18
+ weak positive (plus_s), strong positive (plus_m).
19
+
20
+ Tasks (input, output and metrics)
21
+
22
+ Aspect-based sentiment analysis (ABSA) is a text analysis method that
23
+ categorizes data by aspects and identifies the sentiment assigned to each aspect. It is the sequence tagging
24
+ task.
25
+
26
+ Input ('tokens' column): sequence of tokens
27
+
28
+ Output ('labels' column): sequence of predicted tokens’ classes ("O" + 6 possible classes: strong negative (
29
+ a_minus_m), weak negative (a_minus_s), neutral (a_zero), weak positive (a_plus_s), strong positive (
30
+ a_plus_m), ambiguous (a_amb) )
31
+
32
+ Domain: school, medicine, hotels and products
33
+
34
+ Measurements:
35
+
36
+ Example: ['Dużo', 'wymaga', ',', 'ale', 'bardzo', 'uczciwy', 'i', 'przyjazny', 'studentom', '.', 'Warto', 'chodzić',
37
+ 'na', 'konsultacje', '.', 'Docenia', 'postępy', 'i', 'zaangażowanie', '.', 'Polecam', '.'] → ['O', 'a_plus_s', 'O',
38
+ 'O', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'a_zero', 'O', 'a_plus_m', 'O', 'O', 'O', 'O', 'O', 'O']
39
+ """
40
+
41
+ def load_data(self):
42
+ raw_dataset = load_dataset(self.dataset_name)
43
+ self.data_dict = {
44
+ subset: raw_dataset[subset].to_pandas() for subset in self.subsets
45
+ }
46
+
47
+ def show_dataset(self):
48
+ header = st.container()
49
+ description = st.container()
50
+ dataframe_head = st.container()
51
+
52
+ with header:
53
+ st.title(self.dataset_name)
54
+
55
+ with description:
56
+ st.header("Dataset description")
57
+ st.write(self.description)
58
+
59
+ full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
60
+ with dataframe_head:
61
+ df_to_show = full_dataframe.head(10)
62
+ st.header("First 10 observations of the dataset")
63
+ st.dataframe(df_to_show)
64
+ st.text_area(label="Latex code", value=df_to_show.style.to_latex())