from datasets import load_dataset import streamlit as st from clarin_datasets.dataset_to_show import DatasetToShow class KpwrNerDataset(DatasetToShow): def __init__(self): DatasetToShow.__init__(self) self.dataset_name = "clarin-pl/kpwr-ner" self.description = """ KPWR-NER is a part the Polish Corpus of Wrocław University of Technology (Korpus Języka Polskiego Politechniki Wrocławskiej). Its objective is named entity recognition for fine-grained categories of entities. It is the ‘n82’ version of the KPWr, which means that number of classes is restricted to 82 ( originally 120). During corpus creation, texts were annotated by humans from various sources, covering many domains and genres. Tasks (input, output and metrics) Named entity recognition (NER) - tagging entities in text with their corresponding type. Input ('tokens' column): sequence of tokens Output ('ner' column): sequence of predicted tokens’ classes in BIO notation (82 possible classes, described in detail in the annotation guidelines) example: [‘Roboty’, ‘mają’, ‘kilkanaście’, ‘lat’, ‘i’, ‘pochodzą’, ‘z’, ‘USA’, ‘,’, ‘Wysokie’, ‘napięcie’, ‘jest’, ‘dużo’, ‘młodsze’, ‘,’, ‘powstało’, ‘w’, ‘Niemczech’, ‘.’] → [‘B-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, ‘B-nam_loc_gpe_country’, ‘O’, ‘B-nam_pro_title’, ‘I-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, ‘B-nam_loc_gpe_country’, ‘O’] """ def load_data(self): raw_dataset = load_dataset(self.dataset_name) self.data_dict = { subset: raw_dataset[subset].to_pandas() for subset in self.subsets } def show_dataset(self): header = st.container() description = st.container() dataframe_head = st.container() with header: st.title(self.dataset_name) with description: st.header("Dataset description") st.write(self.description)