Spaces:
Sleeping
Sleeping
# -*- coding:utf-8 -*- | |
import pandas as pd | |
import seaborn as sns | |
import matplotlib | |
matplotlib.use('Agg') | |
import nltk | |
nltk.download('punkt') | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
from n4a_analytics_lib.project import Project | |
class GlobalStatistics(Project): | |
def __init__(self, zip_project): | |
super().__init__(zip_project=zip_project, type="global") | |
self.data = [(src_file, ne_label) for src_file, ann in self.annotations.items() for ne_label in ann['labels']] | |
self.df_base = pd.DataFrame(self.data, columns=["SOURCE_FILE", "LABEL"]) | |
self.df_i = self.df_base.groupby(["LABEL"])["LABEL"].count().reset_index(name="TOTAL") | |
self.df_details = self.df_base.groupby(["SOURCE_FILE", "LABEL"])["LABEL"].count().reset_index(name="TOTAL") | |
self.total_annotations_project = self.df_i['TOTAL'].sum() | |
def create_plot(self, type_data): | |
# apply data filter | |
data_tab_filtered = self.df_details.loc[self.df_details['SOURCE_FILE'] == type_data] | |
# create a new plot | |
ax = sns.barplot(x='LABEL', y='TOTAL', data=data_tab_filtered) | |
# add title to plot | |
ax.figure.suptitle(type_data) | |
# add value labels to bars | |
for container in ax.containers: | |
ax.bar_label(container) | |
return ax.figure | |
class IaaStatistics(Project): | |
def __init__(self, zip_project, baseline_text): | |
super().__init__(zip_project=zip_project, type="iaa") | |
self.baseline_text = baseline_text.decode('utf-8') | |
# self.docs = {} | |
# self.pairwise = {} | |
# self.similar_mention = [] | |
self.mentions_per_coder = self.extract_refs(self.annotations, self.annotators, type="mentions") | |
self.labels_per_coder = self.extract_refs(self.annotations, self.annotators, type="labels") | |
self.annotations_per_coders = {coder: dict(zip(ann[1]['mentions'], ann[1]['labels'])) for coder, ann in zip(self.annotators, self.annotations.items())} | |
def extract_refs(annotations, annotators, type): | |
return { | |
coder: data for coder, ann in zip( | |
annotators, | |
annotations.items() | |
) for ref, data in ann[1].items() if ref == type | |
} | |
def analyze_text(self): | |
"""returns total sentences, words and characters | |
in list format | |
""" | |
return [ | |
len(sent_tokenize(self.baseline_text, language="french")), | |
len(word_tokenize(self.baseline_text, language="french")), | |
len(self.baseline_text) | |
] | |