Spaces:

ner4archives
/

NER4Archives-analytics

Sleeping

App Files Files Community

NER4Archives-analytics / n4a_analytics_lib /analytics.py

lterriel

add app

519b419 over 2 years ago

raw

history blame

2.59 kB

	# -- coding:utf-8 --

	import pandas as pd
	import seaborn as sns
	import matplotlib

	matplotlib.use('Agg')

	import nltk

	nltk.download('punkt')
	from nltk.tokenize import sent_tokenize, word_tokenize

	from n4a_analytics_lib.project import Project


	class GlobalStatistics(Project):
	def __init__(self, zip_project):
	super().__init__(zip_project=zip_project, type="global")

	self.data = [(src_file, ne_label) for src_file, ann in self.annotations.items() for ne_label in ann['labels']]
	self.df_base = pd.DataFrame(self.data, columns=["SOURCE_FILE", "LABEL"])
	self.df_i = self.df_base.groupby(["LABEL"])["LABEL"].count().reset_index(name="TOTAL")
	self.df_details = self.df_base.groupby(["SOURCE_FILE", "LABEL"])["LABEL"].count().reset_index(name="TOTAL")

	self.total_annotations_project = self.df_i['TOTAL'].sum()

	def create_plot(self, type_data):
	# apply data filter
	data_tab_filtered = self.df_details.loc[self.df_details['SOURCE_FILE'] == type_data]
	# create a new plot
	ax = sns.barplot(x='LABEL', y='TOTAL', data=data_tab_filtered)
	# add title to plot
	ax.figure.suptitle(type_data)
	# add value labels to bars
	for container in ax.containers:
	ax.bar_label(container)
	return ax.figure





	class IaaStatistics(Project):
	def __init__(self, zip_project, baseline_text):
	super().__init__(zip_project=zip_project, type="iaa")
	self.baseline_text = baseline_text.decode('utf-8')

	# self.docs = {}
	# self.pairwise = {}
	# self.similar_mention = []
	self.mentions_per_coder = self.extract_refs(self.annotations, self.annotators, type="mentions")
	self.labels_per_coder = self.extract_refs(self.annotations, self.annotators, type="labels")

	self.annotations_per_coders = {coder: dict(zip(ann[1]['mentions'], ann[1]['labels'])) for coder, ann in zip(self.annotators, self.annotations.items())}


	@staticmethod
	def extract_refs(annotations, annotators, type):
	return {
	coder: data for coder, ann in zip(
	annotators,
	annotations.items()
	) for ref, data in ann[1].items() if ref == type
	}

	def analyze_text(self):
	"""returns total sentences, words and characters
	in list format
	"""
	return [
	len(sent_tokenize(self.baseline_text, language="french")),
	len(word_tokenize(self.baseline_text, language="french")),
	len(self.baseline_text)
	]