3ie-intervention-outcome-entity-linking

Sleeping

App Files Files Community

3ie-intervention-outcome-entity-linking / relik /reader /data /relik_reader_re_data.py

CarlosMalaga

Upload 201 files

2f044c1 verified 6 months ago

raw

history blame

47.7 kB

	import logging
	from typing import (
	Any,
	Callable,
	Dict,
	Generator,
	Iterator,
	List,
	NamedTuple,
	Optional,
	Tuple,
	Union,
	)

	import numpy as np
	import torch
	import tqdm
	from torch.utils.data import IterableDataset
	from transformers import AutoTokenizer, PreTrainedTokenizer

	from relik.reader.data.relik_reader_data_utils import (
	add_noise_to_value,
	batchify,
	batchify_matrices,
	batchify_tensor,
	chunks,
	flatten,
	)
	from relik.reader.data.relik_reader_sample import (
	RelikReaderSample,
	load_relik_reader_samples,
	)
	from relik.reader.utils.special_symbols import NME_SYMBOL

	logger = logging.getLogger(__name__)


	class TokenizationOutput(NamedTuple):
	input_ids: torch.Tensor
	attention_mask: torch.Tensor
	token_type_ids: torch.Tensor
	prediction_mask: torch.Tensor
	special_symbols_mask: torch.Tensor
	special_symbols_mask_entities: torch.Tensor


	class RelikREDataset(IterableDataset):
	def __init__(
	self,
	dataset_path: str,
	materialize_samples: bool,
	transformer_model: Union[str, PreTrainedTokenizer],
	special_symbols: List[str],
	shuffle_candidates: Optional[Union[bool, float]] = False,
	flip_candidates: Optional[Union[bool, float]] = False,
	for_inference: bool = False,
	special_symbols_types=None,
	noise_param: float = 0.1,
	sorting_fields: Optional[str] = None,
	tokens_per_batch: int = 2048,
	batch_size: int = None,
	max_batch_size: int = 128,
	section_size: int = 500_000,
	prebatch: bool = True,
	add_gold_candidates: bool = True,
	use_nme: bool = False,
	min_length: int = -1,
	max_length: int = 2048,
	max_triplets: int = 50,
	max_spans: int = 100,
	model_max_length: int = 2048,
	skip_empty_training_samples: bool = True,
	drop_last: bool = False,
	samples: Optional[Iterator[RelikReaderSample]] = None,
	**kwargs,
	):
	super().__init__(**kwargs)
	# mutable default arguments
	if special_symbols_types is None:
	special_symbols_types = []

	self.dataset_path = dataset_path
	self.materialize_samples = materialize_samples
	self.samples: Optional[List[RelikReaderSample]] = samples
	if self.materialize_samples and self.samples is None:
	self.samples = list()

	if isinstance(transformer_model, str):
	self.tokenizer = self._build_tokenizer(
	transformer_model, special_symbols + special_symbols_types
	)
	else:
	self.tokenizer = transformer_model
	self.special_symbols = special_symbols
	self.special_symbols_types = special_symbols_types
	self.shuffle_candidates = shuffle_candidates
	self.flip_candidates = flip_candidates
	self.for_inference = for_inference
	self.noise_param = noise_param
	self.batching_fields = ["input_ids"]
	self.sorting_fields = (
	sorting_fields if sorting_fields is not None else self.batching_fields
	)
	self.add_gold_candidates = add_gold_candidates
	self.use_nme = use_nme
	self.min_length = min_length
	self.max_length = max_length
	self.model_max_length = (
	model_max_length
	if model_max_length < self.tokenizer.model_max_length
	else self.tokenizer.model_max_length
	)
	self.transformer_model = transformer_model
	self.skip_empty_training_samples = skip_empty_training_samples
	self.drop_last = drop_last

	self.tokens_per_batch = tokens_per_batch
	self.batch_size = batch_size
	self.max_batch_size = max_batch_size
	self.max_triplets = max_triplets
	self.max_spans = max_spans
	self.section_size = section_size
	self.prebatch = prebatch

	def _build_tokenizer(self, transformer_model: str, special_symbols: List[str]):
	return AutoTokenizer.from_pretrained(
	transformer_model,
	additional_special_tokens=[ss for ss in special_symbols],
	add_prefix_space=True,
	)

	@staticmethod
	def get_special_symbols_re(num_entities: int, use_nme: bool = False) -> List[str]:
	if use_nme:
	return [NME_SYMBOL] + [f"[R-{i}]" for i in range(num_entities)]
	else:
	return [f"[R-{i}]" for i in range(num_entities)]

	@staticmethod
	def get_special_symbols(num_entities: int) -> List[str]:
	return [NME_SYMBOL] + [f"[E-{i}]" for i in range(num_entities)]

	@property
	def fields_batcher(self) -> Dict[str, Union[None, Callable[[list], Any]]]:
	fields_batchers = {
	"input_ids": lambda x: batchify(
	x, padding_value=self.tokenizer.pad_token_id
	),
	"attention_mask": lambda x: batchify(x, padding_value=0),
	"token_type_ids": lambda x: batchify(x, padding_value=0),
	"prediction_mask": lambda x: batchify(x, padding_value=1),
	"global_attention": lambda x: batchify(x, padding_value=0),
	"token2word": None,
	"sample": None,
	"special_symbols_mask": lambda x: batchify(x, padding_value=False),
	"special_symbols_mask_entities": lambda x: batchify(x, padding_value=False),
	"start_labels": lambda x: batchify(x, padding_value=-100),
	"end_labels": lambda x: batchify_matrices(x, padding_value=-100),
	"disambiguation_labels": lambda x: batchify(x, padding_value=-100),
	"relation_labels": lambda x: batchify_tensor(x, padding_value=-100),
	"predictable_candidates": None,
	}
	if (
	isinstance(self.transformer_model, str)
	and "roberta" in self.transformer_model
	) or (
	isinstance(self.transformer_model, PreTrainedTokenizer)
	and "roberta" in self.transformer_model.config.model_type
	):
	del fields_batchers["token_type_ids"]

	return fields_batchers

	def _build_input_ids(
	self, sentence_input_ids: List[int], candidates_input_ids: List[List[int]]
	) -> List[int]:
	return (
	[self.tokenizer.cls_token_id]
	+ sentence_input_ids
	+ [self.tokenizer.sep_token_id]
	+ flatten(candidates_input_ids)
	+ [self.tokenizer.sep_token_id]
	)

	def _build_input(self, text: List[str], candidates: List[List[str]]) -> List[int]:
	return (
	text
	+ [self.tokenizer.sep_token]
	+ flatten(candidates)
	+ [self.tokenizer.sep_token]
	)

	def _build_tokenizer_essentials(
	self, input_ids, original_sequence, ents=0
	) -> TokenizationOutput:
	input_ids = torch.tensor(input_ids, dtype=torch.long)
	attention_mask = torch.ones_like(input_ids)

	if len(self.special_symbols_types) > 0:
	# special symbols mask
	special_symbols_mask = input_ids >= self.tokenizer.vocab_size
	# select only the first N true values where N is len(entities_definitions)
	special_symbols_mask_entities = special_symbols_mask.clone()
	special_symbols_mask_entities[
	special_symbols_mask_entities.cumsum(0) > ents
	] = False
	token_type_ids = (torch.cumsum(special_symbols_mask, dim=0) > 0).long()
	special_symbols_mask = special_symbols_mask ^ special_symbols_mask_entities
	else:
	special_symbols_mask = input_ids >= self.tokenizer.vocab_size
	special_symbols_mask_entities = special_symbols_mask.clone()
	token_type_ids = (torch.cumsum(special_symbols_mask, dim=0) > 0).long()

	prediction_mask = token_type_ids.roll(shifts=-1, dims=0)
	prediction_mask[-1] = 1
	prediction_mask[0] = 1

	assert len(prediction_mask) == len(input_ids)

	return TokenizationOutput(
	input_ids,
	attention_mask,
	token_type_ids,
	prediction_mask,
	special_symbols_mask,
	special_symbols_mask_entities,
	)

	@staticmethod
	def _subindex(lst, target_values, dims):
	for i, sublist in enumerate(lst):
	match = all(sublist[dim] == target_values[dim] for dim in dims)
	if match:
	return i

	def _build_labels(
	self,
	sample,
	tokenization_output: TokenizationOutput,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	start_labels = [0] * len(tokenization_output.input_ids)
	end_labels = []
	end_labels_tensor = [0] * len(tokenization_output.input_ids)

	sample.entities.sort(key=lambda x: (x[0], x[1]))

	prev_start_bpe = -1
	entities_untyped = list(set([(ce[0], ce[1]) for ce in sample.entities]))
	entities_untyped.sort(key=lambda x: (x[0], x[1]))
	if len(self.special_symbols_types) > 0:
	sample.entities = [(ce[0], ce[1], ce[2]) for ce in sample.entities]
	disambiguation_labels = torch.zeros(
	len(entities_untyped),
	len(sample.span_candidates) + len(sample.triplet_candidates),
	)
	else:
	sample.entities = [(ce[0], ce[1], "") for ce in sample.entities]
	disambiguation_labels = torch.zeros(
	len(entities_untyped), len(sample.triplet_candidates)
	)
	ignored_labels_indices = tokenization_output.prediction_mask == 1
	offset = 0
	for idx, c_ent in enumerate(sample.entities):
	while len(sample.word2token[c_ent[0]]) == 0:
	c_ent = (c_ent[0] + 1, c_ent[1], c_ent[2])
	if len(sample.word2token) == c_ent[0]:
	c_ent = None
	break
	if c_ent is None:
	continue
	while len(sample.word2token[c_ent[1] - 1]) == 0:
	c_ent = (c_ent[0], c_ent[1] + 1, c_ent[2])
	if len(sample.word2token) == c_ent[1]:
	c_ent = None
	break
	if c_ent is None:
	continue
	start_bpe = sample.word2token[c_ent[0]][0] + 1
	end_bpe = sample.word2token[c_ent[1] - 1][-1] + 1
	class_index = idx
	start_labels[start_bpe] = class_index + 1 # +1 for the NONE class
	if start_bpe != prev_start_bpe:
	end_labels.append(end_labels_tensor.copy())
	end_labels[-1][:start_bpe] = [-100] * start_bpe
	end_labels[-1][end_bpe] = class_index + 1
	elif end_labels[-1][end_bpe] == 0:
	end_labels[-1][end_bpe] = class_index + 1
	else:
	offset += 1
	prev_start_bpe = start_bpe
	continue
	if len(self.special_symbols_types) > 0:
	if c_ent[2] in sample.span_candidates:
	entity_type_idx = sample.span_candidates.index(c_ent[2])
	else:
	entity_type_idx = 0
	disambiguation_labels[idx - offset, entity_type_idx] = 1
	prev_start_bpe = start_bpe

	start_labels = torch.tensor(start_labels, dtype=torch.long)
	start_labels[ignored_labels_indices] = -100

	end_labels = torch.tensor(end_labels, dtype=torch.long)
	end_labels[ignored_labels_indices.repeat(len(end_labels), 1)] = -100

	relation_labels = torch.zeros(
	len(entities_untyped), len(entities_untyped), len(sample.triplet_candidates)
	)

	for re in sample.triplets:
	if re["relation"]["name"] not in sample.triplet_candidates:
	re_class_index = len(sample.triplet_candidates) - 1
	else:
	re_class_index = sample.triplet_candidates.index(re["relation"]["name"])

	subject_class_index = self._subindex(
	entities_untyped, (re["subject"]["start"], re["subject"]["end"]), (0, 1)
	)
	object_class_index = self._subindex(
	entities_untyped, (re["object"]["start"], re["object"]["end"]), (0, 1)
	)

	relation_labels[subject_class_index, object_class_index, re_class_index] = 1

	if len(self.special_symbols_types) > 0:
	disambiguation_labels[
	subject_class_index, re_class_index + len(sample.span_candidates)
	] = 1
	disambiguation_labels[
	object_class_index, re_class_index + len(sample.span_candidates)
	] = 1
	else:
	disambiguation_labels[subject_class_index, re_class_index] = 1
	disambiguation_labels[object_class_index, re_class_index] = 1
	return start_labels, end_labels, disambiguation_labels, relation_labels

	def __iter__(self):
	dataset_iterator = self.dataset_iterator_func()
	current_dataset_elements = []
	i = None
	for i, dataset_elem in enumerate(dataset_iterator, start=1):
	if (
	self.section_size is not None
	and len(current_dataset_elements) == self.section_size
	):
	for batch in self.materialize_batches(current_dataset_elements):
	yield batch
	current_dataset_elements = []
	current_dataset_elements.append(dataset_elem)
	if i % 50_000 == 0:
	logger.info(f"Processed: {i} number of elements")
	if len(current_dataset_elements) != 0:
	for batch in self.materialize_batches(current_dataset_elements):
	yield batch
	if i is not None:
	logger.debug(f"Dataset finished: {i} number of elements processed")
	else:
	logger.warning("Dataset empty")

	def dataset_iterator_func(self):
	data_samples = (
	load_relik_reader_samples(self.dataset_path)
	if self.samples is None
	or (isinstance(self.samples, list) and len(self.samples) == 0)
	else self.samples
	)
	if self.materialize_samples:
	data_acc = []
	# take care of the tqdm nesting
	# for sample in tqdm.tqdm(data_samples, desc="Reading dataset"):
	for sample in data_samples:
	if self.materialize_samples and sample.materialize is not None:
	# tokenization_output = sample.materialize["tokenization_output"]
	materialized = sample.materialize
	del sample.materialize
	yield {
	"input_ids": materialized["tokenization_output"].input_ids,
	"attention_mask": materialized[
	"tokenization_output"
	].attention_mask,
	"token_type_ids": materialized[
	"tokenization_output"
	].token_type_ids,
	"prediction_mask": materialized[
	"tokenization_output"
	].prediction_mask,
	"special_symbols_mask": materialized[
	"tokenization_output"
	].special_symbols_mask,
	"special_symbols_mask_entities": materialized[
	"tokenization_output"
	].special_symbols_mask_entities,
	"sample": sample,
	"start_labels": materialized["start_labels"],
	"end_labels": materialized["end_labels"],
	"disambiguation_labels": materialized["disambiguation_labels"],
	"relation_labels": materialized["relation_labels"],
	"predictable_candidates": materialized["candidates_symbols"],
	}
	sample.materialize = materialized
	data_acc.append(sample)
	continue
	candidates_symbols = self.special_symbols
	candidates_entities_symbols = self.special_symbols_types

	# sample.candidates = sample.candidates[: self.max_candidates]

	if len(self.special_symbols_types) > 0:
	# sample.span_candidates = sample.span_candidates[
	# : self.max_ent_candidates
	# ]
	# add NME as a possible candidate
	assert sample.span_candidates is not None
	if self.use_nme:
	sample.span_candidates.insert(0, NME_SYMBOL)
	# sample.candidates.insert(0, NME_SYMBOL)

	sample.triplet_candidates = sample.triplet_candidates[
	: min(len(candidates_symbols), self.max_triplets)
	]

	if len(self.special_symbols_types) > 0:
	sample.span_candidates = sample.span_candidates[
	: min(len(candidates_entities_symbols), self.max_spans)
	]
	# training time sample mods
	if not self.for_inference:
	# check whether the sample has labels if not skip
	if (
	sample.triplets is None or len(sample.triplets) == 0
	) and self.skip_empty_training_samples:
	logger.warning(
	"Sample {} has no labels, skipping".format(sample.id)
	)
	continue

	# add gold candidates if missing
	if self.add_gold_candidates:
	candidates_set = set(sample.triplet_candidates)
	candidates_to_add = set()
	for candidate_title in sample.triplets:
	if candidate_title["relation"]["name"] not in candidates_set:
	candidates_to_add.add(candidate_title["relation"]["name"])
	if len(candidates_to_add) > 0:
	# replacing last candidates with the gold ones
	# this is done in order to preserve the ordering
	candidates_to_add = list(candidates_to_add)
	added_gold_candidates = 0
	gold_candidates_titles_set = set(
	set(ct["relation"]["name"] for ct in sample.triplets)
	)
	for i in reversed(range(len(sample.triplet_candidates))):
	if (
	sample.triplet_candidates[i]
	not in gold_candidates_titles_set
	and sample.triplet_candidates[i] != NME_SYMBOL
	):
	sample.triplet_candidates[i] = candidates_to_add[
	added_gold_candidates
	]
	added_gold_candidates += 1
	if len(candidates_to_add) == added_gold_candidates:
	break

	candidates_still_to_add = (
	len(candidates_to_add) - added_gold_candidates
	)
	while (
	len(sample.triplet_candidates)
	<= min(len(candidates_symbols), self.max_triplets)
	and candidates_still_to_add != 0
	):
	sample.triplet_candidates.append(
	candidates_to_add[added_gold_candidates]
	)
	added_gold_candidates += 1
	candidates_still_to_add -= 1

	def shuffle_cands(shuffle_candidates, candidates):
	if (
	isinstance(shuffle_candidates, bool) and shuffle_candidates
	) or (
	isinstance(shuffle_candidates, float)
	and np.random.uniform() < shuffle_candidates
	):
	np.random.shuffle(candidates)
	if NME_SYMBOL in candidates:
	candidates.remove(NME_SYMBOL)
	candidates.insert(0, NME_SYMBOL)
	return candidates

	def flip_cands(flip_candidates, candidates):
	# flip candidates
	if (isinstance(flip_candidates, bool) and flip_candidates) or (
	isinstance(flip_candidates, float)
	and np.random.uniform() < flip_candidates
	):
	for i in range(len(candidates) - 1):
	if np.random.uniform() < 0.5:
	candidates[i], candidates[i + 1] = (
	candidates[i + 1],
	candidates[i],
	)
	if NME_SYMBOL in candidates:
	candidates.remove(NME_SYMBOL)
	candidates.insert(0, NME_SYMBOL)
	return candidates

	if self.shuffle_candidates:
	sample.triplet_candidates = shuffle_cands(
	self.shuffle_candidates, sample.triplet_candidates
	)
	if len(self.special_symbols_types) > 0:
	sample.span_candidates = shuffle_cands(
	self.shuffle_candidates, sample.span_candidates
	)
	elif self.flip_candidates:
	sample.triplet_candidates = flip_cands(
	self.flip_candidates, sample.triplet_candidates
	)
	if len(self.special_symbols_types) > 0:
	sample.span_candidates = flip_cands(
	self.flip_candidates, sample.span_candidates
	)

	# candidates encoding
	candidates_symbols = candidates_symbols[: len(sample.triplet_candidates)]

	candidates_encoding = [
	["{} {}".format(cs, ct)] if ct != NME_SYMBOL else [NME_SYMBOL]
	for cs, ct in zip(candidates_symbols, sample.triplet_candidates)
	]
	if len(self.special_symbols_types) > 0:
	candidates_entities_symbols = candidates_entities_symbols[
	: len(sample.span_candidates)
	]
	candidates_types_encoding = [
	["{} {}".format(cs, ct)] if ct != NME_SYMBOL else [NME_SYMBOL]
	for cs, ct in zip(
	candidates_entities_symbols, sample.span_candidates
	)
	]
	candidates_encoding = (
	candidates_types_encoding
	+ [[self.tokenizer.sep_token]]
	+ candidates_encoding
	)

	pretoken_input = self._build_input(sample.words, candidates_encoding)
	input_tokenized = self.tokenizer(
	pretoken_input,
	return_offsets_mapping=True,
	add_special_tokens=False,
	)

	window_tokens = input_tokenized.input_ids
	window_tokens = flatten(window_tokens)

	offsets_mapping = [
	[
	(
	ss + sample.token2char_start[str(i)],
	se + sample.token2char_start[str(i)],
	)
	for ss, se in input_tokenized.offset_mapping[i]
	]
	for i in range(len(sample.words))
	]

	offsets_mapping = flatten(offsets_mapping)

	token2char_start = {str(i): s for i, (s, _) in enumerate(offsets_mapping)}
	token2char_end = {str(i): e for i, (_, e) in enumerate(offsets_mapping)}
	token2word_start = {
	str(i): int(sample._d["char2token_start"][str(s)])
	for i, (s, _) in enumerate(offsets_mapping)
	if str(s) in sample._d["char2token_start"]
	}
	token2word_end = {
	str(i): int(sample._d["char2token_end"][str(e)])
	for i, (_, e) in enumerate(offsets_mapping)
	if str(e) in sample._d["char2token_end"]
	}
	# invert token2word_start and token2word_end
	word2token_start = {str(v): int(k) for k, v in token2word_start.items()}
	word2token_end = {str(v): int(k) for k, v in token2word_end.items()}

	sample._d.update(
	dict(
	tokens=window_tokens,
	token2char_start=token2char_start,
	token2char_end=token2char_end,
	token2word_start=token2word_start,
	token2word_end=token2word_end,
	word2token_start=word2token_start,
	word2token_end=word2token_end,
	)
	)

	input_subwords = flatten(input_tokenized["input_ids"][: len(sample.words)])
	offsets = input_tokenized["offset_mapping"][: len(sample.words)]
	token2word = []
	word2token = {}
	count = 0
	for i, offset in enumerate(offsets):
	word2token[i] = []
	for token in offset:
	token2word.append(i)
	word2token[i].append(count)
	count += 1

	sample.token2word = token2word
	sample.word2token = word2token
	candidates_encoding_result = input_tokenized["input_ids"][
	len(sample.words) + 1 : -1
	]

	i = 0
	cum_len = 0
	# drop candidates if the number of input tokens is too long for the model
	if (
	sum(map(len, candidates_encoding_result))
	+ len(input_subwords)
	+ 20 # + 20 special tokens
	> self.model_max_length
	):
	if self.for_inference:
	acceptable_tokens_from_candidates = (
	self.model_max_length - 20 - len(input_subwords)
	)

	while (
	cum_len + len(candidates_encoding_result[i])
	< acceptable_tokens_from_candidates
	):
	cum_len += len(candidates_encoding_result[i])
	i += 1

	assert i > 0

	candidates_encoding_result = candidates_encoding_result[:i]
	if len(self.special_symbols_types) > 0:
	candidates_symbols = candidates_symbols[
	: i - len(sample.span_candidates)
	]
	sample.triplet_candidates = sample.triplet_candidates[
	: i - len(sample.span_candidates)
	]
	else:
	candidates_symbols = candidates_symbols[:i]
	sample.triplet_candidates = sample.triplet_candidates[:i]
	else:
	gold_candidates_set = set(
	[wl["relation"]["name"] for wl in sample.triplets]
	)
	gold_candidates_indices = [
	i
	for i, wc in enumerate(sample.triplet_candidates)
	if wc in gold_candidates_set
	]
	if len(self.special_symbols_types) > 0:
	gold_candidates_indices = [
	i + len(sample.span_candidates)
	for i in gold_candidates_indices
	]
	# add entities indices
	gold_candidates_indices = gold_candidates_indices + list(
	range(len(sample.span_candidates))
	)
	necessary_taken_tokens = sum(
	map(
	len,
	[
	candidates_encoding_result[i]
	for i in gold_candidates_indices
	],
	)
	)

	acceptable_tokens_from_candidates = (
	self.model_max_length
	- 20
	- len(input_subwords)
	- necessary_taken_tokens
	)
	if acceptable_tokens_from_candidates <= 0:
	logger.warning(
	"Sample {} has no candidates after truncation due to max length".format(
	sample.id
	)
	)
	continue
	# assert acceptable_tokens_from_candidates > 0

	i = 0
	cum_len = 0
	while (
	cum_len + len(candidates_encoding_result[i])
	< acceptable_tokens_from_candidates
	):
	if i not in gold_candidates_indices:
	cum_len += len(candidates_encoding_result[i])
	i += 1

	new_indices = sorted(
	list(set(list(range(i)) + gold_candidates_indices))
	)
	# np.random.shuffle(new_indices)

	candidates_encoding_result = [
	candidates_encoding_result[i] for i in new_indices
	]
	if len(self.special_symbols_types) > 0:
	sample.triplet_candidates = [
	sample.triplet_candidates[i - len(sample.span_candidates)]
	for i in new_indices[len(sample.span_candidates) :]
	]
	candidates_symbols = candidates_symbols[
	: i - len(sample.span_candidates)
	]
	else:
	candidates_symbols = [
	candidates_symbols[i] for i in new_indices
	]
	sample.triplet_candidates = [
	sample.triplet_candidates[i] for i in new_indices
	]
	if len(sample.triplet_candidates) == 0:
	logger.warning(
	"Sample {} has no candidates after truncation due to max length".format(
	sample.sample_id
	)
	)
	continue

	# final input_ids build
	input_ids = self._build_input_ids(
	sentence_input_ids=input_subwords,
	candidates_input_ids=candidates_encoding_result,
	)

	# complete input building (e.g. attention / prediction mask)
	tokenization_output = self._build_tokenizer_essentials(
	input_ids,
	input_subwords,
	min(len(sample.span_candidates), len(self.special_symbols_types))
	if sample.span_candidates is not None
	else 0,
	)
	# labels creation
	start_labels, end_labels, disambiguation_labels, relation_labels = (
	None,
	None,
	None,
	None,
	)
	if sample.entities is not None and len(sample.entities) > 0:
	(
	start_labels,
	end_labels,
	disambiguation_labels,
	relation_labels,
	) = self._build_labels(
	sample,
	tokenization_output,
	)
	if self.materialize_samples:
	sample.materialize = {
	"tokenization_output": tokenization_output,
	"start_labels": start_labels,
	"end_labels": end_labels,
	"disambiguation_labels": disambiguation_labels,
	"relation_labels": relation_labels,
	"candidates_symbols": candidates_symbols,
	}
	data_acc.append(sample)
	yield {
	"input_ids": tokenization_output.input_ids,
	"attention_mask": tokenization_output.attention_mask,
	"token_type_ids": tokenization_output.token_type_ids,
	"prediction_mask": tokenization_output.prediction_mask,
	"special_symbols_mask": tokenization_output.special_symbols_mask,
	"special_symbols_mask_entities": tokenization_output.special_symbols_mask_entities,
	"sample": sample,
	"start_labels": start_labels,
	"end_labels": end_labels,
	"disambiguation_labels": disambiguation_labels,
	"relation_labels": relation_labels,
	"predictable_candidates": candidates_symbols,
	}
	if self.materialize_samples:
	self.samples = data_acc

	def preshuffle_elements(self, dataset_elements: List):
	# This shuffling is done so that when using the sorting function,
	# if it is deterministic given a collection and its order, we will
	# make the whole operation not deterministic anymore.
	# Basically, the aim is not to build every time the same batches.
	if not self.for_inference:
	dataset_elements = np.random.permutation(dataset_elements)

	sorting_fn = (
	lambda elem: add_noise_to_value(
	sum(len(elem[k]) for k in self.sorting_fields),
	noise_param=self.noise_param,
	)
	if not self.for_inference
	else sum(len(elem[k]) for k in self.sorting_fields)
	)

	dataset_elements = sorted(dataset_elements, key=sorting_fn)

	if self.for_inference:
	return dataset_elements

	ds = list(chunks(dataset_elements, 64)) # todo: modified
	np.random.shuffle(ds)
	return flatten(ds)

	def materialize_batches(
	self, dataset_elements: List[Dict[str, Any]]
	) -> Generator[Dict[str, Any], None, None]:
	if self.prebatch:
	dataset_elements = self.preshuffle_elements(dataset_elements)

	current_batch = []

	# function that creates a batch from the 'current_batch' list
	def output_batch() -> Dict[str, Any]:
	assert (
	len(
	set([len(elem["predictable_candidates"]) for elem in current_batch])
	)
	== 1
	), " ".join(
	map(
	str, [len(elem["predictable_candidates"]) for elem in current_batch]
	)
	)

	batch_dict = dict()

	de_values_by_field = {
	fn: [de[fn] for de in current_batch if fn in de]
	for fn in self.fields_batcher
	}

	# in case you provide fields batchers but in the batch
	# there are no elements for that field
	de_values_by_field = {
	fn: fvs for fn, fvs in de_values_by_field.items() if len(fvs) > 0
	}

	assert len(set([len(v) for v in de_values_by_field.values()]))

	# todo: maybe we should report the user about possible
	# fields filtering due to "None" instances
	de_values_by_field = {
	fn: fvs
	for fn, fvs in de_values_by_field.items()
	if all([fv is not None for fv in fvs])
	}

	for field_name, field_values in de_values_by_field.items():
	field_batch = (
	self.fields_batcher[field_name](field_values)
	if self.fields_batcher[field_name] is not None
	else field_values
	)

	batch_dict[field_name] = field_batch

	return batch_dict

	max_len_discards, min_len_discards = 0, 0

	should_token_batch = self.batch_size is None

	curr_pred_elements = -1
	for de in dataset_elements:
	if (
	should_token_batch
	and self.max_batch_size != -1
	and len(current_batch) == self.max_batch_size
	) or (not should_token_batch and len(current_batch) == self.batch_size):
	yield output_batch()
	current_batch = []
	curr_pred_elements = -1

	# todo support max length (and min length) as dicts

	too_long_fields = [
	k
	for k in de
	if self.max_length != -1
	and torch.is_tensor(de[k])
	and len(de[k]) > self.max_length
	]
	if len(too_long_fields) > 0:
	max_len_discards += 1
	continue

	too_short_fields = [
	k
	for k in de
	if self.min_length != -1
	and torch.is_tensor(de[k])
	and len(de[k]) < self.min_length
	]
	if len(too_short_fields) > 0:
	min_len_discards += 1
	continue

	if should_token_batch:
	de_len = sum(len(de[k]) for k in self.batching_fields)

	future_max_len = max(
	de_len,
	max(
	[
	sum(len(bde[k]) for k in self.batching_fields)
	for bde in current_batch
	],
	default=0,
	),
	)

	future_tokens_per_batch = future_max_len * (len(current_batch) + 1)

	num_predictable_candidates = len(de["predictable_candidates"])

	if len(current_batch) > 0 and (
	future_tokens_per_batch >= self.tokens_per_batch
	or (
	num_predictable_candidates != curr_pred_elements
	and curr_pred_elements != -1
	)
	):
	yield output_batch()
	current_batch = []

	current_batch.append(de)
	curr_pred_elements = len(de["predictable_candidates"])

	if len(current_batch) != 0 and not self.drop_last:
	yield output_batch()

	if max_len_discards > 0:
	if self.for_inference:
	logger.warning(
	f"WARNING: Inference mode is True but {max_len_discards} samples longer than max length were "
	f"found. The {max_len_discards} samples will be DISCARDED. If you are doing some kind of evaluation"
	f", this can INVALIDATE results. This might happen if the max length was not set to -1 or if the "
	f"sample length exceeds the maximum length supported by the current model."
	)
	else:
	logger.warning(
	f"During iteration, {max_len_discards} elements were "
	f"discarded since longer than max length {self.max_length}"
	)

	if min_len_discards > 0:
	if self.for_inference:
	logger.warning(
	f"WARNING: Inference mode is True but {min_len_discards} samples shorter than min length were "
	f"found. The {min_len_discards} samples will be DISCARDED. If you are doing some kind of evaluation"
	f", this can INVALIDATE results. This might happen if the min length was not set to -1 or if the "
	f"sample length is shorter than the minimum length supported by the current model."
	)
	else:
	logger.warning(
	f"During iteration, {min_len_discards} elements were "
	f"discarded since shorter than min length {self.min_length}"
	)

	@staticmethod
	def _new_output_format(sample: RelikReaderSample) -> RelikReaderSample:
	# try-out for a new format

	# set of span tuples (start, end, type) for each entity
	predicted_spans = set()
	for prediction in sample.predicted_entities:
	predicted_spans.add(
	(
	prediction[0],
	prediction[1],
	prediction[2],
	)
	)

	# sort the spans by start so that we can use the index of the span to get the entity
	predicted_spans = sorted(predicted_spans, key=lambda x: x[0])
	predicted_triples = []
	# now search for the spans in each triplet
	for prediction in sample.predicted_relations:
	# get the index of the entity that has the same start and end
	start_entity_index = [
	i
	for i, p in enumerate(predicted_spans)
	if p[:2]
	== (prediction["subject"]["start"], prediction["subject"]["end"])
	][0]
	end_entity_index = [
	i
	for i, p in enumerate(predicted_spans)
	if p[:2] == (prediction["object"]["start"], prediction["object"]["end"])
	][0]

	predicted_triples.append(
	(
	start_entity_index,
	prediction["relation"]["name"],
	end_entity_index,
	prediction["relation"]["probability"],
	)
	)
	sample.predicted_spans = predicted_spans
	sample.predicted_triples = predicted_triples
	return sample

	@staticmethod
	def _convert_annotations(sample: RelikReaderSample) -> RelikReaderSample:
	triplets = []
	entities = []

	for entity in sample.predicted_entities:
	span_start = entity[0] - 1
	span_end = entity[1] - 1
	if str(span_start) not in sample.token2word_start:
	# span_start is in the middle of a word
	# retrieve the first token of the word
	while str(span_start) not in sample.token2word_start:
	span_start -= 1
	# skip
	if span_start < 0:
	break
	if str(span_end) not in sample.token2word_end:
	# span_end is in the middle of a word
	# retrieve the last token of the word
	while str(span_end) not in sample.token2word_end:
	span_end += 1
	# skip
	if span_end >= len(sample.tokens):
	break

	if span_start < 0 or span_end >= len(sample.tokens):
	continue

	entities.append(
	(
	sample.token2word_start[str(span_start)],
	sample.token2word_end[str(span_end)] + 1,
	sample.span_candidates[entity[2]]
	if sample.span_candidates and len(entity) > 2
	else "NME",
	)
	)
	for predicted_triplet, predicted_triplet_probabilities in zip(
	sample.predicted_relations, sample.predicted_relations_probabilities
	):
	subject, object_, relation = predicted_triplet
	subject = entities[subject]
	object_ = entities[object_]
	relation = sample.triplet_candidates[relation]
	triplets.append(
	{
	"subject": {
	"start": subject[0],
	"end": subject[1],
	"type": subject[2],
	# "name": " ".join(sample.tokens[subject[0] : subject[1]]),
	},
	"relation": {
	"name": relation,
	"probability": float(predicted_triplet_probabilities.round(2)),
	},
	"object": {
	"start": object_[0],
	"end": object_[1],
	"type": object_[2],
	# "name": " ".join(sample.tokens[object_[0] : object_[1]]),
	},
	}
	)
	# convert to list since we need to modify the sample down the road
	sample.predicted_entities = entities
	sample.predicted_relations = triplets
	del sample._d["predicted_relations_probabilities"]

	return sample

	@staticmethod
	def convert_to_word_annotations(sample: RelikReaderSample) -> RelikReaderSample:
	sample = RelikREDataset._convert_annotations(sample)
	return RelikREDataset._new_output_format(sample)

	@staticmethod
	def convert_to_char_annotations(
	sample: RelikReaderSample,
	remove_nmes: bool = True,
	) -> RelikReaderSample:
	RelikREDataset._convert_annotations(sample)
	if "token2char_start" in sample._d:
	entities = []
	for entity in sample.predicted_entities:
	entity = list(entity)
	token_start = sample.word2token_start[str(entity[0])]
	entity[0] = sample.token2char_start[str(token_start)]
	token_end = sample.word2token_end[str(entity[1] - 1)]
	entity[1] = sample.token2char_end[str(token_end)]
	entities.append(entity)
	sample.predicted_entities = entities
	for triplet in sample.predicted_relations:
	triplet["subject"]["start"] = sample.token2char_start[
	str(sample.word2token_start[str(triplet["subject"]["start"])])
	]
	triplet["subject"]["end"] = sample.token2char_end[
	str(sample.word2token_end[str(triplet["subject"]["end"] - 1)])
	]
	triplet["object"]["start"] = sample.token2char_start[
	str(sample.word2token_start[str(triplet["object"]["start"])])
	]
	triplet["object"]["end"] = sample.token2char_end[
	str(sample.word2token_end[str(triplet["object"]["end"] - 1)])
	]

	sample = RelikREDataset._new_output_format(sample)

	return sample

	@staticmethod
	def merge_patches_predictions(sample) -> None:
	pass


	def main():
	special_symbols = [NME_SYMBOL] + [f"R-{i}" for i in range(50)]

	relik_dataset = RelikREDataset(
	"/home/huguetcabot/alby-re/alby/data/nyt-alby+/valid.jsonl",
	materialize_samples=False,
	transformer_model="microsoft/deberta-v3-base",
	special_symbols=special_symbols,
	shuffle_candidates=False,
	flip_candidates=False,
	for_inference=True,
	)

	for batch in relik_dataset:
	print(batch)
	exit(0)


	if __name__ == "__main__":
	main()