Prajwal Kailas
commited on
Commit
•
45c1511
1
Parent(s):
b5a124c
dependency to run
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- deid/__init__.py +2 -0
- deid/text_deid.py +307 -0
- deid/utils.py +43 -0
- ner_datasets/__init__.py +5 -0
- ner_datasets/__pycache__/__init__.cpython-37.pyc +0 -0
- ner_datasets/dataset_builder/__init__.py +3 -0
- ner_datasets/dataset_builder/dataset.py +119 -0
- ner_datasets/dataset_builder/labels/__init__.py +4 -0
- ner_datasets/dataset_builder/labels/mismatch_error.py +7 -0
- ner_datasets/dataset_builder/labels/ner_predict_token_labels.py +30 -0
- ner_datasets/dataset_builder/labels/ner_token_labels.py +156 -0
- ner_datasets/dataset_builder/sentence_dataset.py +355 -0
- ner_datasets/dataset_creator.py +322 -0
- ner_datasets/dataset_splitter.py +294 -0
- ner_datasets/distribution/__init__.py +4 -0
- ner_datasets/distribution/dataset_splits.py +218 -0
- ner_datasets/distribution/ner_distribution.py +54 -0
- ner_datasets/distribution/print_distribution.py +49 -0
- ner_datasets/preprocessing/__init__.py +2 -0
- ner_datasets/preprocessing/preprocessing_loader.py +63 -0
- ner_datasets/preprocessing/sentencizers/__init__.py +3 -0
- ner_datasets/preprocessing/sentencizers/mimic_stanza_sentencizer.py +37 -0
- ner_datasets/preprocessing/sentencizers/note_sentencizer.py +33 -0
- ner_datasets/preprocessing/sentencizers/spacy_sentencizer.py +37 -0
- ner_datasets/preprocessing/tokenizers/__init__.py +4 -0
- ner_datasets/preprocessing/tokenizers/abbreviations/check.txt +20 -0
- ner_datasets/preprocessing/tokenizers/abbreviations/medical_abbreviations_curated.txt +87 -0
- ner_datasets/preprocessing/tokenizers/abbreviations/medical_abbreviations_wiki.txt +459 -0
- ner_datasets/preprocessing/tokenizers/clinical_spacy_tokenizer.py +73 -0
- ner_datasets/preprocessing/tokenizers/core_nlp_tokenizer.py +58 -0
- ner_datasets/preprocessing/tokenizers/spacy_tokenizer.py +49 -0
- ner_datasets/preprocessing/tokenizers/utils/__init__.py +4 -0
- ner_datasets/preprocessing/tokenizers/utils/clean_regex.py +64 -0
- ner_datasets/preprocessing/tokenizers/utils/clinical_regex.py +309 -0
- ner_datasets/preprocessing/tokenizers/utils/date_regex.py +104 -0
- ner_datasets/span_fixer.py +380 -0
- ner_datasets/span_validation.py +91 -0
- sequence_tagging/.DS_Store +0 -0
- sequence_tagging/__init__.py +2 -0
- sequence_tagging/__pycache__/__init__.cpython-37.pyc +0 -0
- sequence_tagging/__pycache__/sequence_tagger.cpython-37.pyc +0 -0
- sequence_tagging/arguments/__init__.py +8 -0
- sequence_tagging/arguments/data_training_arguments.py +115 -0
- sequence_tagging/arguments/evaluation_arguments.py +26 -0
- sequence_tagging/arguments/model_arguments.py +43 -0
- sequence_tagging/dataset_builder/__init__.py +5 -0
- sequence_tagging/dataset_builder/dataset_tokenizer.py +178 -0
- sequence_tagging/dataset_builder/label_mapper.py +87 -0
- sequence_tagging/dataset_builder/ner_dataset.py +102 -0
- sequence_tagging/dataset_builder/ner_labels.py +67 -0
deid/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .text_deid import TextDeid
|
2 |
+
__all__ = ["TextDeid"]
|
deid/text_deid.py
ADDED
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import re
|
3 |
+
from argparse import ArgumentParser
|
4 |
+
from typing import Sequence, List, Tuple, Mapping, Union, Any, Type
|
5 |
+
|
6 |
+
import regex
|
7 |
+
from seqeval.scheme import IOB1, IOB2, IOBES, BILOU, Entities
|
8 |
+
|
9 |
+
from .utils import remove, replace_tag_type, replace_informative
|
10 |
+
|
11 |
+
|
12 |
+
class TextDeid(object):
|
13 |
+
|
14 |
+
def __init__(self, notation, span_constraint):
|
15 |
+
self._span_constraint = span_constraint
|
16 |
+
if self._span_constraint == 'strict':
|
17 |
+
self._scheme = TextDeid.__get_scheme('IO')
|
18 |
+
elif self._span_constraint == 'super_strict':
|
19 |
+
self._scheme = TextDeid.__get_scheme('IO')
|
20 |
+
else:
|
21 |
+
self._scheme = TextDeid.__get_scheme(notation)
|
22 |
+
|
23 |
+
def decode(self, tokens, predictions):
|
24 |
+
if self._span_constraint == 'exact':
|
25 |
+
return predictions
|
26 |
+
elif self._span_constraint == 'strict':
|
27 |
+
return TextDeid.__get_relaxed_predictions(predictions)
|
28 |
+
elif self._span_constraint == 'super_strict':
|
29 |
+
return TextDeid.__get_super_relaxed_predictions(tokens, predictions)
|
30 |
+
|
31 |
+
def get_predicted_entities_positions(
|
32 |
+
self,
|
33 |
+
tokens: Sequence[Mapping[str, Union[str, int]]],
|
34 |
+
predictions: List[str],
|
35 |
+
suffix: bool
|
36 |
+
) -> List[List[Union[Tuple[Union[str, int], Union[str, int]], Any]]]:
|
37 |
+
"""
|
38 |
+
Use the seqeval get_entities method, which goes through the predictions and returns
|
39 |
+
where the span starts and ends. - [O, O, B-AGE, I-AGE, O, O] this will return
|
40 |
+
spans starts at token 2 and ends at token 3 - with type AGE. We then extract the
|
41 |
+
position of the token in the note (character position) - so we return that
|
42 |
+
this span starts at 32 and ends at 37. The function then returns a nested list
|
43 |
+
that contains a tuple of tag type and tag position (character positions).
|
44 |
+
Example: [[(3, 9), LOC], [(34, 41), PATIENT], ...]]
|
45 |
+
Args:
|
46 |
+
tokens (Sequence[Mapping[str, Union[str, int]]]): The list of tokens in the note
|
47 |
+
predictions (Sequence[str]): The list of predictions for the note
|
48 |
+
suffix (str): Whether the B, I etc is in the prefix or the suffix
|
49 |
+
Returns:
|
50 |
+
positions_info (List[Tuple[Tuple[int, int], str]])): List containing tuples of tag positions and tag type
|
51 |
+
"""
|
52 |
+
positions_info = list()
|
53 |
+
entities = Entities(sequences=[predictions], scheme=self._scheme, suffix=suffix)
|
54 |
+
for entity_list in entities.entities:
|
55 |
+
for entity in entity_list:
|
56 |
+
position = (tokens[entity.start]['start'], tokens[entity.end - 1]['end'])
|
57 |
+
positions_info.append([position, entity.tag])
|
58 |
+
return positions_info
|
59 |
+
|
60 |
+
def run_deid(
|
61 |
+
self,
|
62 |
+
input_file,
|
63 |
+
predictions_file,
|
64 |
+
deid_strategy,
|
65 |
+
keep_age: bool = False,
|
66 |
+
metadata_key: str = 'meta',
|
67 |
+
note_id_key: str = 'note_id',
|
68 |
+
tokens_key: str = 'tokens',
|
69 |
+
predictions_key: str = 'predictions',
|
70 |
+
text_key: str = 'text'
|
71 |
+
):
|
72 |
+
# Store note_id to note mapping
|
73 |
+
note_map = dict()
|
74 |
+
for line in open(input_file, 'r'):
|
75 |
+
note = json.loads(line)
|
76 |
+
note_id = note[metadata_key][note_id_key]
|
77 |
+
note_map[note_id] = note
|
78 |
+
# Go through note predictions and de identify the note accordingly
|
79 |
+
for line in open(predictions_file, 'r'):
|
80 |
+
note = json.loads(line)
|
81 |
+
# Get the text using the note_id for this note from the note_map dict
|
82 |
+
note_id = note[note_id_key]
|
83 |
+
# Get the note from the note_map dict
|
84 |
+
deid_note = note_map[note_id]
|
85 |
+
# Get predictions
|
86 |
+
predictions = self.decode(tokens=note[tokens_key], predictions=note[predictions_key])
|
87 |
+
# Get entities and their positions
|
88 |
+
entity_positions = self.get_predicted_entities_positions(
|
89 |
+
tokens=note[tokens_key],
|
90 |
+
predictions=predictions,
|
91 |
+
suffix=False
|
92 |
+
)
|
93 |
+
yield TextDeid.__get_deid_text(
|
94 |
+
deid_note=deid_note,
|
95 |
+
entity_positions=entity_positions,
|
96 |
+
deid_strategy=deid_strategy,
|
97 |
+
keep_age=keep_age,
|
98 |
+
text_key=text_key
|
99 |
+
)
|
100 |
+
|
101 |
+
@staticmethod
|
102 |
+
def __get_deid_text(
|
103 |
+
deid_note,
|
104 |
+
entity_positions,
|
105 |
+
deid_strategy,
|
106 |
+
keep_age: bool = False,
|
107 |
+
text_key: str = 'text'
|
108 |
+
):
|
109 |
+
tag_mapping = TextDeid.__get_tag_mapping(deid_strategy=deid_strategy)
|
110 |
+
age_pattern = '((?<!\d+)([1-7]\d?)(?!\d+))|((?<!\d+)(8[0-8]?)(?!\d+))'
|
111 |
+
# Sort positions - store the last occurring tag first - i.e in descending order
|
112 |
+
# of start positions.
|
113 |
+
entity_positions.sort(key=lambda info: info[0][0], reverse=True)
|
114 |
+
# Get text and de identify it
|
115 |
+
note_text = deid_note[text_key]
|
116 |
+
deid_text = deid_note[text_key]
|
117 |
+
# Go through the entities and their positions and de identify the text
|
118 |
+
# Since we have the positions in sorted order (descending by start positions)
|
119 |
+
# we de identify the text from the end to the start - i.e back to front
|
120 |
+
for positions, tag in entity_positions:
|
121 |
+
start_pos, end_pos = positions
|
122 |
+
deid_tag = tag_mapping[tag]
|
123 |
+
age_unchanged = False
|
124 |
+
if tag == 'AGE' and keep_age:
|
125 |
+
span_text = note_text[start_pos:end_pos]
|
126 |
+
if regex.search(age_pattern, span_text, flags=regex.IGNORECASE):
|
127 |
+
deid_tag = span_text
|
128 |
+
age_unchanged = True
|
129 |
+
else:
|
130 |
+
deid_tag = deid_tag
|
131 |
+
if deid_strategy == 'replace_informative' and not age_unchanged:
|
132 |
+
deid_text = deid_text[:start_pos] + deid_tag.format(note_text[start_pos:end_pos]) + deid_text[end_pos:]
|
133 |
+
else:
|
134 |
+
deid_text = deid_text[:start_pos] + deid_tag + deid_text[end_pos:]
|
135 |
+
deid_note['deid_text'] = regex.sub('[\n]+', '\n', regex.sub('[ \t\r\f\v]+', ' ', deid_text)).strip()
|
136 |
+
return deid_note
|
137 |
+
|
138 |
+
@staticmethod
|
139 |
+
def __get_tag_mapping(deid_strategy):
|
140 |
+
if deid_strategy == 'remove':
|
141 |
+
return remove()
|
142 |
+
elif deid_strategy == 'replace_tag_type':
|
143 |
+
return replace_tag_type()
|
144 |
+
elif deid_strategy == 'replace_informative':
|
145 |
+
return replace_informative()
|
146 |
+
|
147 |
+
@staticmethod
|
148 |
+
def __get_relaxed_predictions(predictions):
|
149 |
+
return ['I-' + prediction[2:] if '-' in prediction else prediction for prediction in predictions]
|
150 |
+
|
151 |
+
@staticmethod
|
152 |
+
def __get_super_relaxed_predictions(tokens, predictions):
|
153 |
+
# Super relaxed
|
154 |
+
# 360 Longwood Ave, OBI, Boston
|
155 |
+
# Tokens: ['360', 'Longwood', 'Ave', ',', 'OBI', ',', Boston[
|
156 |
+
# Predictions: [B-LOC, I-LOC, L-LOC, O, U-LOC, O, U-LOC]
|
157 |
+
# Relaxed: [I-LOC, I-LOC, I-LOC, O, I-LOC, O, I-LOC]
|
158 |
+
# Super relaxed: [I-LOC, I-LOC, I-LOC, I-LOC, I-LOC, I-LOC, I-LOC]
|
159 |
+
relaxed_predictions = TextDeid.__get_relaxed_predictions(predictions)
|
160 |
+
prev_type = None
|
161 |
+
replace_indexes = list()
|
162 |
+
super_relaxed_predictions = list()
|
163 |
+
for index, (token, prediction) in enumerate(zip(tokens, relaxed_predictions)):
|
164 |
+
super_relaxed_predictions.append(prediction)
|
165 |
+
# Check special characters that appear after a prediction
|
166 |
+
# we can assign the prediction label to this sequence of special characters
|
167 |
+
if prediction == 'O' and prev_type is not None:
|
168 |
+
# [a-zA-Z0-9]
|
169 |
+
if re.search('^(\W|_)+$', token['text'], flags=re.IGNORECASE | re.DOTALL):
|
170 |
+
replace_indexes.append(index)
|
171 |
+
else:
|
172 |
+
prev_type = None
|
173 |
+
replace_indexes = list()
|
174 |
+
# Replace all the tokens identified above with the NER prediction type
|
175 |
+
# This is done only ig the current prediction type matches the previous type
|
176 |
+
elif prediction != 'O':
|
177 |
+
if prediction[2:] == prev_type and replace_indexes != []:
|
178 |
+
for replace_index in replace_indexes:
|
179 |
+
super_relaxed_predictions[replace_index] = 'I-' + prev_type
|
180 |
+
# Reset list and previous type
|
181 |
+
replace_indexes = list()
|
182 |
+
prev_type = prediction[2:]
|
183 |
+
else:
|
184 |
+
prev_type = None
|
185 |
+
return super_relaxed_predictions
|
186 |
+
|
187 |
+
@staticmethod
|
188 |
+
def __get_scheme(notation: str) -> Union[Type[IOB2], Type[IOBES], Type[BILOU], Type[IOB1]]:
|
189 |
+
"""
|
190 |
+
Get the seqeval scheme based on the notation
|
191 |
+
Args:
|
192 |
+
notation (str): The NER notation
|
193 |
+
Returns:
|
194 |
+
(Union[IOB2, IOBES, BILOU, IOB1]): The seqeval scheme
|
195 |
+
"""
|
196 |
+
if notation == 'BIO':
|
197 |
+
return IOB2
|
198 |
+
elif notation == 'BIOES':
|
199 |
+
return IOBES
|
200 |
+
elif notation == 'BILOU':
|
201 |
+
return BILOU
|
202 |
+
elif notation == 'IO':
|
203 |
+
return IOB1
|
204 |
+
else:
|
205 |
+
raise ValueError('Invalid Notation')
|
206 |
+
|
207 |
+
|
208 |
+
def main():
|
209 |
+
# The following code sets up the arguments to be passed via CLI or via a JSON file
|
210 |
+
cli_parser = ArgumentParser(description='configuration arguments provided at run time from the CLI')
|
211 |
+
cli_parser.add_argument(
|
212 |
+
'--input_file',
|
213 |
+
type=str,
|
214 |
+
required=True,
|
215 |
+
help='the the jsonl file that contains the notes'
|
216 |
+
)
|
217 |
+
cli_parser.add_argument(
|
218 |
+
'--predictions_file',
|
219 |
+
type=str,
|
220 |
+
required=True,
|
221 |
+
help='the location where the predictions are'
|
222 |
+
)
|
223 |
+
cli_parser.add_argument(
|
224 |
+
'--span_constraint',
|
225 |
+
type=str,
|
226 |
+
required=True,
|
227 |
+
choices=['exact', 'strict', 'super_strict'],
|
228 |
+
help='whether we want to modify the predictions, make the process of removing phi more struct etc'
|
229 |
+
)
|
230 |
+
cli_parser.add_argument(
|
231 |
+
'--notation',
|
232 |
+
type=str,
|
233 |
+
|
234 |
+
required=True,
|
235 |
+
help='the NER notation in the predictions'
|
236 |
+
)
|
237 |
+
cli_parser.add_argument(
|
238 |
+
'--deid_strategy',
|
239 |
+
type=str,
|
240 |
+
required=True,
|
241 |
+
choices=['remove', 'replace_tag_type', 'replace_informative'],
|
242 |
+
help='The strategy '
|
243 |
+
)
|
244 |
+
cli_parser.add_argument(
|
245 |
+
'--keep_age',
|
246 |
+
action='store_true',
|
247 |
+
help='whether to keep ages below 89'
|
248 |
+
)
|
249 |
+
cli_parser.add_argument(
|
250 |
+
'--text_key',
|
251 |
+
type=str,
|
252 |
+
default='text',
|
253 |
+
help='the key where the note text is present in the json object'
|
254 |
+
)
|
255 |
+
cli_parser.add_argument(
|
256 |
+
'--metadata_key',
|
257 |
+
type=str,
|
258 |
+
default='meta',
|
259 |
+
help='the key where the note metadata is present in the json object'
|
260 |
+
)
|
261 |
+
cli_parser.add_argument(
|
262 |
+
'--note_id_key',
|
263 |
+
type=str,
|
264 |
+
default='note_id',
|
265 |
+
help='the key where the note id is present in the json object'
|
266 |
+
)
|
267 |
+
cli_parser.add_argument(
|
268 |
+
'--tokens_key',
|
269 |
+
type=str,
|
270 |
+
default='tokens',
|
271 |
+
help='the key where the tokens for the notes are present in the json object'
|
272 |
+
)
|
273 |
+
cli_parser.add_argument(
|
274 |
+
'--predictions_key',
|
275 |
+
type=str,
|
276 |
+
default='predictions',
|
277 |
+
help='the key where the note predictions is present in the json object'
|
278 |
+
)
|
279 |
+
cli_parser.add_argument(
|
280 |
+
'--output_file',
|
281 |
+
type=str,
|
282 |
+
required=True,
|
283 |
+
help='the location we would write the deid notes'
|
284 |
+
)
|
285 |
+
# Parse args
|
286 |
+
args = cli_parser.parse_args()
|
287 |
+
text_deid = TextDeid(notation=args.notation, span_constraint=args.span_constraint)
|
288 |
+
deid_notes = text_deid.run_deid(
|
289 |
+
input_file=args.input_file,
|
290 |
+
predictions_file=args.predictions_file,
|
291 |
+
deid_strategy=args.deid_strategy,
|
292 |
+
keep_age=args.keep_age,
|
293 |
+
metadata_key=args.metadata_key,
|
294 |
+
note_id_key=args.note_id_key,
|
295 |
+
tokens_key=args.tokens_key,
|
296 |
+
predictions_key=args.predictions_key,
|
297 |
+
text_key=args.text_key
|
298 |
+
)
|
299 |
+
# Write the dataset to the output file
|
300 |
+
with open(args.output_file, 'w') as file:
|
301 |
+
for deid_note in deid_notes:
|
302 |
+
file.write(json.dumps(deid_note) + '\n')
|
303 |
+
|
304 |
+
|
305 |
+
if __name__ == "__main__":
|
306 |
+
# Get deid notes
|
307 |
+
main()
|
deid/utils.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def remove():
|
2 |
+
return {'PATIENT': '',
|
3 |
+
'STAFF': '',
|
4 |
+
'AGE': '',
|
5 |
+
'DATE': '',
|
6 |
+
'PHONE': '',
|
7 |
+
'MRN': '',
|
8 |
+
'ID': '',
|
9 |
+
'EMAIL': '',
|
10 |
+
'PATORG': '',
|
11 |
+
'LOC': '',
|
12 |
+
'HOSP': '',
|
13 |
+
'OTHERPHI': ''}
|
14 |
+
|
15 |
+
|
16 |
+
def replace_tag_type():
|
17 |
+
return {'PATIENT': 'PATIENT',
|
18 |
+
'STAFF': 'STAFF',
|
19 |
+
'AGE': 'AGE',
|
20 |
+
'DATE': 'DATE',
|
21 |
+
'PHONE': 'PHONE',
|
22 |
+
'MRN': 'MRN',
|
23 |
+
'ID': 'ID',
|
24 |
+
'EMAIL': 'EMAIL',
|
25 |
+
'PATORG': 'PATORG',
|
26 |
+
'LOC': 'LOCATION',
|
27 |
+
'HOSP': 'HOSPITAL',
|
28 |
+
'OTHERPHI': 'OTHERPHI'}
|
29 |
+
|
30 |
+
|
31 |
+
def replace_informative():
|
32 |
+
return {'PATIENT': '<<PATIENT:{}>>',
|
33 |
+
'STAFF': '<<STAFF:{}>>',
|
34 |
+
'AGE': '<<AGE:{}>>',
|
35 |
+
'DATE': '<<DATE:{}>>',
|
36 |
+
'PHONE': '<<PHONE:{}>>',
|
37 |
+
'MRN': '<<MRN:{}>>',
|
38 |
+
'ID': '<<ID:{}>>',
|
39 |
+
'EMAIL': '<<EMAIL:{}>>',
|
40 |
+
'PATORG': '<<PATORG:{}>>',
|
41 |
+
'LOC': '<<LOCATION:{}>>',
|
42 |
+
'HOSP': '<<HOSPITAL:{}>>',
|
43 |
+
'OTHERPHI': '<<OTHERPHI:{}>>'}
|
ner_datasets/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ehr_deidentification.sequence_tagging.dataset_builder.ner_labels import NERLabels
|
2 |
+
from .span_fixer import SpanFixer
|
3 |
+
from .dataset_splitter import DatasetSplitter
|
4 |
+
from .dataset_creator import DatasetCreator
|
5 |
+
__all__ = ["NERLabels", "SpanFixer", "DatasetSplitter", "DatasetCreator"]
|
ner_datasets/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (487 Bytes). View file
|
|
ner_datasets/dataset_builder/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .dataset import Dataset
|
2 |
+
from .sentence_dataset import SentenceDataset
|
3 |
+
__all__ = ["SentenceDataset", "Dataset"]
|
ner_datasets/dataset_builder/dataset.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import re
|
3 |
+
from typing import Iterable, Dict, Sequence, Union, Mapping, Optional, List
|
4 |
+
|
5 |
+
from .labels import NERTokenLabels, NERPredictTokenLabels, MismatchError
|
6 |
+
|
7 |
+
random.seed(41)
|
8 |
+
|
9 |
+
|
10 |
+
class Dataset(object):
|
11 |
+
"""
|
12 |
+
Build a NER token classification dataset. Each token should have a corresponding label
|
13 |
+
based on the annotated spans
|
14 |
+
For training we will build the dataset using the annotated spans (e.g from prodigy)
|
15 |
+
For predictions we will assign default labels. to keep the format of the dataset the same
|
16 |
+
The dataset is on a sentence level, i.e each note is split into sentences and the
|
17 |
+
task is run on a sentence level. Even the predictions are run on a sentence level
|
18 |
+
The dataset would be something like:
|
19 |
+
Tokens: [tok1, tok2, ... tok n]
|
20 |
+
Labels: [lab1, lab2, ... lab n]
|
21 |
+
For the prediction mode the labels would be: [default, default, default .... default]
|
22 |
+
This script can also be used for predictions, the Labels will be filled with some
|
23 |
+
default value. This is done so that we can use the same script for building a dataset to train a model
|
24 |
+
and a dataset to obtain predictions using a model
|
25 |
+
"""
|
26 |
+
|
27 |
+
def __init__(
|
28 |
+
self,
|
29 |
+
sentencizer,
|
30 |
+
tokenizer
|
31 |
+
):
|
32 |
+
"""
|
33 |
+
Build a NER token classification dataset
|
34 |
+
For training we will build the dataset using the annotated spans (e.g from prodigy)
|
35 |
+
For predictions we will assign default labels.
|
36 |
+
The dataset is on a sentence level, i.e each note is split into sentences and the de-id
|
37 |
+
task is run on a sentence level. Even the predictions are run on a sentence level
|
38 |
+
The dataset would be something like:
|
39 |
+
Tokens: [tok1, tok2, ... tok n]
|
40 |
+
Labels: [lab1, lab2, ... lab n]
|
41 |
+
This script can also be used for predictions, the Labels will be filled with some
|
42 |
+
default value. This is done so that we can use the same script for building a dataset to train a model
|
43 |
+
and a dataset to obtain predictions using a model
|
44 |
+
Args:
|
45 |
+
sentencizer (Union[SpacySentencizer, MimicStanzaSentencizer, NoteSentencizer]): The sentencizer to use for
|
46 |
+
splitting notes into
|
47 |
+
sentences
|
48 |
+
tokenizer (Union[ClinicalSpacyTokenizer, SpacyTokenizer, CoreNLPTokenizer]): The tokenizer to use for
|
49 |
+
splitting text into tokens
|
50 |
+
"""
|
51 |
+
self._sentencizer = sentencizer
|
52 |
+
self._tokenizer = tokenizer
|
53 |
+
|
54 |
+
def get_tokens(
|
55 |
+
self,
|
56 |
+
text: str,
|
57 |
+
spans: Optional[List[Mapping[str, Union[str, int]]]] = None,
|
58 |
+
notation: str = 'BIO',
|
59 |
+
token_text_key: str = 'text',
|
60 |
+
label_key: str = 'label'
|
61 |
+
) -> Iterable[Sequence[Dict[str, Union[str, int]]]]:
|
62 |
+
"""
|
63 |
+
Get a nested list of tokens where the the inner list represents the tokens in the
|
64 |
+
sentence and the outer list will contain all the sentences in the note
|
65 |
+
Args:
|
66 |
+
text (str): The text present in the note
|
67 |
+
spans (Optional[List[Mapping[str, Union[str, int]]]]): The NER spans in the note. This will be none if
|
68 |
+
building the dataset for prediction
|
69 |
+
notation (str): The notation we will be using for the label scheme (e.g BIO, BILOU etc)
|
70 |
+
token_text_key (str): The key where the note text is present
|
71 |
+
label_key (str): The key where the note label for each token is present
|
72 |
+
Returns:
|
73 |
+
Iterable[Sequence[Dict[str, Union[str, int]]]]: Iterable that iterates through all the sentences
|
74 |
+
and yields the list of tokens in each sentence
|
75 |
+
"""
|
76 |
+
# Initialize the object that will be used to align tokens and spans based on the notation
|
77 |
+
# as mentioned earlier - this will be used only when mode is train - because we have
|
78 |
+
# access to labelled spans for the notes
|
79 |
+
if spans is None:
|
80 |
+
label_spans = NERPredictTokenLabels('O')
|
81 |
+
else:
|
82 |
+
label_spans = NERTokenLabels(spans=spans, notation=notation)
|
83 |
+
# Iterate through the sentences in the note
|
84 |
+
for sentence in self._sentencizer.get_sentences(text=text):
|
85 |
+
# This is used to determine the position of the tokens with respect to the entire note
|
86 |
+
offset = sentence['start']
|
87 |
+
# Keeps track of the tokens in the sentence
|
88 |
+
tokens = list()
|
89 |
+
for token in self._tokenizer.get_tokens(text=sentence['text']):
|
90 |
+
# Get the token position (start, end) in the note
|
91 |
+
token['start'] += offset
|
92 |
+
token['end'] += offset
|
93 |
+
if token[token_text_key].strip() in ['\n', '\t', ' ', ''] or token['start'] == token['end']:
|
94 |
+
continue
|
95 |
+
# Shorten consecutive sequences of special characters, this can prevent BERT from truncating
|
96 |
+
# extremely long sentences - that could arise because of these characters
|
97 |
+
elif re.search('(\W|_){9,}', token[token_text_key]):
|
98 |
+
print('WARNING - Shortening a long sequence of special characters from {} to 8'.format(
|
99 |
+
len(token[token_text_key])))
|
100 |
+
token[token_text_key] = re.sub('(?P<specchar>(\W|_)){8,}', '\g<specchar>' * 8,
|
101 |
+
token[token_text_key])
|
102 |
+
elif len(token[token_text_key].split(' ')) != 1:
|
103 |
+
print('WARNING - Token contains a space character - will be replaced with hyphen')
|
104 |
+
token[token_text_key] = token[token_text_key].replace(' ', '-')
|
105 |
+
# Get the labels for each token based on the notation (BIO)
|
106 |
+
# In predict mode - the default label (e.g O) will be assigned
|
107 |
+
try:
|
108 |
+
# Get the label for the token - based on the notation
|
109 |
+
label = label_spans.get_labels(token=token)
|
110 |
+
if label[2:] == 'OTHERISSUE':
|
111 |
+
raise ValueError('Fix OTHERISSUE spans')
|
112 |
+
# Check if there is a token and span mismatch, i.e the token and span does not align
|
113 |
+
except MismatchError:
|
114 |
+
print(token)
|
115 |
+
raise ValueError('Token-Span mismatch')
|
116 |
+
token[label_key] = label
|
117 |
+
tokens.append(token)
|
118 |
+
if tokens:
|
119 |
+
yield tokens
|
ner_datasets/dataset_builder/labels/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .mismatch_error import MismatchError
|
2 |
+
from .ner_token_labels import NERTokenLabels
|
3 |
+
from .ner_predict_token_labels import NERPredictTokenLabels
|
4 |
+
__all__=["NERTokenLabels", "NERPredictTokenLabels", "MismatchError"]
|
ner_datasets/dataset_builder/labels/mismatch_error.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Exception thrown when there is a mismatch between a token and span
|
2 |
+
# The token and spans don't line up due to a tokenization issue
|
3 |
+
# E.g - 79M - span is AGE - 79, but token is 79M
|
4 |
+
# There is a mismatch and an error will be thrown - that is the token does
|
5 |
+
# not line up with the span
|
6 |
+
class MismatchError(Exception):
|
7 |
+
pass
|
ner_datasets/dataset_builder/labels/ner_predict_token_labels.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Mapping, Union, NoReturn
|
2 |
+
|
3 |
+
|
4 |
+
class NERPredictTokenLabels(object):
|
5 |
+
"""
|
6 |
+
Assign a default label while creating the dataset for prediction.
|
7 |
+
This is done since the sequence tagging code expects the input
|
8 |
+
file to contain a labels field, hence we assign a default label
|
9 |
+
to meet this requirement
|
10 |
+
"""
|
11 |
+
|
12 |
+
def __init__(self, default_label: str) -> NoReturn:
|
13 |
+
"""
|
14 |
+
Initialize the default label
|
15 |
+
Args:
|
16 |
+
default_label (str): Default label that will be used
|
17 |
+
"""
|
18 |
+
# Keeps track of all the spans (list) in the text (note)
|
19 |
+
self._default_label = default_label
|
20 |
+
|
21 |
+
def get_labels(self, token: Mapping[str, Union[str, int]]) -> str:
|
22 |
+
"""
|
23 |
+
Given a token, return the default label.
|
24 |
+
Args:
|
25 |
+
token (Mapping[str, Union[str, int]]): Contains the token text, start and end position of the token
|
26 |
+
in the text
|
27 |
+
Returns:
|
28 |
+
default_label (str): default label
|
29 |
+
"""
|
30 |
+
return self._default_label
|
ner_datasets/dataset_builder/labels/ner_token_labels.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Mapping, Union, Sequence, List
|
2 |
+
from .mismatch_error import MismatchError
|
3 |
+
|
4 |
+
|
5 |
+
class NERTokenLabels(object):
|
6 |
+
"""
|
7 |
+
This class is used to align tokens with the spans
|
8 |
+
Each token is assigned one of the following labels
|
9 |
+
'B-LABEL', 'I-LABEL', 'O'. For example the text
|
10 |
+
360 Longwood Avenue is 2 tokens - [360, Longwood, Avenue]
|
11 |
+
and each token would be assigned the following labels
|
12 |
+
[B-LOC, I-LOC, I-LOC] (this would also depend on what
|
13 |
+
notation we are using). Generally the data after prodigy
|
14 |
+
annotation has all the tokens and all the spans.
|
15 |
+
We would have tokens:[tok1, tok2, ... tokn]
|
16 |
+
and spans:[span1:[tok1, tok2, tok3], span2:[tok7], ... span k]
|
17 |
+
This would be used to convert into the format we are using
|
18 |
+
which is assign the label to each token based on which span it
|
19 |
+
belongs to.
|
20 |
+
"""
|
21 |
+
|
22 |
+
def __init__(
|
23 |
+
self,
|
24 |
+
spans: List[Mapping[str, Union[str, int]]],
|
25 |
+
notation: str
|
26 |
+
):
|
27 |
+
"""
|
28 |
+
Initialize variables that will be used to align tokens
|
29 |
+
and span labels. The spans variable will contain all the spans
|
30 |
+
in the note. Notation is whether we would like to use BIO, IO, BILOU,
|
31 |
+
when assigning the label to each token based on which span it belongs to.
|
32 |
+
Keep track of the total number of spans etc.
|
33 |
+
Args:
|
34 |
+
spans (Sequence[Mapping[str, Union[str, int]]]): List of all the spans in the text
|
35 |
+
notation (str): NER label notation
|
36 |
+
"""
|
37 |
+
# Keeps track of all the spans (list) in the text (note)
|
38 |
+
self._spans = spans
|
39 |
+
for span in self._spans:
|
40 |
+
if type(span['start']) != int or type(span['end']) != int:
|
41 |
+
raise ValueError('The start and end keys of the span must be of type int')
|
42 |
+
self._spans.sort(key=lambda _span: (_span['start'], _span['end']))
|
43 |
+
# The current span is the first element of the list
|
44 |
+
self._current_span = 0
|
45 |
+
# Boolean variable that indicates whether the token is inside
|
46 |
+
# the span (I-LABEL)
|
47 |
+
self._inside = False
|
48 |
+
# Total number of spans
|
49 |
+
self._span_count = len(self._spans)
|
50 |
+
# Depending on the notation passed, we will return the label for
|
51 |
+
# the token accordingly
|
52 |
+
if notation == 'BIO':
|
53 |
+
self._prefix_single = 'B-'
|
54 |
+
self._prefix_begin = 'B-'
|
55 |
+
self._prefix_inside = 'I-'
|
56 |
+
self._prefix_end = 'I-'
|
57 |
+
self._prefix_outside = 'O'
|
58 |
+
elif notation == 'BIOES':
|
59 |
+
self._prefix_single = 'S-'
|
60 |
+
self._prefix_begin = 'B-'
|
61 |
+
self._prefix_inside = 'I-'
|
62 |
+
self._prefix_end = 'E-'
|
63 |
+
self._prefix_outside = 'O'
|
64 |
+
elif notation == 'BILOU':
|
65 |
+
self._prefix_single = 'U-'
|
66 |
+
self._prefix_begin = 'B-'
|
67 |
+
self._prefix_inside = 'I-'
|
68 |
+
self._prefix_end = 'L-'
|
69 |
+
self._prefix_outside = 'O'
|
70 |
+
elif notation == 'IO':
|
71 |
+
self._prefix_single = 'I-'
|
72 |
+
self._prefix_begin = 'I-'
|
73 |
+
self._prefix_inside = 'I-'
|
74 |
+
self._prefix_end = 'I-'
|
75 |
+
self._prefix_outside = 'O'
|
76 |
+
|
77 |
+
def __check_begin(self, token: Mapping[str, Union[str, int]]) -> str:
|
78 |
+
"""
|
79 |
+
Given a token, return the label (B-LABEL) and check whether the token
|
80 |
+
covers the entire span or is a sub set of the span
|
81 |
+
Args:
|
82 |
+
token (Mapping[str, Union[str, int]]): Contains the token text, start and end position of the token
|
83 |
+
in the text
|
84 |
+
Returns:
|
85 |
+
(str): The label - 'B-LABEL'
|
86 |
+
"""
|
87 |
+
# Set the inside flag to true to indicate that the next token that is checked
|
88 |
+
# will be checked to see if it belongs 'inside' the span
|
89 |
+
self._inside = True
|
90 |
+
if token['end'] > int(self._spans[self._current_span]['end']):
|
91 |
+
raise MismatchError('Span and Token mismatch - Begin Token extends longer than the span')
|
92 |
+
# If this token does not cover the entire span then we expect another token
|
93 |
+
# to be in the span and that token should be assigned the I-LABEL
|
94 |
+
elif token['end'] < int(self._spans[self._current_span]['end']):
|
95 |
+
return self._prefix_begin + self._spans[self._current_span]['label']
|
96 |
+
# If this token does cover the entire span then we set inside = False
|
97 |
+
# to indicate this span is complete and increment the current span
|
98 |
+
# to move onto the next span in the text
|
99 |
+
elif token['end'] == int(self._spans[self._current_span]['end']):
|
100 |
+
self._current_span += 1
|
101 |
+
self._inside = False
|
102 |
+
return self._prefix_single + self._spans[self._current_span - 1]['label']
|
103 |
+
|
104 |
+
def __check_inside(self, token: Mapping[str, Union[str, int]]) -> str:
|
105 |
+
"""
|
106 |
+
Given a token, return the label (I-LABEL) and check whether the token
|
107 |
+
covers the entire span or is still inside the span.
|
108 |
+
Args:
|
109 |
+
token (Mapping[str, Union[str, int]]): Contains the token text, start and end position of the token
|
110 |
+
in the text
|
111 |
+
Returns:
|
112 |
+
(str): The label - 'I-LABEL'
|
113 |
+
"""
|
114 |
+
|
115 |
+
if (token['start'] >= int(self._spans[self._current_span]['end'])
|
116 |
+
or token['end'] > int(self._spans[self._current_span]['end'])):
|
117 |
+
raise MismatchError('Span and Token mismatch - Inside Token starts after the span ends')
|
118 |
+
# If this token does not cover the entire span then we expect another token
|
119 |
+
# to be in the span and that token should be assigned the I-LABEL
|
120 |
+
elif token['end'] < int(self._spans[self._current_span]['end']):
|
121 |
+
return self._prefix_inside + self._spans[self._current_span]['label']
|
122 |
+
# If this token does cover the entire span then we set inside = False
|
123 |
+
# to indicate this span is complete and increment the current span
|
124 |
+
# to move onto the next span in the text
|
125 |
+
elif token['end'] == int(self._spans[self._current_span]['end']):
|
126 |
+
self._current_span += 1
|
127 |
+
self._inside = False
|
128 |
+
return self._prefix_end + self._spans[self._current_span - 1]['label']
|
129 |
+
|
130 |
+
def get_labels(self, token: Mapping[str, Union[str, int]]) -> str:
|
131 |
+
"""
|
132 |
+
Given a token, return the label (B-LABEL, I-LABEL, O) based on
|
133 |
+
the spans present in the text & the desired notation.
|
134 |
+
Args:
|
135 |
+
token (Mapping[str, Union[str, int]]): Contains the token text, start and end position of the token
|
136 |
+
in the text
|
137 |
+
Returns:
|
138 |
+
(str): One of the labels according to the notation - 'B-LABEL', 'I-LABEL', 'O'
|
139 |
+
"""
|
140 |
+
# If we have iterated through all the spans in the text (note), all the tokens that
|
141 |
+
# come after the last span will be marked as 'O' - since they don't belong to any span
|
142 |
+
if self._current_span >= self._span_count:
|
143 |
+
return self._prefix_outside
|
144 |
+
# Check if the span can be assigned the B-LABEL
|
145 |
+
if token['start'] == int(self._spans[self._current_span]['start']):
|
146 |
+
return self.__check_begin(token)
|
147 |
+
# Check if the span can be assigned the I-LABEL
|
148 |
+
elif token['start'] > int(self._spans[self._current_span]['start']) and self._inside is True:
|
149 |
+
return self.__check_inside(token)
|
150 |
+
# Check if the token is outside a span
|
151 |
+
elif self._inside is False and (token['end'] <= int(self._spans[self._current_span]['start'])):
|
152 |
+
return self._prefix_outside
|
153 |
+
else:
|
154 |
+
raise MismatchError(
|
155 |
+
'Span and Token mismatch - the span and tokens don\'t line up. There might be a tokenization issue '
|
156 |
+
'that needs to be fixed')
|
ner_datasets/dataset_builder/sentence_dataset.py
ADDED
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import deque
|
2 |
+
from typing import Deque, List, Sequence, Iterable, Optional, NoReturn, Dict, Mapping, Union, Tuple
|
3 |
+
|
4 |
+
|
5 |
+
class SentenceDataset(object):
|
6 |
+
"""
|
7 |
+
When we mention previous sentence and next sentence, we don't mean exactly one sentence
|
8 |
+
but rather a previous chunk and a next chunk. This can include one or more sentences and
|
9 |
+
it does not mean that the sentence has to be complete (it can be cutoff in between) - hence a chunk
|
10 |
+
This class is used to build a dataset at the sentence
|
11 |
+
level. It takes as input all the tokenized sentences in the note. So the input is
|
12 |
+
a list of lists where the outer list represents the sentences in the note and the inner list
|
13 |
+
is a list of tokens in the sentence. It then returns a dataset where each sentence is
|
14 |
+
concatenated with the previous and a next chunk. This is done so that when we build a model
|
15 |
+
we can use the previous and next chunks to add context to the sentence/model. The weights and loss etc
|
16 |
+
will be computed and updated based on the current sentence. The previous and next chunks will
|
17 |
+
only be used to add context. We could have different sizes of previous and next chunks
|
18 |
+
depending on the position of the sentence etc. Essentially we build a sentence level dataset
|
19 |
+
where we can also provide context to the sentence by including the previous and next chunks
|
20 |
+
"""
|
21 |
+
|
22 |
+
def __init__(
|
23 |
+
self,
|
24 |
+
max_tokens: int,
|
25 |
+
max_prev_sentence_token: int,
|
26 |
+
max_next_sentence_token: int,
|
27 |
+
default_chunk_size: int,
|
28 |
+
ignore_label: str
|
29 |
+
) -> NoReturn:
|
30 |
+
"""
|
31 |
+
Set the maximum token length a given training example (sentence level) can have.
|
32 |
+
That is the total length of the current sentence + previous chunk + next chunk
|
33 |
+
We also set the the maximum length of the previous and next chunks. That is how many
|
34 |
+
tokens can be in these chunks. However if the total length exceeds, tokens in the
|
35 |
+
previous and next chunks will be dropped to ensure that the total length is < max_tokens
|
36 |
+
The default chunk size ensures that the length of the chunks will be a minimum number of
|
37 |
+
tokens based on the value passed. For example is default_chunk_size=10, the length
|
38 |
+
of the previous chunks and next chunks will be at least 10 tokens.
|
39 |
+
Args:
|
40 |
+
max_tokens (int): maximum token length a given training example (sentence level) can have
|
41 |
+
max_prev_sentence_token (int): The max chunk size for the previous chunks for a given sentence
|
42 |
+
(training/prediction example) in the note can have
|
43 |
+
max_next_sentence_token (int): The max chunk size for the next chunks for a given sentence
|
44 |
+
(training/prediction example) in the note can have
|
45 |
+
default_chunk_size (int): the training example will always include a chunk of this length
|
46 |
+
as part of the previous and next chunks
|
47 |
+
ignore_label (str): The label assigned to the previous and next chunks to distinguish
|
48 |
+
from the current sentence
|
49 |
+
"""
|
50 |
+
self._id_num = None
|
51 |
+
self._max_tokens = max_tokens
|
52 |
+
self._max_prev_sentence_token = max_prev_sentence_token
|
53 |
+
self._max_next_sentence_token = max_next_sentence_token
|
54 |
+
self._default_chunk_size = default_chunk_size
|
55 |
+
self._ignore_label = ignore_label
|
56 |
+
|
57 |
+
@staticmethod
|
58 |
+
def chunker(
|
59 |
+
seq: Sequence[Mapping[str, Union[str, int]]],
|
60 |
+
size: int
|
61 |
+
) -> Iterable[Sequence[Mapping[str, Union[str, int]]]]:
|
62 |
+
"""
|
63 |
+
Return chunks of the sequence. The size of each chunk will be based
|
64 |
+
on the value passed to the size argument.
|
65 |
+
Args:
|
66 |
+
seq (Sequence): maximum token length a given training example (sentence level) can have
|
67 |
+
size (int): The max chunk size for the chunks
|
68 |
+
Return:
|
69 |
+
(Iterable[Sequence[Mapping[str, Union[str, int]]]]): Iterable that iterates through fixed size chunks of
|
70 |
+
the input sequence chunked version of the sequence
|
71 |
+
|
72 |
+
"""
|
73 |
+
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
|
74 |
+
|
75 |
+
def get_previous_sentences(self, sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]]) -> List[Deque]:
|
76 |
+
"""
|
77 |
+
Go through all the sentences in the medical note and create a list of
|
78 |
+
previous sentences. The output of this function will be a list of chunks
|
79 |
+
where each index of the list contains the sentences (chunks) - (tokens) present before
|
80 |
+
the sentence at that index in the medical note. For example prev_sent[0] will
|
81 |
+
be empty since there is no sentence before the first sentence in the note
|
82 |
+
prev_sent[1] will be equal to sent[0], that is the previous sentence of the
|
83 |
+
second sentence will be the first sentence. We make use of deque, where we
|
84 |
+
start to deque elements when it start to exceed max_prev_sentence_token. This
|
85 |
+
list of previous sentences will be used to define the previous chunks
|
86 |
+
Args:
|
87 |
+
sent_tokens (Sequence[str]): Sentences in the note and
|
88 |
+
each element of the list contains a
|
89 |
+
list of tokens in that sentence
|
90 |
+
Returns:
|
91 |
+
previous_sentences (List[deque]): A list of deque objects where each index contains a
|
92 |
+
list (queue) of previous tokens (chunk) with respect
|
93 |
+
to the sentence represented by that index in the note
|
94 |
+
"""
|
95 |
+
previous_sentences = list()
|
96 |
+
# Create a queue and specify the capacity of the queue
|
97 |
+
# Tokens will be popped from the queue when the capacity is exceeded
|
98 |
+
prev_sentence = deque(maxlen=self._max_prev_sentence_token)
|
99 |
+
# The first previous chunk is empty since the first sentence in the note does not have
|
100 |
+
# anything before it
|
101 |
+
previous_sentences.append(prev_sentence.copy())
|
102 |
+
# As we iterate through the list of sentences in the not, we add the tokens from the previous chunks
|
103 |
+
# to the the queue. Since we have a queue, as soon as the capacity is exceeded we pop tokens from
|
104 |
+
# the queue
|
105 |
+
for sent_token in sent_tokens[:-1]:
|
106 |
+
for token in sent_token:
|
107 |
+
prev_sentence.append(token)
|
108 |
+
# As soon as each sentence in the list is processed
|
109 |
+
# We add a copy of the current queue to a list - this list keeps track of the
|
110 |
+
# previous chunks for a sentence
|
111 |
+
previous_sentences.append(prev_sentence.copy())
|
112 |
+
|
113 |
+
return previous_sentences
|
114 |
+
|
115 |
+
def get_next_sentences(self, sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]]) -> List[Deque]:
|
116 |
+
"""
|
117 |
+
Go through all the sentences in the medical note and create a list of
|
118 |
+
next sentences. The output of this function will be a list of lists
|
119 |
+
where each index of the list contains the list of sentences present after
|
120 |
+
the sentence at that index in the medical note. For example next_sent[-] will
|
121 |
+
be empty since there is no sentence after the last sentence in the note
|
122 |
+
next_sent[0] will be equal to sent[1:], that is the next sentence of the
|
123 |
+
first sentence will be the subsequent sentences. We make use of deque, where we
|
124 |
+
start to deque elements when it start to exceed max_next_sentence_token. This
|
125 |
+
list of previous sentences will be used to define the previous chunks
|
126 |
+
Args:
|
127 |
+
sent_tokens (Sequence[str]): Sentences in the note and each
|
128 |
+
element of the list contains a
|
129 |
+
list of tokens in that sentence
|
130 |
+
Returns:
|
131 |
+
next_sentences (List[deque]): A list of deque objects where each index contains a list (queue)
|
132 |
+
of next tokens (chunk) with respect to the sentence represented
|
133 |
+
by that index in the note
|
134 |
+
"""
|
135 |
+
# A list of next sentences is first created and reversed
|
136 |
+
next_sentences = list()
|
137 |
+
# Create a queue and specify the capacity of the queue
|
138 |
+
# Tokens will be popped from the queue when the capacity is exceeded
|
139 |
+
next_sentence = deque(maxlen=self._max_next_sentence_token)
|
140 |
+
# The first (which becomes the last chunk when we reverse this list) next chunk is empty since
|
141 |
+
# the last sentence in the note does not have
|
142 |
+
# anything after it
|
143 |
+
next_sentences.append(next_sentence.copy())
|
144 |
+
for sent_token in reversed(sent_tokens[1:]):
|
145 |
+
for token in reversed(sent_token):
|
146 |
+
next_sentence.appendleft(token)
|
147 |
+
next_sentences.append(next_sentence.copy())
|
148 |
+
# The list is reversed - since we went through the sentences in the reverse order in
|
149 |
+
# the earlier steps
|
150 |
+
return [next_sent for next_sent in reversed(next_sentences)]
|
151 |
+
|
152 |
+
def get_sentences(
|
153 |
+
self,
|
154 |
+
sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]],
|
155 |
+
token_text_key: str = 'text',
|
156 |
+
label_key: str = 'label',
|
157 |
+
start_chunk: Optional[Sequence[Mapping[str, Union[str, int]]]] = None,
|
158 |
+
end_chunk: Optional[Sequence[Mapping[str, Union[str, int]]]] = None,
|
159 |
+
sub: bool = False
|
160 |
+
) -> Iterable[Tuple[int, Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]]:
|
161 |
+
"""
|
162 |
+
When we mention previous sentence and next sentence, we don't mean exactly one sentence
|
163 |
+
but rather a previous chunk and a next chunk. This can include one or more sentences and
|
164 |
+
it does not mean that the sentence has to be complete (it can be cutoff in between) - hence a chunk
|
165 |
+
We iterate through all the tokenized sentences in the note. So the input is
|
166 |
+
a list of lists where the outer list represents the sentences in the note and the inner list
|
167 |
+
is a list of tokens in the sentence. It then returns a dataset where each sentence is
|
168 |
+
concatenated with the previous and the next sentence. This is done so that when we build a model
|
169 |
+
we can use the previous and next sentence to add context to the model. The weights and loss etc
|
170 |
+
will be computed and updated based on the current sentence. The previous and next sentence will
|
171 |
+
only be used to add context. We could have different sizes of previous and next chunks
|
172 |
+
depending on the position of the sentence etc. Since we split a note in several sentences which are
|
173 |
+
then used as training data.
|
174 |
+
ignore_label is used to differentiate between the current sentence and the previous and next
|
175 |
+
chunks. The chunks will have the label NA so that and the current sentence
|
176 |
+
will have the label (DATE, AGE etc) so that they can be distinguished.
|
177 |
+
If however we are building a dataset for predictions
|
178 |
+
the current sentence will have the default label O, but the next and previous chunks will still
|
179 |
+
have the label NA. However if the total length exceeds, tokens in the
|
180 |
+
previous and next chunks will be dropped to ensure that the total length is < max_tokens
|
181 |
+
The default chunk size ensures that the length of the chunks will be a minimum number of
|
182 |
+
tokens based on the value passed. For example is default_chunk_size=10, the length
|
183 |
+
of the previous chunks and next chunks will be at least 10 tokens. If the total length > max tokens
|
184 |
+
even after decreasing the sizes of the previous and next chunks, then we split this long
|
185 |
+
sentence into sub sentences and repeat the process described above.
|
186 |
+
Args:
|
187 |
+
sent_tokens (Sequence[Sequence[Mapping[str, Union[str, int]]]]): Sentences in the note and each sentence
|
188 |
+
contains the tokens (dict) in that sentence
|
189 |
+
the token dict object contains the
|
190 |
+
token text, start, end etc
|
191 |
+
token_text_key (str): Each sentence contains a list of tokens where each token is a dict. We use the text
|
192 |
+
key to extract the text of the token from the dictionary
|
193 |
+
label_key (str): Each sentence contains a list of tokens where each token is a dict. We use the label_key
|
194 |
+
key to extract the label of the token from the dictionary. (if it does not have a label
|
195 |
+
the default label will be assigned)
|
196 |
+
start_chunk (Optional[Sequence[Mapping[str, Union[str, int]]]]): Prefix the first sentence of with some
|
197 |
+
pre-defined chunk
|
198 |
+
end_chunk (Optional[Sequence[Mapping[str, Union[str, int]]]]): Suffix the last sentence of with some
|
199 |
+
pre-defined chunk
|
200 |
+
sub (bool): Whether the function is called to process sub-sentences (used when we are splitting
|
201 |
+
long sentences into smaller sub sentences to keep sentence length < max_tokens
|
202 |
+
Returns:
|
203 |
+
(Iterable[Tuple[int, Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]]): Iterate through the
|
204 |
+
returned sentences,
|
205 |
+
where each sentence
|
206 |
+
has the previous
|
207 |
+
chunks and next
|
208 |
+
chunks attached
|
209 |
+
to it.
|
210 |
+
"""
|
211 |
+
# Id num keeps track of the id of the sentence - that is the position the sentence occurs in
|
212 |
+
# the note. We keep the id of sub sentences the same as the sentence, so that the user
|
213 |
+
# knows that these sub sentences are chunked from a longer sentence.
|
214 |
+
# <SENT 0> <SENT 1>. Say length of sent 0 with the previous and next chunks is less than max_tokens
|
215 |
+
# we return sent 0 with id 0. For sent 1, say the length is longer, we split it into sub
|
216 |
+
# sentences - <SUB 1><SUB 2> - we return SUB 1, and SUB 2 with id 1 - so we know that it belongs
|
217 |
+
# to <SENT 1> in the note.
|
218 |
+
if not sub:
|
219 |
+
self._id_num = -1
|
220 |
+
# Initialize the object that will take all the sentences in the note and return
|
221 |
+
# a dataset where each row represents a sentence in the note. The sentence in each
|
222 |
+
# row will also contain a previous chunk and next chunk (tokens) that will act as context
|
223 |
+
# when training the mode
|
224 |
+
# [ps1, ps 2, ps 3...ps-i], [cs1, cs2, ... cs-j], [ns, ns, ... ns-k] - as you can see the current sentence
|
225 |
+
# which is the sentence we train on (or predict on) will be in the middle - the surrounding tokens will
|
226 |
+
# provide context to the current sentence
|
227 |
+
# Get the previous sentences (chunks) for each sentence in the note
|
228 |
+
previous_sentences = self.get_previous_sentences(sent_tokens)
|
229 |
+
# Get the next sentences (chunks) for each sentence in the note
|
230 |
+
next_sentences = self.get_next_sentences(sent_tokens)
|
231 |
+
# For the note we are going to iterate through all the sentences in the note and
|
232 |
+
# concatenate each sentence with the previous and next chunks. (This forms the data that
|
233 |
+
# will be used for training/predictions) Each sentence with the concatenated chunks will be
|
234 |
+
# a training sample. We would do the same thing for getting predictions on a sentence as well
|
235 |
+
# The only difference would be the labels that are used. We would use the default label O for
|
236 |
+
# prediction and the annotated labels for prediction
|
237 |
+
if len(sent_tokens) != len(previous_sentences) or len(sent_tokens) != len(next_sentences):
|
238 |
+
raise ValueError('Sentence length mismatch')
|
239 |
+
for index, (previous_sent, current_sent, next_sent) in enumerate(
|
240 |
+
zip(previous_sentences, sent_tokens, next_sentences)):
|
241 |
+
sent_tokens_text = list()
|
242 |
+
sent_labels = list()
|
243 |
+
sent_toks = list()
|
244 |
+
# Get the tokens and labels for the current sentence
|
245 |
+
for token in current_sent:
|
246 |
+
# We store this, if we need to process sub sentences when a sentence exceeds max_tokens
|
247 |
+
sent_toks.append(token)
|
248 |
+
sent_tokens_text.append(token[token_text_key])
|
249 |
+
sent_labels.append(token[label_key])
|
250 |
+
# We check if the number of tokens in teh current sentence + previous chunk
|
251 |
+
# + next chunk exceeds max tokens. If it does we start popping tokens from the previous and next chunks
|
252 |
+
# until the number of tokens is equal to max tokens
|
253 |
+
previous_sent_length = len(previous_sent)
|
254 |
+
current_sent_length = len(sent_tokens_text)
|
255 |
+
next_sent_length = len(next_sent)
|
256 |
+
total_length = previous_sent_length + current_sent_length + next_sent_length
|
257 |
+
# If the length of the current sentence plus the length of the previous and next
|
258 |
+
# chunks exceeds the max_tokens, start popping tokens from the previous and next
|
259 |
+
# chunks until either total length < max_tokens or the number of tokens in the previous and
|
260 |
+
# next chunks goes below the default chunk size
|
261 |
+
while total_length > self._max_tokens and \
|
262 |
+
(next_sent_length > self._default_chunk_size or previous_sent_length > self._default_chunk_size):
|
263 |
+
if next_sent_length >= previous_sent_length:
|
264 |
+
next_sent.pop()
|
265 |
+
next_sent_length -= 1
|
266 |
+
total_length -= 1
|
267 |
+
elif previous_sent_length > next_sent_length:
|
268 |
+
previous_sent.popleft()
|
269 |
+
previous_sent_length -= 1
|
270 |
+
total_length -= 1
|
271 |
+
# If this is not a sub sentence, increment the ID to
|
272 |
+
# indicate the processing of the next sentence of the note
|
273 |
+
# If it is a sub sentence, keep the ID the same, to indicate
|
274 |
+
# it belongs to a larger sentence
|
275 |
+
if not sub:
|
276 |
+
self._id_num += 1
|
277 |
+
# If total length < max_tokens - process the sentence with the current sentence
|
278 |
+
# and add on the previous and next chunks and return
|
279 |
+
if total_length <= self._max_tokens:
|
280 |
+
# Check if we want to add a pre-defined chunk for the first sentence in the note
|
281 |
+
if index == 0 and start_chunk is not None:
|
282 |
+
previous_sent_tokens = [chunk[token_text_key] for chunk in start_chunk] + \
|
283 |
+
[prev_token[token_text_key] for prev_token in list(previous_sent)]
|
284 |
+
else:
|
285 |
+
previous_sent_tokens = [prev_token[token_text_key] for prev_token in list(previous_sent)]
|
286 |
+
# Check if we want to add a pre-defined chunk for the last sentence in the note
|
287 |
+
if index == len(sent_tokens) - 1 and end_chunk is not None:
|
288 |
+
next_sent_tokens = [next_token[token_text_key] for next_token in list(next_sent)] + \
|
289 |
+
[chunk[token_text_key] for chunk in end_chunk]
|
290 |
+
else:
|
291 |
+
next_sent_tokens = [next_token[token_text_key] for next_token in list(next_sent)]
|
292 |
+
previous_sent_length = len(previous_sent_tokens)
|
293 |
+
next_sent_length = len(next_sent_tokens)
|
294 |
+
# Store information about the current sentence - start and end pos etc
|
295 |
+
# this can be used to distinguish from the next and previous chunks
|
296 |
+
# current_sent_info = {'token_info':current_sent}
|
297 |
+
# Assign an different label (the ignore label) to the chunks - since they are used only for context
|
298 |
+
previous_sent_labels = list()
|
299 |
+
next_sent_labels = list()
|
300 |
+
if self._ignore_label == 'NA':
|
301 |
+
previous_sent_labels = [self._ignore_label] * previous_sent_length
|
302 |
+
next_sent_labels = [self._ignore_label] * next_sent_length
|
303 |
+
elif self._ignore_label == 'label':
|
304 |
+
if index == 0 and start_chunk is not None:
|
305 |
+
previous_sent_labels = [chunk[label_key] for chunk in start_chunk] + \
|
306 |
+
[prev_token[label_key] for prev_token in list(previous_sent)]
|
307 |
+
else:
|
308 |
+
previous_sent_labels = [prev_token[label_key] for prev_token in list(previous_sent)]
|
309 |
+
if index == len(sent_tokens) - 1 and end_chunk is not None:
|
310 |
+
next_sent_labels = [next_token[label_key] for next_token in list(next_sent)] + \
|
311 |
+
[chunk[label_key] for chunk in end_chunk]
|
312 |
+
else:
|
313 |
+
next_sent_labels = [next_token[label_key] for next_token in list(next_sent)]
|
314 |
+
# Concatenate the chunks and the sentence
|
315 |
+
# sent_tokens_text.append(token[token_text_key])
|
316 |
+
tokens_data = previous_sent_tokens + sent_tokens_text + next_sent_tokens
|
317 |
+
labels_data = previous_sent_labels + sent_labels + next_sent_labels
|
318 |
+
# Return processed sentences
|
319 |
+
yield self._id_num, {'tokens': tokens_data, 'labels': labels_data, 'current_sent_info': current_sent}
|
320 |
+
# Process the sub sentences - we take a long sentence
|
321 |
+
# and split it into smaller chunks - and we recursively call the function on this list
|
322 |
+
# of smaller chunks - as mentioned before the smaller chunks (sub sentences) will have the
|
323 |
+
# same ID as the original sentence
|
324 |
+
else:
|
325 |
+
# Store the smaller chunks - say <SENT1> is too long
|
326 |
+
# <PREV CHUNK><SENT1><NEXT CHUNK>
|
327 |
+
# We get chunk sent 1 - to <SUB1><SUB2><SUB3> and we pass this [<SUB1><SUB2><SUB3>] to the function
|
328 |
+
# as a recursive call. This list is now processed as a smaller note that essentially belongs
|
329 |
+
# to a sentence. But as you can see we did not pass <PREV CHUNK> & <NEXT CHUNK>, because
|
330 |
+
# these are chunks that are not part of the current sentence, but they still need to be
|
331 |
+
# included in the final output - and the work around is mentioned below
|
332 |
+
# So that we have a previous chunk for <SUB1> and next chunk for <SUB3>
|
333 |
+
# we include the previous_sent_tokens and next_sent_tokens as the start chunk
|
334 |
+
# and the next chunk in the function call below
|
335 |
+
# <PREV CHUNK><SUB1><NEXT SUB1>, id = x
|
336 |
+
# <PREV SUB2><SUB2><NEXT SUB2>, id = x
|
337 |
+
# <PREV SUB3><SUB3><NEXT CHUNK>, id = x
|
338 |
+
sub_sentences = list()
|
339 |
+
# Prefix the first sentence in these smaller chunks
|
340 |
+
previous_sent_tokens = list(previous_sent)
|
341 |
+
# Suffix the last sentence in these smaller chunks
|
342 |
+
next_sent_tokens = list(next_sent)
|
343 |
+
# Get chunks
|
344 |
+
for chunk in SentenceDataset.chunker(sent_toks, self._max_tokens - (2 * self._default_chunk_size)):
|
345 |
+
sub_sentences.append(chunk)
|
346 |
+
# Process list of smaller chunks
|
347 |
+
for sub_sent in self.get_sentences(
|
348 |
+
sub_sentences,
|
349 |
+
token_text_key,
|
350 |
+
label_key,
|
351 |
+
start_chunk=previous_sent_tokens,
|
352 |
+
end_chunk=next_sent_tokens,
|
353 |
+
sub=True
|
354 |
+
):
|
355 |
+
yield sub_sent
|
ner_datasets/dataset_creator.py
ADDED
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import random
|
3 |
+
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
|
4 |
+
from typing import Iterable, Dict, List, Union, Optional, Sequence, NoReturn
|
5 |
+
|
6 |
+
from .dataset_builder import Dataset, SentenceDataset
|
7 |
+
from .preprocessing import PreprocessingLoader
|
8 |
+
|
9 |
+
random.seed(41)
|
10 |
+
|
11 |
+
|
12 |
+
class DatasetCreator(object):
|
13 |
+
"""
|
14 |
+
Build a NER token classification dataset
|
15 |
+
For training we will build the dataset using the annotated spans (e.g from prodigy)
|
16 |
+
For predictions we will assign default labels.
|
17 |
+
The dataset is on a sentence level, i.e each note is split into sentences and the de-id
|
18 |
+
task is run on a sentence level. Even the predictions are run on a sentence level
|
19 |
+
The dataset would be something like:
|
20 |
+
Tokens: [[tok1, tok2, ... tok-n], [tok ...], ..., [tok ...]]
|
21 |
+
Labels: [[lab1, lab2, ... lab-n], [lab ...], ..., [lab ...]]
|
22 |
+
Where the inner list represents the sentences - the tokens in the sentence and the respective
|
23 |
+
labels for each token. The labels depend on the notation
|
24 |
+
This script can also be used for predictions, the Labels will be filled with some
|
25 |
+
default value. This is done so that we can use the same script for building a dataset to train a model
|
26 |
+
and a dataset to obtain predictions using a model
|
27 |
+
Example:
|
28 |
+
Note: Bruce Wayne is a 60yo man. He lives in Gotham
|
29 |
+
Sentences: [Bruce Wayne Jr is a 60yo man., He lives in Gotham]
|
30 |
+
Tokens: [[Bruce, Wayne, Jr, is, a, 60, yo, man, .], [He, lives, in, Gotham]]
|
31 |
+
Labels (BIO notation): [[B-Name, I-Name, I-Name, O, O, O, O, O, O], [O, O, O, B-LOC]]
|
32 |
+
Labels (BILOU notation): [[B-Name, I-Name, L-Name, O, O, O, O, O, O], [O, O, O, U-LOC]]
|
33 |
+
We also can create sentences that uses previous/next chunks as context - in this case the dataset would
|
34 |
+
look something like this. (Assume we limit the size of the chunks to 3 tokens)
|
35 |
+
Sentences: [Bruce Wayne Jr is a 60yo man., He lives in Gotham]
|
36 |
+
Tokens: [[Bruce, Wayne, Jr, is, a, 60, yo, man, ., He, lives, in], [yo, man, ., He, lives, in, Gotham]]
|
37 |
+
Labels (BIO notation): [[B-Name, I-Name, I-Name, O, O, O, O, O, O, NA, NA, NA], [NA, NA, NA, O, O, O, B-LOC]]
|
38 |
+
Labels (BILOU notation): [[B-Name, I-Name, L-Name, O, O, O, O, O, O, NA, NA, NA], [NA, NA, NA, O, O, O, U-LOC]]
|
39 |
+
NA represents the token is used for context
|
40 |
+
"""
|
41 |
+
|
42 |
+
def __init__(
|
43 |
+
self,
|
44 |
+
sentencizer: str,
|
45 |
+
tokenizer: str,
|
46 |
+
abbreviations: Optional[Sequence[str]] = None,
|
47 |
+
max_tokens: int = 128,
|
48 |
+
max_prev_sentence_token: int = 32,
|
49 |
+
max_next_sentence_token: int = 32,
|
50 |
+
default_chunk_size: int = 32,
|
51 |
+
ignore_label: str = 'NA'
|
52 |
+
) -> NoReturn:
|
53 |
+
"""
|
54 |
+
Initialize the sentencizer and tokenizer
|
55 |
+
Args:
|
56 |
+
sentencizer (str): Specify which sentencizer you want to use
|
57 |
+
tokenizer (str): Specify which tokenizer you want to use
|
58 |
+
abbreviations (Optional[Sequence[str]]): A list of abbreviations for which tokens will not be split
|
59 |
+
- works only with with custom clinical tokenizer.
|
60 |
+
max_tokens (int): The maximum number of tokens allowed in a sentence/training example,
|
61 |
+
truncate if it exceeds.
|
62 |
+
max_prev_sentence_token (int): The maximum number of previous chunk tokens allowed in a
|
63 |
+
sentence/training example
|
64 |
+
max_next_sentence_token (int): The maximum number of next chunk tokens allowed in a
|
65 |
+
sentence/training example.
|
66 |
+
ignore_label (str): The label assigned to the previous and next chunks to distinguish
|
67 |
+
from the current sentence
|
68 |
+
"""
|
69 |
+
self._sentencizer = PreprocessingLoader.get_sentencizer(sentencizer=sentencizer)
|
70 |
+
self._tokenizer = PreprocessingLoader.get_tokenizer(tokenizer=tokenizer, abbreviations=abbreviations)
|
71 |
+
# Initialize the object that will be used to get the tokens and the sentences
|
72 |
+
self._dataset = Dataset(sentencizer=self._sentencizer, tokenizer=self._tokenizer)
|
73 |
+
# Initialize the object that will take all the sentences in the note and return
|
74 |
+
# a dataset where each row represents a sentence in the note. The sentence in each
|
75 |
+
# row will also contain a previous chunk and next chunk (tokens) that will act as context
|
76 |
+
# when training the mode
|
77 |
+
# [ps1, ps 2, ps 3...ps-i], [cs1, cs2, ... cs-j], [ns, ns, ... ns-k] - as you can see the current sentence
|
78 |
+
# which is the sentence we train on (or predict on) will be in the middle - the surrounding tokens will
|
79 |
+
# provide context to the current sentence
|
80 |
+
self._sentence_dataset = SentenceDataset(
|
81 |
+
max_tokens=max_tokens,
|
82 |
+
max_prev_sentence_token=max_prev_sentence_token,
|
83 |
+
max_next_sentence_token=max_next_sentence_token,
|
84 |
+
default_chunk_size=default_chunk_size,
|
85 |
+
ignore_label=ignore_label
|
86 |
+
)
|
87 |
+
|
88 |
+
def create(
|
89 |
+
self,
|
90 |
+
input_file: str,
|
91 |
+
mode: str = 'predict',
|
92 |
+
notation: str = 'BIO',
|
93 |
+
token_text_key: str = 'text',
|
94 |
+
metadata_key: str = 'meta',
|
95 |
+
note_id_key: str = 'note_id',
|
96 |
+
label_key: str = 'labels',
|
97 |
+
span_text_key: str = 'spans'
|
98 |
+
) -> Iterable[Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]:
|
99 |
+
"""
|
100 |
+
This function is used to get the sentences that will be part of the NER dataset.
|
101 |
+
We check whether the note belongs to the desired dataset split. If it does,
|
102 |
+
we fix any spans that can cause token-span alignment errors. Then we extract
|
103 |
+
all the sentences in the notes, the tokens in each sentence. Finally we
|
104 |
+
add some context tokens to the sentence if required. This function returns
|
105 |
+
an iterable that iterated through each of the processed sentences
|
106 |
+
Args:
|
107 |
+
input_file (str): Input jsonl file. Make sure the spans are in ascending order (based on start position)
|
108 |
+
mode (str): Dataset being built for train or predict.
|
109 |
+
notation (str): The NER labelling notation
|
110 |
+
token_text_key (str): The key where the note text and token text is present in the json object
|
111 |
+
metadata_key (str): The key where the note metadata is present in the json object
|
112 |
+
note_id_key (str): The key where the note id is present in the json object
|
113 |
+
label_key (str): The key where the token label will be stored in the json object
|
114 |
+
span_text_key (str): The key where the note spans is present in the json object
|
115 |
+
Returns:
|
116 |
+
(Iterable[Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]): Iterate through the processed
|
117 |
+
sentences/training examples
|
118 |
+
"""
|
119 |
+
# Go through the notes
|
120 |
+
for line in open(input_file, 'r'):
|
121 |
+
note = json.loads(line)
|
122 |
+
note_text = note[token_text_key]
|
123 |
+
note_id = note[metadata_key][note_id_key]
|
124 |
+
if mode == 'train':
|
125 |
+
note_spans = note[span_text_key]
|
126 |
+
# No spans in predict mode
|
127 |
+
elif mode == 'predict':
|
128 |
+
note_spans = None
|
129 |
+
else:
|
130 |
+
raise ValueError("Invalid mode - can only be train/predict")
|
131 |
+
# Store the list of tokens in the sentence
|
132 |
+
# Eventually this list will contain all the tokens in the note (split on the sentence level)
|
133 |
+
# Store the start and end positions of the sentence in the note. This can
|
134 |
+
# be used later to reconstruct the note from the sentences
|
135 |
+
# we also store the note_id for each sentence so that we can map it back
|
136 |
+
# to the note and therefore have all the sentences mapped back to the notes they belong to.
|
137 |
+
sent_tokens = [sent_tok for sent_tok in self._dataset.get_tokens(
|
138 |
+
text=note_text,
|
139 |
+
spans=note_spans,
|
140 |
+
notation=notation
|
141 |
+
)]
|
142 |
+
# The following loop goes through each sentence in the note and returns
|
143 |
+
# the current sentence and previous and next chunks that will be used for context
|
144 |
+
# The chunks will have a default label (e.g NA) to distinguish from the current sentence
|
145 |
+
# and so that we can ignore these chunks when calculating loss and updating weights
|
146 |
+
# during training
|
147 |
+
for ner_sent_index, ner_sentence in self._sentence_dataset.get_sentences(
|
148 |
+
sent_tokens=sent_tokens,
|
149 |
+
token_text_key=token_text_key,
|
150 |
+
label_key=label_key
|
151 |
+
):
|
152 |
+
# Return the processed sentence. This sentence will then be used
|
153 |
+
# by the model
|
154 |
+
current_sent_info = ner_sentence['current_sent_info']
|
155 |
+
note_sent_info_store = {'start': current_sent_info[0]['start'],
|
156 |
+
'end': current_sent_info[-1]['end'], 'note_id': note_id}
|
157 |
+
ner_sentence['note_sent_info'] = note_sent_info_store
|
158 |
+
yield ner_sentence
|
159 |
+
|
160 |
+
|
161 |
+
def main():
|
162 |
+
cli_parser = ArgumentParser(
|
163 |
+
description='configuration arguments provided at run time from the CLI',
|
164 |
+
formatter_class=ArgumentDefaultsHelpFormatter
|
165 |
+
)
|
166 |
+
cli_parser.add_argument(
|
167 |
+
'--input_file',
|
168 |
+
type=str,
|
169 |
+
required=True,
|
170 |
+
help='the the jsonl file that contains the notes. spans need to be sorted in ascending order (based on start '
|
171 |
+
'position) '
|
172 |
+
)
|
173 |
+
cli_parser.add_argument(
|
174 |
+
'--notation',
|
175 |
+
type=str,
|
176 |
+
default='BIO',
|
177 |
+
help='the notation we will be using for the label scheme'
|
178 |
+
)
|
179 |
+
cli_parser.add_argument(
|
180 |
+
'--max_tokens',
|
181 |
+
type=int,
|
182 |
+
default=128,
|
183 |
+
help='The max tokens that a given sentence (training/prediction example) in the note can have'
|
184 |
+
)
|
185 |
+
cli_parser.add_argument(
|
186 |
+
'--default_chunk_size',
|
187 |
+
type=int,
|
188 |
+
default=32,
|
189 |
+
help='the default chunk size for the previous and next chunks for a given sentence (training/prediction '
|
190 |
+
'example) in the note can have '
|
191 |
+
)
|
192 |
+
cli_parser.add_argument(
|
193 |
+
'--max_prev_sentence_token',
|
194 |
+
type=int,
|
195 |
+
default=32,
|
196 |
+
help='the max chunk size for the previous chunks for a given sentence (training/prediction example) in the '
|
197 |
+
'note can have '
|
198 |
+
)
|
199 |
+
cli_parser.add_argument(
|
200 |
+
'--max_next_sentence_token',
|
201 |
+
type=int,
|
202 |
+
default=32,
|
203 |
+
help='the max chunk size for the next chunks for a given sentence (training/prediction example) in the note '
|
204 |
+
'can have '
|
205 |
+
)
|
206 |
+
cli_parser.add_argument(
|
207 |
+
'--mode',
|
208 |
+
type=str,
|
209 |
+
choices=['train', 'predict'],
|
210 |
+
required=True,
|
211 |
+
help='whether we are building the dataset for training or prediction'
|
212 |
+
)
|
213 |
+
cli_parser.add_argument(
|
214 |
+
'--sentencizer',
|
215 |
+
type=str,
|
216 |
+
required=True,
|
217 |
+
help='the sentencizer to use for splitting notes into sentences'
|
218 |
+
)
|
219 |
+
cli_parser.add_argument(
|
220 |
+
'--tokenizer',
|
221 |
+
type=str,
|
222 |
+
required=True,
|
223 |
+
help='the tokenizer to use for splitting text into tokens'
|
224 |
+
)
|
225 |
+
cli_parser.add_argument(
|
226 |
+
'--abbreviations',
|
227 |
+
type=str,
|
228 |
+
default=None,
|
229 |
+
help='file that will be used by clinical tokenizer to handle abbreviations'
|
230 |
+
)
|
231 |
+
cli_parser.add_argument(
|
232 |
+
'--ignore_label',
|
233 |
+
type=str,
|
234 |
+
default='NA',
|
235 |
+
help='whether to use the ignore label or not'
|
236 |
+
)
|
237 |
+
cli_parser.add_argument(
|
238 |
+
'--token_text_key',
|
239 |
+
type=str,
|
240 |
+
default='text',
|
241 |
+
help='the key where the note text is present in the json object'
|
242 |
+
)
|
243 |
+
cli_parser.add_argument(
|
244 |
+
'--metadata_key',
|
245 |
+
type=str,
|
246 |
+
default='meta',
|
247 |
+
help='the key where the note metadata is present in the json object'
|
248 |
+
)
|
249 |
+
cli_parser.add_argument(
|
250 |
+
'--note_id_key',
|
251 |
+
type=str,
|
252 |
+
default='note_id',
|
253 |
+
help='the key where the note metadata is present in the json object'
|
254 |
+
)
|
255 |
+
cli_parser.add_argument(
|
256 |
+
'--label_key',
|
257 |
+
type=str,
|
258 |
+
default='label',
|
259 |
+
help='the key where the note label for each token is present in the json object'
|
260 |
+
)
|
261 |
+
cli_parser.add_argument(
|
262 |
+
'--span_text_key',
|
263 |
+
type=str,
|
264 |
+
default='spans',
|
265 |
+
help='the key where the note annotates spans are present in the json object'
|
266 |
+
)
|
267 |
+
cli_parser.add_argument(
|
268 |
+
'--format',
|
269 |
+
type=str,
|
270 |
+
default='jsonl',
|
271 |
+
help='format to store the dataset in: jsonl or conll'
|
272 |
+
)
|
273 |
+
cli_parser.add_argument(
|
274 |
+
'--output_file',
|
275 |
+
type=str,
|
276 |
+
help='The file where the NER dataset will be stored'
|
277 |
+
)
|
278 |
+
args = cli_parser.parse_args()
|
279 |
+
dataset_creator = DatasetCreator(
|
280 |
+
sentencizer=args.sentencizer,
|
281 |
+
tokenizer=args.tokenizer,
|
282 |
+
abbreviations=args.abbreviations,
|
283 |
+
max_tokens=args.max_tokens,
|
284 |
+
max_prev_sentence_token=args.max_prev_sentence_token,
|
285 |
+
max_next_sentence_token=args.max_next_sentence_token,
|
286 |
+
default_chunk_size=args.default_chunk_size,
|
287 |
+
ignore_label=args.ignore_label)
|
288 |
+
ner_notes = dataset_creator.create(
|
289 |
+
input_file=args.input_file,
|
290 |
+
mode=args.mode,
|
291 |
+
notation=args.notation,
|
292 |
+
token_text_key=args.token_text_key,
|
293 |
+
metadata_key=args.metadata_key,
|
294 |
+
note_id_key=args.note_id_key,
|
295 |
+
label_key=args.label_key,
|
296 |
+
span_text_key=args.span_text_key
|
297 |
+
)
|
298 |
+
# Store the NER dataset in the desired format
|
299 |
+
if args.format == 'jsonl':
|
300 |
+
# Write the dataset to the output file
|
301 |
+
with open(args.output_file, 'w') as file:
|
302 |
+
for ner_sentence in ner_notes:
|
303 |
+
file.write(json.dumps(ner_sentence) + '\n')
|
304 |
+
elif args.format == 'conll':
|
305 |
+
with open(args.output_file, 'w') as file:
|
306 |
+
for ner_sentence in ner_notes:
|
307 |
+
tokens = ner_sentence['tokens']
|
308 |
+
labels = ner_sentence['labels']
|
309 |
+
current_sent_info = ner_sentence['current_sent_info']
|
310 |
+
note_id = ner_sentence['note_sent_info']['note_id']
|
311 |
+
if len(tokens) != len(labels) or len(labels) != len(current_sent_info):
|
312 |
+
raise ValueError('Length mismatch')
|
313 |
+
for token, label, sent_info in zip(tokens, labels, current_sent_info):
|
314 |
+
sent_info['note_id'] = note_id
|
315 |
+
data = token + ' ' + label + ' ' + json.dumps(sent_info) + '\n'
|
316 |
+
file.write(data)
|
317 |
+
file.write('\n')
|
318 |
+
|
319 |
+
|
320 |
+
if __name__ == '__main__':
|
321 |
+
|
322 |
+
main()
|
ner_datasets/dataset_splitter.py
ADDED
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import random
|
3 |
+
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
|
4 |
+
from collections import Counter
|
5 |
+
from typing import NoReturn, List
|
6 |
+
|
7 |
+
from .distribution import NERDistribution, DatasetSplits, PrintDistribution
|
8 |
+
|
9 |
+
random.seed(41)
|
10 |
+
|
11 |
+
|
12 |
+
class DatasetSplitter(object):
|
13 |
+
"""
|
14 |
+
Prepare dataset splits - training, validation & testing splits
|
15 |
+
Compute ner distributions in our dataset. Compute ner distributions
|
16 |
+
based on which we create and store a dictionary which will contain
|
17 |
+
information about which notes (in a dataset) belong to which split.
|
18 |
+
Based on this distribution and whether we want to keep certain notes
|
19 |
+
grouped (e.g by patient) we assign notes to a split, such that the
|
20 |
+
final ner type distribution in each split is similar.
|
21 |
+
"""
|
22 |
+
|
23 |
+
def __init__(
|
24 |
+
self,
|
25 |
+
train_proportion: int = 70,
|
26 |
+
validation_proportion: int = 15,
|
27 |
+
test_proportion: int = 15
|
28 |
+
) -> NoReturn:
|
29 |
+
"""
|
30 |
+
Initialize the proportions of the splits.
|
31 |
+
Args:
|
32 |
+
train_proportion (int): Ratio of train dataset
|
33 |
+
validation_proportion (int): Ratio of validation dataset
|
34 |
+
test_proportion (int): Ratio of test dataset
|
35 |
+
"""
|
36 |
+
self._train_proportion = train_proportion
|
37 |
+
self._validation_proportion = validation_proportion
|
38 |
+
self._test_proportion = test_proportion
|
39 |
+
self._split = None
|
40 |
+
self._lookup_split = dict()
|
41 |
+
|
42 |
+
def get_split(self, split: str) -> List[str]:
|
43 |
+
return [key for key in self._lookup_split[split].keys()]
|
44 |
+
|
45 |
+
def set_split(self, split: str) -> NoReturn:
|
46 |
+
"""
|
47 |
+
Set the split that you are currently checking/processing.
|
48 |
+
Based on the split you can perform certain checks and
|
49 |
+
computation. Once the split is set, read the information
|
50 |
+
present in the split_info_path. Extract only the information
|
51 |
+
belonging to the split. Create a hash map where we have
|
52 |
+
the keys as the note_ids/patient ids that belong to the split. This hashmap
|
53 |
+
can then be used to check if a particular note belongs to this
|
54 |
+
split.
|
55 |
+
Args:
|
56 |
+
split (str): The split - train, test etc (depends on how you named it)
|
57 |
+
"""
|
58 |
+
if split not in ['train', 'validation', 'test']:
|
59 |
+
raise ValueError('Invalid split')
|
60 |
+
self._split = split
|
61 |
+
|
62 |
+
def __update_split(self, key: str) -> NoReturn:
|
63 |
+
"""
|
64 |
+
Update the hash map where we have
|
65 |
+
the keys (e.g note_id) that belong to the split. This hashmap
|
66 |
+
can then be used to check if a particular note belongs to this
|
67 |
+
split.
|
68 |
+
Args:
|
69 |
+
key (str): The key that identify the note belonging to the split
|
70 |
+
"""
|
71 |
+
self._lookup_split[self._split][key] = 1
|
72 |
+
|
73 |
+
def check_note(self, key: str) -> bool:
|
74 |
+
"""
|
75 |
+
Use the hash map created in the __get_i2b2_filter_map function
|
76 |
+
to check if the note (note_info) belongs to this split (train,
|
77 |
+
val, test etc). If it does, return true, else false
|
78 |
+
Args:
|
79 |
+
key (str): The key that identify the note belonging to the split
|
80 |
+
Returns:
|
81 |
+
(bool): True if the note belongs to the split, false otherwise
|
82 |
+
"""
|
83 |
+
if self._split is None:
|
84 |
+
raise ValueError('Split not set')
|
85 |
+
if self._lookup_split[self._split].get(key, False):
|
86 |
+
return True
|
87 |
+
else:
|
88 |
+
return False
|
89 |
+
|
90 |
+
def assign_splits(
|
91 |
+
self,
|
92 |
+
input_file: str,
|
93 |
+
spans_key: str = 'spans',
|
94 |
+
metadata_key: str = 'meta',
|
95 |
+
group_key: str = 'note_id',
|
96 |
+
margin: float = 0.3
|
97 |
+
) -> NoReturn:
|
98 |
+
"""
|
99 |
+
Get the dataset splits - training, validation & testing splits
|
100 |
+
Based on the NER distribution and whether we want to keep certain
|
101 |
+
notes grouped (e.g by patient). Return an iterable that contains
|
102 |
+
a tuple that contains the note_id and the split. This can be used
|
103 |
+
to filter notes based on the splits.
|
104 |
+
Args:
|
105 |
+
input_file (str): The input file
|
106 |
+
spans_key (str): The key where the note spans are present
|
107 |
+
metadata_key (str): The key where the note metadata is present
|
108 |
+
group_key (str): The key where the note group (e.g note_id or patient id etc) is present.
|
109 |
+
This field is what the notes will be grouped by, and all notes belonging
|
110 |
+
to this grouping will be in the same split
|
111 |
+
margin (float): Margin of error when maintaining proportions in the splits
|
112 |
+
"""
|
113 |
+
# Compute the distribution of NER types in the grouped notes.
|
114 |
+
# For example the distribution of NER types in all notes belonging to a
|
115 |
+
# particular patient
|
116 |
+
self._lookup_split = {
|
117 |
+
'train': dict(),
|
118 |
+
'validation': dict(),
|
119 |
+
'test': dict()
|
120 |
+
}
|
121 |
+
ner_distribution = NERDistribution()
|
122 |
+
for line in open(input_file, 'r'):
|
123 |
+
note = json.loads(line)
|
124 |
+
key = note[metadata_key][group_key]
|
125 |
+
ner_distribution.update_distribution(spans=note[spans_key], key=key)
|
126 |
+
# Initialize the dataset splits object
|
127 |
+
dataset_splits = DatasetSplits(
|
128 |
+
ner_distribution=ner_distribution,
|
129 |
+
train_proportion=self._train_proportion,
|
130 |
+
validation_proportion=self._validation_proportion,
|
131 |
+
test_proportion=self._test_proportion,
|
132 |
+
margin=margin
|
133 |
+
)
|
134 |
+
# Check the note and assign it to a split
|
135 |
+
for line in open(input_file, 'r'):
|
136 |
+
note = json.loads(line)
|
137 |
+
key = note[metadata_key][group_key]
|
138 |
+
split = dataset_splits.get_split(key=key)
|
139 |
+
self.set_split(split)
|
140 |
+
self.__update_split(key)
|
141 |
+
return None
|
142 |
+
|
143 |
+
|
144 |
+
def main() -> NoReturn:
|
145 |
+
"""
|
146 |
+
Prepare dataset splits - training, validation & testing splits
|
147 |
+
Compute ner distributions in our dataset. Based on this distribution
|
148 |
+
and whether we want to keep certain notes grouped (e.g by patient)
|
149 |
+
we assign notes to a split, such that the final ner type distribution
|
150 |
+
in each split is similar.
|
151 |
+
"""
|
152 |
+
# Compute the distribution of NER types in the grouped notes.
|
153 |
+
# For example the distribution of NER types in all notes belonging to a
|
154 |
+
# particular patient
|
155 |
+
# The following code sets up the arguments to be passed via CLI or via a JSON file
|
156 |
+
cli_parser = ArgumentParser(
|
157 |
+
description='configuration arguments provided at run time from the CLI',
|
158 |
+
formatter_class=ArgumentDefaultsHelpFormatter
|
159 |
+
)
|
160 |
+
cli_parser.add_argument(
|
161 |
+
'--input_file',
|
162 |
+
type=str,
|
163 |
+
required=True,
|
164 |
+
help='the the jsonl file that contains the notes'
|
165 |
+
)
|
166 |
+
cli_parser.add_argument(
|
167 |
+
'--spans_key',
|
168 |
+
type=str,
|
169 |
+
default='spans',
|
170 |
+
help='the key where the note spans is present in the json object'
|
171 |
+
)
|
172 |
+
cli_parser.add_argument(
|
173 |
+
'--metadata_key',
|
174 |
+
type=str,
|
175 |
+
default='meta',
|
176 |
+
help='the key where the note metadata is present in the json object'
|
177 |
+
)
|
178 |
+
cli_parser.add_argument(
|
179 |
+
'--group_key',
|
180 |
+
type=str,
|
181 |
+
default='note_id',
|
182 |
+
help='the key to group notes by in the json object'
|
183 |
+
)
|
184 |
+
cli_parser.add_argument(
|
185 |
+
'--train_proportion',
|
186 |
+
type=int,
|
187 |
+
default=70,
|
188 |
+
help='ratio of train dataset'
|
189 |
+
)
|
190 |
+
cli_parser.add_argument(
|
191 |
+
'--train_file',
|
192 |
+
type=str,
|
193 |
+
default=None,
|
194 |
+
help='The file to store the train data'
|
195 |
+
)
|
196 |
+
cli_parser.add_argument(
|
197 |
+
'--validation_proportion',
|
198 |
+
type=int,
|
199 |
+
default=15,
|
200 |
+
help='ratio of validation dataset'
|
201 |
+
)
|
202 |
+
cli_parser.add_argument(
|
203 |
+
'--validation_file',
|
204 |
+
type=str,
|
205 |
+
default=None,
|
206 |
+
help='The file to store the validation data'
|
207 |
+
)
|
208 |
+
cli_parser.add_argument(
|
209 |
+
'--test_proportion',
|
210 |
+
type=int,
|
211 |
+
default=15,
|
212 |
+
help='ratio of test dataset'
|
213 |
+
)
|
214 |
+
cli_parser.add_argument(
|
215 |
+
'--test_file',
|
216 |
+
type=str,
|
217 |
+
default=None,
|
218 |
+
help='The file to store the test data'
|
219 |
+
)
|
220 |
+
cli_parser.add_argument(
|
221 |
+
'--margin',
|
222 |
+
type=float,
|
223 |
+
default=0.3,
|
224 |
+
help='margin of error when maintaining proportions in the splits'
|
225 |
+
)
|
226 |
+
cli_parser.add_argument(
|
227 |
+
'--print_dist',
|
228 |
+
action='store_true',
|
229 |
+
help='whether to print the label distribution in the splits'
|
230 |
+
)
|
231 |
+
args = cli_parser.parse_args()
|
232 |
+
dataset_splitter = DatasetSplitter(
|
233 |
+
train_proportion=args.train_proportion,
|
234 |
+
validation_proportion=args.validation_proportion,
|
235 |
+
test_proportion=args.test_proportion
|
236 |
+
)
|
237 |
+
dataset_splitter.assign_splits(
|
238 |
+
input_file=args.input_file,
|
239 |
+
spans_key=args.spans_key,
|
240 |
+
metadata_key=args.metadata_key,
|
241 |
+
group_key=args.group_key,
|
242 |
+
margin=args.margin
|
243 |
+
)
|
244 |
+
|
245 |
+
if args.train_proportion > 0:
|
246 |
+
with open(args.train_file, 'w') as file:
|
247 |
+
for line in open(args.input_file, 'r'):
|
248 |
+
note = json.loads(line)
|
249 |
+
key = note[args.metadata_key][args.group_key]
|
250 |
+
dataset_splitter.set_split('train')
|
251 |
+
if dataset_splitter.check_note(key):
|
252 |
+
file.write(json.dumps(note) + '\n')
|
253 |
+
|
254 |
+
if args.validation_proportion > 0:
|
255 |
+
with open(args.validation_file, 'w') as file:
|
256 |
+
for line in open(args.input_file, 'r'):
|
257 |
+
note = json.loads(line)
|
258 |
+
key = note[args.metadata_key][args.group_key]
|
259 |
+
dataset_splitter.set_split('validation')
|
260 |
+
if dataset_splitter.check_note(key):
|
261 |
+
file.write(json.dumps(note) + '\n')
|
262 |
+
|
263 |
+
if args.test_proportion > 0:
|
264 |
+
with open(args.test_file, 'w') as file:
|
265 |
+
for line in open(args.input_file, 'r'):
|
266 |
+
note = json.loads(line)
|
267 |
+
key = note[args.metadata_key][args.group_key]
|
268 |
+
dataset_splitter.set_split('test')
|
269 |
+
if dataset_splitter.check_note(key):
|
270 |
+
file.write(json.dumps(note) + '\n')
|
271 |
+
|
272 |
+
if args.print_dist:
|
273 |
+
# Read the dataset splits file and compute the NER type distribution
|
274 |
+
key_counts = Counter()
|
275 |
+
ner_distribution = NERDistribution()
|
276 |
+
for line in open(args.input_file, 'r'):
|
277 |
+
note = json.loads(line)
|
278 |
+
key = note[args.metadata_key][args.group_key]
|
279 |
+
key_counts[key] += 1
|
280 |
+
ner_distribution.update_distribution(spans=note[args.spans_key], key=key)
|
281 |
+
print_distribution = PrintDistribution(ner_distribution=ner_distribution, key_counts=key_counts)
|
282 |
+
train_splits = dataset_splitter.get_split('train')
|
283 |
+
validation_splits = dataset_splitter.get_split('validation')
|
284 |
+
test_splits = dataset_splitter.get_split('test')
|
285 |
+
all_splits = train_splits + validation_splits + test_splits
|
286 |
+
# Print distribution for each split
|
287 |
+
print_distribution.split_distribution(split='total', split_info=all_splits)
|
288 |
+
print_distribution.split_distribution(split='train', split_info=train_splits)
|
289 |
+
print_distribution.split_distribution(split='validation', split_info=validation_splits)
|
290 |
+
print_distribution.split_distribution(split='test', split_info=test_splits)
|
291 |
+
|
292 |
+
|
293 |
+
if __name__ == "__main__":
|
294 |
+
main()
|
ner_datasets/distribution/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .dataset_splits import DatasetSplits
|
2 |
+
from .ner_distribution import NERDistribution
|
3 |
+
from .print_distribution import PrintDistribution
|
4 |
+
__all__=["DatasetSplits", "NERDistribution", "PrintDistribution"]
|
ner_datasets/distribution/dataset_splits.py
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
from collections import Counter
|
3 |
+
from typing import NoReturn
|
4 |
+
|
5 |
+
from .ner_distribution import NERDistribution
|
6 |
+
|
7 |
+
random.seed(41)
|
8 |
+
|
9 |
+
|
10 |
+
class DatasetSplits(object):
|
11 |
+
"""
|
12 |
+
Prepare dataset splits - training, validation & testing splits
|
13 |
+
Compute ner distributions in the dataset. Based on this we assign
|
14 |
+
notes to different splits and at the same time, we keep the distribution of
|
15 |
+
NER types in each split similar. .
|
16 |
+
Keep track of the split information - which notes are present in which split.
|
17 |
+
The label distribution in each split, the number of notes in each split.
|
18 |
+
"""
|
19 |
+
|
20 |
+
def __init__(
|
21 |
+
self,
|
22 |
+
ner_distribution: NERDistribution,
|
23 |
+
train_proportion: int,
|
24 |
+
validation_proportion: int,
|
25 |
+
test_proportion: int,
|
26 |
+
margin: float
|
27 |
+
) -> NoReturn:
|
28 |
+
"""
|
29 |
+
Maintain split information. Assign notes based on the proportion of
|
30 |
+
the splits, while keeping the label distribution in each split similar.
|
31 |
+
Keep track of the split information - which notes are present in which split.
|
32 |
+
The label distribution in each split, the number of notes in each split.
|
33 |
+
Keep track of the dataset splits and the counts in each split etc.
|
34 |
+
These will be used to assign the different notes to different
|
35 |
+
splits while keeping the proportion of ner similar in each split.
|
36 |
+
Get the maximum number of ner that can be present in the train,
|
37 |
+
validation and test split. The total count will be used to
|
38 |
+
calculate the current proportion of ner in the split. This can be used
|
39 |
+
to keep the proportion of ner types consistent among different splits
|
40 |
+
Args:
|
41 |
+
ner_distribution (NERDistribution): The NER distribution in the dataset
|
42 |
+
train_proportion (int): Ratio of train dataset
|
43 |
+
validation_proportion (int): Ratio of validation dataset
|
44 |
+
test_proportion (int): Ratio of test dataset
|
45 |
+
margin (float): Margin by which the label distribution can be exceeded in the split
|
46 |
+
"""
|
47 |
+
self._ner_distribution = ner_distribution
|
48 |
+
# Compute the counts of NER types in the entire dataset
|
49 |
+
total_distribution = Counter()
|
50 |
+
for key, counts in ner_distribution.get_ner_distribution().items():
|
51 |
+
for label, count in counts.items():
|
52 |
+
total_distribution[label] += count
|
53 |
+
# Compute the percentages of NER types in the entire dataset
|
54 |
+
self._total_ner = sum(total_distribution.values())
|
55 |
+
self._label_dist_percentages = {
|
56 |
+
ner_type: float(count) / self._total_ner * 100 if self._total_ner else 0
|
57 |
+
for ner_type, count in total_distribution.items()
|
58 |
+
}
|
59 |
+
self._margin = margin
|
60 |
+
# The three splits
|
61 |
+
self._splits = ['train', 'validation', 'test']
|
62 |
+
self._split_weights = None
|
63 |
+
self._splits_info = None
|
64 |
+
# Keep track of the patient_ids that have been processed.
|
65 |
+
# Since a patient can have multiple notes and we already know the
|
66 |
+
# ner distribution for this patient across all the notes (i.e the ner types
|
67 |
+
# and count that appear in all the notes associated with this patient)
|
68 |
+
# We also keep all the notes associated with a patient in the same split
|
69 |
+
# So we check if adding all the notes associated with this patient will
|
70 |
+
# disturb the ner distribution (proportions) as mentioned before.
|
71 |
+
self._processed_keys = dict()
|
72 |
+
# Based on these proportions we compute train_ner_count, validation_ner_count, test_ner_count
|
73 |
+
# Say the proportion are 85, 10, 5
|
74 |
+
# The train split will have a maximum of 85% of the overall ner, validation will have 10 and test will 5
|
75 |
+
# That is if there are total count of all ner is 100, on splitting the datasets
|
76 |
+
# the train split will have a total of 85 ner, validation split will have a total of 10 ner and the
|
77 |
+
# test split will have a total of 5 ner
|
78 |
+
train_ner_count = int(train_proportion * self._total_ner / 100)
|
79 |
+
validation_ner_count = int(validation_proportion * self._total_ner / 100)
|
80 |
+
test_ner_count = int(test_proportion * self._total_ner / 100)
|
81 |
+
# So based on this, we check if adding a note keeps the balance in proportion or not
|
82 |
+
# If it does not, we check the splits given in the "remain" field of the dict (which is
|
83 |
+
# the 2 other splits
|
84 |
+
self._split_weights = [train_proportion, validation_proportion, test_proportion]
|
85 |
+
# Based on the split proportions, ner counts and ner distribution
|
86 |
+
# we need to split our dataset into train, validation and test split
|
87 |
+
# For each split we try and maintain the same distribution (proportions) between ner types
|
88 |
+
# that we computed from the entire dataset (given by - ner_distribution)
|
89 |
+
# If the entire dataset had AGE:50%, DATE:30%, LOC:20%, we want the same proportions
|
90 |
+
# in each of the train, validation and test splits
|
91 |
+
# So based on this, we check if adding a note keeps the balance in proportion or not
|
92 |
+
# If it does not, we check the splits given in the "remain" field of the dict (which is
|
93 |
+
# the 2 other splits
|
94 |
+
self._splits_info = {'train': {'remain': ['validation', 'test'],
|
95 |
+
'total': train_ner_count,
|
96 |
+
'remain_weights': [validation_proportion, test_proportion],
|
97 |
+
'groups': list(), 'number_of_notes': 0, 'label_dist': Counter()},
|
98 |
+
'validation': {'remain': ['train', 'test'],
|
99 |
+
'total': validation_ner_count,
|
100 |
+
'remain_weights': [train_proportion, test_proportion],
|
101 |
+
'groups': list(), 'number_of_notes': 0, 'label_dist': Counter()},
|
102 |
+
'test': {'remain': ['validation', 'train'],
|
103 |
+
'total': test_ner_count,
|
104 |
+
'remain_weights': [validation_proportion, train_proportion],
|
105 |
+
'groups': list(), 'number_of_notes': 0, 'label_dist': Counter()}}
|
106 |
+
|
107 |
+
def __set_split(self, split: str) -> NoReturn:
|
108 |
+
"""
|
109 |
+
Set the split that you are currently checking/processing.
|
110 |
+
Based on the split you can perform certain checks and
|
111 |
+
computation for that split.
|
112 |
+
Args:
|
113 |
+
split (str): The split - train, validation or test
|
114 |
+
"""
|
115 |
+
self._split = split
|
116 |
+
|
117 |
+
def __update_label_dist(self, distribution: Counter) -> NoReturn:
|
118 |
+
"""
|
119 |
+
Once we have determined that a note can be added to the split we need to
|
120 |
+
update the current count of the ner types in the split. So we pass the ner counts
|
121 |
+
in the note that will be updated and update the counts of the ner types in the split.
|
122 |
+
Args:
|
123 |
+
distribution (Counter): Contains the ner type and it's counts (distribution)
|
124 |
+
"""
|
125 |
+
self._splits_info[self._split]['label_dist'].update(distribution)
|
126 |
+
|
127 |
+
def __update_groups(self, note_group_key: str) -> NoReturn:
|
128 |
+
"""
|
129 |
+
Once we have determined that a note can be added to the split, we append
|
130 |
+
to a list some distinct element of the note (e.g note_id). This list will
|
131 |
+
contain the note_ids of the notes that belong to this split.
|
132 |
+
Args:
|
133 |
+
note_group_key (str): Contains the note metadata - e.g note_id, institute etc
|
134 |
+
"""
|
135 |
+
self._processed_keys[note_group_key] = self._split
|
136 |
+
self._splits_info[self._split]['groups'].append(note_group_key)
|
137 |
+
|
138 |
+
def __check_split(self, distribution: Counter) -> bool:
|
139 |
+
"""
|
140 |
+
This function is used to check the resulting ner distribution in the split on adding this
|
141 |
+
note to the split. We check how the proportion of ner changes if this note is added to
|
142 |
+
the split. If the proportion exceeds the desired proportion then we return false
|
143 |
+
to indicate that adding this note will upset the ner distribution across splits, so we should
|
144 |
+
instead check adding this note to another split. If it does not update the balance then we return
|
145 |
+
True, which means we can add this note to this split. The desired proportion of ner is passed
|
146 |
+
in the percentages argument - where we have the desired proportion for each ner type.
|
147 |
+
Args:
|
148 |
+
distribution (Counter): Contains the mapping between ner type and count
|
149 |
+
Returns:
|
150 |
+
(bool): True if the note can be added to the split, false otherwise
|
151 |
+
"""
|
152 |
+
# Get the current ner types and counts in the split
|
153 |
+
split_label_dist = self._splits_info[self._split]['label_dist']
|
154 |
+
# Get the max ner count that can be present in the split
|
155 |
+
# This will be used to compute the ner proportions in the split
|
156 |
+
split_total = self._splits_info[self._split]['total']
|
157 |
+
# Check if the proportion of the split picked in zero
|
158 |
+
# and return False because we cant add any note to this split
|
159 |
+
if split_total == 0:
|
160 |
+
return False
|
161 |
+
for ner_type, count in distribution.items():
|
162 |
+
percentage = (split_label_dist.get(ner_type, 0) + count) / split_total * 100
|
163 |
+
# Check if the proportion on adding this note exceeds the desired proportion
|
164 |
+
# within the margin of error
|
165 |
+
# If it does return false
|
166 |
+
if percentage > self._label_dist_percentages[ner_type] + self._margin:
|
167 |
+
return False
|
168 |
+
return True
|
169 |
+
|
170 |
+
def get_split(self, key: str) -> str:
|
171 |
+
"""
|
172 |
+
Assign a split to the note - based on the distribution of ner types in the note
|
173 |
+
and the distribution of ner types in the split. Essentially assign a note to a split
|
174 |
+
such that the distribution of ner types in each split is similar, once all notes have
|
175 |
+
been assigned to their respective splits.
|
176 |
+
Args:
|
177 |
+
key (str): The note id or patient id of the note (some grouping key)
|
178 |
+
Returns:
|
179 |
+
(str): The split
|
180 |
+
"""
|
181 |
+
current_splits = self._splits
|
182 |
+
current_weights = self._split_weights
|
183 |
+
distribution = self._ner_distribution.get_group_distribution(key=key)
|
184 |
+
if self._processed_keys.get(key, False):
|
185 |
+
return self._processed_keys[key]
|
186 |
+
while True:
|
187 |
+
# Pick and set the split
|
188 |
+
check_split = random.choices(current_splits, current_weights)[0]
|
189 |
+
self.__set_split(check_split)
|
190 |
+
# Get the ner distribution for this particular patient (across all the notes associated
|
191 |
+
# with this patient) and check if the notes can be added to this split.
|
192 |
+
# The margin of error for the ner proportions. As we said above we try and keep the proportions
|
193 |
+
# across the splits the same, but we allow for some flexibility, so we can go +- the amount
|
194 |
+
# given by margin.
|
195 |
+
include = self.__check_split(distribution=distribution)
|
196 |
+
if include:
|
197 |
+
self.__update_groups(key)
|
198 |
+
self.__update_label_dist(distribution=distribution)
|
199 |
+
return check_split
|
200 |
+
else:
|
201 |
+
# Check the two other possible splits
|
202 |
+
if len(current_splits) == 3:
|
203 |
+
current_splits = self._splits_info[check_split]['remain']
|
204 |
+
current_weights = self._splits_info[check_split]['remain_weights']
|
205 |
+
# Check the one other possible split (when the one of the above two other split check returns false)
|
206 |
+
elif len(current_splits) == 2 and current_weights[1 - current_splits.index(check_split)] != 0:
|
207 |
+
index = current_splits.index(check_split)
|
208 |
+
current_splits = [current_splits[1 - index]]
|
209 |
+
current_weights = [100]
|
210 |
+
# If it can't be added to any split - choose a split randomly
|
211 |
+
else:
|
212 |
+
current_splits = self._splits
|
213 |
+
current_weights = self._split_weights
|
214 |
+
check_split = random.choices(current_splits, current_weights)[0]
|
215 |
+
self.__set_split(check_split)
|
216 |
+
self.__update_groups(key)
|
217 |
+
self.__update_label_dist(distribution=distribution)
|
218 |
+
return check_split
|
ner_datasets/distribution/ner_distribution.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import Counter, defaultdict
|
2 |
+
from typing import Sequence, Mapping, NoReturn
|
3 |
+
|
4 |
+
|
5 |
+
class NERDistribution(object):
|
6 |
+
"""
|
7 |
+
Store the distribution of ner types based on some key.
|
8 |
+
That is we store the NER type distribution for some given key value and we update
|
9 |
+
the distribution when spans related to that key is passed
|
10 |
+
"""
|
11 |
+
|
12 |
+
def __init__(self) -> NoReturn:
|
13 |
+
"""
|
14 |
+
Initialize the NER type - count mapping
|
15 |
+
"""
|
16 |
+
# Counter the captures the ner types and counts per patient/note_id in the dataset
|
17 |
+
# Depending on what we set the group_key as. Basically gather counts with respect
|
18 |
+
# to some grouping of the notes
|
19 |
+
# E.g - {{PATIENT 1: {AGE: 99, DATE: 55, ...}, {PATIENT 2: {AGE: 5, DATE: 9, ...} ... }
|
20 |
+
self._ner_distribution = defaultdict(Counter)
|
21 |
+
|
22 |
+
def update_distribution(self, spans: Sequence[Mapping[str, str]], key: str) -> NoReturn:
|
23 |
+
"""
|
24 |
+
Update the distribution of ner types for the given key
|
25 |
+
Args:
|
26 |
+
spans (Sequence[Mapping[str, str]]): The list of spans in the note
|
27 |
+
key (str): The note id or patient id of the note (some grouping)
|
28 |
+
"""
|
29 |
+
# Go through the spans in the note and compute the ner distribution
|
30 |
+
# Compute both the overall ner distribution and ner distribution per
|
31 |
+
# patient (i.e the ner types in all the notes associated with the patient)
|
32 |
+
if not self._ner_distribution.get(key, False):
|
33 |
+
self._ner_distribution[key] = Counter()
|
34 |
+
for span in spans:
|
35 |
+
self._ner_distribution[key][span['label']] += 1
|
36 |
+
|
37 |
+
def get_ner_distribution(self) -> defaultdict:
|
38 |
+
"""
|
39 |
+
Return overall ner distribution. The NER type distribution for every key.
|
40 |
+
Returns:
|
41 |
+
ner_distribution (defaultdict(Counter)): Overall NER type distribution for all keys
|
42 |
+
"""
|
43 |
+
return self._ner_distribution
|
44 |
+
|
45 |
+
def get_group_distribution(self, key: str) -> Counter:
|
46 |
+
"""
|
47 |
+
Return the NER type distribution for the given key
|
48 |
+
Returns:
|
49 |
+
(Counter): ner distribution w.r.t some grouping (key)
|
50 |
+
"""
|
51 |
+
if key in self._ner_distribution.keys():
|
52 |
+
return self._ner_distribution[key]
|
53 |
+
else:
|
54 |
+
raise ValueError('Key not found')
|
ner_datasets/distribution/print_distribution.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import Counter
|
2 |
+
from typing import Sequence, NoReturn
|
3 |
+
|
4 |
+
from .ner_distribution import NERDistribution
|
5 |
+
|
6 |
+
|
7 |
+
class PrintDistribution(object):
|
8 |
+
"""
|
9 |
+
This class is used to print the distribution of NER types
|
10 |
+
"""
|
11 |
+
|
12 |
+
def __init__(self, ner_distribution: NERDistribution, key_counts: Counter) -> NoReturn:
|
13 |
+
"""
|
14 |
+
Initialize
|
15 |
+
Args:
|
16 |
+
ner_distribution (NERDistribution): NERDistribution object that keeps track of the NER type distributions
|
17 |
+
key_counts (Counter): Number of keys/groups (e.g note_ids, patient ids etc)
|
18 |
+
"""
|
19 |
+
self._ner_distribution = ner_distribution
|
20 |
+
self._key_counts = key_counts
|
21 |
+
|
22 |
+
def split_distribution(self, split: str, split_info: Sequence[str]) -> NoReturn:
|
23 |
+
"""
|
24 |
+
Print NER type distribution
|
25 |
+
Args:
|
26 |
+
split (str): The dataset split
|
27 |
+
split_info (Sequence[str]): The keys belonging to that split
|
28 |
+
"""
|
29 |
+
split_distribution = Counter()
|
30 |
+
number_of_notes = 0
|
31 |
+
for key in split_info:
|
32 |
+
number_of_notes += self._key_counts[key]
|
33 |
+
split_distribution.update(self._ner_distribution.get_group_distribution(key))
|
34 |
+
total_ner = sum(split_distribution.values())
|
35 |
+
percentages = {ner_type: float(count) / total_ner * 100 if total_ner else 0
|
36 |
+
for ner_type, count in split_distribution.items()}
|
37 |
+
print('{:^70}'.format('============ ' + split.upper() + ' NER Distribution ============='))
|
38 |
+
print('{:<20}{:<10}'.format('Number of Notes: ', number_of_notes))
|
39 |
+
print('{:<20}{:<10}\n'.format('Number of Groups: ', len(split_info)))
|
40 |
+
for ner_type, count in split_distribution.most_common():
|
41 |
+
print('{:<10}{:<10}{:<5}{:<10}{:<5}{:<10}'.format(
|
42 |
+
'NER Type: ', ner_type,
|
43 |
+
'Count: ', count,
|
44 |
+
'Percentage: ', '{:0.2f}'.format(percentages[ner_type]))
|
45 |
+
)
|
46 |
+
print('{:<10}{:<10}{:<5}{:<10}{:<5}{:<10}'.format(
|
47 |
+
'NER Type:', 'TOTALS', 'Count: ', total_ner, 'Percentage: ', '{:0.2f}'.format(100))
|
48 |
+
)
|
49 |
+
print('\n')
|
ner_datasets/preprocessing/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .preprocessing_loader import PreprocessingLoader
|
2 |
+
__all__ = ["PreprocessingLoader"]
|
ner_datasets/preprocessing/preprocessing_loader.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Union, Optional, Sequence
|
2 |
+
|
3 |
+
from .sentencizers import SpacySentencizer, NoteSentencizer
|
4 |
+
from .tokenizers import ClinicalSpacyTokenizer, SpacyTokenizer, CoreNLPTokenizer
|
5 |
+
|
6 |
+
|
7 |
+
class PreprocessingLoader(object):
|
8 |
+
|
9 |
+
@staticmethod
|
10 |
+
def get_sentencizer(sentencizer: str) -> Union[SpacySentencizer, NoteSentencizer]:
|
11 |
+
"""
|
12 |
+
Initialize the sentencizer and tokenizer based
|
13 |
+
We can either use the sci-spacy (en_core_sci_lg or en_core_web_sm) or
|
14 |
+
consider the entire note as a single sentence.
|
15 |
+
Args:
|
16 |
+
sentencizer (str): Specify which sentencizer you want to use
|
17 |
+
Returns:
|
18 |
+
Union[SpacySentencizer, MimicStanzaSentencizer, NoteSentencizer]: An object of the requested
|
19 |
+
sentencizer class
|
20 |
+
"""
|
21 |
+
if sentencizer == 'en_core_sci_lg':
|
22 |
+
return SpacySentencizer(spacy_model='en_core_sci_lg')
|
23 |
+
elif sentencizer == 'en_core_web_sm':
|
24 |
+
return SpacySentencizer(spacy_model='en_core_web_sm')
|
25 |
+
elif sentencizer == 'note':
|
26 |
+
return NoteSentencizer()
|
27 |
+
else:
|
28 |
+
raise ValueError('Invalid sentencizer - does not exist')
|
29 |
+
|
30 |
+
@staticmethod
|
31 |
+
def get_tokenizer(
|
32 |
+
tokenizer: str,
|
33 |
+
abbreviations: Optional[Sequence[str]] = None,
|
34 |
+
) -> Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]:
|
35 |
+
"""
|
36 |
+
Initialize the tokenizer based on the CLI arguments
|
37 |
+
We can either use the default scipacy (en_core_sci_lg or en_core_web_sm)
|
38 |
+
or the modified scipacy (with regex rule) tokenizer.
|
39 |
+
It also supports the corenlp tokenizer
|
40 |
+
Args:
|
41 |
+
tokenizer (str): Specify which tokenizer you want to use
|
42 |
+
abbreviations (Optional[str]): A list of abbreviations for which tokens will not be split - works only with
|
43 |
+
used with custom clinical tokenizer
|
44 |
+
Returns:
|
45 |
+
Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]: An object of the requested tokenizer class
|
46 |
+
"""
|
47 |
+
if tokenizer == 'en_core_sci_lg':
|
48 |
+
return SpacyTokenizer(spacy_model='en_core_sci_lg')
|
49 |
+
elif tokenizer == 'en_core_web_sm':
|
50 |
+
return SpacyTokenizer(spacy_model='en_core_web_sm')
|
51 |
+
elif tokenizer == 'en':
|
52 |
+
return SpacyTokenizer(spacy_model='en')
|
53 |
+
elif tokenizer == 'corenlp':
|
54 |
+
return CoreNLPTokenizer()
|
55 |
+
elif tokenizer == 'clinical':
|
56 |
+
# Abbreviations - we won't split tokens that match these (e.g 18F-FDG)
|
57 |
+
if abbreviations is None:
|
58 |
+
return ClinicalSpacyTokenizer(spacy_model='en_core_sci_lg', abbreviations=abbreviations)
|
59 |
+
else:
|
60 |
+
|
61 |
+
return ClinicalSpacyTokenizer(spacy_model='en_core_sci_lg', abbreviations=abbreviations)
|
62 |
+
else:
|
63 |
+
raise ValueError('Invalid tokenizer - does not exist')
|
ner_datasets/preprocessing/sentencizers/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .note_sentencizer import NoteSentencizer
|
2 |
+
from .spacy_sentencizer import SpacySentencizer
|
3 |
+
__all__=["NoteSentencizer", "SpacySentencizer"]
|
ner_datasets/preprocessing/sentencizers/mimic_stanza_sentencizer.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Iterable, Dict, Union
|
2 |
+
|
3 |
+
import stanza
|
4 |
+
|
5 |
+
|
6 |
+
class MimicStanzaSentencizer(object):
|
7 |
+
"""
|
8 |
+
This class is used to read text and split it into
|
9 |
+
sentences (and their start and end positions)
|
10 |
+
using the mimic stanza package
|
11 |
+
"""
|
12 |
+
|
13 |
+
def __init__(self, package: str):
|
14 |
+
"""
|
15 |
+
Initialize a mimic stanza model to read text and split it into
|
16 |
+
sentences.
|
17 |
+
Args:
|
18 |
+
package (str): Name of the mimic model
|
19 |
+
"""
|
20 |
+
self._nlp = stanza.Pipeline('en', package=package, processors='tokenize', use_gpu=True)
|
21 |
+
|
22 |
+
def get_sentences(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
|
23 |
+
"""
|
24 |
+
Return an integrator that iterates through the sentences in the text
|
25 |
+
Args:
|
26 |
+
text (str): The text
|
27 |
+
Returns:
|
28 |
+
(Iterable[Dict[str, Union[str, int]]]): Yields a dictionary that contains the text of the sentence
|
29 |
+
the start position of the sentence in the entire text
|
30 |
+
and the end position of the sentence in the entire text
|
31 |
+
"""
|
32 |
+
doc = self._nlp(text)
|
33 |
+
for sentence in doc.sentences:
|
34 |
+
yield {'text': sentence.text,
|
35 |
+
'start': sentence.tokens[0].start_char,
|
36 |
+
'end': sentence.tokens[-1].end_char,
|
37 |
+
'last_token': sentence.tokens[-1].text}
|
ner_datasets/preprocessing/sentencizers/note_sentencizer.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Iterable, Dict, Union
|
2 |
+
|
3 |
+
|
4 |
+
class NoteSentencizer(object):
|
5 |
+
"""
|
6 |
+
This class is used to read text and split it into
|
7 |
+
sentences (and their start and end positions)
|
8 |
+
This class considers an entire note or text as
|
9 |
+
a single sentence
|
10 |
+
"""
|
11 |
+
|
12 |
+
def __init__(self):
|
13 |
+
"""
|
14 |
+
Nothing to be initialized
|
15 |
+
"""
|
16 |
+
|
17 |
+
def get_sentences(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
|
18 |
+
"""
|
19 |
+
Return an iterator that iterates through the sentences in the text.
|
20 |
+
In this case it just returns the text itself.
|
21 |
+
Args:
|
22 |
+
text (str): The text
|
23 |
+
Returns:
|
24 |
+
(Iterable[Dict[str, Union[str, int]]]): Yields a dictionary that contains the text of the sentence
|
25 |
+
the start position of the sentence in the entire text
|
26 |
+
and the end position of the sentence in the entire text
|
27 |
+
"""
|
28 |
+
yield {
|
29 |
+
'text': text,
|
30 |
+
'start': 0,
|
31 |
+
'end': len(text),
|
32 |
+
'last_token': None
|
33 |
+
}
|
ner_datasets/preprocessing/sentencizers/spacy_sentencizer.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Iterable, Dict, Union
|
2 |
+
|
3 |
+
import spacy
|
4 |
+
|
5 |
+
|
6 |
+
class SpacySentencizer(object):
|
7 |
+
"""
|
8 |
+
This class is used to read text and split it into
|
9 |
+
sentences (and their start and end positions)
|
10 |
+
using a spacy model
|
11 |
+
"""
|
12 |
+
|
13 |
+
def __init__(self, spacy_model: str):
|
14 |
+
"""
|
15 |
+
Initialize a spacy model to read text and split it into
|
16 |
+
sentences.
|
17 |
+
Args:
|
18 |
+
spacy_model (str): Name of the spacy model
|
19 |
+
"""
|
20 |
+
self._nlp = spacy.load(spacy_model)
|
21 |
+
|
22 |
+
def get_sentences(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
|
23 |
+
"""
|
24 |
+
Return an iterator that iterates through the sentences in the text
|
25 |
+
Args:
|
26 |
+
text (str): The text
|
27 |
+
Returns:
|
28 |
+
(Iterable[Dict[str, Union[str, int]]]): Yields a dictionary that contains the text of the sentence
|
29 |
+
the start position of the sentence in the entire text
|
30 |
+
and the end position of the sentence in the entire text
|
31 |
+
"""
|
32 |
+
document = self._nlp(text)
|
33 |
+
for sentence in document.sents:
|
34 |
+
yield {'text': sentence.text,
|
35 |
+
'start': sentence.start_char,
|
36 |
+
'end': sentence.end_char,
|
37 |
+
'last_token': None}
|
ner_datasets/preprocessing/tokenizers/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .spacy_tokenizer import SpacyTokenizer
|
2 |
+
from .core_nlp_tokenizer import CoreNLPTokenizer
|
3 |
+
from .clinical_spacy_tokenizer import ClinicalSpacyTokenizer
|
4 |
+
__all__=["SpacyTokenizer", "CoreNLPTokenizer", "ClinicalSpacyTokenizer"]
|
ner_datasets/preprocessing/tokenizers/abbreviations/check.txt
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sec.
|
2 |
+
secs.
|
3 |
+
Sec.
|
4 |
+
Secs.
|
5 |
+
fig.
|
6 |
+
figs.
|
7 |
+
Fig.
|
8 |
+
Figs.
|
9 |
+
eq.
|
10 |
+
eqs.
|
11 |
+
Eq.
|
12 |
+
Eqs.
|
13 |
+
no.
|
14 |
+
nos.
|
15 |
+
No.
|
16 |
+
Nos.
|
17 |
+
al.
|
18 |
+
gen.
|
19 |
+
sp.
|
20 |
+
nov.
|
ner_datasets/preprocessing/tokenizers/abbreviations/medical_abbreviations_curated.txt
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-ve
|
2 |
+
+ve
|
3 |
+
a.c.
|
4 |
+
a/g
|
5 |
+
b.i.d.
|
6 |
+
C&S
|
7 |
+
C/O
|
8 |
+
D/C
|
9 |
+
D&C
|
10 |
+
D and C
|
11 |
+
H&H
|
12 |
+
H&P
|
13 |
+
h.s.
|
14 |
+
H/O
|
15 |
+
h/o
|
16 |
+
I&D
|
17 |
+
M/H
|
18 |
+
N/V
|
19 |
+
O&P
|
20 |
+
O.D.
|
21 |
+
O.S.
|
22 |
+
O.U.
|
23 |
+
p¯
|
24 |
+
p.o.
|
25 |
+
p.r.n.
|
26 |
+
q.d.
|
27 |
+
q.i.d.
|
28 |
+
R/O
|
29 |
+
s/p
|
30 |
+
T&A
|
31 |
+
t.i.d.
|
32 |
+
u/a
|
33 |
+
u**
|
34 |
+
y.o.
|
35 |
+
F/u
|
36 |
+
Crohn's
|
37 |
+
R.N.
|
38 |
+
S/p
|
39 |
+
S/P
|
40 |
+
s/P
|
41 |
+
N/A
|
42 |
+
n/a
|
43 |
+
N/a
|
44 |
+
n/A
|
45 |
+
w/
|
46 |
+
Pt.
|
47 |
+
pt.
|
48 |
+
PT.
|
49 |
+
cf.
|
50 |
+
CF.
|
51 |
+
Cf.
|
52 |
+
dr.
|
53 |
+
DR.
|
54 |
+
Dr.
|
55 |
+
ft.
|
56 |
+
FT.
|
57 |
+
Ft.
|
58 |
+
lt.
|
59 |
+
LT.
|
60 |
+
Lt.
|
61 |
+
mr.
|
62 |
+
MR.
|
63 |
+
Mr.
|
64 |
+
ms.
|
65 |
+
MS.
|
66 |
+
Ms.
|
67 |
+
mt.
|
68 |
+
MT.
|
69 |
+
Mt.
|
70 |
+
mx.
|
71 |
+
MX.
|
72 |
+
Mx.
|
73 |
+
ph.
|
74 |
+
PH.
|
75 |
+
Ph.
|
76 |
+
rd.
|
77 |
+
RD.
|
78 |
+
Rd.
|
79 |
+
st.
|
80 |
+
ST.
|
81 |
+
St.
|
82 |
+
vs.
|
83 |
+
VS.
|
84 |
+
Vs.
|
85 |
+
wm.
|
86 |
+
WM.
|
87 |
+
Wm.
|
ner_datasets/preprocessing/tokenizers/abbreviations/medical_abbreviations_wiki.txt
ADDED
@@ -0,0 +1,459 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
+ve
|
2 |
+
x/12
|
3 |
+
x/40
|
4 |
+
x/52
|
5 |
+
x/7
|
6 |
+
18F-FDG
|
7 |
+
2°
|
8 |
+
2/2
|
9 |
+
3TC
|
10 |
+
5-FU
|
11 |
+
5-HIAA
|
12 |
+
5-HT
|
13 |
+
6MP
|
14 |
+
a.a.
|
15 |
+
A1C
|
16 |
+
Aa.
|
17 |
+
AAOx3
|
18 |
+
A/B
|
19 |
+
a.c.
|
20 |
+
AC&BC
|
21 |
+
ad.
|
22 |
+
part.
|
23 |
+
A+E
|
24 |
+
AF-AFP
|
25 |
+
a.h.
|
26 |
+
altern.
|
27 |
+
d.
|
28 |
+
Anti-
|
29 |
+
A&O
|
30 |
+
A/O
|
31 |
+
A&Ox3
|
32 |
+
A&Ox4
|
33 |
+
a.p.
|
34 |
+
A&P
|
35 |
+
A/P
|
36 |
+
applic.
|
37 |
+
aq.
|
38 |
+
bull.
|
39 |
+
calid.
|
40 |
+
dist.
|
41 |
+
gel.
|
42 |
+
ASC-H
|
43 |
+
ASC-US
|
44 |
+
A-T
|
45 |
+
AT-III
|
46 |
+
aur.
|
47 |
+
dextro.
|
48 |
+
aurist.
|
49 |
+
A&W
|
50 |
+
A/W
|
51 |
+
b.i.d.
|
52 |
+
b/l
|
53 |
+
bl.cult
|
54 |
+
B/O
|
55 |
+
BRCA1
|
56 |
+
BRCA2
|
57 |
+
C1
|
58 |
+
C2
|
59 |
+
c/b
|
60 |
+
CBC/DIFF
|
61 |
+
C/C/E
|
62 |
+
CCK-PZ
|
63 |
+
CHEM-7
|
64 |
+
CHEM-20
|
65 |
+
C/O
|
66 |
+
c/o
|
67 |
+
CO2
|
68 |
+
COX-1
|
69 |
+
COX-2
|
70 |
+
COX-3
|
71 |
+
C/S
|
72 |
+
C&S
|
73 |
+
C-section
|
74 |
+
C-spine
|
75 |
+
C-SSRS
|
76 |
+
c/a/p
|
77 |
+
c/w
|
78 |
+
D5
|
79 |
+
D25
|
80 |
+
D4T
|
81 |
+
D5W
|
82 |
+
D&C
|
83 |
+
D/C
|
84 |
+
D&E
|
85 |
+
DHEA-S
|
86 |
+
Di-Di
|
87 |
+
DM2
|
88 |
+
D/O
|
89 |
+
D/T
|
90 |
+
Ex-n
|
91 |
+
F/C
|
92 |
+
F/C/S
|
93 |
+
FEF25–75
|
94 |
+
FEV1
|
95 |
+
fl.oz.
|
96 |
+
FTA-ABS
|
97 |
+
F/U
|
98 |
+
G6PD
|
99 |
+
G-CSF
|
100 |
+
GM-CSF
|
101 |
+
H/A
|
102 |
+
HbA1c
|
103 |
+
HCO3
|
104 |
+
HDL-C
|
105 |
+
H&E
|
106 |
+
H/H
|
107 |
+
H&H
|
108 |
+
H&M
|
109 |
+
HMG-CoA
|
110 |
+
H-mole
|
111 |
+
H/O
|
112 |
+
H&P
|
113 |
+
H/oPI
|
114 |
+
h.s.
|
115 |
+
I131
|
116 |
+
ICD-10
|
117 |
+
I&D
|
118 |
+
IgG4-RD
|
119 |
+
IgG4-RKD
|
120 |
+
IgG4-ROD
|
121 |
+
IgG4-TIN
|
122 |
+
INF(-α/-β/-γ)
|
123 |
+
I&O
|
124 |
+
IV-DSA
|
125 |
+
L&D
|
126 |
+
LDL-C
|
127 |
+
L-DOPA
|
128 |
+
L/S
|
129 |
+
MC&S
|
130 |
+
M/E
|
131 |
+
MgSO4
|
132 |
+
MHA-TP
|
133 |
+
M&M
|
134 |
+
MMR-D
|
135 |
+
Mono-Di
|
136 |
+
Mono-Mono
|
137 |
+
MS-AFP
|
138 |
+
MSO4
|
139 |
+
MVo2
|
140 |
+
No.
|
141 |
+
rep.
|
142 |
+
n.s.
|
143 |
+
n/t
|
144 |
+
N&V
|
145 |
+
n/v
|
146 |
+
O2
|
147 |
+
OB-GYN
|
148 |
+
ob-gyne
|
149 |
+
O/E
|
150 |
+
O/N
|
151 |
+
O&P
|
152 |
+
P&A
|
153 |
+
PAI-1
|
154 |
+
PAPP-A
|
155 |
+
p.c.
|
156 |
+
PIG-A
|
157 |
+
PM&R
|
158 |
+
p.r.
|
159 |
+
Pt.
|
160 |
+
p.v.
|
161 |
+
P-Y
|
162 |
+
q2wk
|
163 |
+
q6h
|
164 |
+
q6°
|
165 |
+
q.a.d.
|
166 |
+
q.AM
|
167 |
+
q.d.
|
168 |
+
q.d.s.
|
169 |
+
q.h.
|
170 |
+
q.h.s.
|
171 |
+
q.i.d.
|
172 |
+
q.l.
|
173 |
+
q.m.t.
|
174 |
+
q.n.
|
175 |
+
q.n.s.
|
176 |
+
q.o.d.
|
177 |
+
q.o.h.
|
178 |
+
q.s.
|
179 |
+
q.v.
|
180 |
+
q.wk.
|
181 |
+
r/g/m
|
182 |
+
R&M
|
183 |
+
R/O
|
184 |
+
r/r/w
|
185 |
+
R/t
|
186 |
+
RT-PCR
|
187 |
+
S1
|
188 |
+
S2
|
189 |
+
S3
|
190 |
+
S4
|
191 |
+
S&O
|
192 |
+
S.D.
|
193 |
+
op.
|
194 |
+
SMA-6
|
195 |
+
SMA-7
|
196 |
+
s/p
|
197 |
+
spp.
|
198 |
+
Sp.
|
199 |
+
fl.
|
200 |
+
gr.
|
201 |
+
S/S
|
202 |
+
S/Sx
|
203 |
+
Staph.
|
204 |
+
Strep.
|
205 |
+
Strepto.
|
206 |
+
T&A
|
207 |
+
T&C
|
208 |
+
T&S
|
209 |
+
TAH-BSO
|
210 |
+
T2DM
|
211 |
+
T/F
|
212 |
+
T&H
|
213 |
+
Tib-Fib
|
214 |
+
TRF'd
|
215 |
+
TSHR-Ab
|
216 |
+
T.S.T.H.
|
217 |
+
U/A
|
218 |
+
U&E
|
219 |
+
U/O
|
220 |
+
V-fib
|
221 |
+
V/Q
|
222 |
+
WAIS-R
|
223 |
+
W/C
|
224 |
+
WISC-R
|
225 |
+
W/O
|
226 |
+
w/o
|
227 |
+
w/u
|
228 |
+
X-AFP
|
229 |
+
y/o
|
230 |
+
a.c.h.s.
|
231 |
+
ac&hs
|
232 |
+
a.d.
|
233 |
+
ad.
|
234 |
+
add.
|
235 |
+
lib.
|
236 |
+
admov.
|
237 |
+
us.
|
238 |
+
æq.
|
239 |
+
agit.
|
240 |
+
alt.
|
241 |
+
d.
|
242 |
+
dieb.
|
243 |
+
h.
|
244 |
+
hor.
|
245 |
+
a.m.
|
246 |
+
amp.
|
247 |
+
com.
|
248 |
+
dest.
|
249 |
+
ferv.
|
250 |
+
a.l.
|
251 |
+
a.s.
|
252 |
+
a.u.
|
253 |
+
b.d.s.
|
254 |
+
bib.
|
255 |
+
b.i.d.
|
256 |
+
b.d.
|
257 |
+
ind.
|
258 |
+
bol.
|
259 |
+
Ph.Br.
|
260 |
+
b.t.
|
261 |
+
bucc.
|
262 |
+
cap.
|
263 |
+
caps.
|
264 |
+
cap.
|
265 |
+
c.m.
|
266 |
+
c.m.s.
|
267 |
+
c.
|
268 |
+
cib.
|
269 |
+
c.c.
|
270 |
+
cf.
|
271 |
+
c.n.
|
272 |
+
cochl.
|
273 |
+
ampl.
|
274 |
+
infant.
|
275 |
+
mag.
|
276 |
+
mod.
|
277 |
+
parv.
|
278 |
+
colet.
|
279 |
+
comp.
|
280 |
+
contin.
|
281 |
+
cpt.
|
282 |
+
cr.
|
283 |
+
cuj.
|
284 |
+
c.v.
|
285 |
+
cyath.
|
286 |
+
vinos.
|
287 |
+
D5LR
|
288 |
+
D5NS
|
289 |
+
D5W
|
290 |
+
D10W
|
291 |
+
D10W
|
292 |
+
D/C
|
293 |
+
decoct.
|
294 |
+
det.
|
295 |
+
dil.
|
296 |
+
dim.
|
297 |
+
p.
|
298 |
+
æ.
|
299 |
+
disp.
|
300 |
+
div.
|
301 |
+
d.t.d.
|
302 |
+
elix.
|
303 |
+
e.m.p.
|
304 |
+
emuls.
|
305 |
+
exhib.
|
306 |
+
f.
|
307 |
+
f.h.
|
308 |
+
fl.
|
309 |
+
fld.
|
310 |
+
f.m.
|
311 |
+
pil.
|
312 |
+
f.s.a.
|
313 |
+
ft.
|
314 |
+
garg.
|
315 |
+
gutt.
|
316 |
+
habt.
|
317 |
+
decub.
|
318 |
+
intermed.
|
319 |
+
tert.
|
320 |
+
inj.
|
321 |
+
i.m.
|
322 |
+
inf.
|
323 |
+
i.v.
|
324 |
+
i.v.p.
|
325 |
+
lat.
|
326 |
+
dol.
|
327 |
+
lb.
|
328 |
+
l.c.d.
|
329 |
+
liq.
|
330 |
+
lot.
|
331 |
+
M.
|
332 |
+
m.
|
333 |
+
max.
|
334 |
+
m.d.u.
|
335 |
+
mg/dL
|
336 |
+
min.
|
337 |
+
mist.
|
338 |
+
mit.
|
339 |
+
mitt.
|
340 |
+
præscript.
|
341 |
+
neb.
|
342 |
+
noct.
|
343 |
+
n.p.o.
|
344 |
+
1/2NS
|
345 |
+
o 2
|
346 |
+
o2
|
347 |
+
o.d.
|
348 |
+
o.m.
|
349 |
+
omn.
|
350 |
+
bih.
|
351 |
+
o.n.
|
352 |
+
o.s.
|
353 |
+
o.u.
|
354 |
+
p.c.h.s.
|
355 |
+
pc&hs
|
356 |
+
Ph.Br.
|
357 |
+
Ph.Eur.
|
358 |
+
Ph.Int.
|
359 |
+
pig./pigm.
|
360 |
+
p.m.
|
361 |
+
p.o.
|
362 |
+
ppt.
|
363 |
+
p.r.
|
364 |
+
p.r.n.
|
365 |
+
pt.
|
366 |
+
pulv.
|
367 |
+
p.v.
|
368 |
+
q.1
|
369 |
+
q.1°
|
370 |
+
q4PM
|
371 |
+
q.a.m.
|
372 |
+
q.d./q.1.d.
|
373 |
+
q.d.a.m.
|
374 |
+
q.d.p.m.
|
375 |
+
q.p.m.
|
376 |
+
q.q.
|
377 |
+
q.q.h.
|
378 |
+
a.d
|
379 |
+
rep.
|
380 |
+
rept.
|
381 |
+
R/L
|
382 |
+
s.
|
383 |
+
s.a.
|
384 |
+
sem.
|
385 |
+
s.i.d.
|
386 |
+
sig.
|
387 |
+
sing.
|
388 |
+
s.l.
|
389 |
+
sol.
|
390 |
+
s.o.s.
|
391 |
+
s.s.
|
392 |
+
st.
|
393 |
+
sum.
|
394 |
+
supp.
|
395 |
+
susp.
|
396 |
+
syr.
|
397 |
+
tab.
|
398 |
+
tal.
|
399 |
+
t.
|
400 |
+
t.d.s.
|
401 |
+
t.i.d.
|
402 |
+
t.d.
|
403 |
+
tinct.
|
404 |
+
t.i.w.
|
405 |
+
top.
|
406 |
+
tinc.
|
407 |
+
trit.
|
408 |
+
troch.
|
409 |
+
u.d.
|
410 |
+
ut.
|
411 |
+
dict.
|
412 |
+
ung.
|
413 |
+
vag.
|
414 |
+
w/a
|
415 |
+
w/f
|
416 |
+
y.o.
|
417 |
+
ADD-RT
|
418 |
+
A-T
|
419 |
+
PDD-NOS
|
420 |
+
Alzheimer's
|
421 |
+
Age-related
|
422 |
+
Aldosterone-producing
|
423 |
+
Alcohol-related
|
424 |
+
Ataxia-telangiectasia
|
425 |
+
Binswanger's
|
426 |
+
Becker's
|
427 |
+
Bloom's
|
428 |
+
Brown-Séquard
|
429 |
+
Crimean-Congo
|
430 |
+
Cerebro-oculo-facio-skeletal
|
431 |
+
Carbapenem-resistant
|
432 |
+
Drug-resistant
|
433 |
+
End-stage
|
434 |
+
Graft-versus-host
|
435 |
+
Huntington's
|
436 |
+
High-functioning
|
437 |
+
Hypoxanthine-guanine
|
438 |
+
Legionnaires'
|
439 |
+
Low-functioning
|
440 |
+
Multi-drug-resistant
|
441 |
+
Multi-infarct
|
442 |
+
Machado-Joseph
|
443 |
+
Maturity-onset
|
444 |
+
Multi-sensory
|
445 |
+
Obsessive-compulsive
|
446 |
+
Parkinson's
|
447 |
+
kinase-associated
|
448 |
+
Post-polio
|
449 |
+
Port-wine
|
450 |
+
Reye's
|
451 |
+
Sensory-based
|
452 |
+
Vitus's
|
453 |
+
Septo-optic
|
454 |
+
ST-elevation
|
455 |
+
Short-lasting
|
456 |
+
Urticaria-deafness-amyloidosis
|
457 |
+
Wilson's
|
458 |
+
drug-resistant
|
459 |
+
X-linked
|
ner_datasets/preprocessing/tokenizers/clinical_spacy_tokenizer.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import spacy
|
3 |
+
from spacy.symbols import ORTH
|
4 |
+
from .spacy_tokenizer import SpacyTokenizer
|
5 |
+
from .utils import DateRegex, CleanRegex, ClinicalRegex
|
6 |
+
|
7 |
+
|
8 |
+
class ClinicalSpacyTokenizer(SpacyTokenizer):
|
9 |
+
"""
|
10 |
+
This class is used to read text and return the tokens
|
11 |
+
present in the text (and their start and end positions)
|
12 |
+
"""
|
13 |
+
|
14 |
+
def __init__(self, spacy_model, abbreviations,
|
15 |
+
split_multiple=True, split_temperature=True,
|
16 |
+
split_percentage=True):
|
17 |
+
"""
|
18 |
+
Initialize a spacy model to read text and split it into
|
19 |
+
tokens.
|
20 |
+
Args:
|
21 |
+
spacy_model (str): Name of the spacy model
|
22 |
+
"""
|
23 |
+
super().__init__(spacy_model)
|
24 |
+
self._nlp.tokenizer.prefix_search = self.__get_prefix_regex(split_multiple, split_temperature,
|
25 |
+
split_percentage).search
|
26 |
+
self._nlp.tokenizer.infix_finditer = self.__get_infix_regex().finditer
|
27 |
+
self._nlp.tokenizer.suffix_search = self.__get_suffix_regex().search
|
28 |
+
new_rules = {}
|
29 |
+
for orth, exc in self._nlp.tokenizer.rules.items():
|
30 |
+
if re.search('((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)[.]$)|(^(W|w)ed$)', orth):
|
31 |
+
continue
|
32 |
+
new_rules[orth] = exc
|
33 |
+
self._nlp.tokenizer.rules = new_rules
|
34 |
+
if (abbreviations != None):
|
35 |
+
for abbreviation in abbreviations:
|
36 |
+
special_case = [{ORTH: abbreviation}]
|
37 |
+
self._nlp.tokenizer.add_special_case(abbreviation, special_case)
|
38 |
+
# this matches any lower case tokens - abstract this part out - whetehr to lowercase abbreviations ro not
|
39 |
+
exclusions_uncased = {abbreviation.lower(): [{ORTH: abbreviation.lower()}] for abbreviation in
|
40 |
+
abbreviations}
|
41 |
+
for k, excl in exclusions_uncased.items():
|
42 |
+
try:
|
43 |
+
self._nlp.tokenizer.add_special_case(k, excl)
|
44 |
+
except:
|
45 |
+
print('failed to add exception: {}'.format(k))
|
46 |
+
|
47 |
+
def __get_prefix_regex(self, split_multiple, split_temperature, split_percentage):
|
48 |
+
|
49 |
+
date_prefix = DateRegex.get_infixes()
|
50 |
+
clinical_prefix = ClinicalRegex.get_prefixes(split_multiple, split_temperature, split_percentage)
|
51 |
+
clean_prefix = CleanRegex.get_prefixes()
|
52 |
+
digit_infix = ClinicalRegex.get_infixes()
|
53 |
+
prefixes = clean_prefix + self._nlp.Defaults.prefixes + date_prefix + clinical_prefix + digit_infix
|
54 |
+
prefix_regex = spacy.util.compile_prefix_regex(prefixes)
|
55 |
+
return prefix_regex
|
56 |
+
|
57 |
+
def __get_suffix_regex(self):
|
58 |
+
clean_suffix = CleanRegex.get_suffixes()
|
59 |
+
suffixes = clean_suffix + self._nlp.Defaults.suffixes
|
60 |
+
suffix_regex = spacy.util.compile_suffix_regex(suffixes)
|
61 |
+
return suffix_regex
|
62 |
+
|
63 |
+
def __get_infix_regex(self):
|
64 |
+
|
65 |
+
date_infixes = DateRegex.get_infixes()
|
66 |
+
clean_infixes = CleanRegex.get_infixes()
|
67 |
+
digit_infix = ClinicalRegex.get_infixes()
|
68 |
+
infixes = self._nlp.Defaults.infixes + date_infixes + clean_infixes
|
69 |
+
infix_re = spacy.util.compile_infix_regex(infixes)
|
70 |
+
return infix_re
|
71 |
+
|
72 |
+
def get_nlp(self):
|
73 |
+
return self._nlp
|
ner_datasets/preprocessing/tokenizers/core_nlp_tokenizer.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from typing import Iterable, Mapping, Dict, Union
|
3 |
+
|
4 |
+
from pycorenlp import StanfordCoreNLP
|
5 |
+
|
6 |
+
|
7 |
+
class CoreNLPTokenizer(object):
|
8 |
+
"""
|
9 |
+
This class is used to read text and return the tokens
|
10 |
+
present in the text (and their start and end positions)
|
11 |
+
using core nlp tokenization
|
12 |
+
"""
|
13 |
+
|
14 |
+
def __init__(self, port: int = 9000):
|
15 |
+
"""
|
16 |
+
Initialize a core nlp server to read text and split it into
|
17 |
+
tokens using the core nlp annotators
|
18 |
+
Args:
|
19 |
+
port (int): The port to run the server on
|
20 |
+
"""
|
21 |
+
self._core_nlp = StanfordCoreNLP('http://localhost:{0}'.format(port))
|
22 |
+
|
23 |
+
def get_stanford_annotations(self, text: str, annotators: str = 'tokenize,ssplit,pos,lemma') -> Dict:
|
24 |
+
"""
|
25 |
+
Use the core nlp server to annotate the text and return the
|
26 |
+
results as a json object
|
27 |
+
Args:
|
28 |
+
text (str): The text to annotate
|
29 |
+
annotators (str): The core nlp annotations to run on the text
|
30 |
+
Returns:
|
31 |
+
output (Dict): The core nlp results
|
32 |
+
"""
|
33 |
+
output = self._core_nlp.annotate(text, properties={
|
34 |
+
"timeout": "50000",
|
35 |
+
"ssplit.newlineIsSentenceBreak": "two",
|
36 |
+
'annotators': annotators,
|
37 |
+
'outputFormat': 'json'
|
38 |
+
})
|
39 |
+
if type(output) is str:
|
40 |
+
output = json.loads(output, strict=False)
|
41 |
+
return output
|
42 |
+
|
43 |
+
def get_tokens(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
|
44 |
+
"""
|
45 |
+
Return an iterable that iterates through the tokens in the text
|
46 |
+
Args:
|
47 |
+
text (str): The text to annotate
|
48 |
+
Returns:
|
49 |
+
(Iterable[Mapping[str, Union[str, int]]]): Yields a dictionary that contains the text of the token
|
50 |
+
the start position of the token in the entire text
|
51 |
+
and the end position of the token in the entire text
|
52 |
+
"""
|
53 |
+
stanford_output = self.get_stanford_annotations(text)
|
54 |
+
for sentence in stanford_output['sentences']:
|
55 |
+
for token in sentence['tokens']:
|
56 |
+
yield {'text': token['originalText'],
|
57 |
+
'start': token['characterOffsetBegin'],
|
58 |
+
'end': token['characterOffsetEnd']}
|
ner_datasets/preprocessing/tokenizers/spacy_tokenizer.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
from typing import Tuple, Iterable, Mapping, Dict, Union
|
3 |
+
|
4 |
+
|
5 |
+
class SpacyTokenizer(object):
|
6 |
+
"""
|
7 |
+
This class is used to read text and return the tokens
|
8 |
+
present in the text (and their start and end positions)
|
9 |
+
using spacy
|
10 |
+
"""
|
11 |
+
|
12 |
+
def __init__(self, spacy_model: str):
|
13 |
+
"""
|
14 |
+
Initialize a spacy model to read text and split it into
|
15 |
+
tokens.
|
16 |
+
Args:
|
17 |
+
spacy_model (str): Name of the spacy model
|
18 |
+
"""
|
19 |
+
self._nlp = spacy.load(spacy_model)
|
20 |
+
|
21 |
+
@staticmethod
|
22 |
+
def __get_start_and_end_offset(token: spacy.tokens.Token) -> Tuple[int, int]:
|
23 |
+
"""
|
24 |
+
Return the start position of the token in the entire text
|
25 |
+
and the end position of the token in the entire text
|
26 |
+
Args:
|
27 |
+
token (spacy.tokens.Token): The spacy token object
|
28 |
+
Returns:
|
29 |
+
start (int): the start position of the token in the entire text
|
30 |
+
end (int): the end position of the token in the entire text
|
31 |
+
"""
|
32 |
+
start = token.idx
|
33 |
+
end = start + len(token)
|
34 |
+
return start, end
|
35 |
+
|
36 |
+
def get_tokens(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
|
37 |
+
"""
|
38 |
+
Return an iterable that iterates through the tokens in the text
|
39 |
+
Args:
|
40 |
+
text (str): The text to annotate
|
41 |
+
Returns:
|
42 |
+
(Iterable[Mapping[str, Union[str, int]]]): Yields a dictionary that contains the text of the token
|
43 |
+
the start position of the token in the entire text
|
44 |
+
and the end position of the token in the entire text
|
45 |
+
"""
|
46 |
+
document = self._nlp(text)
|
47 |
+
for token in document:
|
48 |
+
start, end = SpacyTokenizer.__get_start_and_end_offset(token)
|
49 |
+
yield {'text': token.text, 'start': start, 'end': end}
|
ner_datasets/preprocessing/tokenizers/utils/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .date_regex import DateRegex
|
2 |
+
from .clean_regex import CleanRegex
|
3 |
+
from .clinical_regex import ClinicalRegex
|
4 |
+
__all__=["DateRegex", "CleanRegex", "ClinicalRegex"]
|
ner_datasets/preprocessing/tokenizers/utils/clean_regex.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
class CleanRegex(object):
|
3 |
+
"""
|
4 |
+
This class is used to define the regexes that will be used by the
|
5 |
+
spacy tokenizer rules. Mainly the regexes are used to clean up
|
6 |
+
tokens that have unwanted characters (e.g extra hyphens).
|
7 |
+
"""
|
8 |
+
#Staff - 3
|
9 |
+
#Hosp - 4, 5
|
10 |
+
#Loc - 2
|
11 |
+
@staticmethod
|
12 |
+
def get_prefixes() -> List[str]:
|
13 |
+
"""
|
14 |
+
This function is used to build the regex that will clean up dirty characters
|
15 |
+
present at the prefix position (start position) of a token. For example the token ---clean
|
16 |
+
has three hyphens that need to be split from the word clean. This regex
|
17 |
+
will be used by spacy to clean it up. This rule considers any characters that is
|
18 |
+
not a letter or a digit as dirty characters
|
19 |
+
Examples: ----------------9/36, :63, -ESH
|
20 |
+
Returns:
|
21 |
+
(list): List of regexes to clean the prefix of the token
|
22 |
+
"""
|
23 |
+
#Handles case 5 of HOSP
|
24 |
+
return ['((?P<prefix>([^a-zA-Z0-9.]))(?P=prefix)*)', '([.])(?!\d+(\W+|$))']
|
25 |
+
|
26 |
+
@staticmethod
|
27 |
+
def get_suffixes() -> List[str]:
|
28 |
+
"""
|
29 |
+
This function is used to build the regex that will clean up dirty characters
|
30 |
+
present at the suffix position (end position) of a token. For example the token clean---
|
31 |
+
has three hyphens that need to be split from the word clean. This regex
|
32 |
+
will be used by spacy to clean it up. This rule considers any characters that is
|
33 |
+
not a letter or a digit as dirty characters
|
34 |
+
Examples: FRANK^, regimen---------------, no)
|
35 |
+
Returns:
|
36 |
+
(list): List of regexes to clean the suffix of the token
|
37 |
+
"""
|
38 |
+
return ['((?P<suffix>([^a-zA-Z0-9]))(?P=suffix)*)']
|
39 |
+
|
40 |
+
@staticmethod
|
41 |
+
def get_infixes() -> List[str]:
|
42 |
+
"""
|
43 |
+
This function is used to build the regex that will clean up dirty characters
|
44 |
+
present at the infix position (in-between position) of a token. For example the token
|
45 |
+
clean---me has three hyphens that need to be split from the word clean and me. This regex
|
46 |
+
will be used by spacy to clean it up. This rule considers any characters that is
|
47 |
+
not a letter or a digit as dirty characters
|
48 |
+
Examples: FRANK^08/30/76^UNDERWOOD, regimen---------------1/37
|
49 |
+
Returns:
|
50 |
+
(list): List of regexes to clean the infix of the token
|
51 |
+
"""
|
52 |
+
#Handles case 3 of STAFF
|
53 |
+
#Handles case 4 of HOSP
|
54 |
+
#Handles case 2 of LOC
|
55 |
+
connector_clean = '\^|;|&#|([\(\)\[\]:="])'
|
56 |
+
#full_stop_clean = '(?<=[a-zA-Z])(\.)(?=([A-Z][A-Za-z]+)|[^a-zA-Z0-9_.]+)'
|
57 |
+
bracket_comma_clean = '(((?<=\d)[,)(](?=[a-zA-Z]+))|((?<=[a-zA-Z])[,)(](?=\w+)))'
|
58 |
+
#special_char_clean = '(?<=[a-zA-Z])(\W{3,}|[_]{3,})(?=[A-Za-z]+)'
|
59 |
+
special_char_clean = '(?<=[a-zA-Z])([_\W_]{3,})(?=[A-Za-z]+)'
|
60 |
+
#Sometimes when there is no space between a period and a comma - it becomes part of the same token
|
61 |
+
#e.g John.,M.D - we need to split this up.
|
62 |
+
comma_period_clean = '(?<=[a-zA-Z])(\.,)(?=[A-Za-z]+)'
|
63 |
+
|
64 |
+
return [connector_clean, bracket_comma_clean, special_char_clean, comma_period_clean]
|
ner_datasets/preprocessing/tokenizers/utils/clinical_regex.py
ADDED
@@ -0,0 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
class ClinicalRegex(object):
|
3 |
+
"""
|
4 |
+
This class is used to define the regexes that will be used by the
|
5 |
+
spacy tokenizer rules. Mainly the regexes are used to clean up
|
6 |
+
tokens that have unwanted characters and typos (e.g missing spaces).
|
7 |
+
In the descriptions when we mention symbol we refer to any character
|
8 |
+
that is not a letter or a digit or underscore. The spacy tokenizer splits
|
9 |
+
the text by whitespace and applies these rules (along with some default rules)
|
10 |
+
to the indiviudal tokens.
|
11 |
+
"""
|
12 |
+
#Patient - 2, 3, 5
|
13 |
+
#Staff - 1, 2
|
14 |
+
#Hosp - 2, 3
|
15 |
+
#Loc - 1, 3
|
16 |
+
@staticmethod
|
17 |
+
def get_word_typo_prefix():
|
18 |
+
"""
|
19 |
+
If token contains a typo. What we mean by a typo is when two tokens
|
20 |
+
that should be separate tokens are fused into one token because there
|
21 |
+
is a missing space.
|
22 |
+
Examples: JohnMarital Status - John is the name that is fused into the
|
23 |
+
token Marital because of a missing space.
|
24 |
+
The regex checks if we have a sequence of characters followed by another
|
25 |
+
sequence of characters that starts with a capital letter, followed by two or
|
26 |
+
more small letters, we assume this is a typo and split the tokens (two sequences) up.
|
27 |
+
If there is a symbol separating the two sequences, we ease the condition saying
|
28 |
+
the Cpaital letter can be followed by two or more capital/small letters.
|
29 |
+
Returns:
|
30 |
+
(str): regex to clean tokens that are fused because of a missing space
|
31 |
+
"""
|
32 |
+
#Handles cases 2 of PATIENT
|
33 |
+
#Handles cases 1 & 2 of STAFF
|
34 |
+
#Handles cases 2 & 3 of HOSP
|
35 |
+
#Handles cases 1 & 3 of LOC
|
36 |
+
#'(([a-z]+)|([A-Z]+)|([A-Z][a-z]+))(?=(([-./]*[A-Z][a-z]{2,})|([-./]+[A-Z][a-zA-Z]{2,})))'
|
37 |
+
return '(([a-z]+)|([A-Z]{2,})|([A-Z][a-z]+))(?=(([-./]*[A-Z][a-z]{2,})|([-./]+[A-Z][a-zA-Z]{2,})))'
|
38 |
+
|
39 |
+
@staticmethod
|
40 |
+
def get_word_symbol_digit_prefix() -> str:
|
41 |
+
"""
|
42 |
+
If text is followed by one or more symbols and then followed by one or more digits
|
43 |
+
we make the assumption that the text is a seperate token. Spacy will use this regex
|
44 |
+
to extract the text portion as one token and will then move on to
|
45 |
+
process the rest (symbol and tokens) based on the defined rules.
|
46 |
+
Examples: Yang(4986231) - "Yang" will become a seperate token & "(4986231)" will
|
47 |
+
be processed as new token
|
48 |
+
Returns:
|
49 |
+
(str): regex to clean text followed by symbols followed by digits
|
50 |
+
"""
|
51 |
+
#Handles cases 3 & 5 of patient
|
52 |
+
return '([a-zA-Z]+)(?=\W+\d+)'
|
53 |
+
|
54 |
+
@staticmethod
|
55 |
+
def get_multiple_prefix(split_multiple: bool) -> str:
|
56 |
+
"""
|
57 |
+
If text is of the format take it x2 times, this function
|
58 |
+
can be used to treat the entire thing as one token or
|
59 |
+
split into two seperate tokens
|
60 |
+
Args:
|
61 |
+
split_multiple (bool): whether to treat it as one token or split them up
|
62 |
+
Returns:
|
63 |
+
(str): regex to either keep as one token or split into two
|
64 |
+
"""
|
65 |
+
if(split_multiple):
|
66 |
+
return '([x])(?=(\d{1,2}$))'
|
67 |
+
else:
|
68 |
+
return '[x]\d{1,2}$'
|
69 |
+
|
70 |
+
@staticmethod
|
71 |
+
def get_pager_prefix():
|
72 |
+
return '([pXxPb])(?=(\d{4,}|\d+[-]\d+))'
|
73 |
+
|
74 |
+
@staticmethod
|
75 |
+
def get_age_word_prefix():
|
76 |
+
return '([MFmf])(?=\d{2,3}(\W+|$))'
|
77 |
+
|
78 |
+
@staticmethod
|
79 |
+
def get_id_prefix():
|
80 |
+
return '(ID|id|Id)(?=\d{3,})'
|
81 |
+
|
82 |
+
@staticmethod
|
83 |
+
def get_word_period_prefix():
|
84 |
+
return '((cf|CF|Cf|dr|DR|Dr|ft|FT|Ft|lt|LT|Lt|mr|MR|Mr|ms|MS|Ms|mt|MT|Mt|mx|MX|Mx|ph|PH|Ph|rd|RD|Rd|st|ST|St|vs|VS|Vs|wm|WM|Wm|[A-Za-z]{1})[.])(?=((\W+|$)))'
|
85 |
+
|
86 |
+
@staticmethod
|
87 |
+
def get_chemical_prefix():
|
88 |
+
#Vitamin B12 T9 or maybe codes like I48.9- should probaly do \d{1,2} - limit arbitary numbers
|
89 |
+
"""
|
90 |
+
There are certain chemicals, vitamins etc that should not be split. They
|
91 |
+
should be kept as a single token - for example the token "B12" in
|
92 |
+
"Vitamin B12". This regex checks if there is a single capital letter
|
93 |
+
followed by some digits (there can be a hyphen in between those digits)
|
94 |
+
then this most likely represents a token that should not be split
|
95 |
+
Returns:
|
96 |
+
(str): regex to keep vitamin/chemical names as a single token
|
97 |
+
"""
|
98 |
+
#return '((\d)?[A-EG-LN-OQ-WYZ]{1}\d+([.]\d+)?(-\d{1,2})*)(?=(([\(\)\[\]:="])|\W*$))'
|
99 |
+
return '((\d)?[A-EG-LN-OQ-WYZ]{1}\d+([.]\d+)?(-\d+)*)(?=(([\(\)\[\]:="])|\W*$))'
|
100 |
+
|
101 |
+
@staticmethod
|
102 |
+
def get_chemical_prefix_small():
|
103 |
+
#Vitamin B12 T9 or maybe codes like I48.9- should probaly do \d{1,2} - limit arbitary numbers
|
104 |
+
"""
|
105 |
+
There are certain chemicals, vitamins etc that should not be split. They
|
106 |
+
should be kept as a single token - for example the token "B12" in
|
107 |
+
"Vitamin B12". This regex checks if there is a single capital letter
|
108 |
+
followed by some digits (there can be a hyphen in between those digits)
|
109 |
+
then this most likely represents a token that should not be split
|
110 |
+
Returns:
|
111 |
+
(str): regex to keep vitamin/chemical names as a single token
|
112 |
+
"""
|
113 |
+
#return '((\d)?[A-EG-LN-OQ-WYZ]{1}\d+([.]\d+)?(-\d{1,2})*)(?=(([\(\)\[\]:="])|\W*$))'
|
114 |
+
return '((\d)?[a-eg-ln-oq-wyz]{1}\d+([.]\d+)?(-\d+)*)(?=(([\(\)\[\]:="])|\W*$))'
|
115 |
+
|
116 |
+
@staticmethod
|
117 |
+
def get_instrument_prefix():
|
118 |
+
"""
|
119 |
+
There are cases when there are tokens like L1-L2-L3, we want to keep these as one
|
120 |
+
single token. This regex checks if there is a capital letter
|
121 |
+
Returns:
|
122 |
+
(str): regex to keep vitamin/chemical names as a single token
|
123 |
+
"""
|
124 |
+
return '([A-Z]{1,2}\d+(?P<instrument>[-:]+)[A-Z]{1,2}\d+((?P=instrument)[A-Z]{1,2}\d+)*)'
|
125 |
+
|
126 |
+
@staticmethod
|
127 |
+
def get_instrument_prefix_small():
|
128 |
+
"""
|
129 |
+
There are cases when there are tokens like L1-L2-L3, we want to keep these as one
|
130 |
+
single token. This regex checks if there is a capital letter
|
131 |
+
Returns:
|
132 |
+
(str): regex to keep vitamin/chemical names as a single token
|
133 |
+
"""
|
134 |
+
return '([a-z]{1,2}\d+(?P<instrument_small>[-:]+)[a-z]{1,2}\d+((?P=instrument_small)[a-z]{1,2}\d+)*)'
|
135 |
+
|
136 |
+
#Handles Case 3, 4, 5 of MRN
|
137 |
+
#Handles Case 1, 2, 3 of PHONE
|
138 |
+
#Handles Case 7, 10 of AGE
|
139 |
+
#Handles Case 1 of IDNUM
|
140 |
+
#Handles Case 3, 5 of PATIENT
|
141 |
+
#Handles Case 7 of HOSP
|
142 |
+
#Handles Case 1 of General
|
143 |
+
@staticmethod
|
144 |
+
def get_age_typo_prefix():
|
145 |
+
"""
|
146 |
+
There are cases when there is no space between the text and the age
|
147 |
+
Example: Plan88yo - we want Plan to be a seperate token
|
148 |
+
Returns:
|
149 |
+
(str):
|
150 |
+
"""
|
151 |
+
age_suffix = '(([yY][eE][aA][rR]|[yY][oO]' + \
|
152 |
+
'|[yY][rR]|[yY]\.[oO]|[yY]/[oO]|[fF]|[mM]|[yY])' + \
|
153 |
+
'(-)*([o|O][l|L][d|D]|[f|F]|[m|M]|[o|O])?)'
|
154 |
+
return '([a-zA-Z]+)(?=((\d{1,3})' + age_suffix + '$))'
|
155 |
+
|
156 |
+
@staticmethod
|
157 |
+
def get_word_digit_split_prefix():
|
158 |
+
#Word followed by more than 3 digits - might not be part of the same token
|
159 |
+
#and could be a typo
|
160 |
+
#This need not be true - maybe we have an id like BFPI980801 - this will be split
|
161 |
+
#BFPI 980801 - but it might be okay to split - need to check
|
162 |
+
#([A-Z][a-z]{2,})(?=\d+)
|
163 |
+
return '([A-Z][a-z]{2,})(?=[A-Za-z]*\d+)'
|
164 |
+
|
165 |
+
@staticmethod
|
166 |
+
def get_word_digit_mix_prefix():
|
167 |
+
#Mix of letters and characters - most likely a typo if the
|
168 |
+
#following characters is a capital letter followed by more than
|
169 |
+
#2 small letters
|
170 |
+
#return '([A-Z]+\d+([A-Z]+(?!([a-z]{2,}))))(?=(\W+|([A-Z][a-z]{2,})|[a-z]{3,}))'
|
171 |
+
return '([A-Z]+\d+)(?=(\W+|([A-Z][a-z]{2,})|[a-z]{3,}))'
|
172 |
+
|
173 |
+
@staticmethod
|
174 |
+
def get_word_digit_mix_prefix_small():
|
175 |
+
#Mix of letters and characters - most likely a typo if the
|
176 |
+
#following characters is a capital letter followed by more than
|
177 |
+
#2 small letters
|
178 |
+
return '([a-z]+\d+)(?=(\W+|[A-Z][a-z]{2,}|[A-Z]{3,}))'
|
179 |
+
|
180 |
+
@staticmethod
|
181 |
+
def get_word_id_split_prefix():
|
182 |
+
return '([a-zA-Z]+)(?=(\d+[-./]+(\d+|$)))'
|
183 |
+
|
184 |
+
@staticmethod
|
185 |
+
def get_word_section_prefix():
|
186 |
+
#Fix JOHNID/CC - missing space from previous section - JOHN
|
187 |
+
return '([A-Za-z]+)(?=(((?P<slash>[/:]+)[A-Za-z]+)((?P=slash)[A-Za-z]+)*\W+\d+))'
|
188 |
+
|
189 |
+
@staticmethod
|
190 |
+
def get_colon_prefix():
|
191 |
+
#Split tokens before and after the token
|
192 |
+
#Does not split time - we make sure the token ebfore the colon
|
193 |
+
#starts with a letter.
|
194 |
+
#Splits patterns like <CHAR 1>:<CHAR 2> where CHAR 1 starts with a
|
195 |
+
#letter and is followed by one more letters/digits
|
196 |
+
#CHAR 2 is a combination of letters/digits of length greater than 2
|
197 |
+
#This wont split time, but assumes that when the colon is present
|
198 |
+
#the entities on either side of the token are different tokens
|
199 |
+
#A:9 - not split - more likely this makes sense as a single token (could be a chemical)
|
200 |
+
return '([A-Za-z][A-Za-z0-9]+)(?=([:][A-Za-z0-9]{2,}))'
|
201 |
+
|
202 |
+
@staticmethod
|
203 |
+
def get_temperature_prefix(split_temperature):
|
204 |
+
if(split_temperature):
|
205 |
+
return '((\d+)|(\d+[.]\d+))(?=(\u00B0([FCK]{1}|$)))'
|
206 |
+
else:
|
207 |
+
return '(((\d+)|(\d+[.]\d+))\u00B0([FCK]{1}|$))|(\u00A9[FCK]{1})'
|
208 |
+
|
209 |
+
@staticmethod
|
210 |
+
def get_percentage_prefix(split_percentage):
|
211 |
+
"""
|
212 |
+
If text is of the format take it 20% times, this function
|
213 |
+
can be used to treat the entire thing as one token or
|
214 |
+
split into two seperate tokens
|
215 |
+
Args:
|
216 |
+
split_percentage (bool): whether to treat it as one token or split them up
|
217 |
+
Returns:
|
218 |
+
(str): regex to either keep as one token or split into two
|
219 |
+
"""
|
220 |
+
if(split_percentage):
|
221 |
+
return '(((\d+)|(\d+[.]\d+)))(?=(%(\W+|$)))'
|
222 |
+
else:
|
223 |
+
return '(((\d+)|(\d+[.]\d+))%(\W+|$))'
|
224 |
+
|
225 |
+
@staticmethod
|
226 |
+
def get_value_range_prefixes():
|
227 |
+
#The following regex might not work on .4-.5 - no number before decimal point
|
228 |
+
#need to figure this out without breaking anything else
|
229 |
+
value_range_1 = '(\d{1})(?=([-]((\d{1,2}|(\d+)[.](\d+)))([a-zA-Z]+|[\W]*$)))'
|
230 |
+
value_range_2 = '(\d{2})(?=([-]((\d{2,3}|(\d+)[.](\d+)))([a-zA-Z]+|[\W]*$)))'
|
231 |
+
value_range_3 = '(\d{3})(?=([-]((\d{3}|(\d+)[.](\d+)))([a-zA-Z]+|[\W]*$)))'
|
232 |
+
return value_range_1, value_range_2, value_range_3
|
233 |
+
|
234 |
+
@staticmethod
|
235 |
+
def get_year_range_prefix():
|
236 |
+
return '(\d{4})(?=([-](\d{4})([a-zA-Z]+|[\W]*$)))'
|
237 |
+
|
238 |
+
@staticmethod
|
239 |
+
def get_short_digit_id_prefix():
|
240 |
+
#4A, 3C etc
|
241 |
+
return '(\d{1,2}[A-EG-LN-WZ]{1}(?=(\W+|$)))'
|
242 |
+
|
243 |
+
#Handles Case 1, 2 of MRN
|
244 |
+
#Handles Case 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19 of AGE
|
245 |
+
#Handles Case 2, 3, 5 of IDNUM
|
246 |
+
#Handles Case 1 of HOSP
|
247 |
+
@staticmethod
|
248 |
+
def get_digit_symbol_word_prefix():
|
249 |
+
return '((\d+)|(\d+[.]\d+))(?=\W+[a-zA-Z]+)'
|
250 |
+
|
251 |
+
@staticmethod
|
252 |
+
def get_digit_age_split_prefix():
|
253 |
+
age_suffix = '(([yY][eE][aA][rR]|[yY][oO]' + \
|
254 |
+
'|[yY][rR]|[yY]\.[oO]|[yY]/[oO]|[fF]|[mM]|[yY])' + \
|
255 |
+
'(-)*([o|O][l|L][d|D]|[f|F]|[m|M]|[o|O])?)'
|
256 |
+
return '((\d{1,3}))(?=(' + age_suffix + '\W*$))'
|
257 |
+
|
258 |
+
@staticmethod
|
259 |
+
def get_digit_word_short_prefix():
|
260 |
+
return '((\d+)|(\d+[.]\d+))([a-z]{1,2}|[A-Z]{1,2})(?=(\W*$))'
|
261 |
+
|
262 |
+
@staticmethod
|
263 |
+
def get_digit_word_typo_prefix():
|
264 |
+
return '((\d+)|(\d+[.]\d+))(?=[a-zA-Z]{1}[a-zA-Z\W]+)'
|
265 |
+
|
266 |
+
@staticmethod
|
267 |
+
def get_prefixes(split_multiple, split_temperature, split_percentage):
|
268 |
+
word_typo_prefix = ClinicalRegex.get_word_typo_prefix()
|
269 |
+
word_symbol_digit_prefix = ClinicalRegex.get_word_symbol_digit_prefix()
|
270 |
+
pager_prefix = ClinicalRegex.get_pager_prefix()
|
271 |
+
age_word_prefix = ClinicalRegex.get_age_word_prefix()
|
272 |
+
word_period_prefix = ClinicalRegex.get_word_period_prefix()
|
273 |
+
id_prefix = ClinicalRegex.get_id_prefix()
|
274 |
+
multiple_prefix = ClinicalRegex.get_multiple_prefix(split_multiple)
|
275 |
+
chemical_prefix = ClinicalRegex.get_chemical_prefix()
|
276 |
+
chemical_prefix_small = ClinicalRegex.get_chemical_prefix_small()
|
277 |
+
instrument_prefix = ClinicalRegex.get_instrument_prefix()
|
278 |
+
instrument_prefix_small = ClinicalRegex.get_instrument_prefix_small()
|
279 |
+
age_typo_prefix = ClinicalRegex.get_age_typo_prefix()
|
280 |
+
word_digit_split_prefix = ClinicalRegex.get_word_digit_split_prefix()
|
281 |
+
word_digit_mix_prefix = ClinicalRegex.get_word_digit_mix_prefix()
|
282 |
+
word_digit_mix_prefix_small = ClinicalRegex.get_word_digit_mix_prefix_small()
|
283 |
+
word_id_split_prefix = ClinicalRegex.get_word_id_split_prefix()
|
284 |
+
word_section_prefix = ClinicalRegex.get_word_section_prefix()
|
285 |
+
colon_prefix = ClinicalRegex.get_colon_prefix()
|
286 |
+
temperature_prefix = ClinicalRegex.get_temperature_prefix(split_temperature)
|
287 |
+
percentage_prefix = ClinicalRegex.get_percentage_prefix(split_percentage)
|
288 |
+
value_range_1, value_range_2, value_range_3 = ClinicalRegex.get_value_range_prefixes()
|
289 |
+
year_range_prefix = ClinicalRegex.get_year_range_prefix()
|
290 |
+
short_digit_id_prefix = ClinicalRegex.get_short_digit_id_prefix()
|
291 |
+
digit_symbol_word_prefix = ClinicalRegex.get_digit_symbol_word_prefix()
|
292 |
+
digit_age_split_prefix = ClinicalRegex.get_digit_age_split_prefix()
|
293 |
+
digit_word_short_prefix = ClinicalRegex.get_digit_word_short_prefix()
|
294 |
+
digit_word_typo_prefix = ClinicalRegex.get_digit_word_typo_prefix()
|
295 |
+
|
296 |
+
return [word_typo_prefix, word_symbol_digit_prefix, pager_prefix, age_word_prefix,\
|
297 |
+
word_period_prefix, id_prefix, multiple_prefix, chemical_prefix, chemical_prefix_small,\
|
298 |
+
instrument_prefix, instrument_prefix_small, age_typo_prefix, word_digit_split_prefix,\
|
299 |
+
word_id_split_prefix, word_digit_mix_prefix, word_digit_mix_prefix_small, \
|
300 |
+
word_section_prefix, colon_prefix, temperature_prefix,\
|
301 |
+
percentage_prefix, value_range_1, value_range_2, value_range_3, year_range_prefix,\
|
302 |
+
short_digit_id_prefix, digit_symbol_word_prefix, digit_age_split_prefix,\
|
303 |
+
digit_word_short_prefix, digit_word_typo_prefix]
|
304 |
+
|
305 |
+
@staticmethod
|
306 |
+
def get_infixes():
|
307 |
+
digit_infix = '(\d+(?P<sep>[-:]+)\d+((?P=sep)\d+)*)'
|
308 |
+
return [digit_infix, ]
|
309 |
+
|
ner_datasets/preprocessing/tokenizers/utils/date_regex.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class DateRegex(object):
|
2 |
+
|
3 |
+
@staticmethod
|
4 |
+
def __get_day_attributes():
|
5 |
+
# day of the month with optional suffix, such as 7th, 22nd
|
6 |
+
dd = '(([0-2]?[0-9]|3[01])(\s*)([sS][tT]|[nN][dD]|[rR][dD]|[tT][hH])?)'
|
7 |
+
# two-digit numeric day of the month
|
8 |
+
DD = '(0[0-9]|[1-2][0-9]|3[01])'
|
9 |
+
|
10 |
+
return dd, DD
|
11 |
+
|
12 |
+
@staticmethod
|
13 |
+
def __get_month_attributes():
|
14 |
+
|
15 |
+
m = \
|
16 |
+
'([jJ][aA][nN]([uU][aA][rR][yY])?|'+\
|
17 |
+
'[fF][eE][bB]([rR][uU][aA][rR][yY])?|'+\
|
18 |
+
'[mM][aA][rR]([cC][hH])?|'+\
|
19 |
+
'[aA][pP][rR]([iI][lL])?|'+\
|
20 |
+
'[mM][aA][yY]|'+\
|
21 |
+
'[jJ][uU][nN]([eE])?|'+\
|
22 |
+
'[jJ][uU][lL]([yY])?|'+\
|
23 |
+
'[aA][uU][gG]([uU][sS][tT])?|'+\
|
24 |
+
'[sS][eE][pP]([tT][eE][mM][bB][eE][rR])?|'+\
|
25 |
+
'[oO][cC][tT]([oO][bB][eE][rR])?|'+\
|
26 |
+
'[nN][oO][vV]([eE][mM][bB][eE][rR])?|'+\
|
27 |
+
'[dD][eE][cC]([eE][mM][bB][eE][rR])?)'
|
28 |
+
M = m
|
29 |
+
|
30 |
+
# numeric month
|
31 |
+
mm = '(0?[0-9]|1[0-2]|' + m + ')'
|
32 |
+
|
33 |
+
# two digit month
|
34 |
+
MM = '(0[0-9]|1[0-2]|' + m + ')'
|
35 |
+
|
36 |
+
return m, M, mm, MM
|
37 |
+
|
38 |
+
@staticmethod
|
39 |
+
def __get_year_attributes():
|
40 |
+
|
41 |
+
# two or four digit year
|
42 |
+
y = '([0-9]{4}|[0-9]{2})'
|
43 |
+
|
44 |
+
# two digit year
|
45 |
+
yy = '([0-9]{2})'
|
46 |
+
|
47 |
+
# four digit year
|
48 |
+
YY = '([0-9]{4})'
|
49 |
+
|
50 |
+
return y, yy, YY
|
51 |
+
|
52 |
+
@staticmethod
|
53 |
+
def __get_sep_attributes():
|
54 |
+
|
55 |
+
date_sep = '[-./]'
|
56 |
+
date_sep_optional = '[-./]*'
|
57 |
+
date_sep_no_full = '[-/]'
|
58 |
+
|
59 |
+
return date_sep, date_sep_optional, date_sep_no_full
|
60 |
+
|
61 |
+
#def get_week_attributes():
|
62 |
+
# w = \
|
63 |
+
# '([mM][oO][nN]([dD][aA][yY])?|'+\
|
64 |
+
# '[tT][uU][eE]([sS][dD][aA][yY])?|'+\
|
65 |
+
# '[wW][eE][dD]([nN][eE][sS][dD][aA][yY])?|'+\
|
66 |
+
# '[tT][hH][uU][gG]([uU][sS][tT])?|'+\
|
67 |
+
# '[sS][eE][pP]([tT][eE][mM][bB][eE][rR])?|'+\
|
68 |
+
# '[oO][cC][tT]([oO][bB][eE][rR])?|'+\
|
69 |
+
# '[nN][oO][vV]([eE][mM][bB][eE][rR])?|'+\
|
70 |
+
# '[dD][eE][cC]([eE][mM][bB][eE][rR])?)'
|
71 |
+
|
72 |
+
@staticmethod
|
73 |
+
def get_infixes():
|
74 |
+
|
75 |
+
dd, DD = DateRegex.__get_day_attributes()
|
76 |
+
m, M, mm, MM = DateRegex.__get_month_attributes()
|
77 |
+
y, yy, YY = DateRegex.__get_year_attributes()
|
78 |
+
date_sep, date_sep_optional, date_sep_no_full = DateRegex.__get_sep_attributes()
|
79 |
+
|
80 |
+
date_1 = y + '/' + mm + '/' + dd + '(?!([/]+|\d+))'
|
81 |
+
date_2 = y + '/' + dd + '/' + mm + '(?!([/]+|\d+))'
|
82 |
+
date_3 = dd + '/' + mm + '/' + y + '(?!([/]+|\d+))'
|
83 |
+
date_4 = mm + '/' + dd + '/' + y + '(?!([/]+|\d+))'
|
84 |
+
#Do I make this optional (date_sep_optional) - need to check
|
85 |
+
date_5 = y + date_sep + m + date_sep + dd + '(?!\d)'
|
86 |
+
date_6 = y + date_sep + dd + date_sep + m
|
87 |
+
date_7 = dd + date_sep + m + date_sep + y
|
88 |
+
date_8 = m + date_sep + dd + date_sep + y
|
89 |
+
date_9 = y + date_sep + m
|
90 |
+
date_10 = m + date_sep + y
|
91 |
+
date_11 = dd + date_sep + m
|
92 |
+
date_12 = m + date_sep + dd
|
93 |
+
date_13 = '(?<!([/]|\d))' + y + '/' + dd + '(?!([/]+|\d+))'
|
94 |
+
date_14 = '(?<!([/]|\d))' + y + '/' + dd + '(?!([/]+|\d+))'
|
95 |
+
date_15 = '(?<!([/]|\d))' + dd + '/' + y + '(?!([/]+|\d+))'
|
96 |
+
date_16 = '(?<!([/]|\d))' + mm + '/' + y + '(?!([/]+|\d+))'
|
97 |
+
date_17 = '(?<!([/]|\d))' + dd + '/' + mm + '(?!([/]+|\d+))'
|
98 |
+
date_18 = '(?<!([/]|\d))' + mm + '/' + dd + '(?!([/]+|\d+))'
|
99 |
+
|
100 |
+
date_infixes = [date_1, date_2, date_3, date_4, date_5, date_6,\
|
101 |
+
date_7, date_8, date_9, date_10, date_11, date_12,\
|
102 |
+
date_13, date_14, date_15, date_16, date_17, date_18]
|
103 |
+
|
104 |
+
return date_infixes
|
ner_datasets/span_fixer.py
ADDED
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import json
|
3 |
+
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
|
4 |
+
from typing import Iterable, Dict, List, Sequence, Union, Mapping, Tuple, NoReturn
|
5 |
+
|
6 |
+
from .preprocessing import PreprocessingLoader
|
7 |
+
|
8 |
+
|
9 |
+
class SpanFixer(object):
|
10 |
+
"""
|
11 |
+
The tokens and spans may not align depending on the tokenizer used.
|
12 |
+
This class either expands the span to cover the tokens, so we don't have a mismatch.
|
13 |
+
A mismatch is when a span_start will not coincide with some token_start or the span_end
|
14 |
+
will not coincide with some token_end. This class changes the span_start and span_end
|
15 |
+
so that the span_start will coincide with some token_start and the span_end
|
16 |
+
will coincide with some token_end - and we don't get any position mismatch errors while
|
17 |
+
building our dataset. This entire process involves updating span positions which can lead to duplicate
|
18 |
+
or overlapping spans, which then need to be removed.
|
19 |
+
E.g we have text: The patient is 75yo man
|
20 |
+
AGE Span: 75
|
21 |
+
Token: 75yo
|
22 |
+
As you can see the span is smaller than the token, which will lead to an error when
|
23 |
+
building the NER dataset.
|
24 |
+
To ensure this does not happen, we correct the span. We change the span from
|
25 |
+
75 to 75yo -> So now AGE Span is 75yo instead of 75. This script essentially changes
|
26 |
+
the annotated spans to match the tokens. In an ideal case we wouldn't need this script
|
27 |
+
but since medical notes have many typos, this script becomes necessary to deal with
|
28 |
+
issues and changes that arise from different tokenizers.
|
29 |
+
Also sort the spans and convert the start and end keys of the spans to integers
|
30 |
+
"""
|
31 |
+
|
32 |
+
def __init__(
|
33 |
+
self,
|
34 |
+
sentencizer: str,
|
35 |
+
tokenizer: str,
|
36 |
+
ner_priorities: Mapping[str, int],
|
37 |
+
verbose: bool = True
|
38 |
+
) -> NoReturn:
|
39 |
+
"""
|
40 |
+
Initialize the sentencizer and tokenizer
|
41 |
+
Args:
|
42 |
+
sentencizer (str): The sentencizer to use for splitting text into sentences
|
43 |
+
tokenizer (str): The tokenizer to use for splitting text into tokens
|
44 |
+
ner_priorities (Mapping[str, int]): The priority when choosing which duplicates to remove.
|
45 |
+
Mapping that represents a priority for each NER type
|
46 |
+
verbose (bool): To print out warnings etc
|
47 |
+
"""
|
48 |
+
self._sentencizer = PreprocessingLoader.get_sentencizer(sentencizer)
|
49 |
+
self._tokenizer = PreprocessingLoader.get_tokenizer(tokenizer)
|
50 |
+
self._ner_priorities = ner_priorities
|
51 |
+
self._verbose = verbose
|
52 |
+
|
53 |
+
def __get_token_positions(self, text: str) -> Tuple[Dict[int, int], Dict[int, int]]:
|
54 |
+
"""
|
55 |
+
Get the start and end positions of all the tokens in the note.
|
56 |
+
Args:
|
57 |
+
text (str): The text present in the note
|
58 |
+
Returns:
|
59 |
+
token_start_positions (Mapping[int, int]): The start positions of all the tokens in the note
|
60 |
+
token_end_positions (Mapping[int, int]): The end positions of all the tokens in the note
|
61 |
+
"""
|
62 |
+
token_start_positions = dict()
|
63 |
+
token_end_positions = dict()
|
64 |
+
for sentence in self._sentencizer.get_sentences(text):
|
65 |
+
offset = sentence['start']
|
66 |
+
for token in self._tokenizer.get_tokens(sentence['text']):
|
67 |
+
start = token['start'] + offset
|
68 |
+
end = token['end'] + offset
|
69 |
+
token_start_positions[start] = 1
|
70 |
+
token_end_positions[end] = 1
|
71 |
+
return token_start_positions, token_end_positions
|
72 |
+
|
73 |
+
def get_duplicates(
|
74 |
+
self,
|
75 |
+
spans: List[Dict[str, Union[str, int]]],
|
76 |
+
) -> List[int]:
|
77 |
+
"""
|
78 |
+
Return the indexes where there are duplicate/overlapping spans. A duplicate or
|
79 |
+
span is one where the same token can have two labels.
|
80 |
+
E.g:
|
81 |
+
Token: BWH^Bruce
|
82 |
+
This is a single token where BWH is the hospital label and Bruce is the Patient label
|
83 |
+
The fix_alignment function assigns this entre token the hospital label but it also
|
84 |
+
assigns this entire token the patient label. Since we have two labels for the same
|
85 |
+
token, we need to remove one of them.
|
86 |
+
We assign this entire token one label - either hospital label or the patient label
|
87 |
+
In this case we assign patient because of higher priority. So now we need to remove
|
88 |
+
the hospital label from the dataset (since it is essentially a duplicate label). This
|
89 |
+
script handles this case.
|
90 |
+
There are cases when two different labels match the same token partially
|
91 |
+
E.g
|
92 |
+
Text: JT/781-815-9090
|
93 |
+
Spans: JT - hospital, 781-815-9090 - Phone
|
94 |
+
Tokens: (Jt/781) & (- 815 - 9090)
|
95 |
+
As you can see the token JT/781 will be assigned the label in the fix_alignment function
|
96 |
+
but 781-815-9090 is also phone and the 781 portion is overlapped, and we need to resolve this.
|
97 |
+
In this script, we resolve it by treating JT/781 as one span (hospital) and
|
98 |
+
-815-9090 as another span (phone).
|
99 |
+
Args:
|
100 |
+
spans ([List[Dict[str, Union[str, int]]]): The NER spans in the note
|
101 |
+
Returns:
|
102 |
+
remove_spans (Sequence[int]): A list of indexes of the spans to remove
|
103 |
+
"""
|
104 |
+
remove_spans = list()
|
105 |
+
prev_start = -1
|
106 |
+
prev_end = -1
|
107 |
+
prev_label = None
|
108 |
+
prev_index = None
|
109 |
+
spans.sort(key=lambda _span: (_span['start'], _span['end']))
|
110 |
+
for index, span in enumerate(spans):
|
111 |
+
current_start = span['start']
|
112 |
+
current_end = span['end']
|
113 |
+
current_label = span['label']
|
114 |
+
if type(current_start) != int or type(current_end) != int:
|
115 |
+
raise ValueError('The start and end keys of the span must be of type int')
|
116 |
+
# Check if the current span matches another span
|
117 |
+
# that is if this span covers the same tokens as the
|
118 |
+
# previous spans (but has a different label)
|
119 |
+
# Based on the priority, treat the span with the low
|
120 |
+
# priority label as a duplicate label and add it to the
|
121 |
+
# list of spans that need to be removed
|
122 |
+
if current_start == prev_start and current_end == prev_end:
|
123 |
+
if self._ner_priorities[current_label] > self._ner_priorities[prev_label]:
|
124 |
+
# Store index of the previous span if it has lower priority
|
125 |
+
remove_spans.append(prev_index)
|
126 |
+
# Reset span details
|
127 |
+
prev_start = current_start
|
128 |
+
prev_end = current_end
|
129 |
+
prev_index = index
|
130 |
+
prev_label = current_label
|
131 |
+
if self._verbose:
|
132 |
+
print('DUPLICATE: ', span)
|
133 |
+
print('REMOVED: ', spans[remove_spans[-1]])
|
134 |
+
elif self._ner_priorities[current_label] <= self._ner_priorities[prev_label]:
|
135 |
+
# Store current index of span if it has lower priority
|
136 |
+
remove_spans.append(index)
|
137 |
+
if self._verbose:
|
138 |
+
print('DUPLICATE: ', spans[prev_index])
|
139 |
+
print('REMOVED: ', spans[remove_spans[-1]])
|
140 |
+
# Check for overlapping span
|
141 |
+
elif current_start < prev_end:
|
142 |
+
# If the current span end matches the overlapping span end
|
143 |
+
# Remove the current span, since it is smaller
|
144 |
+
if current_end <= prev_end:
|
145 |
+
remove_spans.append(index)
|
146 |
+
if self._verbose:
|
147 |
+
print('DUPLICATE: ', spans[prev_index])
|
148 |
+
print('REMOVED: ', spans[remove_spans[-1]])
|
149 |
+
# If the current end is greater than the prev_end
|
150 |
+
# then we split it into tow spans. We treat the previous span
|
151 |
+
# as one span and the end of the previous span to the end of the current span
|
152 |
+
# as another span.
|
153 |
+
elif current_end > prev_end:
|
154 |
+
# Create the new span - start=previous_span_end, end=current_span_end
|
155 |
+
overlap_length = spans[prev_index]['end'] - current_start
|
156 |
+
new_text = span['text'][overlap_length:]
|
157 |
+
# Remove extra spaces that may arise during this span separation
|
158 |
+
new_text = re.sub('^(\s+)', '', new_text, flags=re.DOTALL)
|
159 |
+
span['start'] = current_end - len(new_text)
|
160 |
+
span['text'] = new_text
|
161 |
+
if self._verbose:
|
162 |
+
print('OVERLAP: ', spans[prev_index])
|
163 |
+
print('UPDATED: ', span)
|
164 |
+
# Reset span details
|
165 |
+
prev_start = current_start
|
166 |
+
prev_end = current_end
|
167 |
+
prev_label = current_label
|
168 |
+
prev_index = index
|
169 |
+
# Reset span details
|
170 |
+
else:
|
171 |
+
prev_start = current_start
|
172 |
+
prev_end = current_end
|
173 |
+
prev_label = current_label
|
174 |
+
prev_index = index
|
175 |
+
return remove_spans
|
176 |
+
|
177 |
+
def fix_alignment(
|
178 |
+
self,
|
179 |
+
text: str,
|
180 |
+
spans: Sequence[Dict[str, Union[str, int]]]
|
181 |
+
) -> Iterable[Dict[str, Union[str, int]]]:
|
182 |
+
"""
|
183 |
+
Align the span and tokens. When the tokens and spans don't align, we change the
|
184 |
+
start and end positions of the spans so that they align with the tokens. This is
|
185 |
+
needed when a different tokenizer is used and the spans which are defined against
|
186 |
+
a different tokenizer don't line up with the new tokenizer. Also remove spaces present
|
187 |
+
at the start or end of the span.
|
188 |
+
E.g:
|
189 |
+
Token: BWH^Bruce
|
190 |
+
This is a single token where BWH is the hospital label and Bruce is the Patient label
|
191 |
+
The fix_alignment function assigns this entre token the hospital label but it also
|
192 |
+
assigns this entire token the patient label. This function basically expands the span
|
193 |
+
so that it matches the start and end positions of some token. By doing this it may create
|
194 |
+
overlapping and duplicate spans. As you can see it expands the patient label to match the
|
195 |
+
start of the token and it expands the hospital label to match the end of the token.
|
196 |
+
function.
|
197 |
+
Args:
|
198 |
+
text (str): The text present in the note
|
199 |
+
spans ([Sequence[Dict[str, Union[str, int]]]): The NER spans in the note
|
200 |
+
Returns:
|
201 |
+
(Iterable[Dict[str, Union[str, int]]]): Iterable through the modified spans
|
202 |
+
"""
|
203 |
+
# Get token start and end positions so that we can check if a span
|
204 |
+
# coincides with the start and end position of some token.
|
205 |
+
token_start_positions, token_end_positions = self.__get_token_positions(text)
|
206 |
+
for span in spans:
|
207 |
+
start = span['start']
|
208 |
+
end = span['end']
|
209 |
+
if type(start) != int or type(end) != int:
|
210 |
+
raise ValueError('The start and end keys of the span must be of type int')
|
211 |
+
if re.search('^\s', text[start:end]):
|
212 |
+
if self._verbose:
|
213 |
+
print('WARNING - space present in the start of the span')
|
214 |
+
start = start + 1
|
215 |
+
if re.search('(\s+)$', text[start:end], flags=re.DOTALL):
|
216 |
+
new_text = re.sub('(\s+)$', '', text[start:end], flags=re.DOTALL)
|
217 |
+
end = start + len(new_text)
|
218 |
+
# When a span does not coincide with the start and end position of some token
|
219 |
+
# it means there will be an error when building the ner dataset, we try and avoid
|
220 |
+
# that error by updating the spans itself, that is we expand the start/end positions
|
221 |
+
# of the spans so that it is aligned with the tokens.
|
222 |
+
while token_start_positions.get(start, False) is False:
|
223 |
+
start -= 1
|
224 |
+
while token_end_positions.get(end, False) is False:
|
225 |
+
end += 1
|
226 |
+
# Print what the old span was and what the new expanded span will look like
|
227 |
+
if self._verbose and (int(span['start']) != start or int(span['end']) != end):
|
228 |
+
print('OLD SPAN: ', text[int(span['start']):int(span['end'])])
|
229 |
+
print('NEW SPAN: ', text[start:end])
|
230 |
+
# Update the span with its new start and end positions
|
231 |
+
span['start'] = start
|
232 |
+
span['end'] = end
|
233 |
+
span['text'] = text[start:end]
|
234 |
+
yield span
|
235 |
+
|
236 |
+
def fix_note(
|
237 |
+
self,
|
238 |
+
text: str,
|
239 |
+
spans: Sequence[Dict[str, Union[str, int]]],
|
240 |
+
) -> Iterable[Dict[str, Union[str, int]]]:
|
241 |
+
"""
|
242 |
+
This function changes the span_start and span_end
|
243 |
+
so that the span_start will coincide with some token_start and the span_end
|
244 |
+
will coincide with some token_end and also removes duplicate/overlapping spans
|
245 |
+
that may arise when we change the span start and end positions. The resulting
|
246 |
+
spans from this function will always coincide with some token start and token
|
247 |
+
end, and hence will not have any token and span mismatch errors when building the
|
248 |
+
NER dataset. For more details and examples check the documentation of the
|
249 |
+
fix_alignment and get_duplicates functions.
|
250 |
+
Args:
|
251 |
+
text (str): The text present in the note
|
252 |
+
spans ([Sequence[Mapping[str, Union[str, int]]]): The NER spans in the note
|
253 |
+
Returns:
|
254 |
+
(Iterable[Mapping[str, Union[str, int]]]): Iterable through the fixed spans
|
255 |
+
"""
|
256 |
+
# Fix span position alignment
|
257 |
+
spans = [span for span in self.fix_alignment(text=text, spans=spans)]
|
258 |
+
# Check for duplicate/overlapping spans
|
259 |
+
remove_spans = self.get_duplicates(spans=spans)
|
260 |
+
for index, span in enumerate(spans):
|
261 |
+
# Remove the duplicate/overlapping spans
|
262 |
+
if index not in remove_spans:
|
263 |
+
yield span
|
264 |
+
|
265 |
+
def fix(
|
266 |
+
self,
|
267 |
+
input_file: str,
|
268 |
+
text_key: str = 'text',
|
269 |
+
spans_key: str = 'spans'
|
270 |
+
) -> Iterable[Dict[str, Union[str, Dict[str, str], List[Dict[str, str]]]]]:
|
271 |
+
"""
|
272 |
+
This function changes the span_start and span_end
|
273 |
+
so that the span_start will coincide with some token_start and the span_end
|
274 |
+
will coincide with some token_end and also removes duplicate/overlapping spans
|
275 |
+
that may arise when we change the span start and end positions. The resulting
|
276 |
+
spans from this function will always coincide with some token start and token
|
277 |
+
end, and hence will not have any token and span mismatch errors when building the
|
278 |
+
NER dataset. For more details and examples check the documentation of the
|
279 |
+
fix_alignment and get_duplicates functions. Fix spans that arise due to bad typos,
|
280 |
+
which are not fixed during tokenization. This essentially updates the spans so that
|
281 |
+
they line up with the start and end positions of tokens - so that there is no error
|
282 |
+
when we assign labels to tokens based on these spans
|
283 |
+
Args:
|
284 |
+
input_file (str): The file that contains the notes that we want to fix the token issues in
|
285 |
+
text_key (str) the key where the note & token text is present in the json object
|
286 |
+
spans_key (str): The key where the note spans are present in the json object
|
287 |
+
Returns:
|
288 |
+
(Iterable[Dict[str, Union[str, Dict[str, str], List[Dict[str, str]]]]]): Iterable through the fixed
|
289 |
+
notes
|
290 |
+
"""
|
291 |
+
for line in open(input_file, 'r'):
|
292 |
+
note = json.loads(line)
|
293 |
+
note[spans_key] = [span for span in self.fix_note(text=note[text_key], spans=note[spans_key])]
|
294 |
+
yield note
|
295 |
+
|
296 |
+
|
297 |
+
def main():
|
298 |
+
# The following code sets up the arguments to be passed via CLI or via a JSON file
|
299 |
+
cli_parser = ArgumentParser(
|
300 |
+
description='configuration arguments provided at run time from the CLI',
|
301 |
+
formatter_class=ArgumentDefaultsHelpFormatter
|
302 |
+
)
|
303 |
+
cli_parser.add_argument(
|
304 |
+
'--input_file',
|
305 |
+
type=str,
|
306 |
+
required=True,
|
307 |
+
help='the the jsonl file that contains the notes'
|
308 |
+
)
|
309 |
+
cli_parser.add_argument(
|
310 |
+
'--sentencizer',
|
311 |
+
type=str,
|
312 |
+
required=True,
|
313 |
+
help='the sentencizer to use for splitting notes into sentences'
|
314 |
+
)
|
315 |
+
cli_parser.add_argument(
|
316 |
+
'--tokenizer',
|
317 |
+
type=str,
|
318 |
+
required=True,
|
319 |
+
help='the tokenizer to use for splitting text into tokens'
|
320 |
+
)
|
321 |
+
cli_parser.add_argument(
|
322 |
+
'--abbreviations_file',
|
323 |
+
type=str,
|
324 |
+
default=None,
|
325 |
+
help='file that will be used by clinical tokenizer to handle abbreviations'
|
326 |
+
)
|
327 |
+
cli_parser.add_argument(
|
328 |
+
'--ner_types',
|
329 |
+
nargs="+",
|
330 |
+
require=True,
|
331 |
+
help='the NER types'
|
332 |
+
)
|
333 |
+
cli_parser.add_argument(
|
334 |
+
'--ner_priorities',
|
335 |
+
nargs="+",
|
336 |
+
require=True,
|
337 |
+
help='the priorities for the NER types - the priority when choosing which duplicates to remove'
|
338 |
+
)
|
339 |
+
cli_parser.add_argument(
|
340 |
+
'--text_key',
|
341 |
+
type=str,
|
342 |
+
default='text',
|
343 |
+
help='the key where the note & token text is present in the json object'
|
344 |
+
)
|
345 |
+
cli_parser.add_argument(
|
346 |
+
'--spans_key',
|
347 |
+
type=str,
|
348 |
+
default='spans',
|
349 |
+
help='the key where the note spans is present in the json object'
|
350 |
+
)
|
351 |
+
cli_parser.add_argument(
|
352 |
+
'--output_file',
|
353 |
+
type=str,
|
354 |
+
required=True,
|
355 |
+
help='the output json file that will contain the new fixed spans'
|
356 |
+
)
|
357 |
+
args = cli_parser.parse_args()
|
358 |
+
# Mapping that represents a priority for each PHI type
|
359 |
+
# For example, the PATIENT type will have a higher priority as
|
360 |
+
# compared to STAFF.
|
361 |
+
if len(args.ner_types) == len(args.ner_priorities):
|
362 |
+
ner_priorities = {ner_type: priority for ner_type, priority in zip(args.ner_types, args.ner_priorities)}
|
363 |
+
else:
|
364 |
+
raise ValueError('Length of ner_types and ner_priorities must be the same')
|
365 |
+
span_fixer = SpanFixer(
|
366 |
+
tokenizer=args.tokenizer,
|
367 |
+
sentencizer=args.sentencizer,
|
368 |
+
ner_priorities=ner_priorities
|
369 |
+
)
|
370 |
+
with open(args.output_file, 'w') as file:
|
371 |
+
for note in span_fixer.fix(
|
372 |
+
input_file=args.input_file,
|
373 |
+
text_key=args.text_key,
|
374 |
+
spans_key=args.spans_key
|
375 |
+
):
|
376 |
+
file.write(json.dumps(note) + '\n')
|
377 |
+
|
378 |
+
|
379 |
+
if __name__ == '__main__':
|
380 |
+
main()
|
ner_datasets/span_validation.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import random
|
3 |
+
from argparse import ArgumentParser
|
4 |
+
from typing import Union, NoReturn, Iterable, Dict, List
|
5 |
+
|
6 |
+
random.seed(41)
|
7 |
+
|
8 |
+
|
9 |
+
class SpanValidation(object):
|
10 |
+
"""
|
11 |
+
This class is used to build a mapping between the note id
|
12 |
+
and the annotated spans in that note. This will be used during the
|
13 |
+
evaluation of the models. This is required to perform span level
|
14 |
+
evaluation.
|
15 |
+
"""
|
16 |
+
@staticmethod
|
17 |
+
def get_spans(
|
18 |
+
input_file: str,
|
19 |
+
metadata_key: str = 'meta',
|
20 |
+
note_id_key: str = 'note_id',
|
21 |
+
spans_key: str = 'spans'
|
22 |
+
):
|
23 |
+
"""
|
24 |
+
Get a mapping between the note id
|
25 |
+
and the annotated spans in that note. This will mainly be used during the
|
26 |
+
evaluation of the models.
|
27 |
+
Args:
|
28 |
+
input_file (str): The input file
|
29 |
+
metadata_key (str): The key where the note metadata is present
|
30 |
+
note_id_key (str): The key where the note id is present
|
31 |
+
spans_key (str): The key that contains the annotated spans for a note dictionary
|
32 |
+
Returns:
|
33 |
+
(Iterable[Dict[str, Union[str, List[Dict[str, str]]]]]): An iterable that iterates through each note
|
34 |
+
and contains the note id and annotated spans
|
35 |
+
for that note
|
36 |
+
"""
|
37 |
+
# Read the input files (data source)
|
38 |
+
for line in open(input_file, 'r'):
|
39 |
+
note = json.loads(line)
|
40 |
+
note_id = note[metadata_key][note_id_key]
|
41 |
+
# Store the note_id and the annotated spans
|
42 |
+
note[spans_key].sort(key=lambda _span: (_span['start'], _span['end']))
|
43 |
+
yield {'note_id': note_id, 'note_spans': note[spans_key]}
|
44 |
+
|
45 |
+
|
46 |
+
def main() -> NoReturn:
|
47 |
+
cli_parser = ArgumentParser(description='configuration arguments provided at run time from the CLI')
|
48 |
+
cli_parser.add_argument(
|
49 |
+
'--input_file',
|
50 |
+
type=str,
|
51 |
+
required=True,
|
52 |
+
help='the the jsonl file that contains the notes'
|
53 |
+
)
|
54 |
+
cli_parser.add_argument(
|
55 |
+
'--metadata_key',
|
56 |
+
type=str,
|
57 |
+
default='meta',
|
58 |
+
help='the key where the note metadata is present in the json object'
|
59 |
+
)
|
60 |
+
cli_parser.add_argument(
|
61 |
+
'--note_id_key',
|
62 |
+
type=str,
|
63 |
+
default='note_id',
|
64 |
+
help='the key where the note id is present in the json object'
|
65 |
+
)
|
66 |
+
cli_parser.add_argument(
|
67 |
+
'--span_text_key',
|
68 |
+
type=str,
|
69 |
+
default='spans',
|
70 |
+
help='the key where the annotated spans for the notes are present in the json object'
|
71 |
+
)
|
72 |
+
cli_parser.add_argument(
|
73 |
+
'--output_file',
|
74 |
+
type=str,
|
75 |
+
required=True,
|
76 |
+
help='the file where the note id and the corresponding spans for that note are to be saved'
|
77 |
+
)
|
78 |
+
args = cli_parser.parse_args()
|
79 |
+
|
80 |
+
# Write the dataset to the output file
|
81 |
+
with open(args.output_file, 'w') as file:
|
82 |
+
for span_info in SpanValidation.get_spans(
|
83 |
+
input_file=args.input_file,
|
84 |
+
metadata_key=args.metadata_key,
|
85 |
+
note_id_key=args.note_id_key,
|
86 |
+
spans_key=args.spans_key):
|
87 |
+
file.write(json.dumps(span_info) + '\n')
|
88 |
+
|
89 |
+
|
90 |
+
if __name__ == "__main__":
|
91 |
+
main()
|
sequence_tagging/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
sequence_tagging/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .sequence_tagger import SequenceTagger
|
2 |
+
__all__ = ["SequenceTagger"]
|
sequence_tagging/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (267 Bytes). View file
|
|
sequence_tagging/__pycache__/sequence_tagger.cpython-37.pyc
ADDED
Binary file (13.6 kB). View file
|
|
sequence_tagging/arguments/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .model_arguments import ModelArguments
|
2 |
+
from .evaluation_arguments import EvaluationArguments
|
3 |
+
from .data_training_arguments import DataTrainingArguments
|
4 |
+
__all__ = [
|
5 |
+
"ModelArguments",
|
6 |
+
"DataTrainingArguments",
|
7 |
+
"EvaluationArguments",
|
8 |
+
]
|
sequence_tagging/arguments/data_training_arguments.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
from dataclasses import dataclass, field
|
3 |
+
|
4 |
+
@dataclass
|
5 |
+
class DataTrainingArguments:
|
6 |
+
"""
|
7 |
+
Arguments pertaining to what data we are going to input our model for training and eval.
|
8 |
+
"""
|
9 |
+
task_name: Optional[str] = field(
|
10 |
+
default="ner",
|
11 |
+
metadata={"help": "The name of the task (ner, pos...)."}
|
12 |
+
)
|
13 |
+
notation: str = field(
|
14 |
+
default="BIO",
|
15 |
+
metadata={"help": "NER notation e.g BIO"},
|
16 |
+
)
|
17 |
+
ner_types: Optional[str] = field(
|
18 |
+
default=None,
|
19 |
+
metadata={"help": "Pass a list of NER types"},
|
20 |
+
)
|
21 |
+
train_file: Optional[str] = field(
|
22 |
+
default=None,
|
23 |
+
metadata={"help": "The input training data file (a csv or JSON file)."}
|
24 |
+
)
|
25 |
+
validation_file: Optional[str] = field(
|
26 |
+
default=None,
|
27 |
+
metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."},
|
28 |
+
)
|
29 |
+
test_file: Optional[str] = field(
|
30 |
+
default=None,
|
31 |
+
metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."},
|
32 |
+
)
|
33 |
+
output_predictions_file: Optional[str] = field(
|
34 |
+
default=None,
|
35 |
+
metadata={"help": "A location where to write the output of the test data"},
|
36 |
+
)
|
37 |
+
text_column_name: Optional[str] = field(
|
38 |
+
default='tokens',
|
39 |
+
metadata={"help": "The column name of text to input in the file (a csv or JSON file)."}
|
40 |
+
)
|
41 |
+
label_column_name: Optional[str] = field(
|
42 |
+
default='labels',
|
43 |
+
metadata={"help": "The column name of label to input in the file (a csv or JSON file)."}
|
44 |
+
)
|
45 |
+
overwrite_cache: bool = field(
|
46 |
+
default=False,
|
47 |
+
metadata={"help": "Overwrite the cached training and evaluation sets"}
|
48 |
+
)
|
49 |
+
preprocessing_num_workers: Optional[int] = field(
|
50 |
+
default=None,
|
51 |
+
metadata={"help": "The number of processes to use for the preprocessing."},
|
52 |
+
)
|
53 |
+
pad_to_max_length: bool = field(
|
54 |
+
default=False,
|
55 |
+
metadata={
|
56 |
+
"help": "Whether to pad all samples to model maximum sentence length. "
|
57 |
+
"If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
|
58 |
+
"efficient on GPU but very bad for TPU."
|
59 |
+
},
|
60 |
+
)
|
61 |
+
truncation: bool = field(
|
62 |
+
default=True,
|
63 |
+
metadata={
|
64 |
+
"help": "Activates and controls truncation"
|
65 |
+
},
|
66 |
+
)
|
67 |
+
max_length: int = field(
|
68 |
+
default=512,
|
69 |
+
metadata={
|
70 |
+
"help": "Controls the maximum length to use by one of the truncation/padding parameters."
|
71 |
+
},
|
72 |
+
)
|
73 |
+
do_lower_case: bool = field(
|
74 |
+
default=False,
|
75 |
+
metadata={
|
76 |
+
"help": "Whether to lowercase the text"
|
77 |
+
},
|
78 |
+
)
|
79 |
+
max_train_samples: Optional[int] = field(
|
80 |
+
default=None,
|
81 |
+
metadata={
|
82 |
+
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
|
83 |
+
"value if set."
|
84 |
+
},
|
85 |
+
)
|
86 |
+
max_eval_samples: Optional[int] = field(
|
87 |
+
default=None,
|
88 |
+
metadata={
|
89 |
+
"help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
|
90 |
+
"value if set."
|
91 |
+
},
|
92 |
+
)
|
93 |
+
max_predict_samples: Optional[int] = field(
|
94 |
+
default=None,
|
95 |
+
metadata={
|
96 |
+
"help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
|
97 |
+
"value if set."
|
98 |
+
},
|
99 |
+
)
|
100 |
+
label_all_tokens: bool = field(
|
101 |
+
default=False,
|
102 |
+
metadata={
|
103 |
+
"help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
|
104 |
+
"one (in which case the other tokens will have a padding index)."
|
105 |
+
},
|
106 |
+
)
|
107 |
+
return_entity_level_metrics: bool = field(
|
108 |
+
default=True,
|
109 |
+
metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."},
|
110 |
+
)
|
111 |
+
token_ignore_label: str = field(
|
112 |
+
default='NA',
|
113 |
+
metadata={"help": "The label that indicates where the tokens will be ignored in loss computation. Used for "
|
114 |
+
"indicating context tokens to the model"}
|
115 |
+
)
|
sequence_tagging/arguments/evaluation_arguments.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
from dataclasses import dataclass, field
|
3 |
+
|
4 |
+
@dataclass
|
5 |
+
class EvaluationArguments:
|
6 |
+
"""
|
7 |
+
Arguments pertaining to the evaluation process.
|
8 |
+
"""
|
9 |
+
model_eval_script: Optional[str] = field(
|
10 |
+
default=None,
|
11 |
+
metadata={"help": "The script that is used for evaluation"},
|
12 |
+
)
|
13 |
+
evaluation_mode: Optional[str] = field(
|
14 |
+
default=None,
|
15 |
+
metadata={"help": "Strict or default mode for sequence evaluation"},
|
16 |
+
)
|
17 |
+
validation_spans_file: Optional[str] = field(
|
18 |
+
default=None,
|
19 |
+
metadata={"help": "A span evaluation data file to evaluate on span level (json file). This will contain a "
|
20 |
+
"mapping between the note_ids and note spans"},
|
21 |
+
)
|
22 |
+
ner_type_maps: Optional[str] = field(
|
23 |
+
default=None,
|
24 |
+
metadata={"help": "List that contains the mappings between the original NER types to another set of NER "
|
25 |
+
"types. Used mainly for evaluation. to map ner token labels to another set of ner token"},
|
26 |
+
)
|
sequence_tagging/arguments/model_arguments.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
from dataclasses import dataclass, field
|
3 |
+
|
4 |
+
|
5 |
+
@dataclass
|
6 |
+
class ModelArguments:
|
7 |
+
"""
|
8 |
+
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
|
9 |
+
"""
|
10 |
+
model_name_or_path: str = field(
|
11 |
+
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
|
12 |
+
)
|
13 |
+
config_name: Optional[str] = field(
|
14 |
+
default=None,
|
15 |
+
metadata={"help": "Pretrained config name or path if not the same as model_name"}
|
16 |
+
)
|
17 |
+
tokenizer_name: Optional[str] = field(
|
18 |
+
default=None,
|
19 |
+
metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
|
20 |
+
)
|
21 |
+
cache_dir: Optional[str] = field(
|
22 |
+
default=None,
|
23 |
+
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
|
24 |
+
)
|
25 |
+
model_revision: str = field(
|
26 |
+
default="main",
|
27 |
+
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
|
28 |
+
)
|
29 |
+
use_auth_token: bool = field(
|
30 |
+
default=False,
|
31 |
+
metadata={
|
32 |
+
"help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
|
33 |
+
"with private models)."
|
34 |
+
},
|
35 |
+
)
|
36 |
+
post_process: str = field(
|
37 |
+
default='argmax',
|
38 |
+
metadata={"help": "What post processing to use on the model logits"},
|
39 |
+
)
|
40 |
+
threshold: Optional[float] = field(
|
41 |
+
default=None,
|
42 |
+
metadata={"help": "Threshold cutoff for softmax"},
|
43 |
+
)
|
sequence_tagging/dataset_builder/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .ner_labels import NERLabels
|
2 |
+
from .ner_dataset import NERDataset
|
3 |
+
from .label_mapper import LabelMapper
|
4 |
+
from .dataset_tokenizer import DatasetTokenizer
|
5 |
+
__all__=["NERLabels", "NERDataset", "LabelMapper", "DatasetTokenizer"]
|
sequence_tagging/dataset_builder/dataset_tokenizer.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Mapping, Sequence, List, Union, Optional, NoReturn
|
2 |
+
from datasets import Dataset
|
3 |
+
from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer
|
4 |
+
|
5 |
+
|
6 |
+
class DatasetTokenizer(object):
|
7 |
+
"""
|
8 |
+
The main goal of this class is to solve the problem described below.
|
9 |
+
Most of the comments have been copied from the huggingface webpage.
|
10 |
+
What this class does is initialize a tokenizer with the desired parameters
|
11 |
+
and then tokenize our dataset and align the tokens with the labels
|
12 |
+
while keeping in mind the problem & solution described below. We can use this
|
13 |
+
function to train and for predictions - we just assume the predictions dataset
|
14 |
+
will have a label column filled with some values (so this code can be re-used).
|
15 |
+
Now we arrive at a common obstacle with using pre-trained models for
|
16 |
+
token-level classification: many of the tokens in the dataset may not
|
17 |
+
be in the tokenizer vocabulary. Bert and many models like it use a method
|
18 |
+
called WordPiece Tokenization, meaning that single words are split into multiple
|
19 |
+
tokens such that each token is likely to be in the vocabulary. For example,
|
20 |
+
the tokenizer would split the date (token) 2080 into the tokens ['208', '##0'].
|
21 |
+
This is a problem for us because we have exactly one tag per token (2080 -> B-DATE).
|
22 |
+
If the tokenizer splits a token into multiple sub-tokens, then we will end up with
|
23 |
+
a mismatch between our tokens and our labels (208, 0) - two tokens but one label (B-DATE).
|
24 |
+
One way to handle this is to only train on the tag labels for the first subtoken of a
|
25 |
+
split token. We can do this in huggingface Transformers by setting the labels
|
26 |
+
we wish to ignore to -100. In the example above, if the label for 2080 is B-DATE
|
27 |
+
and say the id (from the label to id mapping) for B-DATE is 3, we would set the labels
|
28 |
+
of ['208', '##0'] to [3, -100]. This tells the model to ignore the tokens labelled with
|
29 |
+
-100 while updating the weights etc.
|
30 |
+
"""
|
31 |
+
|
32 |
+
def __init__(
|
33 |
+
self,
|
34 |
+
tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
|
35 |
+
token_column: str,
|
36 |
+
label_column: str,
|
37 |
+
label_to_id: Mapping[str, int],
|
38 |
+
b_to_i_label: Sequence[int],
|
39 |
+
padding: Union[bool, str],
|
40 |
+
truncation: Union[bool, str],
|
41 |
+
is_split_into_words: bool,
|
42 |
+
max_length: Optional[int],
|
43 |
+
label_all_tokens: bool,
|
44 |
+
token_ignore_label: Optional[str]
|
45 |
+
) -> NoReturn:
|
46 |
+
"""
|
47 |
+
Set the tokenizer we are using to subword tokenizer our dataset. The name of the
|
48 |
+
column that contains the pre-split tokens, the name of the column that contains
|
49 |
+
the labels for each token, label to id mapping.
|
50 |
+
Set the padding strategy of the input. Set whether to truncate the input tokens.
|
51 |
+
Indicate whether the input is pre-split into tokens. Set the max length of the
|
52 |
+
input tokens (post subword tokenization). This will be used in conjunction with truncation.
|
53 |
+
Set whether we want to label even the sub tokens
|
54 |
+
In the description above we say for 2080 (B-DATE) - [208, ##0]
|
55 |
+
We do [3, -100] - which says assume to label of token 2080 is the one
|
56 |
+
predicted for 208 or we can just label both sub tokens
|
57 |
+
in which case it would be [3, 3] - so we would label 208 as DATE
|
58 |
+
and ##0 as DATE - then we would have to figure out how to merge these
|
59 |
+
labels etc
|
60 |
+
Args:
|
61 |
+
tokenizer (Union[PreTrainedTokenizerFast, PreTrainedTokenizer]): Tokenizer from huggingface
|
62 |
+
token_column (str): The column that contains the tokens in the dataset
|
63 |
+
label_column (str): The column that contains the labels in the dataset
|
64 |
+
label_to_id (Mapping[str, int]): The mapping between labels and ID
|
65 |
+
b_to_i_label (Sequence[int]): The mapping between labels and ID
|
66 |
+
padding (Union[bool, str]): Padding strategy
|
67 |
+
truncation (Union[bool, str]): Truncation strategy
|
68 |
+
is_split_into_words (bool): Is the input pre-split(tokenized)
|
69 |
+
max_length (Optional[int]): Max subword tokenized length for the model
|
70 |
+
label_all_tokens (bool): Whether to label sub words
|
71 |
+
token_ignore_label (str): The value of the token ignore label - we ignore these in the loss computation
|
72 |
+
"""
|
73 |
+
self._tokenizer = tokenizer
|
74 |
+
self._token_column = token_column
|
75 |
+
self._label_column = label_column
|
76 |
+
self._label_to_id = label_to_id
|
77 |
+
self._b_to_i_label = b_to_i_label
|
78 |
+
# We can tell the tokenizer that we’re dealing with ready-split tokens rather than full
|
79 |
+
# sentence strings by passing is_split_into_words=True.
|
80 |
+
# Set the following parameters using the kwargs
|
81 |
+
self._padding = padding
|
82 |
+
self._truncation = truncation
|
83 |
+
self._is_split_into_words = is_split_into_words
|
84 |
+
self._max_length = max_length
|
85 |
+
self._label_all_tokens = label_all_tokens
|
86 |
+
self._token_ignore_label = token_ignore_label
|
87 |
+
self._ignore_label = -100
|
88 |
+
|
89 |
+
def tokenize_and_align_labels(self, dataset: Dataset) -> Dataset:
|
90 |
+
"""
|
91 |
+
This function is the one that is used to read the input dataset
|
92 |
+
Run the subword tokenization on the pre-split tokens and then
|
93 |
+
as mentioned above align the subtokens and labels and add the ignore
|
94 |
+
label. This will read the input - say [60, year, old, in, 2080]
|
95 |
+
and will return the subtokens - [60, year, old, in, 208, ##0]
|
96 |
+
some other information like token_type_ids etc
|
97 |
+
and the labels [0, 20, 20, 20, 3, -100] (0 - corresponds to B-AGE, 20 corresponds to O
|
98 |
+
and 3 corresponds to B-DATE. This returned input serves as input for training the model
|
99 |
+
or for gathering predictions from a trained model.
|
100 |
+
Another important thing to note is that we have mentioned before that
|
101 |
+
we add chunks of tokens that appear before and after the current chunk for context. We would
|
102 |
+
also need to assign the label -100 (ignore_label) to these chunks, since we are using them
|
103 |
+
only to provide context. Basically if a token has the label NA, we don't use it for
|
104 |
+
training or evaluation. For example the input would be something
|
105 |
+
like tokens: [James, Doe, 60, year, old, in, 2080, BWH, tomorrow, only],
|
106 |
+
labels: [NA, NA, B-DATE, O, O, O, B-DATE, NA, NA, NA]. NA represents the tokens used for context
|
107 |
+
This function would return some tokenizer info (e.g attention mask etc), along with
|
108 |
+
the information that maps the tokens to the subtokens -
|
109 |
+
[James, Doe, 60, year, old, in, 208, ##0, BW, ##h, tomorrow, only]
|
110 |
+
and the labels - [-100, -100, 0, 20, 20, 20, 3, -100, -100, -100]
|
111 |
+
(if label_all_tokens was true, we would return [-100, -100, 0, 20, 20, 20, 3, 3, -100, -100]).
|
112 |
+
Args:
|
113 |
+
dataset (Dataset): The pre-split (tokenized dataset) that contain labels
|
114 |
+
Returns:
|
115 |
+
tokenized_inputs (Dataset): Subword tokenized and label aligned dataset
|
116 |
+
"""
|
117 |
+
# Run the tokenizer - subword tokenization
|
118 |
+
tokenized_inputs = self._tokenizer(
|
119 |
+
dataset[self._token_column],
|
120 |
+
padding=self._padding,
|
121 |
+
truncation=self._truncation,
|
122 |
+
max_length=self._max_length,
|
123 |
+
is_split_into_words=self._is_split_into_words,
|
124 |
+
)
|
125 |
+
# Align the subwords and tokens
|
126 |
+
labels = [self.__get_labels(
|
127 |
+
labels,
|
128 |
+
tokenized_inputs.word_ids(batch_index=index)
|
129 |
+
) for index, labels in enumerate(dataset[self._label_column])]
|
130 |
+
tokenized_inputs[self._label_column] = labels
|
131 |
+
|
132 |
+
return tokenized_inputs
|
133 |
+
|
134 |
+
def __get_labels(
|
135 |
+
self,
|
136 |
+
labels: Sequence[str],
|
137 |
+
word_ids: Sequence[int]
|
138 |
+
) -> List[int]:
|
139 |
+
"""
|
140 |
+
Go thorough the subword tokens - which are given as word_ids. 2 different tokens
|
141 |
+
2080 & John will have different word_ids, but the subword tokens 2080 & ##0 will
|
142 |
+
have the same word_id, we use this to align and assign the labels accordingly.
|
143 |
+
if the subword tokens belong to [CLS], [SEP] append the ignore label (-100) to the
|
144 |
+
list of labels. If the (2080) subword token (##0) belongs to a token - 2080
|
145 |
+
then the labels would be [3, -100] if label_all_tokens is false. Also if the token
|
146 |
+
is used only for context (with label NA) it would get the value -100 for its label
|
147 |
+
Args:
|
148 |
+
labels (Sequence[str]): The list of labels for the input (example)
|
149 |
+
word_ids (Sequence[int]): The word_ids after subword tokenization of the input
|
150 |
+
Returns:
|
151 |
+
label_ids (List[int]): The list of label ids for the input with the ignore label (-100) added
|
152 |
+
as required.
|
153 |
+
"""
|
154 |
+
label_ids = list()
|
155 |
+
previous_word_idx = None
|
156 |
+
for word_idx in word_ids:
|
157 |
+
# Special tokens have a word id that is None. We set the label to -100 so they are automatically
|
158 |
+
# ignored in the loss function.
|
159 |
+
if word_idx is None:
|
160 |
+
label_ids.append(self._ignore_label)
|
161 |
+
# We set the label for the first token of each word.
|
162 |
+
elif word_idx != previous_word_idx:
|
163 |
+
if labels[word_idx] == self._token_ignore_label:
|
164 |
+
label_ids.append(self._ignore_label)
|
165 |
+
else:
|
166 |
+
label_ids.append(self._label_to_id[labels[word_idx]])
|
167 |
+
# For the other tokens in a word, we set the label to either the current label or -100, depending on
|
168 |
+
# the label_all_tokens flag.
|
169 |
+
else:
|
170 |
+
if labels[word_idx] == self._token_ignore_label:
|
171 |
+
label_ids.append(self._ignore_label)
|
172 |
+
else:
|
173 |
+
label_ids.append(
|
174 |
+
self._b_to_i_label[self._label_to_id[labels[word_idx]]]
|
175 |
+
if self._label_all_tokens else self._ignore_label
|
176 |
+
)
|
177 |
+
previous_word_idx = word_idx
|
178 |
+
return label_ids
|
sequence_tagging/dataset_builder/label_mapper.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Sequence, Mapping, Optional, NoReturn, Dict, Union
|
2 |
+
from .ner_labels import NERLabels
|
3 |
+
|
4 |
+
|
5 |
+
class LabelMapper(object):
|
6 |
+
"""
|
7 |
+
This class is used to map one set of NER labels to another set of NER labels
|
8 |
+
For example we might want to map all NER labels to Binary HIPAA labels.
|
9 |
+
E.g:
|
10 |
+
We change the token labels - [B-AGE, O, O, U-LOC, B-DATE, L-DATE, O, B-STAFF, I-STAFF, L-STAFF] to
|
11 |
+
[B-HIPAA, O, O, U-HIPAA, B-HIPAA, I-HIPAA, O, O, O, O] or if we wanted binary I2B2 labels we map it to
|
12 |
+
[B-I2B2, O, O, U-I2B2, B-I2B2, -I2B2, O, B-I2B2, I-I2B2, L-I2B2]
|
13 |
+
We do this mapping at the token and the span level. That is we have a span from says start=9, end=15
|
14 |
+
labelled as LOC, we map this label to HIPAA or I2B2. This class maps an exisitng set of labels to
|
15 |
+
another set of labels
|
16 |
+
"""
|
17 |
+
|
18 |
+
def __init__(
|
19 |
+
self,
|
20 |
+
notation,
|
21 |
+
ner_types: Sequence[str],
|
22 |
+
ner_types_maps: Sequence[str],
|
23 |
+
description: str
|
24 |
+
) -> NoReturn:
|
25 |
+
"""
|
26 |
+
Initialize the variables that will be used to map the NER labels and spans
|
27 |
+
The ner_map and spans_map should correspond to each other and contain the same NER types
|
28 |
+
Args:
|
29 |
+
"""
|
30 |
+
self._description = description
|
31 |
+
self._types = list(set(ner_types_maps))
|
32 |
+
self._types.sort()
|
33 |
+
self._spans_map = {ner_type: ner_type_map for ner_type, ner_type_map in zip(ner_types, ner_types_maps)}
|
34 |
+
ner_labels = NERLabels(notation=notation, ner_types=ner_types)
|
35 |
+
self._ner_map = dict()
|
36 |
+
for label in ner_labels.get_label_list():
|
37 |
+
if label == 'O' or self._spans_map[label[2:]] == 'O':
|
38 |
+
self._ner_map[label] = 'O'
|
39 |
+
else:
|
40 |
+
self._ner_map[label] = label[0:2] + self._spans_map[label[2:]]
|
41 |
+
|
42 |
+
def map_sequence(self, tag_sequence: Sequence[str]) -> List[str]:
|
43 |
+
"""
|
44 |
+
Mapping a sequence of NER labels to another set of NER labels.
|
45 |
+
E.g: If we use a binary HIPAA mapping
|
46 |
+
This sequence [B-AGE, O, O, U-LOC, B-DATE, L-DATE, O, B-STAFF, I-STAFF, L-STAFF] will be mapped to
|
47 |
+
[B-HIPAA, O, O, U-HIPAA, B-HIPAA, I-HIPAA, O, O, O, O]
|
48 |
+
Return the original sequence if no mapping is used (i.e the maps are == None)
|
49 |
+
Args:
|
50 |
+
tag_sequence (Sequence[str]): A sequence of NER labels
|
51 |
+
Returns:
|
52 |
+
(List[str]): A mapped sequence of NER labels
|
53 |
+
"""
|
54 |
+
# Return the original sequence if no mapping is used
|
55 |
+
return [self._ner_map[tag] for tag in tag_sequence]
|
56 |
+
|
57 |
+
def map_spans(self, spans: Sequence[Mapping[str, Union[str, int]]]) -> Sequence[Dict[str, Union[str, int]]]:
|
58 |
+
"""
|
59 |
+
Mapping a sequence of NER spans to another set of NER spans.
|
60 |
+
E.g: If we use a binary HIPAA mapping
|
61 |
+
The spans: [{start:0, end:5, label: DATE}, {start:17, end:25, label: STAFF}, {start:43, end:54, label: PATIENT}]
|
62 |
+
will be mapped to: [{start:0, end:5, label: HIPAA}, {start:17, end:25, label: O}, {start:43, end:54, label: HIPAA}]
|
63 |
+
Return the original list of spans if no mapping is used (i.e the maps are == None)
|
64 |
+
Args:
|
65 |
+
spans (Sequence[Mapping[str, Union[str, int]]]): A sequence of NER spans
|
66 |
+
Returns:
|
67 |
+
(Sequence[Mapping[str, Union[str, int]]]): A mapped sequence of NER spans
|
68 |
+
"""
|
69 |
+
return [{'start': span['start'], 'end': span['end'], 'label': self._spans_map[span['label']]} \
|
70 |
+
for span in spans]
|
71 |
+
|
72 |
+
def get_ner_description(self) -> str:
|
73 |
+
"""
|
74 |
+
Get the description of the ner labels and span maps used
|
75 |
+
Returns:
|
76 |
+
(str): A description of the label/span maps used
|
77 |
+
"""
|
78 |
+
return self._description
|
79 |
+
|
80 |
+
def get_ner_types(self) -> List[str]:
|
81 |
+
"""
|
82 |
+
Get the PHI types back from the list of NER labels
|
83 |
+
[B-AGE, I-AGE, B-DATE, I-DATE ..] ---> [AGE, DATE, ...]
|
84 |
+
Returns:
|
85 |
+
ner_types (List[str]): The list of unique NER types
|
86 |
+
"""
|
87 |
+
return self._types
|
sequence_tagging/dataset_builder/ner_dataset.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Sequence, Optional, NoReturn
|
2 |
+
|
3 |
+
from datasets import load_dataset, Dataset
|
4 |
+
|
5 |
+
|
6 |
+
class NERDataset(object):
|
7 |
+
"""
|
8 |
+
This class is a wrapper around the huggingface datasets library
|
9 |
+
It maintains the train, validation and test datasets based on the
|
10 |
+
train, validation and test files passed by loading the dataset object
|
11 |
+
from the file and provides a get function to access each of the datasets.
|
12 |
+
"""
|
13 |
+
|
14 |
+
def __init__(
|
15 |
+
self,
|
16 |
+
train_file: Optional[Sequence[str]] = None,
|
17 |
+
validation_file: Optional[Sequence[str]] = None,
|
18 |
+
test_file: Optional[Sequence[str]] = None,
|
19 |
+
extension: str = 'json',
|
20 |
+
shuffle: bool = True,
|
21 |
+
seed: int = 41
|
22 |
+
) -> NoReturn:
|
23 |
+
"""
|
24 |
+
Load the train, validation and test datasets from the files passed. Read the files and convert
|
25 |
+
it into a huggingface dataset.
|
26 |
+
Args:
|
27 |
+
train_file (Optional[Sequence[str]]): The list of files that contain train data
|
28 |
+
validation_file (Optional[Sequence[str]]): The list of files that contain validation data
|
29 |
+
test_file (Optional[Sequence[str]]): The list of files that contain test data
|
30 |
+
shuffle (bool): Whether to shuffle the dataset
|
31 |
+
seed (int): Shuffle seed
|
32 |
+
|
33 |
+
"""
|
34 |
+
self._datasets = NERDataset.__prepare_data(
|
35 |
+
train_file,
|
36 |
+
validation_file,
|
37 |
+
test_file,
|
38 |
+
extension,
|
39 |
+
shuffle,
|
40 |
+
seed
|
41 |
+
)
|
42 |
+
|
43 |
+
@staticmethod
|
44 |
+
def __prepare_data(
|
45 |
+
train_file: Optional[Sequence[str]],
|
46 |
+
validation_file: Optional[Sequence[str]],
|
47 |
+
test_file: Optional[Sequence[str]],
|
48 |
+
extension: str,
|
49 |
+
shuffle: bool,
|
50 |
+
seed: int
|
51 |
+
) -> Dataset:
|
52 |
+
"""
|
53 |
+
Get the train, validation and test datasets from the files passed. Read the files and convert
|
54 |
+
it into a huggingface dataset.
|
55 |
+
Args:
|
56 |
+
train_file (Optional[Sequence[str]]): The list of files that contain train data
|
57 |
+
validation_file (Optional[Sequence[str]]): The list of files that contain validation data
|
58 |
+
test_file (Optional[Sequence[str]]): The list of files that contain test data
|
59 |
+
shuffle (bool): Whether to shuffle the dataset
|
60 |
+
seed (int): Shuffle seed
|
61 |
+
Returns:
|
62 |
+
(Dataset): The huggingface dataset with train, validation, test splits (if included)
|
63 |
+
"""
|
64 |
+
# Read the datasets (train, validation, test etc).
|
65 |
+
data_files = {}
|
66 |
+
if train_file is not None:
|
67 |
+
data_files['train'] = train_file
|
68 |
+
if validation_file is not None:
|
69 |
+
data_files['validation'] = validation_file
|
70 |
+
if test_file is not None:
|
71 |
+
data_files['test'] = test_file
|
72 |
+
# Shuffle the dataset
|
73 |
+
if shuffle:
|
74 |
+
datasets = load_dataset(extension, data_files=data_files).shuffle(seed=seed)
|
75 |
+
else:
|
76 |
+
# Don't shuffle the dataset
|
77 |
+
datasets = load_dataset(extension, data_files=data_files)
|
78 |
+
return datasets
|
79 |
+
|
80 |
+
def get_train_dataset(self) -> Dataset:
|
81 |
+
"""
|
82 |
+
Return the train dataset
|
83 |
+
Returns:
|
84 |
+
(Dataset): The huggingface dataset - train split
|
85 |
+
"""
|
86 |
+
return self._datasets['train']
|
87 |
+
|
88 |
+
def get_validation_dataset(self) -> Dataset:
|
89 |
+
"""
|
90 |
+
Return the validation dataset
|
91 |
+
Returns:
|
92 |
+
(Dataset): The huggingface dataset - validation split
|
93 |
+
"""
|
94 |
+
return self._datasets['validation']
|
95 |
+
|
96 |
+
def get_test_dataset(self) -> Dataset:
|
97 |
+
"""
|
98 |
+
Return the test dataset
|
99 |
+
Returns:
|
100 |
+
(Dataset): The huggingface dataset - test split
|
101 |
+
"""
|
102 |
+
return self._datasets['test']
|
sequence_tagging/dataset_builder/ner_labels.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Sequence, List, NoReturn, Dict
|
2 |
+
|
3 |
+
|
4 |
+
class NERLabels(object):
|
5 |
+
"""
|
6 |
+
Prepare the labels that will be used by the model. Parse the NER types
|
7 |
+
and prepare the NER labels. For example - NER Types: [AGE, DATE],
|
8 |
+
it will create a list like this (for BIO notation) [B-AGE, I-AGE, B-DATE, I-DATE, O]
|
9 |
+
These are the labels that will be assigned to the tokens based on the PHI type.
|
10 |
+
Say we had the following NER types: NAME, AGE, HOSP
|
11 |
+
The NER labels in the BIO notation would be B-AGE, B-HOSP, B-NAME, I-AGE, I-HOSP, I-NAME, O
|
12 |
+
This script creates a list of the NER labels ([B-AGE, B-HOSP, B-NAME, I-AGE, I-HOSP, I-NAME, O])
|
13 |
+
based on the NER types (NAME, AGE, HOSP) that have been defined. Labels have been sorted.
|
14 |
+
The script also returns the number of labels, the label_to_id mapping, the id_to_label mapping
|
15 |
+
Label_id_mapping: {B-AGE:0, B-HOSP:1, B-NAME:2, I-AGE:3, I-HOSP:4, I-NAME:5, O:6}
|
16 |
+
This information will be used during training, evaluation and prediction.
|
17 |
+
"""
|
18 |
+
|
19 |
+
def __init__(self, notation: str, ner_types: Sequence[str]) -> NoReturn:
|
20 |
+
"""
|
21 |
+
Initialize the notation that we are using for the NER task
|
22 |
+
Args:
|
23 |
+
notation (str): The notation that will be used for the NER labels
|
24 |
+
ner_types (Sequence[str]): The list of NER categories
|
25 |
+
"""
|
26 |
+
self._notation = notation
|
27 |
+
self._ner_types = ner_types
|
28 |
+
|
29 |
+
def get_label_list(self) -> List[str]:
|
30 |
+
"""
|
31 |
+
Given the NER types return the NER labels.
|
32 |
+
NER Types: [AGE, DATE] -> return a list like this (for BIO notation) [B-AGE, I-AGE, B-DATE, I-DATE, O]
|
33 |
+
Returns:
|
34 |
+
ner_labels (List[str]): The list of NER labels based on the NER notation (e.g BIO)
|
35 |
+
"""
|
36 |
+
# Add the 'O' (Outside - Non-phi) label to the list
|
37 |
+
if 'O' not in self._ner_types:
|
38 |
+
ner_labels = ['O']
|
39 |
+
else:
|
40 |
+
ner_labels = list()
|
41 |
+
# Go through each label and prefix it based on the notation (e.g - B, I etc)
|
42 |
+
for ner_type in self._ner_types:
|
43 |
+
for ner_tag in list(self._notation):
|
44 |
+
if ner_tag != 'O':
|
45 |
+
ner_labels.append(ner_tag + '-' + ner_type)
|
46 |
+
ner_labels.sort()
|
47 |
+
return ner_labels
|
48 |
+
|
49 |
+
def get_label_to_id(self) -> Dict[str, int]:
|
50 |
+
"""
|
51 |
+
Return a label to id mapping
|
52 |
+
Returns:
|
53 |
+
label_to_id (Dict[str, int]): label to id mapping
|
54 |
+
"""
|
55 |
+
labels = self.get_label_list()
|
56 |
+
label_to_id = {label: index_id for index_id, label in enumerate(labels)}
|
57 |
+
return label_to_id
|
58 |
+
|
59 |
+
def get_id_to_label(self) -> Dict[int, str]:
|
60 |
+
"""
|
61 |
+
Return a id to label mapping
|
62 |
+
Returns:
|
63 |
+
id_to_label (Dict[int, str]): id to label mapping
|
64 |
+
"""
|
65 |
+
labels = self.get_label_list()
|
66 |
+
id_to_label = {index_id: label for index_id, label in enumerate(labels)}
|
67 |
+
return id_to_label
|