Prajwal Kailas commited on
Commit
45c1511
1 Parent(s): b5a124c

dependency to run

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. deid/__init__.py +2 -0
  2. deid/text_deid.py +307 -0
  3. deid/utils.py +43 -0
  4. ner_datasets/__init__.py +5 -0
  5. ner_datasets/__pycache__/__init__.cpython-37.pyc +0 -0
  6. ner_datasets/dataset_builder/__init__.py +3 -0
  7. ner_datasets/dataset_builder/dataset.py +119 -0
  8. ner_datasets/dataset_builder/labels/__init__.py +4 -0
  9. ner_datasets/dataset_builder/labels/mismatch_error.py +7 -0
  10. ner_datasets/dataset_builder/labels/ner_predict_token_labels.py +30 -0
  11. ner_datasets/dataset_builder/labels/ner_token_labels.py +156 -0
  12. ner_datasets/dataset_builder/sentence_dataset.py +355 -0
  13. ner_datasets/dataset_creator.py +322 -0
  14. ner_datasets/dataset_splitter.py +294 -0
  15. ner_datasets/distribution/__init__.py +4 -0
  16. ner_datasets/distribution/dataset_splits.py +218 -0
  17. ner_datasets/distribution/ner_distribution.py +54 -0
  18. ner_datasets/distribution/print_distribution.py +49 -0
  19. ner_datasets/preprocessing/__init__.py +2 -0
  20. ner_datasets/preprocessing/preprocessing_loader.py +63 -0
  21. ner_datasets/preprocessing/sentencizers/__init__.py +3 -0
  22. ner_datasets/preprocessing/sentencizers/mimic_stanza_sentencizer.py +37 -0
  23. ner_datasets/preprocessing/sentencizers/note_sentencizer.py +33 -0
  24. ner_datasets/preprocessing/sentencizers/spacy_sentencizer.py +37 -0
  25. ner_datasets/preprocessing/tokenizers/__init__.py +4 -0
  26. ner_datasets/preprocessing/tokenizers/abbreviations/check.txt +20 -0
  27. ner_datasets/preprocessing/tokenizers/abbreviations/medical_abbreviations_curated.txt +87 -0
  28. ner_datasets/preprocessing/tokenizers/abbreviations/medical_abbreviations_wiki.txt +459 -0
  29. ner_datasets/preprocessing/tokenizers/clinical_spacy_tokenizer.py +73 -0
  30. ner_datasets/preprocessing/tokenizers/core_nlp_tokenizer.py +58 -0
  31. ner_datasets/preprocessing/tokenizers/spacy_tokenizer.py +49 -0
  32. ner_datasets/preprocessing/tokenizers/utils/__init__.py +4 -0
  33. ner_datasets/preprocessing/tokenizers/utils/clean_regex.py +64 -0
  34. ner_datasets/preprocessing/tokenizers/utils/clinical_regex.py +309 -0
  35. ner_datasets/preprocessing/tokenizers/utils/date_regex.py +104 -0
  36. ner_datasets/span_fixer.py +380 -0
  37. ner_datasets/span_validation.py +91 -0
  38. sequence_tagging/.DS_Store +0 -0
  39. sequence_tagging/__init__.py +2 -0
  40. sequence_tagging/__pycache__/__init__.cpython-37.pyc +0 -0
  41. sequence_tagging/__pycache__/sequence_tagger.cpython-37.pyc +0 -0
  42. sequence_tagging/arguments/__init__.py +8 -0
  43. sequence_tagging/arguments/data_training_arguments.py +115 -0
  44. sequence_tagging/arguments/evaluation_arguments.py +26 -0
  45. sequence_tagging/arguments/model_arguments.py +43 -0
  46. sequence_tagging/dataset_builder/__init__.py +5 -0
  47. sequence_tagging/dataset_builder/dataset_tokenizer.py +178 -0
  48. sequence_tagging/dataset_builder/label_mapper.py +87 -0
  49. sequence_tagging/dataset_builder/ner_dataset.py +102 -0
  50. sequence_tagging/dataset_builder/ner_labels.py +67 -0
deid/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .text_deid import TextDeid
2
+ __all__ = ["TextDeid"]
deid/text_deid.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from argparse import ArgumentParser
4
+ from typing import Sequence, List, Tuple, Mapping, Union, Any, Type
5
+
6
+ import regex
7
+ from seqeval.scheme import IOB1, IOB2, IOBES, BILOU, Entities
8
+
9
+ from .utils import remove, replace_tag_type, replace_informative
10
+
11
+
12
+ class TextDeid(object):
13
+
14
+ def __init__(self, notation, span_constraint):
15
+ self._span_constraint = span_constraint
16
+ if self._span_constraint == 'strict':
17
+ self._scheme = TextDeid.__get_scheme('IO')
18
+ elif self._span_constraint == 'super_strict':
19
+ self._scheme = TextDeid.__get_scheme('IO')
20
+ else:
21
+ self._scheme = TextDeid.__get_scheme(notation)
22
+
23
+ def decode(self, tokens, predictions):
24
+ if self._span_constraint == 'exact':
25
+ return predictions
26
+ elif self._span_constraint == 'strict':
27
+ return TextDeid.__get_relaxed_predictions(predictions)
28
+ elif self._span_constraint == 'super_strict':
29
+ return TextDeid.__get_super_relaxed_predictions(tokens, predictions)
30
+
31
+ def get_predicted_entities_positions(
32
+ self,
33
+ tokens: Sequence[Mapping[str, Union[str, int]]],
34
+ predictions: List[str],
35
+ suffix: bool
36
+ ) -> List[List[Union[Tuple[Union[str, int], Union[str, int]], Any]]]:
37
+ """
38
+ Use the seqeval get_entities method, which goes through the predictions and returns
39
+ where the span starts and ends. - [O, O, B-AGE, I-AGE, O, O] this will return
40
+ spans starts at token 2 and ends at token 3 - with type AGE. We then extract the
41
+ position of the token in the note (character position) - so we return that
42
+ this span starts at 32 and ends at 37. The function then returns a nested list
43
+ that contains a tuple of tag type and tag position (character positions).
44
+ Example: [[(3, 9), LOC], [(34, 41), PATIENT], ...]]
45
+ Args:
46
+ tokens (Sequence[Mapping[str, Union[str, int]]]): The list of tokens in the note
47
+ predictions (Sequence[str]): The list of predictions for the note
48
+ suffix (str): Whether the B, I etc is in the prefix or the suffix
49
+ Returns:
50
+ positions_info (List[Tuple[Tuple[int, int], str]])): List containing tuples of tag positions and tag type
51
+ """
52
+ positions_info = list()
53
+ entities = Entities(sequences=[predictions], scheme=self._scheme, suffix=suffix)
54
+ for entity_list in entities.entities:
55
+ for entity in entity_list:
56
+ position = (tokens[entity.start]['start'], tokens[entity.end - 1]['end'])
57
+ positions_info.append([position, entity.tag])
58
+ return positions_info
59
+
60
+ def run_deid(
61
+ self,
62
+ input_file,
63
+ predictions_file,
64
+ deid_strategy,
65
+ keep_age: bool = False,
66
+ metadata_key: str = 'meta',
67
+ note_id_key: str = 'note_id',
68
+ tokens_key: str = 'tokens',
69
+ predictions_key: str = 'predictions',
70
+ text_key: str = 'text'
71
+ ):
72
+ # Store note_id to note mapping
73
+ note_map = dict()
74
+ for line in open(input_file, 'r'):
75
+ note = json.loads(line)
76
+ note_id = note[metadata_key][note_id_key]
77
+ note_map[note_id] = note
78
+ # Go through note predictions and de identify the note accordingly
79
+ for line in open(predictions_file, 'r'):
80
+ note = json.loads(line)
81
+ # Get the text using the note_id for this note from the note_map dict
82
+ note_id = note[note_id_key]
83
+ # Get the note from the note_map dict
84
+ deid_note = note_map[note_id]
85
+ # Get predictions
86
+ predictions = self.decode(tokens=note[tokens_key], predictions=note[predictions_key])
87
+ # Get entities and their positions
88
+ entity_positions = self.get_predicted_entities_positions(
89
+ tokens=note[tokens_key],
90
+ predictions=predictions,
91
+ suffix=False
92
+ )
93
+ yield TextDeid.__get_deid_text(
94
+ deid_note=deid_note,
95
+ entity_positions=entity_positions,
96
+ deid_strategy=deid_strategy,
97
+ keep_age=keep_age,
98
+ text_key=text_key
99
+ )
100
+
101
+ @staticmethod
102
+ def __get_deid_text(
103
+ deid_note,
104
+ entity_positions,
105
+ deid_strategy,
106
+ keep_age: bool = False,
107
+ text_key: str = 'text'
108
+ ):
109
+ tag_mapping = TextDeid.__get_tag_mapping(deid_strategy=deid_strategy)
110
+ age_pattern = '((?<!\d+)([1-7]\d?)(?!\d+))|((?<!\d+)(8[0-8]?)(?!\d+))'
111
+ # Sort positions - store the last occurring tag first - i.e in descending order
112
+ # of start positions.
113
+ entity_positions.sort(key=lambda info: info[0][0], reverse=True)
114
+ # Get text and de identify it
115
+ note_text = deid_note[text_key]
116
+ deid_text = deid_note[text_key]
117
+ # Go through the entities and their positions and de identify the text
118
+ # Since we have the positions in sorted order (descending by start positions)
119
+ # we de identify the text from the end to the start - i.e back to front
120
+ for positions, tag in entity_positions:
121
+ start_pos, end_pos = positions
122
+ deid_tag = tag_mapping[tag]
123
+ age_unchanged = False
124
+ if tag == 'AGE' and keep_age:
125
+ span_text = note_text[start_pos:end_pos]
126
+ if regex.search(age_pattern, span_text, flags=regex.IGNORECASE):
127
+ deid_tag = span_text
128
+ age_unchanged = True
129
+ else:
130
+ deid_tag = deid_tag
131
+ if deid_strategy == 'replace_informative' and not age_unchanged:
132
+ deid_text = deid_text[:start_pos] + deid_tag.format(note_text[start_pos:end_pos]) + deid_text[end_pos:]
133
+ else:
134
+ deid_text = deid_text[:start_pos] + deid_tag + deid_text[end_pos:]
135
+ deid_note['deid_text'] = regex.sub('[\n]+', '\n', regex.sub('[ \t\r\f\v]+', ' ', deid_text)).strip()
136
+ return deid_note
137
+
138
+ @staticmethod
139
+ def __get_tag_mapping(deid_strategy):
140
+ if deid_strategy == 'remove':
141
+ return remove()
142
+ elif deid_strategy == 'replace_tag_type':
143
+ return replace_tag_type()
144
+ elif deid_strategy == 'replace_informative':
145
+ return replace_informative()
146
+
147
+ @staticmethod
148
+ def __get_relaxed_predictions(predictions):
149
+ return ['I-' + prediction[2:] if '-' in prediction else prediction for prediction in predictions]
150
+
151
+ @staticmethod
152
+ def __get_super_relaxed_predictions(tokens, predictions):
153
+ # Super relaxed
154
+ # 360 Longwood Ave, OBI, Boston
155
+ # Tokens: ['360', 'Longwood', 'Ave', ',', 'OBI', ',', Boston[
156
+ # Predictions: [B-LOC, I-LOC, L-LOC, O, U-LOC, O, U-LOC]
157
+ # Relaxed: [I-LOC, I-LOC, I-LOC, O, I-LOC, O, I-LOC]
158
+ # Super relaxed: [I-LOC, I-LOC, I-LOC, I-LOC, I-LOC, I-LOC, I-LOC]
159
+ relaxed_predictions = TextDeid.__get_relaxed_predictions(predictions)
160
+ prev_type = None
161
+ replace_indexes = list()
162
+ super_relaxed_predictions = list()
163
+ for index, (token, prediction) in enumerate(zip(tokens, relaxed_predictions)):
164
+ super_relaxed_predictions.append(prediction)
165
+ # Check special characters that appear after a prediction
166
+ # we can assign the prediction label to this sequence of special characters
167
+ if prediction == 'O' and prev_type is not None:
168
+ # [a-zA-Z0-9]
169
+ if re.search('^(\W|_)+$', token['text'], flags=re.IGNORECASE | re.DOTALL):
170
+ replace_indexes.append(index)
171
+ else:
172
+ prev_type = None
173
+ replace_indexes = list()
174
+ # Replace all the tokens identified above with the NER prediction type
175
+ # This is done only ig the current prediction type matches the previous type
176
+ elif prediction != 'O':
177
+ if prediction[2:] == prev_type and replace_indexes != []:
178
+ for replace_index in replace_indexes:
179
+ super_relaxed_predictions[replace_index] = 'I-' + prev_type
180
+ # Reset list and previous type
181
+ replace_indexes = list()
182
+ prev_type = prediction[2:]
183
+ else:
184
+ prev_type = None
185
+ return super_relaxed_predictions
186
+
187
+ @staticmethod
188
+ def __get_scheme(notation: str) -> Union[Type[IOB2], Type[IOBES], Type[BILOU], Type[IOB1]]:
189
+ """
190
+ Get the seqeval scheme based on the notation
191
+ Args:
192
+ notation (str): The NER notation
193
+ Returns:
194
+ (Union[IOB2, IOBES, BILOU, IOB1]): The seqeval scheme
195
+ """
196
+ if notation == 'BIO':
197
+ return IOB2
198
+ elif notation == 'BIOES':
199
+ return IOBES
200
+ elif notation == 'BILOU':
201
+ return BILOU
202
+ elif notation == 'IO':
203
+ return IOB1
204
+ else:
205
+ raise ValueError('Invalid Notation')
206
+
207
+
208
+ def main():
209
+ # The following code sets up the arguments to be passed via CLI or via a JSON file
210
+ cli_parser = ArgumentParser(description='configuration arguments provided at run time from the CLI')
211
+ cli_parser.add_argument(
212
+ '--input_file',
213
+ type=str,
214
+ required=True,
215
+ help='the the jsonl file that contains the notes'
216
+ )
217
+ cli_parser.add_argument(
218
+ '--predictions_file',
219
+ type=str,
220
+ required=True,
221
+ help='the location where the predictions are'
222
+ )
223
+ cli_parser.add_argument(
224
+ '--span_constraint',
225
+ type=str,
226
+ required=True,
227
+ choices=['exact', 'strict', 'super_strict'],
228
+ help='whether we want to modify the predictions, make the process of removing phi more struct etc'
229
+ )
230
+ cli_parser.add_argument(
231
+ '--notation',
232
+ type=str,
233
+
234
+ required=True,
235
+ help='the NER notation in the predictions'
236
+ )
237
+ cli_parser.add_argument(
238
+ '--deid_strategy',
239
+ type=str,
240
+ required=True,
241
+ choices=['remove', 'replace_tag_type', 'replace_informative'],
242
+ help='The strategy '
243
+ )
244
+ cli_parser.add_argument(
245
+ '--keep_age',
246
+ action='store_true',
247
+ help='whether to keep ages below 89'
248
+ )
249
+ cli_parser.add_argument(
250
+ '--text_key',
251
+ type=str,
252
+ default='text',
253
+ help='the key where the note text is present in the json object'
254
+ )
255
+ cli_parser.add_argument(
256
+ '--metadata_key',
257
+ type=str,
258
+ default='meta',
259
+ help='the key where the note metadata is present in the json object'
260
+ )
261
+ cli_parser.add_argument(
262
+ '--note_id_key',
263
+ type=str,
264
+ default='note_id',
265
+ help='the key where the note id is present in the json object'
266
+ )
267
+ cli_parser.add_argument(
268
+ '--tokens_key',
269
+ type=str,
270
+ default='tokens',
271
+ help='the key where the tokens for the notes are present in the json object'
272
+ )
273
+ cli_parser.add_argument(
274
+ '--predictions_key',
275
+ type=str,
276
+ default='predictions',
277
+ help='the key where the note predictions is present in the json object'
278
+ )
279
+ cli_parser.add_argument(
280
+ '--output_file',
281
+ type=str,
282
+ required=True,
283
+ help='the location we would write the deid notes'
284
+ )
285
+ # Parse args
286
+ args = cli_parser.parse_args()
287
+ text_deid = TextDeid(notation=args.notation, span_constraint=args.span_constraint)
288
+ deid_notes = text_deid.run_deid(
289
+ input_file=args.input_file,
290
+ predictions_file=args.predictions_file,
291
+ deid_strategy=args.deid_strategy,
292
+ keep_age=args.keep_age,
293
+ metadata_key=args.metadata_key,
294
+ note_id_key=args.note_id_key,
295
+ tokens_key=args.tokens_key,
296
+ predictions_key=args.predictions_key,
297
+ text_key=args.text_key
298
+ )
299
+ # Write the dataset to the output file
300
+ with open(args.output_file, 'w') as file:
301
+ for deid_note in deid_notes:
302
+ file.write(json.dumps(deid_note) + '\n')
303
+
304
+
305
+ if __name__ == "__main__":
306
+ # Get deid notes
307
+ main()
deid/utils.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def remove():
2
+ return {'PATIENT': '',
3
+ 'STAFF': '',
4
+ 'AGE': '',
5
+ 'DATE': '',
6
+ 'PHONE': '',
7
+ 'MRN': '',
8
+ 'ID': '',
9
+ 'EMAIL': '',
10
+ 'PATORG': '',
11
+ 'LOC': '',
12
+ 'HOSP': '',
13
+ 'OTHERPHI': ''}
14
+
15
+
16
+ def replace_tag_type():
17
+ return {'PATIENT': 'PATIENT',
18
+ 'STAFF': 'STAFF',
19
+ 'AGE': 'AGE',
20
+ 'DATE': 'DATE',
21
+ 'PHONE': 'PHONE',
22
+ 'MRN': 'MRN',
23
+ 'ID': 'ID',
24
+ 'EMAIL': 'EMAIL',
25
+ 'PATORG': 'PATORG',
26
+ 'LOC': 'LOCATION',
27
+ 'HOSP': 'HOSPITAL',
28
+ 'OTHERPHI': 'OTHERPHI'}
29
+
30
+
31
+ def replace_informative():
32
+ return {'PATIENT': '<<PATIENT:{}>>',
33
+ 'STAFF': '<<STAFF:{}>>',
34
+ 'AGE': '<<AGE:{}>>',
35
+ 'DATE': '<<DATE:{}>>',
36
+ 'PHONE': '<<PHONE:{}>>',
37
+ 'MRN': '<<MRN:{}>>',
38
+ 'ID': '<<ID:{}>>',
39
+ 'EMAIL': '<<EMAIL:{}>>',
40
+ 'PATORG': '<<PATORG:{}>>',
41
+ 'LOC': '<<LOCATION:{}>>',
42
+ 'HOSP': '<<HOSPITAL:{}>>',
43
+ 'OTHERPHI': '<<OTHERPHI:{}>>'}
ner_datasets/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from ehr_deidentification.sequence_tagging.dataset_builder.ner_labels import NERLabels
2
+ from .span_fixer import SpanFixer
3
+ from .dataset_splitter import DatasetSplitter
4
+ from .dataset_creator import DatasetCreator
5
+ __all__ = ["NERLabels", "SpanFixer", "DatasetSplitter", "DatasetCreator"]
ner_datasets/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (487 Bytes). View file
 
ner_datasets/dataset_builder/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .dataset import Dataset
2
+ from .sentence_dataset import SentenceDataset
3
+ __all__ = ["SentenceDataset", "Dataset"]
ner_datasets/dataset_builder/dataset.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import re
3
+ from typing import Iterable, Dict, Sequence, Union, Mapping, Optional, List
4
+
5
+ from .labels import NERTokenLabels, NERPredictTokenLabels, MismatchError
6
+
7
+ random.seed(41)
8
+
9
+
10
+ class Dataset(object):
11
+ """
12
+ Build a NER token classification dataset. Each token should have a corresponding label
13
+ based on the annotated spans
14
+ For training we will build the dataset using the annotated spans (e.g from prodigy)
15
+ For predictions we will assign default labels. to keep the format of the dataset the same
16
+ The dataset is on a sentence level, i.e each note is split into sentences and the
17
+ task is run on a sentence level. Even the predictions are run on a sentence level
18
+ The dataset would be something like:
19
+ Tokens: [tok1, tok2, ... tok n]
20
+ Labels: [lab1, lab2, ... lab n]
21
+ For the prediction mode the labels would be: [default, default, default .... default]
22
+ This script can also be used for predictions, the Labels will be filled with some
23
+ default value. This is done so that we can use the same script for building a dataset to train a model
24
+ and a dataset to obtain predictions using a model
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ sentencizer,
30
+ tokenizer
31
+ ):
32
+ """
33
+ Build a NER token classification dataset
34
+ For training we will build the dataset using the annotated spans (e.g from prodigy)
35
+ For predictions we will assign default labels.
36
+ The dataset is on a sentence level, i.e each note is split into sentences and the de-id
37
+ task is run on a sentence level. Even the predictions are run on a sentence level
38
+ The dataset would be something like:
39
+ Tokens: [tok1, tok2, ... tok n]
40
+ Labels: [lab1, lab2, ... lab n]
41
+ This script can also be used for predictions, the Labels will be filled with some
42
+ default value. This is done so that we can use the same script for building a dataset to train a model
43
+ and a dataset to obtain predictions using a model
44
+ Args:
45
+ sentencizer (Union[SpacySentencizer, MimicStanzaSentencizer, NoteSentencizer]): The sentencizer to use for
46
+ splitting notes into
47
+ sentences
48
+ tokenizer (Union[ClinicalSpacyTokenizer, SpacyTokenizer, CoreNLPTokenizer]): The tokenizer to use for
49
+ splitting text into tokens
50
+ """
51
+ self._sentencizer = sentencizer
52
+ self._tokenizer = tokenizer
53
+
54
+ def get_tokens(
55
+ self,
56
+ text: str,
57
+ spans: Optional[List[Mapping[str, Union[str, int]]]] = None,
58
+ notation: str = 'BIO',
59
+ token_text_key: str = 'text',
60
+ label_key: str = 'label'
61
+ ) -> Iterable[Sequence[Dict[str, Union[str, int]]]]:
62
+ """
63
+ Get a nested list of tokens where the the inner list represents the tokens in the
64
+ sentence and the outer list will contain all the sentences in the note
65
+ Args:
66
+ text (str): The text present in the note
67
+ spans (Optional[List[Mapping[str, Union[str, int]]]]): The NER spans in the note. This will be none if
68
+ building the dataset for prediction
69
+ notation (str): The notation we will be using for the label scheme (e.g BIO, BILOU etc)
70
+ token_text_key (str): The key where the note text is present
71
+ label_key (str): The key where the note label for each token is present
72
+ Returns:
73
+ Iterable[Sequence[Dict[str, Union[str, int]]]]: Iterable that iterates through all the sentences
74
+ and yields the list of tokens in each sentence
75
+ """
76
+ # Initialize the object that will be used to align tokens and spans based on the notation
77
+ # as mentioned earlier - this will be used only when mode is train - because we have
78
+ # access to labelled spans for the notes
79
+ if spans is None:
80
+ label_spans = NERPredictTokenLabels('O')
81
+ else:
82
+ label_spans = NERTokenLabels(spans=spans, notation=notation)
83
+ # Iterate through the sentences in the note
84
+ for sentence in self._sentencizer.get_sentences(text=text):
85
+ # This is used to determine the position of the tokens with respect to the entire note
86
+ offset = sentence['start']
87
+ # Keeps track of the tokens in the sentence
88
+ tokens = list()
89
+ for token in self._tokenizer.get_tokens(text=sentence['text']):
90
+ # Get the token position (start, end) in the note
91
+ token['start'] += offset
92
+ token['end'] += offset
93
+ if token[token_text_key].strip() in ['\n', '\t', ' ', ''] or token['start'] == token['end']:
94
+ continue
95
+ # Shorten consecutive sequences of special characters, this can prevent BERT from truncating
96
+ # extremely long sentences - that could arise because of these characters
97
+ elif re.search('(\W|_){9,}', token[token_text_key]):
98
+ print('WARNING - Shortening a long sequence of special characters from {} to 8'.format(
99
+ len(token[token_text_key])))
100
+ token[token_text_key] = re.sub('(?P<specchar>(\W|_)){8,}', '\g<specchar>' * 8,
101
+ token[token_text_key])
102
+ elif len(token[token_text_key].split(' ')) != 1:
103
+ print('WARNING - Token contains a space character - will be replaced with hyphen')
104
+ token[token_text_key] = token[token_text_key].replace(' ', '-')
105
+ # Get the labels for each token based on the notation (BIO)
106
+ # In predict mode - the default label (e.g O) will be assigned
107
+ try:
108
+ # Get the label for the token - based on the notation
109
+ label = label_spans.get_labels(token=token)
110
+ if label[2:] == 'OTHERISSUE':
111
+ raise ValueError('Fix OTHERISSUE spans')
112
+ # Check if there is a token and span mismatch, i.e the token and span does not align
113
+ except MismatchError:
114
+ print(token)
115
+ raise ValueError('Token-Span mismatch')
116
+ token[label_key] = label
117
+ tokens.append(token)
118
+ if tokens:
119
+ yield tokens
ner_datasets/dataset_builder/labels/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .mismatch_error import MismatchError
2
+ from .ner_token_labels import NERTokenLabels
3
+ from .ner_predict_token_labels import NERPredictTokenLabels
4
+ __all__=["NERTokenLabels", "NERPredictTokenLabels", "MismatchError"]
ner_datasets/dataset_builder/labels/mismatch_error.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Exception thrown when there is a mismatch between a token and span
2
+ # The token and spans don't line up due to a tokenization issue
3
+ # E.g - 79M - span is AGE - 79, but token is 79M
4
+ # There is a mismatch and an error will be thrown - that is the token does
5
+ # not line up with the span
6
+ class MismatchError(Exception):
7
+ pass
ner_datasets/dataset_builder/labels/ner_predict_token_labels.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Mapping, Union, NoReturn
2
+
3
+
4
+ class NERPredictTokenLabels(object):
5
+ """
6
+ Assign a default label while creating the dataset for prediction.
7
+ This is done since the sequence tagging code expects the input
8
+ file to contain a labels field, hence we assign a default label
9
+ to meet this requirement
10
+ """
11
+
12
+ def __init__(self, default_label: str) -> NoReturn:
13
+ """
14
+ Initialize the default label
15
+ Args:
16
+ default_label (str): Default label that will be used
17
+ """
18
+ # Keeps track of all the spans (list) in the text (note)
19
+ self._default_label = default_label
20
+
21
+ def get_labels(self, token: Mapping[str, Union[str, int]]) -> str:
22
+ """
23
+ Given a token, return the default label.
24
+ Args:
25
+ token (Mapping[str, Union[str, int]]): Contains the token text, start and end position of the token
26
+ in the text
27
+ Returns:
28
+ default_label (str): default label
29
+ """
30
+ return self._default_label
ner_datasets/dataset_builder/labels/ner_token_labels.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Mapping, Union, Sequence, List
2
+ from .mismatch_error import MismatchError
3
+
4
+
5
+ class NERTokenLabels(object):
6
+ """
7
+ This class is used to align tokens with the spans
8
+ Each token is assigned one of the following labels
9
+ 'B-LABEL', 'I-LABEL', 'O'. For example the text
10
+ 360 Longwood Avenue is 2 tokens - [360, Longwood, Avenue]
11
+ and each token would be assigned the following labels
12
+ [B-LOC, I-LOC, I-LOC] (this would also depend on what
13
+ notation we are using). Generally the data after prodigy
14
+ annotation has all the tokens and all the spans.
15
+ We would have tokens:[tok1, tok2, ... tokn]
16
+ and spans:[span1:[tok1, tok2, tok3], span2:[tok7], ... span k]
17
+ This would be used to convert into the format we are using
18
+ which is assign the label to each token based on which span it
19
+ belongs to.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ spans: List[Mapping[str, Union[str, int]]],
25
+ notation: str
26
+ ):
27
+ """
28
+ Initialize variables that will be used to align tokens
29
+ and span labels. The spans variable will contain all the spans
30
+ in the note. Notation is whether we would like to use BIO, IO, BILOU,
31
+ when assigning the label to each token based on which span it belongs to.
32
+ Keep track of the total number of spans etc.
33
+ Args:
34
+ spans (Sequence[Mapping[str, Union[str, int]]]): List of all the spans in the text
35
+ notation (str): NER label notation
36
+ """
37
+ # Keeps track of all the spans (list) in the text (note)
38
+ self._spans = spans
39
+ for span in self._spans:
40
+ if type(span['start']) != int or type(span['end']) != int:
41
+ raise ValueError('The start and end keys of the span must be of type int')
42
+ self._spans.sort(key=lambda _span: (_span['start'], _span['end']))
43
+ # The current span is the first element of the list
44
+ self._current_span = 0
45
+ # Boolean variable that indicates whether the token is inside
46
+ # the span (I-LABEL)
47
+ self._inside = False
48
+ # Total number of spans
49
+ self._span_count = len(self._spans)
50
+ # Depending on the notation passed, we will return the label for
51
+ # the token accordingly
52
+ if notation == 'BIO':
53
+ self._prefix_single = 'B-'
54
+ self._prefix_begin = 'B-'
55
+ self._prefix_inside = 'I-'
56
+ self._prefix_end = 'I-'
57
+ self._prefix_outside = 'O'
58
+ elif notation == 'BIOES':
59
+ self._prefix_single = 'S-'
60
+ self._prefix_begin = 'B-'
61
+ self._prefix_inside = 'I-'
62
+ self._prefix_end = 'E-'
63
+ self._prefix_outside = 'O'
64
+ elif notation == 'BILOU':
65
+ self._prefix_single = 'U-'
66
+ self._prefix_begin = 'B-'
67
+ self._prefix_inside = 'I-'
68
+ self._prefix_end = 'L-'
69
+ self._prefix_outside = 'O'
70
+ elif notation == 'IO':
71
+ self._prefix_single = 'I-'
72
+ self._prefix_begin = 'I-'
73
+ self._prefix_inside = 'I-'
74
+ self._prefix_end = 'I-'
75
+ self._prefix_outside = 'O'
76
+
77
+ def __check_begin(self, token: Mapping[str, Union[str, int]]) -> str:
78
+ """
79
+ Given a token, return the label (B-LABEL) and check whether the token
80
+ covers the entire span or is a sub set of the span
81
+ Args:
82
+ token (Mapping[str, Union[str, int]]): Contains the token text, start and end position of the token
83
+ in the text
84
+ Returns:
85
+ (str): The label - 'B-LABEL'
86
+ """
87
+ # Set the inside flag to true to indicate that the next token that is checked
88
+ # will be checked to see if it belongs 'inside' the span
89
+ self._inside = True
90
+ if token['end'] > int(self._spans[self._current_span]['end']):
91
+ raise MismatchError('Span and Token mismatch - Begin Token extends longer than the span')
92
+ # If this token does not cover the entire span then we expect another token
93
+ # to be in the span and that token should be assigned the I-LABEL
94
+ elif token['end'] < int(self._spans[self._current_span]['end']):
95
+ return self._prefix_begin + self._spans[self._current_span]['label']
96
+ # If this token does cover the entire span then we set inside = False
97
+ # to indicate this span is complete and increment the current span
98
+ # to move onto the next span in the text
99
+ elif token['end'] == int(self._spans[self._current_span]['end']):
100
+ self._current_span += 1
101
+ self._inside = False
102
+ return self._prefix_single + self._spans[self._current_span - 1]['label']
103
+
104
+ def __check_inside(self, token: Mapping[str, Union[str, int]]) -> str:
105
+ """
106
+ Given a token, return the label (I-LABEL) and check whether the token
107
+ covers the entire span or is still inside the span.
108
+ Args:
109
+ token (Mapping[str, Union[str, int]]): Contains the token text, start and end position of the token
110
+ in the text
111
+ Returns:
112
+ (str): The label - 'I-LABEL'
113
+ """
114
+
115
+ if (token['start'] >= int(self._spans[self._current_span]['end'])
116
+ or token['end'] > int(self._spans[self._current_span]['end'])):
117
+ raise MismatchError('Span and Token mismatch - Inside Token starts after the span ends')
118
+ # If this token does not cover the entire span then we expect another token
119
+ # to be in the span and that token should be assigned the I-LABEL
120
+ elif token['end'] < int(self._spans[self._current_span]['end']):
121
+ return self._prefix_inside + self._spans[self._current_span]['label']
122
+ # If this token does cover the entire span then we set inside = False
123
+ # to indicate this span is complete and increment the current span
124
+ # to move onto the next span in the text
125
+ elif token['end'] == int(self._spans[self._current_span]['end']):
126
+ self._current_span += 1
127
+ self._inside = False
128
+ return self._prefix_end + self._spans[self._current_span - 1]['label']
129
+
130
+ def get_labels(self, token: Mapping[str, Union[str, int]]) -> str:
131
+ """
132
+ Given a token, return the label (B-LABEL, I-LABEL, O) based on
133
+ the spans present in the text & the desired notation.
134
+ Args:
135
+ token (Mapping[str, Union[str, int]]): Contains the token text, start and end position of the token
136
+ in the text
137
+ Returns:
138
+ (str): One of the labels according to the notation - 'B-LABEL', 'I-LABEL', 'O'
139
+ """
140
+ # If we have iterated through all the spans in the text (note), all the tokens that
141
+ # come after the last span will be marked as 'O' - since they don't belong to any span
142
+ if self._current_span >= self._span_count:
143
+ return self._prefix_outside
144
+ # Check if the span can be assigned the B-LABEL
145
+ if token['start'] == int(self._spans[self._current_span]['start']):
146
+ return self.__check_begin(token)
147
+ # Check if the span can be assigned the I-LABEL
148
+ elif token['start'] > int(self._spans[self._current_span]['start']) and self._inside is True:
149
+ return self.__check_inside(token)
150
+ # Check if the token is outside a span
151
+ elif self._inside is False and (token['end'] <= int(self._spans[self._current_span]['start'])):
152
+ return self._prefix_outside
153
+ else:
154
+ raise MismatchError(
155
+ 'Span and Token mismatch - the span and tokens don\'t line up. There might be a tokenization issue '
156
+ 'that needs to be fixed')
ner_datasets/dataset_builder/sentence_dataset.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import deque
2
+ from typing import Deque, List, Sequence, Iterable, Optional, NoReturn, Dict, Mapping, Union, Tuple
3
+
4
+
5
+ class SentenceDataset(object):
6
+ """
7
+ When we mention previous sentence and next sentence, we don't mean exactly one sentence
8
+ but rather a previous chunk and a next chunk. This can include one or more sentences and
9
+ it does not mean that the sentence has to be complete (it can be cutoff in between) - hence a chunk
10
+ This class is used to build a dataset at the sentence
11
+ level. It takes as input all the tokenized sentences in the note. So the input is
12
+ a list of lists where the outer list represents the sentences in the note and the inner list
13
+ is a list of tokens in the sentence. It then returns a dataset where each sentence is
14
+ concatenated with the previous and a next chunk. This is done so that when we build a model
15
+ we can use the previous and next chunks to add context to the sentence/model. The weights and loss etc
16
+ will be computed and updated based on the current sentence. The previous and next chunks will
17
+ only be used to add context. We could have different sizes of previous and next chunks
18
+ depending on the position of the sentence etc. Essentially we build a sentence level dataset
19
+ where we can also provide context to the sentence by including the previous and next chunks
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ max_tokens: int,
25
+ max_prev_sentence_token: int,
26
+ max_next_sentence_token: int,
27
+ default_chunk_size: int,
28
+ ignore_label: str
29
+ ) -> NoReturn:
30
+ """
31
+ Set the maximum token length a given training example (sentence level) can have.
32
+ That is the total length of the current sentence + previous chunk + next chunk
33
+ We also set the the maximum length of the previous and next chunks. That is how many
34
+ tokens can be in these chunks. However if the total length exceeds, tokens in the
35
+ previous and next chunks will be dropped to ensure that the total length is < max_tokens
36
+ The default chunk size ensures that the length of the chunks will be a minimum number of
37
+ tokens based on the value passed. For example is default_chunk_size=10, the length
38
+ of the previous chunks and next chunks will be at least 10 tokens.
39
+ Args:
40
+ max_tokens (int): maximum token length a given training example (sentence level) can have
41
+ max_prev_sentence_token (int): The max chunk size for the previous chunks for a given sentence
42
+ (training/prediction example) in the note can have
43
+ max_next_sentence_token (int): The max chunk size for the next chunks for a given sentence
44
+ (training/prediction example) in the note can have
45
+ default_chunk_size (int): the training example will always include a chunk of this length
46
+ as part of the previous and next chunks
47
+ ignore_label (str): The label assigned to the previous and next chunks to distinguish
48
+ from the current sentence
49
+ """
50
+ self._id_num = None
51
+ self._max_tokens = max_tokens
52
+ self._max_prev_sentence_token = max_prev_sentence_token
53
+ self._max_next_sentence_token = max_next_sentence_token
54
+ self._default_chunk_size = default_chunk_size
55
+ self._ignore_label = ignore_label
56
+
57
+ @staticmethod
58
+ def chunker(
59
+ seq: Sequence[Mapping[str, Union[str, int]]],
60
+ size: int
61
+ ) -> Iterable[Sequence[Mapping[str, Union[str, int]]]]:
62
+ """
63
+ Return chunks of the sequence. The size of each chunk will be based
64
+ on the value passed to the size argument.
65
+ Args:
66
+ seq (Sequence): maximum token length a given training example (sentence level) can have
67
+ size (int): The max chunk size for the chunks
68
+ Return:
69
+ (Iterable[Sequence[Mapping[str, Union[str, int]]]]): Iterable that iterates through fixed size chunks of
70
+ the input sequence chunked version of the sequence
71
+
72
+ """
73
+ return (seq[pos:pos + size] for pos in range(0, len(seq), size))
74
+
75
+ def get_previous_sentences(self, sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]]) -> List[Deque]:
76
+ """
77
+ Go through all the sentences in the medical note and create a list of
78
+ previous sentences. The output of this function will be a list of chunks
79
+ where each index of the list contains the sentences (chunks) - (tokens) present before
80
+ the sentence at that index in the medical note. For example prev_sent[0] will
81
+ be empty since there is no sentence before the first sentence in the note
82
+ prev_sent[1] will be equal to sent[0], that is the previous sentence of the
83
+ second sentence will be the first sentence. We make use of deque, where we
84
+ start to deque elements when it start to exceed max_prev_sentence_token. This
85
+ list of previous sentences will be used to define the previous chunks
86
+ Args:
87
+ sent_tokens (Sequence[str]): Sentences in the note and
88
+ each element of the list contains a
89
+ list of tokens in that sentence
90
+ Returns:
91
+ previous_sentences (List[deque]): A list of deque objects where each index contains a
92
+ list (queue) of previous tokens (chunk) with respect
93
+ to the sentence represented by that index in the note
94
+ """
95
+ previous_sentences = list()
96
+ # Create a queue and specify the capacity of the queue
97
+ # Tokens will be popped from the queue when the capacity is exceeded
98
+ prev_sentence = deque(maxlen=self._max_prev_sentence_token)
99
+ # The first previous chunk is empty since the first sentence in the note does not have
100
+ # anything before it
101
+ previous_sentences.append(prev_sentence.copy())
102
+ # As we iterate through the list of sentences in the not, we add the tokens from the previous chunks
103
+ # to the the queue. Since we have a queue, as soon as the capacity is exceeded we pop tokens from
104
+ # the queue
105
+ for sent_token in sent_tokens[:-1]:
106
+ for token in sent_token:
107
+ prev_sentence.append(token)
108
+ # As soon as each sentence in the list is processed
109
+ # We add a copy of the current queue to a list - this list keeps track of the
110
+ # previous chunks for a sentence
111
+ previous_sentences.append(prev_sentence.copy())
112
+
113
+ return previous_sentences
114
+
115
+ def get_next_sentences(self, sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]]) -> List[Deque]:
116
+ """
117
+ Go through all the sentences in the medical note and create a list of
118
+ next sentences. The output of this function will be a list of lists
119
+ where each index of the list contains the list of sentences present after
120
+ the sentence at that index in the medical note. For example next_sent[-] will
121
+ be empty since there is no sentence after the last sentence in the note
122
+ next_sent[0] will be equal to sent[1:], that is the next sentence of the
123
+ first sentence will be the subsequent sentences. We make use of deque, where we
124
+ start to deque elements when it start to exceed max_next_sentence_token. This
125
+ list of previous sentences will be used to define the previous chunks
126
+ Args:
127
+ sent_tokens (Sequence[str]): Sentences in the note and each
128
+ element of the list contains a
129
+ list of tokens in that sentence
130
+ Returns:
131
+ next_sentences (List[deque]): A list of deque objects where each index contains a list (queue)
132
+ of next tokens (chunk) with respect to the sentence represented
133
+ by that index in the note
134
+ """
135
+ # A list of next sentences is first created and reversed
136
+ next_sentences = list()
137
+ # Create a queue and specify the capacity of the queue
138
+ # Tokens will be popped from the queue when the capacity is exceeded
139
+ next_sentence = deque(maxlen=self._max_next_sentence_token)
140
+ # The first (which becomes the last chunk when we reverse this list) next chunk is empty since
141
+ # the last sentence in the note does not have
142
+ # anything after it
143
+ next_sentences.append(next_sentence.copy())
144
+ for sent_token in reversed(sent_tokens[1:]):
145
+ for token in reversed(sent_token):
146
+ next_sentence.appendleft(token)
147
+ next_sentences.append(next_sentence.copy())
148
+ # The list is reversed - since we went through the sentences in the reverse order in
149
+ # the earlier steps
150
+ return [next_sent for next_sent in reversed(next_sentences)]
151
+
152
+ def get_sentences(
153
+ self,
154
+ sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]],
155
+ token_text_key: str = 'text',
156
+ label_key: str = 'label',
157
+ start_chunk: Optional[Sequence[Mapping[str, Union[str, int]]]] = None,
158
+ end_chunk: Optional[Sequence[Mapping[str, Union[str, int]]]] = None,
159
+ sub: bool = False
160
+ ) -> Iterable[Tuple[int, Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]]:
161
+ """
162
+ When we mention previous sentence and next sentence, we don't mean exactly one sentence
163
+ but rather a previous chunk and a next chunk. This can include one or more sentences and
164
+ it does not mean that the sentence has to be complete (it can be cutoff in between) - hence a chunk
165
+ We iterate through all the tokenized sentences in the note. So the input is
166
+ a list of lists where the outer list represents the sentences in the note and the inner list
167
+ is a list of tokens in the sentence. It then returns a dataset where each sentence is
168
+ concatenated with the previous and the next sentence. This is done so that when we build a model
169
+ we can use the previous and next sentence to add context to the model. The weights and loss etc
170
+ will be computed and updated based on the current sentence. The previous and next sentence will
171
+ only be used to add context. We could have different sizes of previous and next chunks
172
+ depending on the position of the sentence etc. Since we split a note in several sentences which are
173
+ then used as training data.
174
+ ignore_label is used to differentiate between the current sentence and the previous and next
175
+ chunks. The chunks will have the label NA so that and the current sentence
176
+ will have the label (DATE, AGE etc) so that they can be distinguished.
177
+ If however we are building a dataset for predictions
178
+ the current sentence will have the default label O, but the next and previous chunks will still
179
+ have the label NA. However if the total length exceeds, tokens in the
180
+ previous and next chunks will be dropped to ensure that the total length is < max_tokens
181
+ The default chunk size ensures that the length of the chunks will be a minimum number of
182
+ tokens based on the value passed. For example is default_chunk_size=10, the length
183
+ of the previous chunks and next chunks will be at least 10 tokens. If the total length > max tokens
184
+ even after decreasing the sizes of the previous and next chunks, then we split this long
185
+ sentence into sub sentences and repeat the process described above.
186
+ Args:
187
+ sent_tokens (Sequence[Sequence[Mapping[str, Union[str, int]]]]): Sentences in the note and each sentence
188
+ contains the tokens (dict) in that sentence
189
+ the token dict object contains the
190
+ token text, start, end etc
191
+ token_text_key (str): Each sentence contains a list of tokens where each token is a dict. We use the text
192
+ key to extract the text of the token from the dictionary
193
+ label_key (str): Each sentence contains a list of tokens where each token is a dict. We use the label_key
194
+ key to extract the label of the token from the dictionary. (if it does not have a label
195
+ the default label will be assigned)
196
+ start_chunk (Optional[Sequence[Mapping[str, Union[str, int]]]]): Prefix the first sentence of with some
197
+ pre-defined chunk
198
+ end_chunk (Optional[Sequence[Mapping[str, Union[str, int]]]]): Suffix the last sentence of with some
199
+ pre-defined chunk
200
+ sub (bool): Whether the function is called to process sub-sentences (used when we are splitting
201
+ long sentences into smaller sub sentences to keep sentence length < max_tokens
202
+ Returns:
203
+ (Iterable[Tuple[int, Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]]): Iterate through the
204
+ returned sentences,
205
+ where each sentence
206
+ has the previous
207
+ chunks and next
208
+ chunks attached
209
+ to it.
210
+ """
211
+ # Id num keeps track of the id of the sentence - that is the position the sentence occurs in
212
+ # the note. We keep the id of sub sentences the same as the sentence, so that the user
213
+ # knows that these sub sentences are chunked from a longer sentence.
214
+ # <SENT 0> <SENT 1>. Say length of sent 0 with the previous and next chunks is less than max_tokens
215
+ # we return sent 0 with id 0. For sent 1, say the length is longer, we split it into sub
216
+ # sentences - <SUB 1><SUB 2> - we return SUB 1, and SUB 2 with id 1 - so we know that it belongs
217
+ # to <SENT 1> in the note.
218
+ if not sub:
219
+ self._id_num = -1
220
+ # Initialize the object that will take all the sentences in the note and return
221
+ # a dataset where each row represents a sentence in the note. The sentence in each
222
+ # row will also contain a previous chunk and next chunk (tokens) that will act as context
223
+ # when training the mode
224
+ # [ps1, ps 2, ps 3...ps-i], [cs1, cs2, ... cs-j], [ns, ns, ... ns-k] - as you can see the current sentence
225
+ # which is the sentence we train on (or predict on) will be in the middle - the surrounding tokens will
226
+ # provide context to the current sentence
227
+ # Get the previous sentences (chunks) for each sentence in the note
228
+ previous_sentences = self.get_previous_sentences(sent_tokens)
229
+ # Get the next sentences (chunks) for each sentence in the note
230
+ next_sentences = self.get_next_sentences(sent_tokens)
231
+ # For the note we are going to iterate through all the sentences in the note and
232
+ # concatenate each sentence with the previous and next chunks. (This forms the data that
233
+ # will be used for training/predictions) Each sentence with the concatenated chunks will be
234
+ # a training sample. We would do the same thing for getting predictions on a sentence as well
235
+ # The only difference would be the labels that are used. We would use the default label O for
236
+ # prediction and the annotated labels for prediction
237
+ if len(sent_tokens) != len(previous_sentences) or len(sent_tokens) != len(next_sentences):
238
+ raise ValueError('Sentence length mismatch')
239
+ for index, (previous_sent, current_sent, next_sent) in enumerate(
240
+ zip(previous_sentences, sent_tokens, next_sentences)):
241
+ sent_tokens_text = list()
242
+ sent_labels = list()
243
+ sent_toks = list()
244
+ # Get the tokens and labels for the current sentence
245
+ for token in current_sent:
246
+ # We store this, if we need to process sub sentences when a sentence exceeds max_tokens
247
+ sent_toks.append(token)
248
+ sent_tokens_text.append(token[token_text_key])
249
+ sent_labels.append(token[label_key])
250
+ # We check if the number of tokens in teh current sentence + previous chunk
251
+ # + next chunk exceeds max tokens. If it does we start popping tokens from the previous and next chunks
252
+ # until the number of tokens is equal to max tokens
253
+ previous_sent_length = len(previous_sent)
254
+ current_sent_length = len(sent_tokens_text)
255
+ next_sent_length = len(next_sent)
256
+ total_length = previous_sent_length + current_sent_length + next_sent_length
257
+ # If the length of the current sentence plus the length of the previous and next
258
+ # chunks exceeds the max_tokens, start popping tokens from the previous and next
259
+ # chunks until either total length < max_tokens or the number of tokens in the previous and
260
+ # next chunks goes below the default chunk size
261
+ while total_length > self._max_tokens and \
262
+ (next_sent_length > self._default_chunk_size or previous_sent_length > self._default_chunk_size):
263
+ if next_sent_length >= previous_sent_length:
264
+ next_sent.pop()
265
+ next_sent_length -= 1
266
+ total_length -= 1
267
+ elif previous_sent_length > next_sent_length:
268
+ previous_sent.popleft()
269
+ previous_sent_length -= 1
270
+ total_length -= 1
271
+ # If this is not a sub sentence, increment the ID to
272
+ # indicate the processing of the next sentence of the note
273
+ # If it is a sub sentence, keep the ID the same, to indicate
274
+ # it belongs to a larger sentence
275
+ if not sub:
276
+ self._id_num += 1
277
+ # If total length < max_tokens - process the sentence with the current sentence
278
+ # and add on the previous and next chunks and return
279
+ if total_length <= self._max_tokens:
280
+ # Check if we want to add a pre-defined chunk for the first sentence in the note
281
+ if index == 0 and start_chunk is not None:
282
+ previous_sent_tokens = [chunk[token_text_key] for chunk in start_chunk] + \
283
+ [prev_token[token_text_key] for prev_token in list(previous_sent)]
284
+ else:
285
+ previous_sent_tokens = [prev_token[token_text_key] for prev_token in list(previous_sent)]
286
+ # Check if we want to add a pre-defined chunk for the last sentence in the note
287
+ if index == len(sent_tokens) - 1 and end_chunk is not None:
288
+ next_sent_tokens = [next_token[token_text_key] for next_token in list(next_sent)] + \
289
+ [chunk[token_text_key] for chunk in end_chunk]
290
+ else:
291
+ next_sent_tokens = [next_token[token_text_key] for next_token in list(next_sent)]
292
+ previous_sent_length = len(previous_sent_tokens)
293
+ next_sent_length = len(next_sent_tokens)
294
+ # Store information about the current sentence - start and end pos etc
295
+ # this can be used to distinguish from the next and previous chunks
296
+ # current_sent_info = {'token_info':current_sent}
297
+ # Assign an different label (the ignore label) to the chunks - since they are used only for context
298
+ previous_sent_labels = list()
299
+ next_sent_labels = list()
300
+ if self._ignore_label == 'NA':
301
+ previous_sent_labels = [self._ignore_label] * previous_sent_length
302
+ next_sent_labels = [self._ignore_label] * next_sent_length
303
+ elif self._ignore_label == 'label':
304
+ if index == 0 and start_chunk is not None:
305
+ previous_sent_labels = [chunk[label_key] for chunk in start_chunk] + \
306
+ [prev_token[label_key] for prev_token in list(previous_sent)]
307
+ else:
308
+ previous_sent_labels = [prev_token[label_key] for prev_token in list(previous_sent)]
309
+ if index == len(sent_tokens) - 1 and end_chunk is not None:
310
+ next_sent_labels = [next_token[label_key] for next_token in list(next_sent)] + \
311
+ [chunk[label_key] for chunk in end_chunk]
312
+ else:
313
+ next_sent_labels = [next_token[label_key] for next_token in list(next_sent)]
314
+ # Concatenate the chunks and the sentence
315
+ # sent_tokens_text.append(token[token_text_key])
316
+ tokens_data = previous_sent_tokens + sent_tokens_text + next_sent_tokens
317
+ labels_data = previous_sent_labels + sent_labels + next_sent_labels
318
+ # Return processed sentences
319
+ yield self._id_num, {'tokens': tokens_data, 'labels': labels_data, 'current_sent_info': current_sent}
320
+ # Process the sub sentences - we take a long sentence
321
+ # and split it into smaller chunks - and we recursively call the function on this list
322
+ # of smaller chunks - as mentioned before the smaller chunks (sub sentences) will have the
323
+ # same ID as the original sentence
324
+ else:
325
+ # Store the smaller chunks - say <SENT1> is too long
326
+ # <PREV CHUNK><SENT1><NEXT CHUNK>
327
+ # We get chunk sent 1 - to <SUB1><SUB2><SUB3> and we pass this [<SUB1><SUB2><SUB3>] to the function
328
+ # as a recursive call. This list is now processed as a smaller note that essentially belongs
329
+ # to a sentence. But as you can see we did not pass <PREV CHUNK> & <NEXT CHUNK>, because
330
+ # these are chunks that are not part of the current sentence, but they still need to be
331
+ # included in the final output - and the work around is mentioned below
332
+ # So that we have a previous chunk for <SUB1> and next chunk for <SUB3>
333
+ # we include the previous_sent_tokens and next_sent_tokens as the start chunk
334
+ # and the next chunk in the function call below
335
+ # <PREV CHUNK><SUB1><NEXT SUB1>, id = x
336
+ # <PREV SUB2><SUB2><NEXT SUB2>, id = x
337
+ # <PREV SUB3><SUB3><NEXT CHUNK>, id = x
338
+ sub_sentences = list()
339
+ # Prefix the first sentence in these smaller chunks
340
+ previous_sent_tokens = list(previous_sent)
341
+ # Suffix the last sentence in these smaller chunks
342
+ next_sent_tokens = list(next_sent)
343
+ # Get chunks
344
+ for chunk in SentenceDataset.chunker(sent_toks, self._max_tokens - (2 * self._default_chunk_size)):
345
+ sub_sentences.append(chunk)
346
+ # Process list of smaller chunks
347
+ for sub_sent in self.get_sentences(
348
+ sub_sentences,
349
+ token_text_key,
350
+ label_key,
351
+ start_chunk=previous_sent_tokens,
352
+ end_chunk=next_sent_tokens,
353
+ sub=True
354
+ ):
355
+ yield sub_sent
ner_datasets/dataset_creator.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
4
+ from typing import Iterable, Dict, List, Union, Optional, Sequence, NoReturn
5
+
6
+ from .dataset_builder import Dataset, SentenceDataset
7
+ from .preprocessing import PreprocessingLoader
8
+
9
+ random.seed(41)
10
+
11
+
12
+ class DatasetCreator(object):
13
+ """
14
+ Build a NER token classification dataset
15
+ For training we will build the dataset using the annotated spans (e.g from prodigy)
16
+ For predictions we will assign default labels.
17
+ The dataset is on a sentence level, i.e each note is split into sentences and the de-id
18
+ task is run on a sentence level. Even the predictions are run on a sentence level
19
+ The dataset would be something like:
20
+ Tokens: [[tok1, tok2, ... tok-n], [tok ...], ..., [tok ...]]
21
+ Labels: [[lab1, lab2, ... lab-n], [lab ...], ..., [lab ...]]
22
+ Where the inner list represents the sentences - the tokens in the sentence and the respective
23
+ labels for each token. The labels depend on the notation
24
+ This script can also be used for predictions, the Labels will be filled with some
25
+ default value. This is done so that we can use the same script for building a dataset to train a model
26
+ and a dataset to obtain predictions using a model
27
+ Example:
28
+ Note: Bruce Wayne is a 60yo man. He lives in Gotham
29
+ Sentences: [Bruce Wayne Jr is a 60yo man., He lives in Gotham]
30
+ Tokens: [[Bruce, Wayne, Jr, is, a, 60, yo, man, .], [He, lives, in, Gotham]]
31
+ Labels (BIO notation): [[B-Name, I-Name, I-Name, O, O, O, O, O, O], [O, O, O, B-LOC]]
32
+ Labels (BILOU notation): [[B-Name, I-Name, L-Name, O, O, O, O, O, O], [O, O, O, U-LOC]]
33
+ We also can create sentences that uses previous/next chunks as context - in this case the dataset would
34
+ look something like this. (Assume we limit the size of the chunks to 3 tokens)
35
+ Sentences: [Bruce Wayne Jr is a 60yo man., He lives in Gotham]
36
+ Tokens: [[Bruce, Wayne, Jr, is, a, 60, yo, man, ., He, lives, in], [yo, man, ., He, lives, in, Gotham]]
37
+ Labels (BIO notation): [[B-Name, I-Name, I-Name, O, O, O, O, O, O, NA, NA, NA], [NA, NA, NA, O, O, O, B-LOC]]
38
+ Labels (BILOU notation): [[B-Name, I-Name, L-Name, O, O, O, O, O, O, NA, NA, NA], [NA, NA, NA, O, O, O, U-LOC]]
39
+ NA represents the token is used for context
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ sentencizer: str,
45
+ tokenizer: str,
46
+ abbreviations: Optional[Sequence[str]] = None,
47
+ max_tokens: int = 128,
48
+ max_prev_sentence_token: int = 32,
49
+ max_next_sentence_token: int = 32,
50
+ default_chunk_size: int = 32,
51
+ ignore_label: str = 'NA'
52
+ ) -> NoReturn:
53
+ """
54
+ Initialize the sentencizer and tokenizer
55
+ Args:
56
+ sentencizer (str): Specify which sentencizer you want to use
57
+ tokenizer (str): Specify which tokenizer you want to use
58
+ abbreviations (Optional[Sequence[str]]): A list of abbreviations for which tokens will not be split
59
+ - works only with with custom clinical tokenizer.
60
+ max_tokens (int): The maximum number of tokens allowed in a sentence/training example,
61
+ truncate if it exceeds.
62
+ max_prev_sentence_token (int): The maximum number of previous chunk tokens allowed in a
63
+ sentence/training example
64
+ max_next_sentence_token (int): The maximum number of next chunk tokens allowed in a
65
+ sentence/training example.
66
+ ignore_label (str): The label assigned to the previous and next chunks to distinguish
67
+ from the current sentence
68
+ """
69
+ self._sentencizer = PreprocessingLoader.get_sentencizer(sentencizer=sentencizer)
70
+ self._tokenizer = PreprocessingLoader.get_tokenizer(tokenizer=tokenizer, abbreviations=abbreviations)
71
+ # Initialize the object that will be used to get the tokens and the sentences
72
+ self._dataset = Dataset(sentencizer=self._sentencizer, tokenizer=self._tokenizer)
73
+ # Initialize the object that will take all the sentences in the note and return
74
+ # a dataset where each row represents a sentence in the note. The sentence in each
75
+ # row will also contain a previous chunk and next chunk (tokens) that will act as context
76
+ # when training the mode
77
+ # [ps1, ps 2, ps 3...ps-i], [cs1, cs2, ... cs-j], [ns, ns, ... ns-k] - as you can see the current sentence
78
+ # which is the sentence we train on (or predict on) will be in the middle - the surrounding tokens will
79
+ # provide context to the current sentence
80
+ self._sentence_dataset = SentenceDataset(
81
+ max_tokens=max_tokens,
82
+ max_prev_sentence_token=max_prev_sentence_token,
83
+ max_next_sentence_token=max_next_sentence_token,
84
+ default_chunk_size=default_chunk_size,
85
+ ignore_label=ignore_label
86
+ )
87
+
88
+ def create(
89
+ self,
90
+ input_file: str,
91
+ mode: str = 'predict',
92
+ notation: str = 'BIO',
93
+ token_text_key: str = 'text',
94
+ metadata_key: str = 'meta',
95
+ note_id_key: str = 'note_id',
96
+ label_key: str = 'labels',
97
+ span_text_key: str = 'spans'
98
+ ) -> Iterable[Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]:
99
+ """
100
+ This function is used to get the sentences that will be part of the NER dataset.
101
+ We check whether the note belongs to the desired dataset split. If it does,
102
+ we fix any spans that can cause token-span alignment errors. Then we extract
103
+ all the sentences in the notes, the tokens in each sentence. Finally we
104
+ add some context tokens to the sentence if required. This function returns
105
+ an iterable that iterated through each of the processed sentences
106
+ Args:
107
+ input_file (str): Input jsonl file. Make sure the spans are in ascending order (based on start position)
108
+ mode (str): Dataset being built for train or predict.
109
+ notation (str): The NER labelling notation
110
+ token_text_key (str): The key where the note text and token text is present in the json object
111
+ metadata_key (str): The key where the note metadata is present in the json object
112
+ note_id_key (str): The key where the note id is present in the json object
113
+ label_key (str): The key where the token label will be stored in the json object
114
+ span_text_key (str): The key where the note spans is present in the json object
115
+ Returns:
116
+ (Iterable[Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]): Iterate through the processed
117
+ sentences/training examples
118
+ """
119
+ # Go through the notes
120
+ for line in open(input_file, 'r'):
121
+ note = json.loads(line)
122
+ note_text = note[token_text_key]
123
+ note_id = note[metadata_key][note_id_key]
124
+ if mode == 'train':
125
+ note_spans = note[span_text_key]
126
+ # No spans in predict mode
127
+ elif mode == 'predict':
128
+ note_spans = None
129
+ else:
130
+ raise ValueError("Invalid mode - can only be train/predict")
131
+ # Store the list of tokens in the sentence
132
+ # Eventually this list will contain all the tokens in the note (split on the sentence level)
133
+ # Store the start and end positions of the sentence in the note. This can
134
+ # be used later to reconstruct the note from the sentences
135
+ # we also store the note_id for each sentence so that we can map it back
136
+ # to the note and therefore have all the sentences mapped back to the notes they belong to.
137
+ sent_tokens = [sent_tok for sent_tok in self._dataset.get_tokens(
138
+ text=note_text,
139
+ spans=note_spans,
140
+ notation=notation
141
+ )]
142
+ # The following loop goes through each sentence in the note and returns
143
+ # the current sentence and previous and next chunks that will be used for context
144
+ # The chunks will have a default label (e.g NA) to distinguish from the current sentence
145
+ # and so that we can ignore these chunks when calculating loss and updating weights
146
+ # during training
147
+ for ner_sent_index, ner_sentence in self._sentence_dataset.get_sentences(
148
+ sent_tokens=sent_tokens,
149
+ token_text_key=token_text_key,
150
+ label_key=label_key
151
+ ):
152
+ # Return the processed sentence. This sentence will then be used
153
+ # by the model
154
+ current_sent_info = ner_sentence['current_sent_info']
155
+ note_sent_info_store = {'start': current_sent_info[0]['start'],
156
+ 'end': current_sent_info[-1]['end'], 'note_id': note_id}
157
+ ner_sentence['note_sent_info'] = note_sent_info_store
158
+ yield ner_sentence
159
+
160
+
161
+ def main():
162
+ cli_parser = ArgumentParser(
163
+ description='configuration arguments provided at run time from the CLI',
164
+ formatter_class=ArgumentDefaultsHelpFormatter
165
+ )
166
+ cli_parser.add_argument(
167
+ '--input_file',
168
+ type=str,
169
+ required=True,
170
+ help='the the jsonl file that contains the notes. spans need to be sorted in ascending order (based on start '
171
+ 'position) '
172
+ )
173
+ cli_parser.add_argument(
174
+ '--notation',
175
+ type=str,
176
+ default='BIO',
177
+ help='the notation we will be using for the label scheme'
178
+ )
179
+ cli_parser.add_argument(
180
+ '--max_tokens',
181
+ type=int,
182
+ default=128,
183
+ help='The max tokens that a given sentence (training/prediction example) in the note can have'
184
+ )
185
+ cli_parser.add_argument(
186
+ '--default_chunk_size',
187
+ type=int,
188
+ default=32,
189
+ help='the default chunk size for the previous and next chunks for a given sentence (training/prediction '
190
+ 'example) in the note can have '
191
+ )
192
+ cli_parser.add_argument(
193
+ '--max_prev_sentence_token',
194
+ type=int,
195
+ default=32,
196
+ help='the max chunk size for the previous chunks for a given sentence (training/prediction example) in the '
197
+ 'note can have '
198
+ )
199
+ cli_parser.add_argument(
200
+ '--max_next_sentence_token',
201
+ type=int,
202
+ default=32,
203
+ help='the max chunk size for the next chunks for a given sentence (training/prediction example) in the note '
204
+ 'can have '
205
+ )
206
+ cli_parser.add_argument(
207
+ '--mode',
208
+ type=str,
209
+ choices=['train', 'predict'],
210
+ required=True,
211
+ help='whether we are building the dataset for training or prediction'
212
+ )
213
+ cli_parser.add_argument(
214
+ '--sentencizer',
215
+ type=str,
216
+ required=True,
217
+ help='the sentencizer to use for splitting notes into sentences'
218
+ )
219
+ cli_parser.add_argument(
220
+ '--tokenizer',
221
+ type=str,
222
+ required=True,
223
+ help='the tokenizer to use for splitting text into tokens'
224
+ )
225
+ cli_parser.add_argument(
226
+ '--abbreviations',
227
+ type=str,
228
+ default=None,
229
+ help='file that will be used by clinical tokenizer to handle abbreviations'
230
+ )
231
+ cli_parser.add_argument(
232
+ '--ignore_label',
233
+ type=str,
234
+ default='NA',
235
+ help='whether to use the ignore label or not'
236
+ )
237
+ cli_parser.add_argument(
238
+ '--token_text_key',
239
+ type=str,
240
+ default='text',
241
+ help='the key where the note text is present in the json object'
242
+ )
243
+ cli_parser.add_argument(
244
+ '--metadata_key',
245
+ type=str,
246
+ default='meta',
247
+ help='the key where the note metadata is present in the json object'
248
+ )
249
+ cli_parser.add_argument(
250
+ '--note_id_key',
251
+ type=str,
252
+ default='note_id',
253
+ help='the key where the note metadata is present in the json object'
254
+ )
255
+ cli_parser.add_argument(
256
+ '--label_key',
257
+ type=str,
258
+ default='label',
259
+ help='the key where the note label for each token is present in the json object'
260
+ )
261
+ cli_parser.add_argument(
262
+ '--span_text_key',
263
+ type=str,
264
+ default='spans',
265
+ help='the key where the note annotates spans are present in the json object'
266
+ )
267
+ cli_parser.add_argument(
268
+ '--format',
269
+ type=str,
270
+ default='jsonl',
271
+ help='format to store the dataset in: jsonl or conll'
272
+ )
273
+ cli_parser.add_argument(
274
+ '--output_file',
275
+ type=str,
276
+ help='The file where the NER dataset will be stored'
277
+ )
278
+ args = cli_parser.parse_args()
279
+ dataset_creator = DatasetCreator(
280
+ sentencizer=args.sentencizer,
281
+ tokenizer=args.tokenizer,
282
+ abbreviations=args.abbreviations,
283
+ max_tokens=args.max_tokens,
284
+ max_prev_sentence_token=args.max_prev_sentence_token,
285
+ max_next_sentence_token=args.max_next_sentence_token,
286
+ default_chunk_size=args.default_chunk_size,
287
+ ignore_label=args.ignore_label)
288
+ ner_notes = dataset_creator.create(
289
+ input_file=args.input_file,
290
+ mode=args.mode,
291
+ notation=args.notation,
292
+ token_text_key=args.token_text_key,
293
+ metadata_key=args.metadata_key,
294
+ note_id_key=args.note_id_key,
295
+ label_key=args.label_key,
296
+ span_text_key=args.span_text_key
297
+ )
298
+ # Store the NER dataset in the desired format
299
+ if args.format == 'jsonl':
300
+ # Write the dataset to the output file
301
+ with open(args.output_file, 'w') as file:
302
+ for ner_sentence in ner_notes:
303
+ file.write(json.dumps(ner_sentence) + '\n')
304
+ elif args.format == 'conll':
305
+ with open(args.output_file, 'w') as file:
306
+ for ner_sentence in ner_notes:
307
+ tokens = ner_sentence['tokens']
308
+ labels = ner_sentence['labels']
309
+ current_sent_info = ner_sentence['current_sent_info']
310
+ note_id = ner_sentence['note_sent_info']['note_id']
311
+ if len(tokens) != len(labels) or len(labels) != len(current_sent_info):
312
+ raise ValueError('Length mismatch')
313
+ for token, label, sent_info in zip(tokens, labels, current_sent_info):
314
+ sent_info['note_id'] = note_id
315
+ data = token + ' ' + label + ' ' + json.dumps(sent_info) + '\n'
316
+ file.write(data)
317
+ file.write('\n')
318
+
319
+
320
+ if __name__ == '__main__':
321
+
322
+ main()
ner_datasets/dataset_splitter.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
4
+ from collections import Counter
5
+ from typing import NoReturn, List
6
+
7
+ from .distribution import NERDistribution, DatasetSplits, PrintDistribution
8
+
9
+ random.seed(41)
10
+
11
+
12
+ class DatasetSplitter(object):
13
+ """
14
+ Prepare dataset splits - training, validation & testing splits
15
+ Compute ner distributions in our dataset. Compute ner distributions
16
+ based on which we create and store a dictionary which will contain
17
+ information about which notes (in a dataset) belong to which split.
18
+ Based on this distribution and whether we want to keep certain notes
19
+ grouped (e.g by patient) we assign notes to a split, such that the
20
+ final ner type distribution in each split is similar.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ train_proportion: int = 70,
26
+ validation_proportion: int = 15,
27
+ test_proportion: int = 15
28
+ ) -> NoReturn:
29
+ """
30
+ Initialize the proportions of the splits.
31
+ Args:
32
+ train_proportion (int): Ratio of train dataset
33
+ validation_proportion (int): Ratio of validation dataset
34
+ test_proportion (int): Ratio of test dataset
35
+ """
36
+ self._train_proportion = train_proportion
37
+ self._validation_proportion = validation_proportion
38
+ self._test_proportion = test_proportion
39
+ self._split = None
40
+ self._lookup_split = dict()
41
+
42
+ def get_split(self, split: str) -> List[str]:
43
+ return [key for key in self._lookup_split[split].keys()]
44
+
45
+ def set_split(self, split: str) -> NoReturn:
46
+ """
47
+ Set the split that you are currently checking/processing.
48
+ Based on the split you can perform certain checks and
49
+ computation. Once the split is set, read the information
50
+ present in the split_info_path. Extract only the information
51
+ belonging to the split. Create a hash map where we have
52
+ the keys as the note_ids/patient ids that belong to the split. This hashmap
53
+ can then be used to check if a particular note belongs to this
54
+ split.
55
+ Args:
56
+ split (str): The split - train, test etc (depends on how you named it)
57
+ """
58
+ if split not in ['train', 'validation', 'test']:
59
+ raise ValueError('Invalid split')
60
+ self._split = split
61
+
62
+ def __update_split(self, key: str) -> NoReturn:
63
+ """
64
+ Update the hash map where we have
65
+ the keys (e.g note_id) that belong to the split. This hashmap
66
+ can then be used to check if a particular note belongs to this
67
+ split.
68
+ Args:
69
+ key (str): The key that identify the note belonging to the split
70
+ """
71
+ self._lookup_split[self._split][key] = 1
72
+
73
+ def check_note(self, key: str) -> bool:
74
+ """
75
+ Use the hash map created in the __get_i2b2_filter_map function
76
+ to check if the note (note_info) belongs to this split (train,
77
+ val, test etc). If it does, return true, else false
78
+ Args:
79
+ key (str): The key that identify the note belonging to the split
80
+ Returns:
81
+ (bool): True if the note belongs to the split, false otherwise
82
+ """
83
+ if self._split is None:
84
+ raise ValueError('Split not set')
85
+ if self._lookup_split[self._split].get(key, False):
86
+ return True
87
+ else:
88
+ return False
89
+
90
+ def assign_splits(
91
+ self,
92
+ input_file: str,
93
+ spans_key: str = 'spans',
94
+ metadata_key: str = 'meta',
95
+ group_key: str = 'note_id',
96
+ margin: float = 0.3
97
+ ) -> NoReturn:
98
+ """
99
+ Get the dataset splits - training, validation & testing splits
100
+ Based on the NER distribution and whether we want to keep certain
101
+ notes grouped (e.g by patient). Return an iterable that contains
102
+ a tuple that contains the note_id and the split. This can be used
103
+ to filter notes based on the splits.
104
+ Args:
105
+ input_file (str): The input file
106
+ spans_key (str): The key where the note spans are present
107
+ metadata_key (str): The key where the note metadata is present
108
+ group_key (str): The key where the note group (e.g note_id or patient id etc) is present.
109
+ This field is what the notes will be grouped by, and all notes belonging
110
+ to this grouping will be in the same split
111
+ margin (float): Margin of error when maintaining proportions in the splits
112
+ """
113
+ # Compute the distribution of NER types in the grouped notes.
114
+ # For example the distribution of NER types in all notes belonging to a
115
+ # particular patient
116
+ self._lookup_split = {
117
+ 'train': dict(),
118
+ 'validation': dict(),
119
+ 'test': dict()
120
+ }
121
+ ner_distribution = NERDistribution()
122
+ for line in open(input_file, 'r'):
123
+ note = json.loads(line)
124
+ key = note[metadata_key][group_key]
125
+ ner_distribution.update_distribution(spans=note[spans_key], key=key)
126
+ # Initialize the dataset splits object
127
+ dataset_splits = DatasetSplits(
128
+ ner_distribution=ner_distribution,
129
+ train_proportion=self._train_proportion,
130
+ validation_proportion=self._validation_proportion,
131
+ test_proportion=self._test_proportion,
132
+ margin=margin
133
+ )
134
+ # Check the note and assign it to a split
135
+ for line in open(input_file, 'r'):
136
+ note = json.loads(line)
137
+ key = note[metadata_key][group_key]
138
+ split = dataset_splits.get_split(key=key)
139
+ self.set_split(split)
140
+ self.__update_split(key)
141
+ return None
142
+
143
+
144
+ def main() -> NoReturn:
145
+ """
146
+ Prepare dataset splits - training, validation & testing splits
147
+ Compute ner distributions in our dataset. Based on this distribution
148
+ and whether we want to keep certain notes grouped (e.g by patient)
149
+ we assign notes to a split, such that the final ner type distribution
150
+ in each split is similar.
151
+ """
152
+ # Compute the distribution of NER types in the grouped notes.
153
+ # For example the distribution of NER types in all notes belonging to a
154
+ # particular patient
155
+ # The following code sets up the arguments to be passed via CLI or via a JSON file
156
+ cli_parser = ArgumentParser(
157
+ description='configuration arguments provided at run time from the CLI',
158
+ formatter_class=ArgumentDefaultsHelpFormatter
159
+ )
160
+ cli_parser.add_argument(
161
+ '--input_file',
162
+ type=str,
163
+ required=True,
164
+ help='the the jsonl file that contains the notes'
165
+ )
166
+ cli_parser.add_argument(
167
+ '--spans_key',
168
+ type=str,
169
+ default='spans',
170
+ help='the key where the note spans is present in the json object'
171
+ )
172
+ cli_parser.add_argument(
173
+ '--metadata_key',
174
+ type=str,
175
+ default='meta',
176
+ help='the key where the note metadata is present in the json object'
177
+ )
178
+ cli_parser.add_argument(
179
+ '--group_key',
180
+ type=str,
181
+ default='note_id',
182
+ help='the key to group notes by in the json object'
183
+ )
184
+ cli_parser.add_argument(
185
+ '--train_proportion',
186
+ type=int,
187
+ default=70,
188
+ help='ratio of train dataset'
189
+ )
190
+ cli_parser.add_argument(
191
+ '--train_file',
192
+ type=str,
193
+ default=None,
194
+ help='The file to store the train data'
195
+ )
196
+ cli_parser.add_argument(
197
+ '--validation_proportion',
198
+ type=int,
199
+ default=15,
200
+ help='ratio of validation dataset'
201
+ )
202
+ cli_parser.add_argument(
203
+ '--validation_file',
204
+ type=str,
205
+ default=None,
206
+ help='The file to store the validation data'
207
+ )
208
+ cli_parser.add_argument(
209
+ '--test_proportion',
210
+ type=int,
211
+ default=15,
212
+ help='ratio of test dataset'
213
+ )
214
+ cli_parser.add_argument(
215
+ '--test_file',
216
+ type=str,
217
+ default=None,
218
+ help='The file to store the test data'
219
+ )
220
+ cli_parser.add_argument(
221
+ '--margin',
222
+ type=float,
223
+ default=0.3,
224
+ help='margin of error when maintaining proportions in the splits'
225
+ )
226
+ cli_parser.add_argument(
227
+ '--print_dist',
228
+ action='store_true',
229
+ help='whether to print the label distribution in the splits'
230
+ )
231
+ args = cli_parser.parse_args()
232
+ dataset_splitter = DatasetSplitter(
233
+ train_proportion=args.train_proportion,
234
+ validation_proportion=args.validation_proportion,
235
+ test_proportion=args.test_proportion
236
+ )
237
+ dataset_splitter.assign_splits(
238
+ input_file=args.input_file,
239
+ spans_key=args.spans_key,
240
+ metadata_key=args.metadata_key,
241
+ group_key=args.group_key,
242
+ margin=args.margin
243
+ )
244
+
245
+ if args.train_proportion > 0:
246
+ with open(args.train_file, 'w') as file:
247
+ for line in open(args.input_file, 'r'):
248
+ note = json.loads(line)
249
+ key = note[args.metadata_key][args.group_key]
250
+ dataset_splitter.set_split('train')
251
+ if dataset_splitter.check_note(key):
252
+ file.write(json.dumps(note) + '\n')
253
+
254
+ if args.validation_proportion > 0:
255
+ with open(args.validation_file, 'w') as file:
256
+ for line in open(args.input_file, 'r'):
257
+ note = json.loads(line)
258
+ key = note[args.metadata_key][args.group_key]
259
+ dataset_splitter.set_split('validation')
260
+ if dataset_splitter.check_note(key):
261
+ file.write(json.dumps(note) + '\n')
262
+
263
+ if args.test_proportion > 0:
264
+ with open(args.test_file, 'w') as file:
265
+ for line in open(args.input_file, 'r'):
266
+ note = json.loads(line)
267
+ key = note[args.metadata_key][args.group_key]
268
+ dataset_splitter.set_split('test')
269
+ if dataset_splitter.check_note(key):
270
+ file.write(json.dumps(note) + '\n')
271
+
272
+ if args.print_dist:
273
+ # Read the dataset splits file and compute the NER type distribution
274
+ key_counts = Counter()
275
+ ner_distribution = NERDistribution()
276
+ for line in open(args.input_file, 'r'):
277
+ note = json.loads(line)
278
+ key = note[args.metadata_key][args.group_key]
279
+ key_counts[key] += 1
280
+ ner_distribution.update_distribution(spans=note[args.spans_key], key=key)
281
+ print_distribution = PrintDistribution(ner_distribution=ner_distribution, key_counts=key_counts)
282
+ train_splits = dataset_splitter.get_split('train')
283
+ validation_splits = dataset_splitter.get_split('validation')
284
+ test_splits = dataset_splitter.get_split('test')
285
+ all_splits = train_splits + validation_splits + test_splits
286
+ # Print distribution for each split
287
+ print_distribution.split_distribution(split='total', split_info=all_splits)
288
+ print_distribution.split_distribution(split='train', split_info=train_splits)
289
+ print_distribution.split_distribution(split='validation', split_info=validation_splits)
290
+ print_distribution.split_distribution(split='test', split_info=test_splits)
291
+
292
+
293
+ if __name__ == "__main__":
294
+ main()
ner_datasets/distribution/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .dataset_splits import DatasetSplits
2
+ from .ner_distribution import NERDistribution
3
+ from .print_distribution import PrintDistribution
4
+ __all__=["DatasetSplits", "NERDistribution", "PrintDistribution"]
ner_datasets/distribution/dataset_splits.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from collections import Counter
3
+ from typing import NoReturn
4
+
5
+ from .ner_distribution import NERDistribution
6
+
7
+ random.seed(41)
8
+
9
+
10
+ class DatasetSplits(object):
11
+ """
12
+ Prepare dataset splits - training, validation & testing splits
13
+ Compute ner distributions in the dataset. Based on this we assign
14
+ notes to different splits and at the same time, we keep the distribution of
15
+ NER types in each split similar. .
16
+ Keep track of the split information - which notes are present in which split.
17
+ The label distribution in each split, the number of notes in each split.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ ner_distribution: NERDistribution,
23
+ train_proportion: int,
24
+ validation_proportion: int,
25
+ test_proportion: int,
26
+ margin: float
27
+ ) -> NoReturn:
28
+ """
29
+ Maintain split information. Assign notes based on the proportion of
30
+ the splits, while keeping the label distribution in each split similar.
31
+ Keep track of the split information - which notes are present in which split.
32
+ The label distribution in each split, the number of notes in each split.
33
+ Keep track of the dataset splits and the counts in each split etc.
34
+ These will be used to assign the different notes to different
35
+ splits while keeping the proportion of ner similar in each split.
36
+ Get the maximum number of ner that can be present in the train,
37
+ validation and test split. The total count will be used to
38
+ calculate the current proportion of ner in the split. This can be used
39
+ to keep the proportion of ner types consistent among different splits
40
+ Args:
41
+ ner_distribution (NERDistribution): The NER distribution in the dataset
42
+ train_proportion (int): Ratio of train dataset
43
+ validation_proportion (int): Ratio of validation dataset
44
+ test_proportion (int): Ratio of test dataset
45
+ margin (float): Margin by which the label distribution can be exceeded in the split
46
+ """
47
+ self._ner_distribution = ner_distribution
48
+ # Compute the counts of NER types in the entire dataset
49
+ total_distribution = Counter()
50
+ for key, counts in ner_distribution.get_ner_distribution().items():
51
+ for label, count in counts.items():
52
+ total_distribution[label] += count
53
+ # Compute the percentages of NER types in the entire dataset
54
+ self._total_ner = sum(total_distribution.values())
55
+ self._label_dist_percentages = {
56
+ ner_type: float(count) / self._total_ner * 100 if self._total_ner else 0
57
+ for ner_type, count in total_distribution.items()
58
+ }
59
+ self._margin = margin
60
+ # The three splits
61
+ self._splits = ['train', 'validation', 'test']
62
+ self._split_weights = None
63
+ self._splits_info = None
64
+ # Keep track of the patient_ids that have been processed.
65
+ # Since a patient can have multiple notes and we already know the
66
+ # ner distribution for this patient across all the notes (i.e the ner types
67
+ # and count that appear in all the notes associated with this patient)
68
+ # We also keep all the notes associated with a patient in the same split
69
+ # So we check if adding all the notes associated with this patient will
70
+ # disturb the ner distribution (proportions) as mentioned before.
71
+ self._processed_keys = dict()
72
+ # Based on these proportions we compute train_ner_count, validation_ner_count, test_ner_count
73
+ # Say the proportion are 85, 10, 5
74
+ # The train split will have a maximum of 85% of the overall ner, validation will have 10 and test will 5
75
+ # That is if there are total count of all ner is 100, on splitting the datasets
76
+ # the train split will have a total of 85 ner, validation split will have a total of 10 ner and the
77
+ # test split will have a total of 5 ner
78
+ train_ner_count = int(train_proportion * self._total_ner / 100)
79
+ validation_ner_count = int(validation_proportion * self._total_ner / 100)
80
+ test_ner_count = int(test_proportion * self._total_ner / 100)
81
+ # So based on this, we check if adding a note keeps the balance in proportion or not
82
+ # If it does not, we check the splits given in the "remain" field of the dict (which is
83
+ # the 2 other splits
84
+ self._split_weights = [train_proportion, validation_proportion, test_proportion]
85
+ # Based on the split proportions, ner counts and ner distribution
86
+ # we need to split our dataset into train, validation and test split
87
+ # For each split we try and maintain the same distribution (proportions) between ner types
88
+ # that we computed from the entire dataset (given by - ner_distribution)
89
+ # If the entire dataset had AGE:50%, DATE:30%, LOC:20%, we want the same proportions
90
+ # in each of the train, validation and test splits
91
+ # So based on this, we check if adding a note keeps the balance in proportion or not
92
+ # If it does not, we check the splits given in the "remain" field of the dict (which is
93
+ # the 2 other splits
94
+ self._splits_info = {'train': {'remain': ['validation', 'test'],
95
+ 'total': train_ner_count,
96
+ 'remain_weights': [validation_proportion, test_proportion],
97
+ 'groups': list(), 'number_of_notes': 0, 'label_dist': Counter()},
98
+ 'validation': {'remain': ['train', 'test'],
99
+ 'total': validation_ner_count,
100
+ 'remain_weights': [train_proportion, test_proportion],
101
+ 'groups': list(), 'number_of_notes': 0, 'label_dist': Counter()},
102
+ 'test': {'remain': ['validation', 'train'],
103
+ 'total': test_ner_count,
104
+ 'remain_weights': [validation_proportion, train_proportion],
105
+ 'groups': list(), 'number_of_notes': 0, 'label_dist': Counter()}}
106
+
107
+ def __set_split(self, split: str) -> NoReturn:
108
+ """
109
+ Set the split that you are currently checking/processing.
110
+ Based on the split you can perform certain checks and
111
+ computation for that split.
112
+ Args:
113
+ split (str): The split - train, validation or test
114
+ """
115
+ self._split = split
116
+
117
+ def __update_label_dist(self, distribution: Counter) -> NoReturn:
118
+ """
119
+ Once we have determined that a note can be added to the split we need to
120
+ update the current count of the ner types in the split. So we pass the ner counts
121
+ in the note that will be updated and update the counts of the ner types in the split.
122
+ Args:
123
+ distribution (Counter): Contains the ner type and it's counts (distribution)
124
+ """
125
+ self._splits_info[self._split]['label_dist'].update(distribution)
126
+
127
+ def __update_groups(self, note_group_key: str) -> NoReturn:
128
+ """
129
+ Once we have determined that a note can be added to the split, we append
130
+ to a list some distinct element of the note (e.g note_id). This list will
131
+ contain the note_ids of the notes that belong to this split.
132
+ Args:
133
+ note_group_key (str): Contains the note metadata - e.g note_id, institute etc
134
+ """
135
+ self._processed_keys[note_group_key] = self._split
136
+ self._splits_info[self._split]['groups'].append(note_group_key)
137
+
138
+ def __check_split(self, distribution: Counter) -> bool:
139
+ """
140
+ This function is used to check the resulting ner distribution in the split on adding this
141
+ note to the split. We check how the proportion of ner changes if this note is added to
142
+ the split. If the proportion exceeds the desired proportion then we return false
143
+ to indicate that adding this note will upset the ner distribution across splits, so we should
144
+ instead check adding this note to another split. If it does not update the balance then we return
145
+ True, which means we can add this note to this split. The desired proportion of ner is passed
146
+ in the percentages argument - where we have the desired proportion for each ner type.
147
+ Args:
148
+ distribution (Counter): Contains the mapping between ner type and count
149
+ Returns:
150
+ (bool): True if the note can be added to the split, false otherwise
151
+ """
152
+ # Get the current ner types and counts in the split
153
+ split_label_dist = self._splits_info[self._split]['label_dist']
154
+ # Get the max ner count that can be present in the split
155
+ # This will be used to compute the ner proportions in the split
156
+ split_total = self._splits_info[self._split]['total']
157
+ # Check if the proportion of the split picked in zero
158
+ # and return False because we cant add any note to this split
159
+ if split_total == 0:
160
+ return False
161
+ for ner_type, count in distribution.items():
162
+ percentage = (split_label_dist.get(ner_type, 0) + count) / split_total * 100
163
+ # Check if the proportion on adding this note exceeds the desired proportion
164
+ # within the margin of error
165
+ # If it does return false
166
+ if percentage > self._label_dist_percentages[ner_type] + self._margin:
167
+ return False
168
+ return True
169
+
170
+ def get_split(self, key: str) -> str:
171
+ """
172
+ Assign a split to the note - based on the distribution of ner types in the note
173
+ and the distribution of ner types in the split. Essentially assign a note to a split
174
+ such that the distribution of ner types in each split is similar, once all notes have
175
+ been assigned to their respective splits.
176
+ Args:
177
+ key (str): The note id or patient id of the note (some grouping key)
178
+ Returns:
179
+ (str): The split
180
+ """
181
+ current_splits = self._splits
182
+ current_weights = self._split_weights
183
+ distribution = self._ner_distribution.get_group_distribution(key=key)
184
+ if self._processed_keys.get(key, False):
185
+ return self._processed_keys[key]
186
+ while True:
187
+ # Pick and set the split
188
+ check_split = random.choices(current_splits, current_weights)[0]
189
+ self.__set_split(check_split)
190
+ # Get the ner distribution for this particular patient (across all the notes associated
191
+ # with this patient) and check if the notes can be added to this split.
192
+ # The margin of error for the ner proportions. As we said above we try and keep the proportions
193
+ # across the splits the same, but we allow for some flexibility, so we can go +- the amount
194
+ # given by margin.
195
+ include = self.__check_split(distribution=distribution)
196
+ if include:
197
+ self.__update_groups(key)
198
+ self.__update_label_dist(distribution=distribution)
199
+ return check_split
200
+ else:
201
+ # Check the two other possible splits
202
+ if len(current_splits) == 3:
203
+ current_splits = self._splits_info[check_split]['remain']
204
+ current_weights = self._splits_info[check_split]['remain_weights']
205
+ # Check the one other possible split (when the one of the above two other split check returns false)
206
+ elif len(current_splits) == 2 and current_weights[1 - current_splits.index(check_split)] != 0:
207
+ index = current_splits.index(check_split)
208
+ current_splits = [current_splits[1 - index]]
209
+ current_weights = [100]
210
+ # If it can't be added to any split - choose a split randomly
211
+ else:
212
+ current_splits = self._splits
213
+ current_weights = self._split_weights
214
+ check_split = random.choices(current_splits, current_weights)[0]
215
+ self.__set_split(check_split)
216
+ self.__update_groups(key)
217
+ self.__update_label_dist(distribution=distribution)
218
+ return check_split
ner_datasets/distribution/ner_distribution.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter, defaultdict
2
+ from typing import Sequence, Mapping, NoReturn
3
+
4
+
5
+ class NERDistribution(object):
6
+ """
7
+ Store the distribution of ner types based on some key.
8
+ That is we store the NER type distribution for some given key value and we update
9
+ the distribution when spans related to that key is passed
10
+ """
11
+
12
+ def __init__(self) -> NoReturn:
13
+ """
14
+ Initialize the NER type - count mapping
15
+ """
16
+ # Counter the captures the ner types and counts per patient/note_id in the dataset
17
+ # Depending on what we set the group_key as. Basically gather counts with respect
18
+ # to some grouping of the notes
19
+ # E.g - {{PATIENT 1: {AGE: 99, DATE: 55, ...}, {PATIENT 2: {AGE: 5, DATE: 9, ...} ... }
20
+ self._ner_distribution = defaultdict(Counter)
21
+
22
+ def update_distribution(self, spans: Sequence[Mapping[str, str]], key: str) -> NoReturn:
23
+ """
24
+ Update the distribution of ner types for the given key
25
+ Args:
26
+ spans (Sequence[Mapping[str, str]]): The list of spans in the note
27
+ key (str): The note id or patient id of the note (some grouping)
28
+ """
29
+ # Go through the spans in the note and compute the ner distribution
30
+ # Compute both the overall ner distribution and ner distribution per
31
+ # patient (i.e the ner types in all the notes associated with the patient)
32
+ if not self._ner_distribution.get(key, False):
33
+ self._ner_distribution[key] = Counter()
34
+ for span in spans:
35
+ self._ner_distribution[key][span['label']] += 1
36
+
37
+ def get_ner_distribution(self) -> defaultdict:
38
+ """
39
+ Return overall ner distribution. The NER type distribution for every key.
40
+ Returns:
41
+ ner_distribution (defaultdict(Counter)): Overall NER type distribution for all keys
42
+ """
43
+ return self._ner_distribution
44
+
45
+ def get_group_distribution(self, key: str) -> Counter:
46
+ """
47
+ Return the NER type distribution for the given key
48
+ Returns:
49
+ (Counter): ner distribution w.r.t some grouping (key)
50
+ """
51
+ if key in self._ner_distribution.keys():
52
+ return self._ner_distribution[key]
53
+ else:
54
+ raise ValueError('Key not found')
ner_datasets/distribution/print_distribution.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+ from typing import Sequence, NoReturn
3
+
4
+ from .ner_distribution import NERDistribution
5
+
6
+
7
+ class PrintDistribution(object):
8
+ """
9
+ This class is used to print the distribution of NER types
10
+ """
11
+
12
+ def __init__(self, ner_distribution: NERDistribution, key_counts: Counter) -> NoReturn:
13
+ """
14
+ Initialize
15
+ Args:
16
+ ner_distribution (NERDistribution): NERDistribution object that keeps track of the NER type distributions
17
+ key_counts (Counter): Number of keys/groups (e.g note_ids, patient ids etc)
18
+ """
19
+ self._ner_distribution = ner_distribution
20
+ self._key_counts = key_counts
21
+
22
+ def split_distribution(self, split: str, split_info: Sequence[str]) -> NoReturn:
23
+ """
24
+ Print NER type distribution
25
+ Args:
26
+ split (str): The dataset split
27
+ split_info (Sequence[str]): The keys belonging to that split
28
+ """
29
+ split_distribution = Counter()
30
+ number_of_notes = 0
31
+ for key in split_info:
32
+ number_of_notes += self._key_counts[key]
33
+ split_distribution.update(self._ner_distribution.get_group_distribution(key))
34
+ total_ner = sum(split_distribution.values())
35
+ percentages = {ner_type: float(count) / total_ner * 100 if total_ner else 0
36
+ for ner_type, count in split_distribution.items()}
37
+ print('{:^70}'.format('============ ' + split.upper() + ' NER Distribution ============='))
38
+ print('{:<20}{:<10}'.format('Number of Notes: ', number_of_notes))
39
+ print('{:<20}{:<10}\n'.format('Number of Groups: ', len(split_info)))
40
+ for ner_type, count in split_distribution.most_common():
41
+ print('{:<10}{:<10}{:<5}{:<10}{:<5}{:<10}'.format(
42
+ 'NER Type: ', ner_type,
43
+ 'Count: ', count,
44
+ 'Percentage: ', '{:0.2f}'.format(percentages[ner_type]))
45
+ )
46
+ print('{:<10}{:<10}{:<5}{:<10}{:<5}{:<10}'.format(
47
+ 'NER Type:', 'TOTALS', 'Count: ', total_ner, 'Percentage: ', '{:0.2f}'.format(100))
48
+ )
49
+ print('\n')
ner_datasets/preprocessing/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .preprocessing_loader import PreprocessingLoader
2
+ __all__ = ["PreprocessingLoader"]
ner_datasets/preprocessing/preprocessing_loader.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union, Optional, Sequence
2
+
3
+ from .sentencizers import SpacySentencizer, NoteSentencizer
4
+ from .tokenizers import ClinicalSpacyTokenizer, SpacyTokenizer, CoreNLPTokenizer
5
+
6
+
7
+ class PreprocessingLoader(object):
8
+
9
+ @staticmethod
10
+ def get_sentencizer(sentencizer: str) -> Union[SpacySentencizer, NoteSentencizer]:
11
+ """
12
+ Initialize the sentencizer and tokenizer based
13
+ We can either use the sci-spacy (en_core_sci_lg or en_core_web_sm) or
14
+ consider the entire note as a single sentence.
15
+ Args:
16
+ sentencizer (str): Specify which sentencizer you want to use
17
+ Returns:
18
+ Union[SpacySentencizer, MimicStanzaSentencizer, NoteSentencizer]: An object of the requested
19
+ sentencizer class
20
+ """
21
+ if sentencizer == 'en_core_sci_lg':
22
+ return SpacySentencizer(spacy_model='en_core_sci_lg')
23
+ elif sentencizer == 'en_core_web_sm':
24
+ return SpacySentencizer(spacy_model='en_core_web_sm')
25
+ elif sentencizer == 'note':
26
+ return NoteSentencizer()
27
+ else:
28
+ raise ValueError('Invalid sentencizer - does not exist')
29
+
30
+ @staticmethod
31
+ def get_tokenizer(
32
+ tokenizer: str,
33
+ abbreviations: Optional[Sequence[str]] = None,
34
+ ) -> Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]:
35
+ """
36
+ Initialize the tokenizer based on the CLI arguments
37
+ We can either use the default scipacy (en_core_sci_lg or en_core_web_sm)
38
+ or the modified scipacy (with regex rule) tokenizer.
39
+ It also supports the corenlp tokenizer
40
+ Args:
41
+ tokenizer (str): Specify which tokenizer you want to use
42
+ abbreviations (Optional[str]): A list of abbreviations for which tokens will not be split - works only with
43
+ used with custom clinical tokenizer
44
+ Returns:
45
+ Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]: An object of the requested tokenizer class
46
+ """
47
+ if tokenizer == 'en_core_sci_lg':
48
+ return SpacyTokenizer(spacy_model='en_core_sci_lg')
49
+ elif tokenizer == 'en_core_web_sm':
50
+ return SpacyTokenizer(spacy_model='en_core_web_sm')
51
+ elif tokenizer == 'en':
52
+ return SpacyTokenizer(spacy_model='en')
53
+ elif tokenizer == 'corenlp':
54
+ return CoreNLPTokenizer()
55
+ elif tokenizer == 'clinical':
56
+ # Abbreviations - we won't split tokens that match these (e.g 18F-FDG)
57
+ if abbreviations is None:
58
+ return ClinicalSpacyTokenizer(spacy_model='en_core_sci_lg', abbreviations=abbreviations)
59
+ else:
60
+
61
+ return ClinicalSpacyTokenizer(spacy_model='en_core_sci_lg', abbreviations=abbreviations)
62
+ else:
63
+ raise ValueError('Invalid tokenizer - does not exist')
ner_datasets/preprocessing/sentencizers/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .note_sentencizer import NoteSentencizer
2
+ from .spacy_sentencizer import SpacySentencizer
3
+ __all__=["NoteSentencizer", "SpacySentencizer"]
ner_datasets/preprocessing/sentencizers/mimic_stanza_sentencizer.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Iterable, Dict, Union
2
+
3
+ import stanza
4
+
5
+
6
+ class MimicStanzaSentencizer(object):
7
+ """
8
+ This class is used to read text and split it into
9
+ sentences (and their start and end positions)
10
+ using the mimic stanza package
11
+ """
12
+
13
+ def __init__(self, package: str):
14
+ """
15
+ Initialize a mimic stanza model to read text and split it into
16
+ sentences.
17
+ Args:
18
+ package (str): Name of the mimic model
19
+ """
20
+ self._nlp = stanza.Pipeline('en', package=package, processors='tokenize', use_gpu=True)
21
+
22
+ def get_sentences(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
23
+ """
24
+ Return an integrator that iterates through the sentences in the text
25
+ Args:
26
+ text (str): The text
27
+ Returns:
28
+ (Iterable[Dict[str, Union[str, int]]]): Yields a dictionary that contains the text of the sentence
29
+ the start position of the sentence in the entire text
30
+ and the end position of the sentence in the entire text
31
+ """
32
+ doc = self._nlp(text)
33
+ for sentence in doc.sentences:
34
+ yield {'text': sentence.text,
35
+ 'start': sentence.tokens[0].start_char,
36
+ 'end': sentence.tokens[-1].end_char,
37
+ 'last_token': sentence.tokens[-1].text}
ner_datasets/preprocessing/sentencizers/note_sentencizer.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Iterable, Dict, Union
2
+
3
+
4
+ class NoteSentencizer(object):
5
+ """
6
+ This class is used to read text and split it into
7
+ sentences (and their start and end positions)
8
+ This class considers an entire note or text as
9
+ a single sentence
10
+ """
11
+
12
+ def __init__(self):
13
+ """
14
+ Nothing to be initialized
15
+ """
16
+
17
+ def get_sentences(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
18
+ """
19
+ Return an iterator that iterates through the sentences in the text.
20
+ In this case it just returns the text itself.
21
+ Args:
22
+ text (str): The text
23
+ Returns:
24
+ (Iterable[Dict[str, Union[str, int]]]): Yields a dictionary that contains the text of the sentence
25
+ the start position of the sentence in the entire text
26
+ and the end position of the sentence in the entire text
27
+ """
28
+ yield {
29
+ 'text': text,
30
+ 'start': 0,
31
+ 'end': len(text),
32
+ 'last_token': None
33
+ }
ner_datasets/preprocessing/sentencizers/spacy_sentencizer.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Iterable, Dict, Union
2
+
3
+ import spacy
4
+
5
+
6
+ class SpacySentencizer(object):
7
+ """
8
+ This class is used to read text and split it into
9
+ sentences (and their start and end positions)
10
+ using a spacy model
11
+ """
12
+
13
+ def __init__(self, spacy_model: str):
14
+ """
15
+ Initialize a spacy model to read text and split it into
16
+ sentences.
17
+ Args:
18
+ spacy_model (str): Name of the spacy model
19
+ """
20
+ self._nlp = spacy.load(spacy_model)
21
+
22
+ def get_sentences(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
23
+ """
24
+ Return an iterator that iterates through the sentences in the text
25
+ Args:
26
+ text (str): The text
27
+ Returns:
28
+ (Iterable[Dict[str, Union[str, int]]]): Yields a dictionary that contains the text of the sentence
29
+ the start position of the sentence in the entire text
30
+ and the end position of the sentence in the entire text
31
+ """
32
+ document = self._nlp(text)
33
+ for sentence in document.sents:
34
+ yield {'text': sentence.text,
35
+ 'start': sentence.start_char,
36
+ 'end': sentence.end_char,
37
+ 'last_token': None}
ner_datasets/preprocessing/tokenizers/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .spacy_tokenizer import SpacyTokenizer
2
+ from .core_nlp_tokenizer import CoreNLPTokenizer
3
+ from .clinical_spacy_tokenizer import ClinicalSpacyTokenizer
4
+ __all__=["SpacyTokenizer", "CoreNLPTokenizer", "ClinicalSpacyTokenizer"]
ner_datasets/preprocessing/tokenizers/abbreviations/check.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sec.
2
+ secs.
3
+ Sec.
4
+ Secs.
5
+ fig.
6
+ figs.
7
+ Fig.
8
+ Figs.
9
+ eq.
10
+ eqs.
11
+ Eq.
12
+ Eqs.
13
+ no.
14
+ nos.
15
+ No.
16
+ Nos.
17
+ al.
18
+ gen.
19
+ sp.
20
+ nov.
ner_datasets/preprocessing/tokenizers/abbreviations/medical_abbreviations_curated.txt ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -ve
2
+ +ve
3
+ a.c.
4
+ a/g
5
+ b.i.d.
6
+ C&S
7
+ C/O
8
+ D/C
9
+ D&C
10
+ D and C
11
+ H&H
12
+ H&P
13
+ h.s.
14
+ H/O
15
+ h/o
16
+ I&D
17
+ M/H
18
+ N/V
19
+ O&P
20
+ O.D.
21
+ O.S.
22
+ O.U.
23
+
24
+ p.o.
25
+ p.r.n.
26
+ q.d.
27
+ q.i.d.
28
+ R/O
29
+ s/p
30
+ T&A
31
+ t.i.d.
32
+ u/a
33
+ u**
34
+ y.o.
35
+ F/u
36
+ Crohn's
37
+ R.N.
38
+ S/p
39
+ S/P
40
+ s/P
41
+ N/A
42
+ n/a
43
+ N/a
44
+ n/A
45
+ w/
46
+ Pt.
47
+ pt.
48
+ PT.
49
+ cf.
50
+ CF.
51
+ Cf.
52
+ dr.
53
+ DR.
54
+ Dr.
55
+ ft.
56
+ FT.
57
+ Ft.
58
+ lt.
59
+ LT.
60
+ Lt.
61
+ mr.
62
+ MR.
63
+ Mr.
64
+ ms.
65
+ MS.
66
+ Ms.
67
+ mt.
68
+ MT.
69
+ Mt.
70
+ mx.
71
+ MX.
72
+ Mx.
73
+ ph.
74
+ PH.
75
+ Ph.
76
+ rd.
77
+ RD.
78
+ Rd.
79
+ st.
80
+ ST.
81
+ St.
82
+ vs.
83
+ VS.
84
+ Vs.
85
+ wm.
86
+ WM.
87
+ Wm.
ner_datasets/preprocessing/tokenizers/abbreviations/medical_abbreviations_wiki.txt ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ +ve
2
+ x/12
3
+ x/40
4
+ x/52
5
+ x/7
6
+ 18F-FDG
7
+
8
+ 2/2
9
+ 3TC
10
+ 5-FU
11
+ 5-HIAA
12
+ 5-HT
13
+ 6MP
14
+ a.a.
15
+ A1C
16
+ Aa.
17
+ AAOx3
18
+ A/B
19
+ a.c.
20
+ AC&BC
21
+ ad.
22
+ part.
23
+ A+E
24
+ AF-AFP
25
+ a.h.
26
+ altern.
27
+ d.
28
+ Anti-
29
+ A&O
30
+ A/O
31
+ A&Ox3
32
+ A&Ox4
33
+ a.p.
34
+ A&P
35
+ A/P
36
+ applic.
37
+ aq.
38
+ bull.
39
+ calid.
40
+ dist.
41
+ gel.
42
+ ASC-H
43
+ ASC-US
44
+ A-T
45
+ AT-III
46
+ aur.
47
+ dextro.
48
+ aurist.
49
+ A&W
50
+ A/W
51
+ b.i.d.
52
+ b/l
53
+ bl.cult
54
+ B/O
55
+ BRCA1
56
+ BRCA2
57
+ C1
58
+ C2
59
+ c/b
60
+ CBC/DIFF
61
+ C/C/E
62
+ CCK-PZ
63
+ CHEM-7
64
+ CHEM-20
65
+ C/O
66
+ c/o
67
+ CO2
68
+ COX-1
69
+ COX-2
70
+ COX-3
71
+ C/S
72
+ C&S
73
+ C-section
74
+ C-spine
75
+ C-SSRS
76
+ c/a/p
77
+ c/w
78
+ D5
79
+ D25
80
+ D4T
81
+ D5W
82
+ D&C
83
+ D/C
84
+ D&E
85
+ DHEA-S
86
+ Di-Di
87
+ DM2
88
+ D/O
89
+ D/T
90
+ Ex-n
91
+ F/C
92
+ F/C/S
93
+ FEF25–75
94
+ FEV1
95
+ fl.oz.
96
+ FTA-ABS
97
+ F/U
98
+ G6PD
99
+ G-CSF
100
+ GM-CSF
101
+ H/A
102
+ HbA1c
103
+ HCO3
104
+ HDL-C
105
+ H&E
106
+ H/H
107
+ H&H
108
+ H&M
109
+ HMG-CoA
110
+ H-mole
111
+ H/O
112
+ H&P
113
+ H/oPI
114
+ h.s.
115
+ I131
116
+ ICD-10
117
+ I&D
118
+ IgG4-RD
119
+ IgG4-RKD
120
+ IgG4-ROD
121
+ IgG4-TIN
122
+ INF(-α/-β/-γ)
123
+ I&O
124
+ IV-DSA
125
+ L&D
126
+ LDL-C
127
+ L-DOPA
128
+ L/S
129
+ MC&S
130
+ M/E
131
+ MgSO4
132
+ MHA-TP
133
+ M&M
134
+ MMR-D
135
+ Mono-Di
136
+ Mono-Mono
137
+ MS-AFP
138
+ MSO4
139
+ MVo2
140
+ No.
141
+ rep.
142
+ n.s.
143
+ n/t
144
+ N&V
145
+ n/v
146
+ O2
147
+ OB-GYN
148
+ ob-gyne
149
+ O/E
150
+ O/N
151
+ O&P
152
+ P&A
153
+ PAI-1
154
+ PAPP-A
155
+ p.c.
156
+ PIG-A
157
+ PM&R
158
+ p.r.
159
+ Pt.
160
+ p.v.
161
+ P-Y
162
+ q2wk
163
+ q6h
164
+ q6°
165
+ q.a.d.
166
+ q.AM
167
+ q.d.
168
+ q.d.s.
169
+ q.h.
170
+ q.h.s.
171
+ q.i.d.
172
+ q.l.
173
+ q.m.t.
174
+ q.n.
175
+ q.n.s.
176
+ q.o.d.
177
+ q.o.h.
178
+ q.s.
179
+ q.v.
180
+ q.wk.
181
+ r/g/m
182
+ R&M
183
+ R/O
184
+ r/r/w
185
+ R/t
186
+ RT-PCR
187
+ S1
188
+ S2
189
+ S3
190
+ S4
191
+ S&O
192
+ S.D.
193
+ op.
194
+ SMA-6
195
+ SMA-7
196
+ s/p
197
+ spp.
198
+ Sp.
199
+ fl.
200
+ gr.
201
+ S/S
202
+ S/Sx
203
+ Staph.
204
+ Strep.
205
+ Strepto.
206
+ T&A
207
+ T&C
208
+ T&S
209
+ TAH-BSO
210
+ T2DM
211
+ T/F
212
+ T&H
213
+ Tib-Fib
214
+ TRF'd
215
+ TSHR-Ab
216
+ T.S.T.H.
217
+ U/A
218
+ U&E
219
+ U/O
220
+ V-fib
221
+ V/Q
222
+ WAIS-R
223
+ W/C
224
+ WISC-R
225
+ W/O
226
+ w/o
227
+ w/u
228
+ X-AFP
229
+ y/o
230
+ a.c.h.s.
231
+ ac&hs
232
+ a.d.
233
+ ad.
234
+ add.
235
+ lib.
236
+ admov.
237
+ us.
238
+ æq.
239
+ agit.
240
+ alt.
241
+ d.
242
+ dieb.
243
+ h.
244
+ hor.
245
+ a.m.
246
+ amp.
247
+ com.
248
+ dest.
249
+ ferv.
250
+ a.l.
251
+ a.s.
252
+ a.u.
253
+ b.d.s.
254
+ bib.
255
+ b.i.d.
256
+ b.d.
257
+ ind.
258
+ bol.
259
+ Ph.Br.
260
+ b.t.
261
+ bucc.
262
+ cap.
263
+ caps.
264
+ cap.
265
+ c.m.
266
+ c.m.s.
267
+ c.
268
+ cib.
269
+ c.c.
270
+ cf.
271
+ c.n.
272
+ cochl.
273
+ ampl.
274
+ infant.
275
+ mag.
276
+ mod.
277
+ parv.
278
+ colet.
279
+ comp.
280
+ contin.
281
+ cpt.
282
+ cr.
283
+ cuj.
284
+ c.v.
285
+ cyath.
286
+ vinos.
287
+ D5LR
288
+ D5NS
289
+ D5W
290
+ D10W
291
+ D10W
292
+ D/C
293
+ decoct.
294
+ det.
295
+ dil.
296
+ dim.
297
+ p.
298
+ æ.
299
+ disp.
300
+ div.
301
+ d.t.d.
302
+ elix.
303
+ e.m.p.
304
+ emuls.
305
+ exhib.
306
+ f.
307
+ f.h.
308
+ fl.
309
+ fld.
310
+ f.m.
311
+ pil.
312
+ f.s.a.
313
+ ft.
314
+ garg.
315
+ gutt.
316
+ habt.
317
+ decub.
318
+ intermed.
319
+ tert.
320
+ inj.
321
+ i.m.
322
+ inf.
323
+ i.v.
324
+ i.v.p.
325
+ lat.
326
+ dol.
327
+ lb.
328
+ l.c.d.
329
+ liq.
330
+ lot.
331
+ M.
332
+ m.
333
+ max.
334
+ m.d.u.
335
+ mg/dL
336
+ min.
337
+ mist.
338
+ mit.
339
+ mitt.
340
+ præscript.
341
+ neb.
342
+ noct.
343
+ n.p.o.
344
+ 1/2NS
345
+ o 2
346
+ o2
347
+ o.d.
348
+ o.m.
349
+ omn.
350
+ bih.
351
+ o.n.
352
+ o.s.
353
+ o.u.
354
+ p.c.h.s.
355
+ pc&hs
356
+ Ph.Br.
357
+ Ph.Eur.
358
+ Ph.Int.
359
+ pig./pigm.
360
+ p.m.
361
+ p.o.
362
+ ppt.
363
+ p.r.
364
+ p.r.n.
365
+ pt.
366
+ pulv.
367
+ p.v.
368
+ q.1
369
+ q.1°
370
+ q4PM
371
+ q.a.m.
372
+ q.d./q.1.d.
373
+ q.d.a.m.
374
+ q.d.p.m.
375
+ q.p.m.
376
+ q.q.
377
+ q.q.h.
378
+ a.d
379
+ rep.
380
+ rept.
381
+ R/L
382
+ s.
383
+ s.a.
384
+ sem.
385
+ s.i.d.
386
+ sig.
387
+ sing.
388
+ s.l.
389
+ sol.
390
+ s.o.s.
391
+ s.s.
392
+ st.
393
+ sum.
394
+ supp.
395
+ susp.
396
+ syr.
397
+ tab.
398
+ tal.
399
+ t.
400
+ t.d.s.
401
+ t.i.d.
402
+ t.d.
403
+ tinct.
404
+ t.i.w.
405
+ top.
406
+ tinc.
407
+ trit.
408
+ troch.
409
+ u.d.
410
+ ut.
411
+ dict.
412
+ ung.
413
+ vag.
414
+ w/a
415
+ w/f
416
+ y.o.
417
+ ADD-RT
418
+ A-T
419
+ PDD-NOS
420
+ Alzheimer's
421
+ Age-related
422
+ Aldosterone-producing
423
+ Alcohol-related
424
+ Ataxia-telangiectasia
425
+ Binswanger's
426
+ Becker's
427
+ Bloom's
428
+ Brown-Séquard
429
+ Crimean-Congo
430
+ Cerebro-oculo-facio-skeletal
431
+ Carbapenem-resistant
432
+ Drug-resistant
433
+ End-stage
434
+ Graft-versus-host
435
+ Huntington's
436
+ High-functioning
437
+ Hypoxanthine-guanine
438
+ Legionnaires'
439
+ Low-functioning
440
+ Multi-drug-resistant
441
+ Multi-infarct
442
+ Machado-Joseph
443
+ Maturity-onset
444
+ Multi-sensory
445
+ Obsessive-compulsive
446
+ Parkinson's
447
+ kinase-associated
448
+ Post-polio
449
+ Port-wine
450
+ Reye's
451
+ Sensory-based
452
+ Vitus's
453
+ Septo-optic
454
+ ST-elevation
455
+ Short-lasting
456
+ Urticaria-deafness-amyloidosis
457
+ Wilson's
458
+ drug-resistant
459
+ X-linked
ner_datasets/preprocessing/tokenizers/clinical_spacy_tokenizer.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import spacy
3
+ from spacy.symbols import ORTH
4
+ from .spacy_tokenizer import SpacyTokenizer
5
+ from .utils import DateRegex, CleanRegex, ClinicalRegex
6
+
7
+
8
+ class ClinicalSpacyTokenizer(SpacyTokenizer):
9
+ """
10
+ This class is used to read text and return the tokens
11
+ present in the text (and their start and end positions)
12
+ """
13
+
14
+ def __init__(self, spacy_model, abbreviations,
15
+ split_multiple=True, split_temperature=True,
16
+ split_percentage=True):
17
+ """
18
+ Initialize a spacy model to read text and split it into
19
+ tokens.
20
+ Args:
21
+ spacy_model (str): Name of the spacy model
22
+ """
23
+ super().__init__(spacy_model)
24
+ self._nlp.tokenizer.prefix_search = self.__get_prefix_regex(split_multiple, split_temperature,
25
+ split_percentage).search
26
+ self._nlp.tokenizer.infix_finditer = self.__get_infix_regex().finditer
27
+ self._nlp.tokenizer.suffix_search = self.__get_suffix_regex().search
28
+ new_rules = {}
29
+ for orth, exc in self._nlp.tokenizer.rules.items():
30
+ if re.search('((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)[.]$)|(^(W|w)ed$)', orth):
31
+ continue
32
+ new_rules[orth] = exc
33
+ self._nlp.tokenizer.rules = new_rules
34
+ if (abbreviations != None):
35
+ for abbreviation in abbreviations:
36
+ special_case = [{ORTH: abbreviation}]
37
+ self._nlp.tokenizer.add_special_case(abbreviation, special_case)
38
+ # this matches any lower case tokens - abstract this part out - whetehr to lowercase abbreviations ro not
39
+ exclusions_uncased = {abbreviation.lower(): [{ORTH: abbreviation.lower()}] for abbreviation in
40
+ abbreviations}
41
+ for k, excl in exclusions_uncased.items():
42
+ try:
43
+ self._nlp.tokenizer.add_special_case(k, excl)
44
+ except:
45
+ print('failed to add exception: {}'.format(k))
46
+
47
+ def __get_prefix_regex(self, split_multiple, split_temperature, split_percentage):
48
+
49
+ date_prefix = DateRegex.get_infixes()
50
+ clinical_prefix = ClinicalRegex.get_prefixes(split_multiple, split_temperature, split_percentage)
51
+ clean_prefix = CleanRegex.get_prefixes()
52
+ digit_infix = ClinicalRegex.get_infixes()
53
+ prefixes = clean_prefix + self._nlp.Defaults.prefixes + date_prefix + clinical_prefix + digit_infix
54
+ prefix_regex = spacy.util.compile_prefix_regex(prefixes)
55
+ return prefix_regex
56
+
57
+ def __get_suffix_regex(self):
58
+ clean_suffix = CleanRegex.get_suffixes()
59
+ suffixes = clean_suffix + self._nlp.Defaults.suffixes
60
+ suffix_regex = spacy.util.compile_suffix_regex(suffixes)
61
+ return suffix_regex
62
+
63
+ def __get_infix_regex(self):
64
+
65
+ date_infixes = DateRegex.get_infixes()
66
+ clean_infixes = CleanRegex.get_infixes()
67
+ digit_infix = ClinicalRegex.get_infixes()
68
+ infixes = self._nlp.Defaults.infixes + date_infixes + clean_infixes
69
+ infix_re = spacy.util.compile_infix_regex(infixes)
70
+ return infix_re
71
+
72
+ def get_nlp(self):
73
+ return self._nlp
ner_datasets/preprocessing/tokenizers/core_nlp_tokenizer.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Iterable, Mapping, Dict, Union
3
+
4
+ from pycorenlp import StanfordCoreNLP
5
+
6
+
7
+ class CoreNLPTokenizer(object):
8
+ """
9
+ This class is used to read text and return the tokens
10
+ present in the text (and their start and end positions)
11
+ using core nlp tokenization
12
+ """
13
+
14
+ def __init__(self, port: int = 9000):
15
+ """
16
+ Initialize a core nlp server to read text and split it into
17
+ tokens using the core nlp annotators
18
+ Args:
19
+ port (int): The port to run the server on
20
+ """
21
+ self._core_nlp = StanfordCoreNLP('http://localhost:{0}'.format(port))
22
+
23
+ def get_stanford_annotations(self, text: str, annotators: str = 'tokenize,ssplit,pos,lemma') -> Dict:
24
+ """
25
+ Use the core nlp server to annotate the text and return the
26
+ results as a json object
27
+ Args:
28
+ text (str): The text to annotate
29
+ annotators (str): The core nlp annotations to run on the text
30
+ Returns:
31
+ output (Dict): The core nlp results
32
+ """
33
+ output = self._core_nlp.annotate(text, properties={
34
+ "timeout": "50000",
35
+ "ssplit.newlineIsSentenceBreak": "two",
36
+ 'annotators': annotators,
37
+ 'outputFormat': 'json'
38
+ })
39
+ if type(output) is str:
40
+ output = json.loads(output, strict=False)
41
+ return output
42
+
43
+ def get_tokens(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
44
+ """
45
+ Return an iterable that iterates through the tokens in the text
46
+ Args:
47
+ text (str): The text to annotate
48
+ Returns:
49
+ (Iterable[Mapping[str, Union[str, int]]]): Yields a dictionary that contains the text of the token
50
+ the start position of the token in the entire text
51
+ and the end position of the token in the entire text
52
+ """
53
+ stanford_output = self.get_stanford_annotations(text)
54
+ for sentence in stanford_output['sentences']:
55
+ for token in sentence['tokens']:
56
+ yield {'text': token['originalText'],
57
+ 'start': token['characterOffsetBegin'],
58
+ 'end': token['characterOffsetEnd']}
ner_datasets/preprocessing/tokenizers/spacy_tokenizer.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from typing import Tuple, Iterable, Mapping, Dict, Union
3
+
4
+
5
+ class SpacyTokenizer(object):
6
+ """
7
+ This class is used to read text and return the tokens
8
+ present in the text (and their start and end positions)
9
+ using spacy
10
+ """
11
+
12
+ def __init__(self, spacy_model: str):
13
+ """
14
+ Initialize a spacy model to read text and split it into
15
+ tokens.
16
+ Args:
17
+ spacy_model (str): Name of the spacy model
18
+ """
19
+ self._nlp = spacy.load(spacy_model)
20
+
21
+ @staticmethod
22
+ def __get_start_and_end_offset(token: spacy.tokens.Token) -> Tuple[int, int]:
23
+ """
24
+ Return the start position of the token in the entire text
25
+ and the end position of the token in the entire text
26
+ Args:
27
+ token (spacy.tokens.Token): The spacy token object
28
+ Returns:
29
+ start (int): the start position of the token in the entire text
30
+ end (int): the end position of the token in the entire text
31
+ """
32
+ start = token.idx
33
+ end = start + len(token)
34
+ return start, end
35
+
36
+ def get_tokens(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
37
+ """
38
+ Return an iterable that iterates through the tokens in the text
39
+ Args:
40
+ text (str): The text to annotate
41
+ Returns:
42
+ (Iterable[Mapping[str, Union[str, int]]]): Yields a dictionary that contains the text of the token
43
+ the start position of the token in the entire text
44
+ and the end position of the token in the entire text
45
+ """
46
+ document = self._nlp(text)
47
+ for token in document:
48
+ start, end = SpacyTokenizer.__get_start_and_end_offset(token)
49
+ yield {'text': token.text, 'start': start, 'end': end}
ner_datasets/preprocessing/tokenizers/utils/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .date_regex import DateRegex
2
+ from .clean_regex import CleanRegex
3
+ from .clinical_regex import ClinicalRegex
4
+ __all__=["DateRegex", "CleanRegex", "ClinicalRegex"]
ner_datasets/preprocessing/tokenizers/utils/clean_regex.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ class CleanRegex(object):
3
+ """
4
+ This class is used to define the regexes that will be used by the
5
+ spacy tokenizer rules. Mainly the regexes are used to clean up
6
+ tokens that have unwanted characters (e.g extra hyphens).
7
+ """
8
+ #Staff - 3
9
+ #Hosp - 4, 5
10
+ #Loc - 2
11
+ @staticmethod
12
+ def get_prefixes() -> List[str]:
13
+ """
14
+ This function is used to build the regex that will clean up dirty characters
15
+ present at the prefix position (start position) of a token. For example the token ---clean
16
+ has three hyphens that need to be split from the word clean. This regex
17
+ will be used by spacy to clean it up. This rule considers any characters that is
18
+ not a letter or a digit as dirty characters
19
+ Examples: ----------------9/36, :63, -ESH
20
+ Returns:
21
+ (list): List of regexes to clean the prefix of the token
22
+ """
23
+ #Handles case 5 of HOSP
24
+ return ['((?P<prefix>([^a-zA-Z0-9.]))(?P=prefix)*)', '([.])(?!\d+(\W+|$))']
25
+
26
+ @staticmethod
27
+ def get_suffixes() -> List[str]:
28
+ """
29
+ This function is used to build the regex that will clean up dirty characters
30
+ present at the suffix position (end position) of a token. For example the token clean---
31
+ has three hyphens that need to be split from the word clean. This regex
32
+ will be used by spacy to clean it up. This rule considers any characters that is
33
+ not a letter or a digit as dirty characters
34
+ Examples: FRANK^, regimen---------------, no)
35
+ Returns:
36
+ (list): List of regexes to clean the suffix of the token
37
+ """
38
+ return ['((?P<suffix>([^a-zA-Z0-9]))(?P=suffix)*)']
39
+
40
+ @staticmethod
41
+ def get_infixes() -> List[str]:
42
+ """
43
+ This function is used to build the regex that will clean up dirty characters
44
+ present at the infix position (in-between position) of a token. For example the token
45
+ clean---me has three hyphens that need to be split from the word clean and me. This regex
46
+ will be used by spacy to clean it up. This rule considers any characters that is
47
+ not a letter or a digit as dirty characters
48
+ Examples: FRANK^08/30/76^UNDERWOOD, regimen---------------1/37
49
+ Returns:
50
+ (list): List of regexes to clean the infix of the token
51
+ """
52
+ #Handles case 3 of STAFF
53
+ #Handles case 4 of HOSP
54
+ #Handles case 2 of LOC
55
+ connector_clean = '\^|;|&#|([\(\)\[\]:="])'
56
+ #full_stop_clean = '(?<=[a-zA-Z])(\.)(?=([A-Z][A-Za-z]+)|[^a-zA-Z0-9_.]+)'
57
+ bracket_comma_clean = '(((?<=\d)[,)(](?=[a-zA-Z]+))|((?<=[a-zA-Z])[,)(](?=\w+)))'
58
+ #special_char_clean = '(?<=[a-zA-Z])(\W{3,}|[_]{3,})(?=[A-Za-z]+)'
59
+ special_char_clean = '(?<=[a-zA-Z])([_\W_]{3,})(?=[A-Za-z]+)'
60
+ #Sometimes when there is no space between a period and a comma - it becomes part of the same token
61
+ #e.g John.,M.D - we need to split this up.
62
+ comma_period_clean = '(?<=[a-zA-Z])(\.,)(?=[A-Za-z]+)'
63
+
64
+ return [connector_clean, bracket_comma_clean, special_char_clean, comma_period_clean]
ner_datasets/preprocessing/tokenizers/utils/clinical_regex.py ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ class ClinicalRegex(object):
3
+ """
4
+ This class is used to define the regexes that will be used by the
5
+ spacy tokenizer rules. Mainly the regexes are used to clean up
6
+ tokens that have unwanted characters and typos (e.g missing spaces).
7
+ In the descriptions when we mention symbol we refer to any character
8
+ that is not a letter or a digit or underscore. The spacy tokenizer splits
9
+ the text by whitespace and applies these rules (along with some default rules)
10
+ to the indiviudal tokens.
11
+ """
12
+ #Patient - 2, 3, 5
13
+ #Staff - 1, 2
14
+ #Hosp - 2, 3
15
+ #Loc - 1, 3
16
+ @staticmethod
17
+ def get_word_typo_prefix():
18
+ """
19
+ If token contains a typo. What we mean by a typo is when two tokens
20
+ that should be separate tokens are fused into one token because there
21
+ is a missing space.
22
+ Examples: JohnMarital Status - John is the name that is fused into the
23
+ token Marital because of a missing space.
24
+ The regex checks if we have a sequence of characters followed by another
25
+ sequence of characters that starts with a capital letter, followed by two or
26
+ more small letters, we assume this is a typo and split the tokens (two sequences) up.
27
+ If there is a symbol separating the two sequences, we ease the condition saying
28
+ the Cpaital letter can be followed by two or more capital/small letters.
29
+ Returns:
30
+ (str): regex to clean tokens that are fused because of a missing space
31
+ """
32
+ #Handles cases 2 of PATIENT
33
+ #Handles cases 1 & 2 of STAFF
34
+ #Handles cases 2 & 3 of HOSP
35
+ #Handles cases 1 & 3 of LOC
36
+ #'(([a-z]+)|([A-Z]+)|([A-Z][a-z]+))(?=(([-./]*[A-Z][a-z]{2,})|([-./]+[A-Z][a-zA-Z]{2,})))'
37
+ return '(([a-z]+)|([A-Z]{2,})|([A-Z][a-z]+))(?=(([-./]*[A-Z][a-z]{2,})|([-./]+[A-Z][a-zA-Z]{2,})))'
38
+
39
+ @staticmethod
40
+ def get_word_symbol_digit_prefix() -> str:
41
+ """
42
+ If text is followed by one or more symbols and then followed by one or more digits
43
+ we make the assumption that the text is a seperate token. Spacy will use this regex
44
+ to extract the text portion as one token and will then move on to
45
+ process the rest (symbol and tokens) based on the defined rules.
46
+ Examples: Yang(4986231) - "Yang" will become a seperate token & "(4986231)" will
47
+ be processed as new token
48
+ Returns:
49
+ (str): regex to clean text followed by symbols followed by digits
50
+ """
51
+ #Handles cases 3 & 5 of patient
52
+ return '([a-zA-Z]+)(?=\W+\d+)'
53
+
54
+ @staticmethod
55
+ def get_multiple_prefix(split_multiple: bool) -> str:
56
+ """
57
+ If text is of the format take it x2 times, this function
58
+ can be used to treat the entire thing as one token or
59
+ split into two seperate tokens
60
+ Args:
61
+ split_multiple (bool): whether to treat it as one token or split them up
62
+ Returns:
63
+ (str): regex to either keep as one token or split into two
64
+ """
65
+ if(split_multiple):
66
+ return '([x])(?=(\d{1,2}$))'
67
+ else:
68
+ return '[x]\d{1,2}$'
69
+
70
+ @staticmethod
71
+ def get_pager_prefix():
72
+ return '([pXxPb])(?=(\d{4,}|\d+[-]\d+))'
73
+
74
+ @staticmethod
75
+ def get_age_word_prefix():
76
+ return '([MFmf])(?=\d{2,3}(\W+|$))'
77
+
78
+ @staticmethod
79
+ def get_id_prefix():
80
+ return '(ID|id|Id)(?=\d{3,})'
81
+
82
+ @staticmethod
83
+ def get_word_period_prefix():
84
+ return '((cf|CF|Cf|dr|DR|Dr|ft|FT|Ft|lt|LT|Lt|mr|MR|Mr|ms|MS|Ms|mt|MT|Mt|mx|MX|Mx|ph|PH|Ph|rd|RD|Rd|st|ST|St|vs|VS|Vs|wm|WM|Wm|[A-Za-z]{1})[.])(?=((\W+|$)))'
85
+
86
+ @staticmethod
87
+ def get_chemical_prefix():
88
+ #Vitamin B12 T9 or maybe codes like I48.9- should probaly do \d{1,2} - limit arbitary numbers
89
+ """
90
+ There are certain chemicals, vitamins etc that should not be split. They
91
+ should be kept as a single token - for example the token "B12" in
92
+ "Vitamin B12". This regex checks if there is a single capital letter
93
+ followed by some digits (there can be a hyphen in between those digits)
94
+ then this most likely represents a token that should not be split
95
+ Returns:
96
+ (str): regex to keep vitamin/chemical names as a single token
97
+ """
98
+ #return '((\d)?[A-EG-LN-OQ-WYZ]{1}\d+([.]\d+)?(-\d{1,2})*)(?=(([\(\)\[\]:="])|\W*$))'
99
+ return '((\d)?[A-EG-LN-OQ-WYZ]{1}\d+([.]\d+)?(-\d+)*)(?=(([\(\)\[\]:="])|\W*$))'
100
+
101
+ @staticmethod
102
+ def get_chemical_prefix_small():
103
+ #Vitamin B12 T9 or maybe codes like I48.9- should probaly do \d{1,2} - limit arbitary numbers
104
+ """
105
+ There are certain chemicals, vitamins etc that should not be split. They
106
+ should be kept as a single token - for example the token "B12" in
107
+ "Vitamin B12". This regex checks if there is a single capital letter
108
+ followed by some digits (there can be a hyphen in between those digits)
109
+ then this most likely represents a token that should not be split
110
+ Returns:
111
+ (str): regex to keep vitamin/chemical names as a single token
112
+ """
113
+ #return '((\d)?[A-EG-LN-OQ-WYZ]{1}\d+([.]\d+)?(-\d{1,2})*)(?=(([\(\)\[\]:="])|\W*$))'
114
+ return '((\d)?[a-eg-ln-oq-wyz]{1}\d+([.]\d+)?(-\d+)*)(?=(([\(\)\[\]:="])|\W*$))'
115
+
116
+ @staticmethod
117
+ def get_instrument_prefix():
118
+ """
119
+ There are cases when there are tokens like L1-L2-L3, we want to keep these as one
120
+ single token. This regex checks if there is a capital letter
121
+ Returns:
122
+ (str): regex to keep vitamin/chemical names as a single token
123
+ """
124
+ return '([A-Z]{1,2}\d+(?P<instrument>[-:]+)[A-Z]{1,2}\d+((?P=instrument)[A-Z]{1,2}\d+)*)'
125
+
126
+ @staticmethod
127
+ def get_instrument_prefix_small():
128
+ """
129
+ There are cases when there are tokens like L1-L2-L3, we want to keep these as one
130
+ single token. This regex checks if there is a capital letter
131
+ Returns:
132
+ (str): regex to keep vitamin/chemical names as a single token
133
+ """
134
+ return '([a-z]{1,2}\d+(?P<instrument_small>[-:]+)[a-z]{1,2}\d+((?P=instrument_small)[a-z]{1,2}\d+)*)'
135
+
136
+ #Handles Case 3, 4, 5 of MRN
137
+ #Handles Case 1, 2, 3 of PHONE
138
+ #Handles Case 7, 10 of AGE
139
+ #Handles Case 1 of IDNUM
140
+ #Handles Case 3, 5 of PATIENT
141
+ #Handles Case 7 of HOSP
142
+ #Handles Case 1 of General
143
+ @staticmethod
144
+ def get_age_typo_prefix():
145
+ """
146
+ There are cases when there is no space between the text and the age
147
+ Example: Plan88yo - we want Plan to be a seperate token
148
+ Returns:
149
+ (str):
150
+ """
151
+ age_suffix = '(([yY][eE][aA][rR]|[yY][oO]' + \
152
+ '|[yY][rR]|[yY]\.[oO]|[yY]/[oO]|[fF]|[mM]|[yY])' + \
153
+ '(-)*([o|O][l|L][d|D]|[f|F]|[m|M]|[o|O])?)'
154
+ return '([a-zA-Z]+)(?=((\d{1,3})' + age_suffix + '$))'
155
+
156
+ @staticmethod
157
+ def get_word_digit_split_prefix():
158
+ #Word followed by more than 3 digits - might not be part of the same token
159
+ #and could be a typo
160
+ #This need not be true - maybe we have an id like BFPI980801 - this will be split
161
+ #BFPI 980801 - but it might be okay to split - need to check
162
+ #([A-Z][a-z]{2,})(?=\d+)
163
+ return '([A-Z][a-z]{2,})(?=[A-Za-z]*\d+)'
164
+
165
+ @staticmethod
166
+ def get_word_digit_mix_prefix():
167
+ #Mix of letters and characters - most likely a typo if the
168
+ #following characters is a capital letter followed by more than
169
+ #2 small letters
170
+ #return '([A-Z]+\d+([A-Z]+(?!([a-z]{2,}))))(?=(\W+|([A-Z][a-z]{2,})|[a-z]{3,}))'
171
+ return '([A-Z]+\d+)(?=(\W+|([A-Z][a-z]{2,})|[a-z]{3,}))'
172
+
173
+ @staticmethod
174
+ def get_word_digit_mix_prefix_small():
175
+ #Mix of letters and characters - most likely a typo if the
176
+ #following characters is a capital letter followed by more than
177
+ #2 small letters
178
+ return '([a-z]+\d+)(?=(\W+|[A-Z][a-z]{2,}|[A-Z]{3,}))'
179
+
180
+ @staticmethod
181
+ def get_word_id_split_prefix():
182
+ return '([a-zA-Z]+)(?=(\d+[-./]+(\d+|$)))'
183
+
184
+ @staticmethod
185
+ def get_word_section_prefix():
186
+ #Fix JOHNID/CC - missing space from previous section - JOHN
187
+ return '([A-Za-z]+)(?=(((?P<slash>[/:]+)[A-Za-z]+)((?P=slash)[A-Za-z]+)*\W+\d+))'
188
+
189
+ @staticmethod
190
+ def get_colon_prefix():
191
+ #Split tokens before and after the token
192
+ #Does not split time - we make sure the token ebfore the colon
193
+ #starts with a letter.
194
+ #Splits patterns like <CHAR 1>:<CHAR 2> where CHAR 1 starts with a
195
+ #letter and is followed by one more letters/digits
196
+ #CHAR 2 is a combination of letters/digits of length greater than 2
197
+ #This wont split time, but assumes that when the colon is present
198
+ #the entities on either side of the token are different tokens
199
+ #A:9 - not split - more likely this makes sense as a single token (could be a chemical)
200
+ return '([A-Za-z][A-Za-z0-9]+)(?=([:][A-Za-z0-9]{2,}))'
201
+
202
+ @staticmethod
203
+ def get_temperature_prefix(split_temperature):
204
+ if(split_temperature):
205
+ return '((\d+)|(\d+[.]\d+))(?=(\u00B0([FCK]{1}|$)))'
206
+ else:
207
+ return '(((\d+)|(\d+[.]\d+))\u00B0([FCK]{1}|$))|(\u00A9[FCK]{1})'
208
+
209
+ @staticmethod
210
+ def get_percentage_prefix(split_percentage):
211
+ """
212
+ If text is of the format take it 20% times, this function
213
+ can be used to treat the entire thing as one token or
214
+ split into two seperate tokens
215
+ Args:
216
+ split_percentage (bool): whether to treat it as one token or split them up
217
+ Returns:
218
+ (str): regex to either keep as one token or split into two
219
+ """
220
+ if(split_percentage):
221
+ return '(((\d+)|(\d+[.]\d+)))(?=(%(\W+|$)))'
222
+ else:
223
+ return '(((\d+)|(\d+[.]\d+))%(\W+|$))'
224
+
225
+ @staticmethod
226
+ def get_value_range_prefixes():
227
+ #The following regex might not work on .4-.5 - no number before decimal point
228
+ #need to figure this out without breaking anything else
229
+ value_range_1 = '(\d{1})(?=([-]((\d{1,2}|(\d+)[.](\d+)))([a-zA-Z]+|[\W]*$)))'
230
+ value_range_2 = '(\d{2})(?=([-]((\d{2,3}|(\d+)[.](\d+)))([a-zA-Z]+|[\W]*$)))'
231
+ value_range_3 = '(\d{3})(?=([-]((\d{3}|(\d+)[.](\d+)))([a-zA-Z]+|[\W]*$)))'
232
+ return value_range_1, value_range_2, value_range_3
233
+
234
+ @staticmethod
235
+ def get_year_range_prefix():
236
+ return '(\d{4})(?=([-](\d{4})([a-zA-Z]+|[\W]*$)))'
237
+
238
+ @staticmethod
239
+ def get_short_digit_id_prefix():
240
+ #4A, 3C etc
241
+ return '(\d{1,2}[A-EG-LN-WZ]{1}(?=(\W+|$)))'
242
+
243
+ #Handles Case 1, 2 of MRN
244
+ #Handles Case 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19 of AGE
245
+ #Handles Case 2, 3, 5 of IDNUM
246
+ #Handles Case 1 of HOSP
247
+ @staticmethod
248
+ def get_digit_symbol_word_prefix():
249
+ return '((\d+)|(\d+[.]\d+))(?=\W+[a-zA-Z]+)'
250
+
251
+ @staticmethod
252
+ def get_digit_age_split_prefix():
253
+ age_suffix = '(([yY][eE][aA][rR]|[yY][oO]' + \
254
+ '|[yY][rR]|[yY]\.[oO]|[yY]/[oO]|[fF]|[mM]|[yY])' + \
255
+ '(-)*([o|O][l|L][d|D]|[f|F]|[m|M]|[o|O])?)'
256
+ return '((\d{1,3}))(?=(' + age_suffix + '\W*$))'
257
+
258
+ @staticmethod
259
+ def get_digit_word_short_prefix():
260
+ return '((\d+)|(\d+[.]\d+))([a-z]{1,2}|[A-Z]{1,2})(?=(\W*$))'
261
+
262
+ @staticmethod
263
+ def get_digit_word_typo_prefix():
264
+ return '((\d+)|(\d+[.]\d+))(?=[a-zA-Z]{1}[a-zA-Z\W]+)'
265
+
266
+ @staticmethod
267
+ def get_prefixes(split_multiple, split_temperature, split_percentage):
268
+ word_typo_prefix = ClinicalRegex.get_word_typo_prefix()
269
+ word_symbol_digit_prefix = ClinicalRegex.get_word_symbol_digit_prefix()
270
+ pager_prefix = ClinicalRegex.get_pager_prefix()
271
+ age_word_prefix = ClinicalRegex.get_age_word_prefix()
272
+ word_period_prefix = ClinicalRegex.get_word_period_prefix()
273
+ id_prefix = ClinicalRegex.get_id_prefix()
274
+ multiple_prefix = ClinicalRegex.get_multiple_prefix(split_multiple)
275
+ chemical_prefix = ClinicalRegex.get_chemical_prefix()
276
+ chemical_prefix_small = ClinicalRegex.get_chemical_prefix_small()
277
+ instrument_prefix = ClinicalRegex.get_instrument_prefix()
278
+ instrument_prefix_small = ClinicalRegex.get_instrument_prefix_small()
279
+ age_typo_prefix = ClinicalRegex.get_age_typo_prefix()
280
+ word_digit_split_prefix = ClinicalRegex.get_word_digit_split_prefix()
281
+ word_digit_mix_prefix = ClinicalRegex.get_word_digit_mix_prefix()
282
+ word_digit_mix_prefix_small = ClinicalRegex.get_word_digit_mix_prefix_small()
283
+ word_id_split_prefix = ClinicalRegex.get_word_id_split_prefix()
284
+ word_section_prefix = ClinicalRegex.get_word_section_prefix()
285
+ colon_prefix = ClinicalRegex.get_colon_prefix()
286
+ temperature_prefix = ClinicalRegex.get_temperature_prefix(split_temperature)
287
+ percentage_prefix = ClinicalRegex.get_percentage_prefix(split_percentage)
288
+ value_range_1, value_range_2, value_range_3 = ClinicalRegex.get_value_range_prefixes()
289
+ year_range_prefix = ClinicalRegex.get_year_range_prefix()
290
+ short_digit_id_prefix = ClinicalRegex.get_short_digit_id_prefix()
291
+ digit_symbol_word_prefix = ClinicalRegex.get_digit_symbol_word_prefix()
292
+ digit_age_split_prefix = ClinicalRegex.get_digit_age_split_prefix()
293
+ digit_word_short_prefix = ClinicalRegex.get_digit_word_short_prefix()
294
+ digit_word_typo_prefix = ClinicalRegex.get_digit_word_typo_prefix()
295
+
296
+ return [word_typo_prefix, word_symbol_digit_prefix, pager_prefix, age_word_prefix,\
297
+ word_period_prefix, id_prefix, multiple_prefix, chemical_prefix, chemical_prefix_small,\
298
+ instrument_prefix, instrument_prefix_small, age_typo_prefix, word_digit_split_prefix,\
299
+ word_id_split_prefix, word_digit_mix_prefix, word_digit_mix_prefix_small, \
300
+ word_section_prefix, colon_prefix, temperature_prefix,\
301
+ percentage_prefix, value_range_1, value_range_2, value_range_3, year_range_prefix,\
302
+ short_digit_id_prefix, digit_symbol_word_prefix, digit_age_split_prefix,\
303
+ digit_word_short_prefix, digit_word_typo_prefix]
304
+
305
+ @staticmethod
306
+ def get_infixes():
307
+ digit_infix = '(\d+(?P<sep>[-:]+)\d+((?P=sep)\d+)*)'
308
+ return [digit_infix, ]
309
+
ner_datasets/preprocessing/tokenizers/utils/date_regex.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class DateRegex(object):
2
+
3
+ @staticmethod
4
+ def __get_day_attributes():
5
+ # day of the month with optional suffix, such as 7th, 22nd
6
+ dd = '(([0-2]?[0-9]|3[01])(\s*)([sS][tT]|[nN][dD]|[rR][dD]|[tT][hH])?)'
7
+ # two-digit numeric day of the month
8
+ DD = '(0[0-9]|[1-2][0-9]|3[01])'
9
+
10
+ return dd, DD
11
+
12
+ @staticmethod
13
+ def __get_month_attributes():
14
+
15
+ m = \
16
+ '([jJ][aA][nN]([uU][aA][rR][yY])?|'+\
17
+ '[fF][eE][bB]([rR][uU][aA][rR][yY])?|'+\
18
+ '[mM][aA][rR]([cC][hH])?|'+\
19
+ '[aA][pP][rR]([iI][lL])?|'+\
20
+ '[mM][aA][yY]|'+\
21
+ '[jJ][uU][nN]([eE])?|'+\
22
+ '[jJ][uU][lL]([yY])?|'+\
23
+ '[aA][uU][gG]([uU][sS][tT])?|'+\
24
+ '[sS][eE][pP]([tT][eE][mM][bB][eE][rR])?|'+\
25
+ '[oO][cC][tT]([oO][bB][eE][rR])?|'+\
26
+ '[nN][oO][vV]([eE][mM][bB][eE][rR])?|'+\
27
+ '[dD][eE][cC]([eE][mM][bB][eE][rR])?)'
28
+ M = m
29
+
30
+ # numeric month
31
+ mm = '(0?[0-9]|1[0-2]|' + m + ')'
32
+
33
+ # two digit month
34
+ MM = '(0[0-9]|1[0-2]|' + m + ')'
35
+
36
+ return m, M, mm, MM
37
+
38
+ @staticmethod
39
+ def __get_year_attributes():
40
+
41
+ # two or four digit year
42
+ y = '([0-9]{4}|[0-9]{2})'
43
+
44
+ # two digit year
45
+ yy = '([0-9]{2})'
46
+
47
+ # four digit year
48
+ YY = '([0-9]{4})'
49
+
50
+ return y, yy, YY
51
+
52
+ @staticmethod
53
+ def __get_sep_attributes():
54
+
55
+ date_sep = '[-./]'
56
+ date_sep_optional = '[-./]*'
57
+ date_sep_no_full = '[-/]'
58
+
59
+ return date_sep, date_sep_optional, date_sep_no_full
60
+
61
+ #def get_week_attributes():
62
+ # w = \
63
+ # '([mM][oO][nN]([dD][aA][yY])?|'+\
64
+ # '[tT][uU][eE]([sS][dD][aA][yY])?|'+\
65
+ # '[wW][eE][dD]([nN][eE][sS][dD][aA][yY])?|'+\
66
+ # '[tT][hH][uU][gG]([uU][sS][tT])?|'+\
67
+ # '[sS][eE][pP]([tT][eE][mM][bB][eE][rR])?|'+\
68
+ # '[oO][cC][tT]([oO][bB][eE][rR])?|'+\
69
+ # '[nN][oO][vV]([eE][mM][bB][eE][rR])?|'+\
70
+ # '[dD][eE][cC]([eE][mM][bB][eE][rR])?)'
71
+
72
+ @staticmethod
73
+ def get_infixes():
74
+
75
+ dd, DD = DateRegex.__get_day_attributes()
76
+ m, M, mm, MM = DateRegex.__get_month_attributes()
77
+ y, yy, YY = DateRegex.__get_year_attributes()
78
+ date_sep, date_sep_optional, date_sep_no_full = DateRegex.__get_sep_attributes()
79
+
80
+ date_1 = y + '/' + mm + '/' + dd + '(?!([/]+|\d+))'
81
+ date_2 = y + '/' + dd + '/' + mm + '(?!([/]+|\d+))'
82
+ date_3 = dd + '/' + mm + '/' + y + '(?!([/]+|\d+))'
83
+ date_4 = mm + '/' + dd + '/' + y + '(?!([/]+|\d+))'
84
+ #Do I make this optional (date_sep_optional) - need to check
85
+ date_5 = y + date_sep + m + date_sep + dd + '(?!\d)'
86
+ date_6 = y + date_sep + dd + date_sep + m
87
+ date_7 = dd + date_sep + m + date_sep + y
88
+ date_8 = m + date_sep + dd + date_sep + y
89
+ date_9 = y + date_sep + m
90
+ date_10 = m + date_sep + y
91
+ date_11 = dd + date_sep + m
92
+ date_12 = m + date_sep + dd
93
+ date_13 = '(?<!([/]|\d))' + y + '/' + dd + '(?!([/]+|\d+))'
94
+ date_14 = '(?<!([/]|\d))' + y + '/' + dd + '(?!([/]+|\d+))'
95
+ date_15 = '(?<!([/]|\d))' + dd + '/' + y + '(?!([/]+|\d+))'
96
+ date_16 = '(?<!([/]|\d))' + mm + '/' + y + '(?!([/]+|\d+))'
97
+ date_17 = '(?<!([/]|\d))' + dd + '/' + mm + '(?!([/]+|\d+))'
98
+ date_18 = '(?<!([/]|\d))' + mm + '/' + dd + '(?!([/]+|\d+))'
99
+
100
+ date_infixes = [date_1, date_2, date_3, date_4, date_5, date_6,\
101
+ date_7, date_8, date_9, date_10, date_11, date_12,\
102
+ date_13, date_14, date_15, date_16, date_17, date_18]
103
+
104
+ return date_infixes
ner_datasets/span_fixer.py ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
4
+ from typing import Iterable, Dict, List, Sequence, Union, Mapping, Tuple, NoReturn
5
+
6
+ from .preprocessing import PreprocessingLoader
7
+
8
+
9
+ class SpanFixer(object):
10
+ """
11
+ The tokens and spans may not align depending on the tokenizer used.
12
+ This class either expands the span to cover the tokens, so we don't have a mismatch.
13
+ A mismatch is when a span_start will not coincide with some token_start or the span_end
14
+ will not coincide with some token_end. This class changes the span_start and span_end
15
+ so that the span_start will coincide with some token_start and the span_end
16
+ will coincide with some token_end - and we don't get any position mismatch errors while
17
+ building our dataset. This entire process involves updating span positions which can lead to duplicate
18
+ or overlapping spans, which then need to be removed.
19
+ E.g we have text: The patient is 75yo man
20
+ AGE Span: 75
21
+ Token: 75yo
22
+ As you can see the span is smaller than the token, which will lead to an error when
23
+ building the NER dataset.
24
+ To ensure this does not happen, we correct the span. We change the span from
25
+ 75 to 75yo -> So now AGE Span is 75yo instead of 75. This script essentially changes
26
+ the annotated spans to match the tokens. In an ideal case we wouldn't need this script
27
+ but since medical notes have many typos, this script becomes necessary to deal with
28
+ issues and changes that arise from different tokenizers.
29
+ Also sort the spans and convert the start and end keys of the spans to integers
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ sentencizer: str,
35
+ tokenizer: str,
36
+ ner_priorities: Mapping[str, int],
37
+ verbose: bool = True
38
+ ) -> NoReturn:
39
+ """
40
+ Initialize the sentencizer and tokenizer
41
+ Args:
42
+ sentencizer (str): The sentencizer to use for splitting text into sentences
43
+ tokenizer (str): The tokenizer to use for splitting text into tokens
44
+ ner_priorities (Mapping[str, int]): The priority when choosing which duplicates to remove.
45
+ Mapping that represents a priority for each NER type
46
+ verbose (bool): To print out warnings etc
47
+ """
48
+ self._sentencizer = PreprocessingLoader.get_sentencizer(sentencizer)
49
+ self._tokenizer = PreprocessingLoader.get_tokenizer(tokenizer)
50
+ self._ner_priorities = ner_priorities
51
+ self._verbose = verbose
52
+
53
+ def __get_token_positions(self, text: str) -> Tuple[Dict[int, int], Dict[int, int]]:
54
+ """
55
+ Get the start and end positions of all the tokens in the note.
56
+ Args:
57
+ text (str): The text present in the note
58
+ Returns:
59
+ token_start_positions (Mapping[int, int]): The start positions of all the tokens in the note
60
+ token_end_positions (Mapping[int, int]): The end positions of all the tokens in the note
61
+ """
62
+ token_start_positions = dict()
63
+ token_end_positions = dict()
64
+ for sentence in self._sentencizer.get_sentences(text):
65
+ offset = sentence['start']
66
+ for token in self._tokenizer.get_tokens(sentence['text']):
67
+ start = token['start'] + offset
68
+ end = token['end'] + offset
69
+ token_start_positions[start] = 1
70
+ token_end_positions[end] = 1
71
+ return token_start_positions, token_end_positions
72
+
73
+ def get_duplicates(
74
+ self,
75
+ spans: List[Dict[str, Union[str, int]]],
76
+ ) -> List[int]:
77
+ """
78
+ Return the indexes where there are duplicate/overlapping spans. A duplicate or
79
+ span is one where the same token can have two labels.
80
+ E.g:
81
+ Token: BWH^Bruce
82
+ This is a single token where BWH is the hospital label and Bruce is the Patient label
83
+ The fix_alignment function assigns this entre token the hospital label but it also
84
+ assigns this entire token the patient label. Since we have two labels for the same
85
+ token, we need to remove one of them.
86
+ We assign this entire token one label - either hospital label or the patient label
87
+ In this case we assign patient because of higher priority. So now we need to remove
88
+ the hospital label from the dataset (since it is essentially a duplicate label). This
89
+ script handles this case.
90
+ There are cases when two different labels match the same token partially
91
+ E.g
92
+ Text: JT/781-815-9090
93
+ Spans: JT - hospital, 781-815-9090 - Phone
94
+ Tokens: (Jt/781) & (- 815 - 9090)
95
+ As you can see the token JT/781 will be assigned the label in the fix_alignment function
96
+ but 781-815-9090 is also phone and the 781 portion is overlapped, and we need to resolve this.
97
+ In this script, we resolve it by treating JT/781 as one span (hospital) and
98
+ -815-9090 as another span (phone).
99
+ Args:
100
+ spans ([List[Dict[str, Union[str, int]]]): The NER spans in the note
101
+ Returns:
102
+ remove_spans (Sequence[int]): A list of indexes of the spans to remove
103
+ """
104
+ remove_spans = list()
105
+ prev_start = -1
106
+ prev_end = -1
107
+ prev_label = None
108
+ prev_index = None
109
+ spans.sort(key=lambda _span: (_span['start'], _span['end']))
110
+ for index, span in enumerate(spans):
111
+ current_start = span['start']
112
+ current_end = span['end']
113
+ current_label = span['label']
114
+ if type(current_start) != int or type(current_end) != int:
115
+ raise ValueError('The start and end keys of the span must be of type int')
116
+ # Check if the current span matches another span
117
+ # that is if this span covers the same tokens as the
118
+ # previous spans (but has a different label)
119
+ # Based on the priority, treat the span with the low
120
+ # priority label as a duplicate label and add it to the
121
+ # list of spans that need to be removed
122
+ if current_start == prev_start and current_end == prev_end:
123
+ if self._ner_priorities[current_label] > self._ner_priorities[prev_label]:
124
+ # Store index of the previous span if it has lower priority
125
+ remove_spans.append(prev_index)
126
+ # Reset span details
127
+ prev_start = current_start
128
+ prev_end = current_end
129
+ prev_index = index
130
+ prev_label = current_label
131
+ if self._verbose:
132
+ print('DUPLICATE: ', span)
133
+ print('REMOVED: ', spans[remove_spans[-1]])
134
+ elif self._ner_priorities[current_label] <= self._ner_priorities[prev_label]:
135
+ # Store current index of span if it has lower priority
136
+ remove_spans.append(index)
137
+ if self._verbose:
138
+ print('DUPLICATE: ', spans[prev_index])
139
+ print('REMOVED: ', spans[remove_spans[-1]])
140
+ # Check for overlapping span
141
+ elif current_start < prev_end:
142
+ # If the current span end matches the overlapping span end
143
+ # Remove the current span, since it is smaller
144
+ if current_end <= prev_end:
145
+ remove_spans.append(index)
146
+ if self._verbose:
147
+ print('DUPLICATE: ', spans[prev_index])
148
+ print('REMOVED: ', spans[remove_spans[-1]])
149
+ # If the current end is greater than the prev_end
150
+ # then we split it into tow spans. We treat the previous span
151
+ # as one span and the end of the previous span to the end of the current span
152
+ # as another span.
153
+ elif current_end > prev_end:
154
+ # Create the new span - start=previous_span_end, end=current_span_end
155
+ overlap_length = spans[prev_index]['end'] - current_start
156
+ new_text = span['text'][overlap_length:]
157
+ # Remove extra spaces that may arise during this span separation
158
+ new_text = re.sub('^(\s+)', '', new_text, flags=re.DOTALL)
159
+ span['start'] = current_end - len(new_text)
160
+ span['text'] = new_text
161
+ if self._verbose:
162
+ print('OVERLAP: ', spans[prev_index])
163
+ print('UPDATED: ', span)
164
+ # Reset span details
165
+ prev_start = current_start
166
+ prev_end = current_end
167
+ prev_label = current_label
168
+ prev_index = index
169
+ # Reset span details
170
+ else:
171
+ prev_start = current_start
172
+ prev_end = current_end
173
+ prev_label = current_label
174
+ prev_index = index
175
+ return remove_spans
176
+
177
+ def fix_alignment(
178
+ self,
179
+ text: str,
180
+ spans: Sequence[Dict[str, Union[str, int]]]
181
+ ) -> Iterable[Dict[str, Union[str, int]]]:
182
+ """
183
+ Align the span and tokens. When the tokens and spans don't align, we change the
184
+ start and end positions of the spans so that they align with the tokens. This is
185
+ needed when a different tokenizer is used and the spans which are defined against
186
+ a different tokenizer don't line up with the new tokenizer. Also remove spaces present
187
+ at the start or end of the span.
188
+ E.g:
189
+ Token: BWH^Bruce
190
+ This is a single token where BWH is the hospital label and Bruce is the Patient label
191
+ The fix_alignment function assigns this entre token the hospital label but it also
192
+ assigns this entire token the patient label. This function basically expands the span
193
+ so that it matches the start and end positions of some token. By doing this it may create
194
+ overlapping and duplicate spans. As you can see it expands the patient label to match the
195
+ start of the token and it expands the hospital label to match the end of the token.
196
+ function.
197
+ Args:
198
+ text (str): The text present in the note
199
+ spans ([Sequence[Dict[str, Union[str, int]]]): The NER spans in the note
200
+ Returns:
201
+ (Iterable[Dict[str, Union[str, int]]]): Iterable through the modified spans
202
+ """
203
+ # Get token start and end positions so that we can check if a span
204
+ # coincides with the start and end position of some token.
205
+ token_start_positions, token_end_positions = self.__get_token_positions(text)
206
+ for span in spans:
207
+ start = span['start']
208
+ end = span['end']
209
+ if type(start) != int or type(end) != int:
210
+ raise ValueError('The start and end keys of the span must be of type int')
211
+ if re.search('^\s', text[start:end]):
212
+ if self._verbose:
213
+ print('WARNING - space present in the start of the span')
214
+ start = start + 1
215
+ if re.search('(\s+)$', text[start:end], flags=re.DOTALL):
216
+ new_text = re.sub('(\s+)$', '', text[start:end], flags=re.DOTALL)
217
+ end = start + len(new_text)
218
+ # When a span does not coincide with the start and end position of some token
219
+ # it means there will be an error when building the ner dataset, we try and avoid
220
+ # that error by updating the spans itself, that is we expand the start/end positions
221
+ # of the spans so that it is aligned with the tokens.
222
+ while token_start_positions.get(start, False) is False:
223
+ start -= 1
224
+ while token_end_positions.get(end, False) is False:
225
+ end += 1
226
+ # Print what the old span was and what the new expanded span will look like
227
+ if self._verbose and (int(span['start']) != start or int(span['end']) != end):
228
+ print('OLD SPAN: ', text[int(span['start']):int(span['end'])])
229
+ print('NEW SPAN: ', text[start:end])
230
+ # Update the span with its new start and end positions
231
+ span['start'] = start
232
+ span['end'] = end
233
+ span['text'] = text[start:end]
234
+ yield span
235
+
236
+ def fix_note(
237
+ self,
238
+ text: str,
239
+ spans: Sequence[Dict[str, Union[str, int]]],
240
+ ) -> Iterable[Dict[str, Union[str, int]]]:
241
+ """
242
+ This function changes the span_start and span_end
243
+ so that the span_start will coincide with some token_start and the span_end
244
+ will coincide with some token_end and also removes duplicate/overlapping spans
245
+ that may arise when we change the span start and end positions. The resulting
246
+ spans from this function will always coincide with some token start and token
247
+ end, and hence will not have any token and span mismatch errors when building the
248
+ NER dataset. For more details and examples check the documentation of the
249
+ fix_alignment and get_duplicates functions.
250
+ Args:
251
+ text (str): The text present in the note
252
+ spans ([Sequence[Mapping[str, Union[str, int]]]): The NER spans in the note
253
+ Returns:
254
+ (Iterable[Mapping[str, Union[str, int]]]): Iterable through the fixed spans
255
+ """
256
+ # Fix span position alignment
257
+ spans = [span for span in self.fix_alignment(text=text, spans=spans)]
258
+ # Check for duplicate/overlapping spans
259
+ remove_spans = self.get_duplicates(spans=spans)
260
+ for index, span in enumerate(spans):
261
+ # Remove the duplicate/overlapping spans
262
+ if index not in remove_spans:
263
+ yield span
264
+
265
+ def fix(
266
+ self,
267
+ input_file: str,
268
+ text_key: str = 'text',
269
+ spans_key: str = 'spans'
270
+ ) -> Iterable[Dict[str, Union[str, Dict[str, str], List[Dict[str, str]]]]]:
271
+ """
272
+ This function changes the span_start and span_end
273
+ so that the span_start will coincide with some token_start and the span_end
274
+ will coincide with some token_end and also removes duplicate/overlapping spans
275
+ that may arise when we change the span start and end positions. The resulting
276
+ spans from this function will always coincide with some token start and token
277
+ end, and hence will not have any token and span mismatch errors when building the
278
+ NER dataset. For more details and examples check the documentation of the
279
+ fix_alignment and get_duplicates functions. Fix spans that arise due to bad typos,
280
+ which are not fixed during tokenization. This essentially updates the spans so that
281
+ they line up with the start and end positions of tokens - so that there is no error
282
+ when we assign labels to tokens based on these spans
283
+ Args:
284
+ input_file (str): The file that contains the notes that we want to fix the token issues in
285
+ text_key (str) the key where the note & token text is present in the json object
286
+ spans_key (str): The key where the note spans are present in the json object
287
+ Returns:
288
+ (Iterable[Dict[str, Union[str, Dict[str, str], List[Dict[str, str]]]]]): Iterable through the fixed
289
+ notes
290
+ """
291
+ for line in open(input_file, 'r'):
292
+ note = json.loads(line)
293
+ note[spans_key] = [span for span in self.fix_note(text=note[text_key], spans=note[spans_key])]
294
+ yield note
295
+
296
+
297
+ def main():
298
+ # The following code sets up the arguments to be passed via CLI or via a JSON file
299
+ cli_parser = ArgumentParser(
300
+ description='configuration arguments provided at run time from the CLI',
301
+ formatter_class=ArgumentDefaultsHelpFormatter
302
+ )
303
+ cli_parser.add_argument(
304
+ '--input_file',
305
+ type=str,
306
+ required=True,
307
+ help='the the jsonl file that contains the notes'
308
+ )
309
+ cli_parser.add_argument(
310
+ '--sentencizer',
311
+ type=str,
312
+ required=True,
313
+ help='the sentencizer to use for splitting notes into sentences'
314
+ )
315
+ cli_parser.add_argument(
316
+ '--tokenizer',
317
+ type=str,
318
+ required=True,
319
+ help='the tokenizer to use for splitting text into tokens'
320
+ )
321
+ cli_parser.add_argument(
322
+ '--abbreviations_file',
323
+ type=str,
324
+ default=None,
325
+ help='file that will be used by clinical tokenizer to handle abbreviations'
326
+ )
327
+ cli_parser.add_argument(
328
+ '--ner_types',
329
+ nargs="+",
330
+ require=True,
331
+ help='the NER types'
332
+ )
333
+ cli_parser.add_argument(
334
+ '--ner_priorities',
335
+ nargs="+",
336
+ require=True,
337
+ help='the priorities for the NER types - the priority when choosing which duplicates to remove'
338
+ )
339
+ cli_parser.add_argument(
340
+ '--text_key',
341
+ type=str,
342
+ default='text',
343
+ help='the key where the note & token text is present in the json object'
344
+ )
345
+ cli_parser.add_argument(
346
+ '--spans_key',
347
+ type=str,
348
+ default='spans',
349
+ help='the key where the note spans is present in the json object'
350
+ )
351
+ cli_parser.add_argument(
352
+ '--output_file',
353
+ type=str,
354
+ required=True,
355
+ help='the output json file that will contain the new fixed spans'
356
+ )
357
+ args = cli_parser.parse_args()
358
+ # Mapping that represents a priority for each PHI type
359
+ # For example, the PATIENT type will have a higher priority as
360
+ # compared to STAFF.
361
+ if len(args.ner_types) == len(args.ner_priorities):
362
+ ner_priorities = {ner_type: priority for ner_type, priority in zip(args.ner_types, args.ner_priorities)}
363
+ else:
364
+ raise ValueError('Length of ner_types and ner_priorities must be the same')
365
+ span_fixer = SpanFixer(
366
+ tokenizer=args.tokenizer,
367
+ sentencizer=args.sentencizer,
368
+ ner_priorities=ner_priorities
369
+ )
370
+ with open(args.output_file, 'w') as file:
371
+ for note in span_fixer.fix(
372
+ input_file=args.input_file,
373
+ text_key=args.text_key,
374
+ spans_key=args.spans_key
375
+ ):
376
+ file.write(json.dumps(note) + '\n')
377
+
378
+
379
+ if __name__ == '__main__':
380
+ main()
ner_datasets/span_validation.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ from argparse import ArgumentParser
4
+ from typing import Union, NoReturn, Iterable, Dict, List
5
+
6
+ random.seed(41)
7
+
8
+
9
+ class SpanValidation(object):
10
+ """
11
+ This class is used to build a mapping between the note id
12
+ and the annotated spans in that note. This will be used during the
13
+ evaluation of the models. This is required to perform span level
14
+ evaluation.
15
+ """
16
+ @staticmethod
17
+ def get_spans(
18
+ input_file: str,
19
+ metadata_key: str = 'meta',
20
+ note_id_key: str = 'note_id',
21
+ spans_key: str = 'spans'
22
+ ):
23
+ """
24
+ Get a mapping between the note id
25
+ and the annotated spans in that note. This will mainly be used during the
26
+ evaluation of the models.
27
+ Args:
28
+ input_file (str): The input file
29
+ metadata_key (str): The key where the note metadata is present
30
+ note_id_key (str): The key where the note id is present
31
+ spans_key (str): The key that contains the annotated spans for a note dictionary
32
+ Returns:
33
+ (Iterable[Dict[str, Union[str, List[Dict[str, str]]]]]): An iterable that iterates through each note
34
+ and contains the note id and annotated spans
35
+ for that note
36
+ """
37
+ # Read the input files (data source)
38
+ for line in open(input_file, 'r'):
39
+ note = json.loads(line)
40
+ note_id = note[metadata_key][note_id_key]
41
+ # Store the note_id and the annotated spans
42
+ note[spans_key].sort(key=lambda _span: (_span['start'], _span['end']))
43
+ yield {'note_id': note_id, 'note_spans': note[spans_key]}
44
+
45
+
46
+ def main() -> NoReturn:
47
+ cli_parser = ArgumentParser(description='configuration arguments provided at run time from the CLI')
48
+ cli_parser.add_argument(
49
+ '--input_file',
50
+ type=str,
51
+ required=True,
52
+ help='the the jsonl file that contains the notes'
53
+ )
54
+ cli_parser.add_argument(
55
+ '--metadata_key',
56
+ type=str,
57
+ default='meta',
58
+ help='the key where the note metadata is present in the json object'
59
+ )
60
+ cli_parser.add_argument(
61
+ '--note_id_key',
62
+ type=str,
63
+ default='note_id',
64
+ help='the key where the note id is present in the json object'
65
+ )
66
+ cli_parser.add_argument(
67
+ '--span_text_key',
68
+ type=str,
69
+ default='spans',
70
+ help='the key where the annotated spans for the notes are present in the json object'
71
+ )
72
+ cli_parser.add_argument(
73
+ '--output_file',
74
+ type=str,
75
+ required=True,
76
+ help='the file where the note id and the corresponding spans for that note are to be saved'
77
+ )
78
+ args = cli_parser.parse_args()
79
+
80
+ # Write the dataset to the output file
81
+ with open(args.output_file, 'w') as file:
82
+ for span_info in SpanValidation.get_spans(
83
+ input_file=args.input_file,
84
+ metadata_key=args.metadata_key,
85
+ note_id_key=args.note_id_key,
86
+ spans_key=args.spans_key):
87
+ file.write(json.dumps(span_info) + '\n')
88
+
89
+
90
+ if __name__ == "__main__":
91
+ main()
sequence_tagging/.DS_Store ADDED
Binary file (6.15 kB). View file
 
sequence_tagging/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .sequence_tagger import SequenceTagger
2
+ __all__ = ["SequenceTagger"]
sequence_tagging/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (267 Bytes). View file
 
sequence_tagging/__pycache__/sequence_tagger.cpython-37.pyc ADDED
Binary file (13.6 kB). View file
 
sequence_tagging/arguments/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from .model_arguments import ModelArguments
2
+ from .evaluation_arguments import EvaluationArguments
3
+ from .data_training_arguments import DataTrainingArguments
4
+ __all__ = [
5
+ "ModelArguments",
6
+ "DataTrainingArguments",
7
+ "EvaluationArguments",
8
+ ]
sequence_tagging/arguments/data_training_arguments.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ from dataclasses import dataclass, field
3
+
4
+ @dataclass
5
+ class DataTrainingArguments:
6
+ """
7
+ Arguments pertaining to what data we are going to input our model for training and eval.
8
+ """
9
+ task_name: Optional[str] = field(
10
+ default="ner",
11
+ metadata={"help": "The name of the task (ner, pos...)."}
12
+ )
13
+ notation: str = field(
14
+ default="BIO",
15
+ metadata={"help": "NER notation e.g BIO"},
16
+ )
17
+ ner_types: Optional[str] = field(
18
+ default=None,
19
+ metadata={"help": "Pass a list of NER types"},
20
+ )
21
+ train_file: Optional[str] = field(
22
+ default=None,
23
+ metadata={"help": "The input training data file (a csv or JSON file)."}
24
+ )
25
+ validation_file: Optional[str] = field(
26
+ default=None,
27
+ metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."},
28
+ )
29
+ test_file: Optional[str] = field(
30
+ default=None,
31
+ metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."},
32
+ )
33
+ output_predictions_file: Optional[str] = field(
34
+ default=None,
35
+ metadata={"help": "A location where to write the output of the test data"},
36
+ )
37
+ text_column_name: Optional[str] = field(
38
+ default='tokens',
39
+ metadata={"help": "The column name of text to input in the file (a csv or JSON file)."}
40
+ )
41
+ label_column_name: Optional[str] = field(
42
+ default='labels',
43
+ metadata={"help": "The column name of label to input in the file (a csv or JSON file)."}
44
+ )
45
+ overwrite_cache: bool = field(
46
+ default=False,
47
+ metadata={"help": "Overwrite the cached training and evaluation sets"}
48
+ )
49
+ preprocessing_num_workers: Optional[int] = field(
50
+ default=None,
51
+ metadata={"help": "The number of processes to use for the preprocessing."},
52
+ )
53
+ pad_to_max_length: bool = field(
54
+ default=False,
55
+ metadata={
56
+ "help": "Whether to pad all samples to model maximum sentence length. "
57
+ "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
58
+ "efficient on GPU but very bad for TPU."
59
+ },
60
+ )
61
+ truncation: bool = field(
62
+ default=True,
63
+ metadata={
64
+ "help": "Activates and controls truncation"
65
+ },
66
+ )
67
+ max_length: int = field(
68
+ default=512,
69
+ metadata={
70
+ "help": "Controls the maximum length to use by one of the truncation/padding parameters."
71
+ },
72
+ )
73
+ do_lower_case: bool = field(
74
+ default=False,
75
+ metadata={
76
+ "help": "Whether to lowercase the text"
77
+ },
78
+ )
79
+ max_train_samples: Optional[int] = field(
80
+ default=None,
81
+ metadata={
82
+ "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
83
+ "value if set."
84
+ },
85
+ )
86
+ max_eval_samples: Optional[int] = field(
87
+ default=None,
88
+ metadata={
89
+ "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
90
+ "value if set."
91
+ },
92
+ )
93
+ max_predict_samples: Optional[int] = field(
94
+ default=None,
95
+ metadata={
96
+ "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
97
+ "value if set."
98
+ },
99
+ )
100
+ label_all_tokens: bool = field(
101
+ default=False,
102
+ metadata={
103
+ "help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
104
+ "one (in which case the other tokens will have a padding index)."
105
+ },
106
+ )
107
+ return_entity_level_metrics: bool = field(
108
+ default=True,
109
+ metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."},
110
+ )
111
+ token_ignore_label: str = field(
112
+ default='NA',
113
+ metadata={"help": "The label that indicates where the tokens will be ignored in loss computation. Used for "
114
+ "indicating context tokens to the model"}
115
+ )
sequence_tagging/arguments/evaluation_arguments.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ from dataclasses import dataclass, field
3
+
4
+ @dataclass
5
+ class EvaluationArguments:
6
+ """
7
+ Arguments pertaining to the evaluation process.
8
+ """
9
+ model_eval_script: Optional[str] = field(
10
+ default=None,
11
+ metadata={"help": "The script that is used for evaluation"},
12
+ )
13
+ evaluation_mode: Optional[str] = field(
14
+ default=None,
15
+ metadata={"help": "Strict or default mode for sequence evaluation"},
16
+ )
17
+ validation_spans_file: Optional[str] = field(
18
+ default=None,
19
+ metadata={"help": "A span evaluation data file to evaluate on span level (json file). This will contain a "
20
+ "mapping between the note_ids and note spans"},
21
+ )
22
+ ner_type_maps: Optional[str] = field(
23
+ default=None,
24
+ metadata={"help": "List that contains the mappings between the original NER types to another set of NER "
25
+ "types. Used mainly for evaluation. to map ner token labels to another set of ner token"},
26
+ )
sequence_tagging/arguments/model_arguments.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ from dataclasses import dataclass, field
3
+
4
+
5
+ @dataclass
6
+ class ModelArguments:
7
+ """
8
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
9
+ """
10
+ model_name_or_path: str = field(
11
+ metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
12
+ )
13
+ config_name: Optional[str] = field(
14
+ default=None,
15
+ metadata={"help": "Pretrained config name or path if not the same as model_name"}
16
+ )
17
+ tokenizer_name: Optional[str] = field(
18
+ default=None,
19
+ metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
20
+ )
21
+ cache_dir: Optional[str] = field(
22
+ default=None,
23
+ metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
24
+ )
25
+ model_revision: str = field(
26
+ default="main",
27
+ metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
28
+ )
29
+ use_auth_token: bool = field(
30
+ default=False,
31
+ metadata={
32
+ "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
33
+ "with private models)."
34
+ },
35
+ )
36
+ post_process: str = field(
37
+ default='argmax',
38
+ metadata={"help": "What post processing to use on the model logits"},
39
+ )
40
+ threshold: Optional[float] = field(
41
+ default=None,
42
+ metadata={"help": "Threshold cutoff for softmax"},
43
+ )
sequence_tagging/dataset_builder/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .ner_labels import NERLabels
2
+ from .ner_dataset import NERDataset
3
+ from .label_mapper import LabelMapper
4
+ from .dataset_tokenizer import DatasetTokenizer
5
+ __all__=["NERLabels", "NERDataset", "LabelMapper", "DatasetTokenizer"]
sequence_tagging/dataset_builder/dataset_tokenizer.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Mapping, Sequence, List, Union, Optional, NoReturn
2
+ from datasets import Dataset
3
+ from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer
4
+
5
+
6
+ class DatasetTokenizer(object):
7
+ """
8
+ The main goal of this class is to solve the problem described below.
9
+ Most of the comments have been copied from the huggingface webpage.
10
+ What this class does is initialize a tokenizer with the desired parameters
11
+ and then tokenize our dataset and align the tokens with the labels
12
+ while keeping in mind the problem & solution described below. We can use this
13
+ function to train and for predictions - we just assume the predictions dataset
14
+ will have a label column filled with some values (so this code can be re-used).
15
+ Now we arrive at a common obstacle with using pre-trained models for
16
+ token-level classification: many of the tokens in the dataset may not
17
+ be in the tokenizer vocabulary. Bert and many models like it use a method
18
+ called WordPiece Tokenization, meaning that single words are split into multiple
19
+ tokens such that each token is likely to be in the vocabulary. For example,
20
+ the tokenizer would split the date (token) 2080 into the tokens ['208', '##0'].
21
+ This is a problem for us because we have exactly one tag per token (2080 -> B-DATE).
22
+ If the tokenizer splits a token into multiple sub-tokens, then we will end up with
23
+ a mismatch between our tokens and our labels (208, 0) - two tokens but one label (B-DATE).
24
+ One way to handle this is to only train on the tag labels for the first subtoken of a
25
+ split token. We can do this in huggingface Transformers by setting the labels
26
+ we wish to ignore to -100. In the example above, if the label for 2080 is B-DATE
27
+ and say the id (from the label to id mapping) for B-DATE is 3, we would set the labels
28
+ of ['208', '##0'] to [3, -100]. This tells the model to ignore the tokens labelled with
29
+ -100 while updating the weights etc.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
35
+ token_column: str,
36
+ label_column: str,
37
+ label_to_id: Mapping[str, int],
38
+ b_to_i_label: Sequence[int],
39
+ padding: Union[bool, str],
40
+ truncation: Union[bool, str],
41
+ is_split_into_words: bool,
42
+ max_length: Optional[int],
43
+ label_all_tokens: bool,
44
+ token_ignore_label: Optional[str]
45
+ ) -> NoReturn:
46
+ """
47
+ Set the tokenizer we are using to subword tokenizer our dataset. The name of the
48
+ column that contains the pre-split tokens, the name of the column that contains
49
+ the labels for each token, label to id mapping.
50
+ Set the padding strategy of the input. Set whether to truncate the input tokens.
51
+ Indicate whether the input is pre-split into tokens. Set the max length of the
52
+ input tokens (post subword tokenization). This will be used in conjunction with truncation.
53
+ Set whether we want to label even the sub tokens
54
+ In the description above we say for 2080 (B-DATE) - [208, ##0]
55
+ We do [3, -100] - which says assume to label of token 2080 is the one
56
+ predicted for 208 or we can just label both sub tokens
57
+ in which case it would be [3, 3] - so we would label 208 as DATE
58
+ and ##0 as DATE - then we would have to figure out how to merge these
59
+ labels etc
60
+ Args:
61
+ tokenizer (Union[PreTrainedTokenizerFast, PreTrainedTokenizer]): Tokenizer from huggingface
62
+ token_column (str): The column that contains the tokens in the dataset
63
+ label_column (str): The column that contains the labels in the dataset
64
+ label_to_id (Mapping[str, int]): The mapping between labels and ID
65
+ b_to_i_label (Sequence[int]): The mapping between labels and ID
66
+ padding (Union[bool, str]): Padding strategy
67
+ truncation (Union[bool, str]): Truncation strategy
68
+ is_split_into_words (bool): Is the input pre-split(tokenized)
69
+ max_length (Optional[int]): Max subword tokenized length for the model
70
+ label_all_tokens (bool): Whether to label sub words
71
+ token_ignore_label (str): The value of the token ignore label - we ignore these in the loss computation
72
+ """
73
+ self._tokenizer = tokenizer
74
+ self._token_column = token_column
75
+ self._label_column = label_column
76
+ self._label_to_id = label_to_id
77
+ self._b_to_i_label = b_to_i_label
78
+ # We can tell the tokenizer that we’re dealing with ready-split tokens rather than full
79
+ # sentence strings by passing is_split_into_words=True.
80
+ # Set the following parameters using the kwargs
81
+ self._padding = padding
82
+ self._truncation = truncation
83
+ self._is_split_into_words = is_split_into_words
84
+ self._max_length = max_length
85
+ self._label_all_tokens = label_all_tokens
86
+ self._token_ignore_label = token_ignore_label
87
+ self._ignore_label = -100
88
+
89
+ def tokenize_and_align_labels(self, dataset: Dataset) -> Dataset:
90
+ """
91
+ This function is the one that is used to read the input dataset
92
+ Run the subword tokenization on the pre-split tokens and then
93
+ as mentioned above align the subtokens and labels and add the ignore
94
+ label. This will read the input - say [60, year, old, in, 2080]
95
+ and will return the subtokens - [60, year, old, in, 208, ##0]
96
+ some other information like token_type_ids etc
97
+ and the labels [0, 20, 20, 20, 3, -100] (0 - corresponds to B-AGE, 20 corresponds to O
98
+ and 3 corresponds to B-DATE. This returned input serves as input for training the model
99
+ or for gathering predictions from a trained model.
100
+ Another important thing to note is that we have mentioned before that
101
+ we add chunks of tokens that appear before and after the current chunk for context. We would
102
+ also need to assign the label -100 (ignore_label) to these chunks, since we are using them
103
+ only to provide context. Basically if a token has the label NA, we don't use it for
104
+ training or evaluation. For example the input would be something
105
+ like tokens: [James, Doe, 60, year, old, in, 2080, BWH, tomorrow, only],
106
+ labels: [NA, NA, B-DATE, O, O, O, B-DATE, NA, NA, NA]. NA represents the tokens used for context
107
+ This function would return some tokenizer info (e.g attention mask etc), along with
108
+ the information that maps the tokens to the subtokens -
109
+ [James, Doe, 60, year, old, in, 208, ##0, BW, ##h, tomorrow, only]
110
+ and the labels - [-100, -100, 0, 20, 20, 20, 3, -100, -100, -100]
111
+ (if label_all_tokens was true, we would return [-100, -100, 0, 20, 20, 20, 3, 3, -100, -100]).
112
+ Args:
113
+ dataset (Dataset): The pre-split (tokenized dataset) that contain labels
114
+ Returns:
115
+ tokenized_inputs (Dataset): Subword tokenized and label aligned dataset
116
+ """
117
+ # Run the tokenizer - subword tokenization
118
+ tokenized_inputs = self._tokenizer(
119
+ dataset[self._token_column],
120
+ padding=self._padding,
121
+ truncation=self._truncation,
122
+ max_length=self._max_length,
123
+ is_split_into_words=self._is_split_into_words,
124
+ )
125
+ # Align the subwords and tokens
126
+ labels = [self.__get_labels(
127
+ labels,
128
+ tokenized_inputs.word_ids(batch_index=index)
129
+ ) for index, labels in enumerate(dataset[self._label_column])]
130
+ tokenized_inputs[self._label_column] = labels
131
+
132
+ return tokenized_inputs
133
+
134
+ def __get_labels(
135
+ self,
136
+ labels: Sequence[str],
137
+ word_ids: Sequence[int]
138
+ ) -> List[int]:
139
+ """
140
+ Go thorough the subword tokens - which are given as word_ids. 2 different tokens
141
+ 2080 & John will have different word_ids, but the subword tokens 2080 & ##0 will
142
+ have the same word_id, we use this to align and assign the labels accordingly.
143
+ if the subword tokens belong to [CLS], [SEP] append the ignore label (-100) to the
144
+ list of labels. If the (2080) subword token (##0) belongs to a token - 2080
145
+ then the labels would be [3, -100] if label_all_tokens is false. Also if the token
146
+ is used only for context (with label NA) it would get the value -100 for its label
147
+ Args:
148
+ labels (Sequence[str]): The list of labels for the input (example)
149
+ word_ids (Sequence[int]): The word_ids after subword tokenization of the input
150
+ Returns:
151
+ label_ids (List[int]): The list of label ids for the input with the ignore label (-100) added
152
+ as required.
153
+ """
154
+ label_ids = list()
155
+ previous_word_idx = None
156
+ for word_idx in word_ids:
157
+ # Special tokens have a word id that is None. We set the label to -100 so they are automatically
158
+ # ignored in the loss function.
159
+ if word_idx is None:
160
+ label_ids.append(self._ignore_label)
161
+ # We set the label for the first token of each word.
162
+ elif word_idx != previous_word_idx:
163
+ if labels[word_idx] == self._token_ignore_label:
164
+ label_ids.append(self._ignore_label)
165
+ else:
166
+ label_ids.append(self._label_to_id[labels[word_idx]])
167
+ # For the other tokens in a word, we set the label to either the current label or -100, depending on
168
+ # the label_all_tokens flag.
169
+ else:
170
+ if labels[word_idx] == self._token_ignore_label:
171
+ label_ids.append(self._ignore_label)
172
+ else:
173
+ label_ids.append(
174
+ self._b_to_i_label[self._label_to_id[labels[word_idx]]]
175
+ if self._label_all_tokens else self._ignore_label
176
+ )
177
+ previous_word_idx = word_idx
178
+ return label_ids
sequence_tagging/dataset_builder/label_mapper.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Sequence, Mapping, Optional, NoReturn, Dict, Union
2
+ from .ner_labels import NERLabels
3
+
4
+
5
+ class LabelMapper(object):
6
+ """
7
+ This class is used to map one set of NER labels to another set of NER labels
8
+ For example we might want to map all NER labels to Binary HIPAA labels.
9
+ E.g:
10
+ We change the token labels - [B-AGE, O, O, U-LOC, B-DATE, L-DATE, O, B-STAFF, I-STAFF, L-STAFF] to
11
+ [B-HIPAA, O, O, U-HIPAA, B-HIPAA, I-HIPAA, O, O, O, O] or if we wanted binary I2B2 labels we map it to
12
+ [B-I2B2, O, O, U-I2B2, B-I2B2, -I2B2, O, B-I2B2, I-I2B2, L-I2B2]
13
+ We do this mapping at the token and the span level. That is we have a span from says start=9, end=15
14
+ labelled as LOC, we map this label to HIPAA or I2B2. This class maps an exisitng set of labels to
15
+ another set of labels
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ notation,
21
+ ner_types: Sequence[str],
22
+ ner_types_maps: Sequence[str],
23
+ description: str
24
+ ) -> NoReturn:
25
+ """
26
+ Initialize the variables that will be used to map the NER labels and spans
27
+ The ner_map and spans_map should correspond to each other and contain the same NER types
28
+ Args:
29
+ """
30
+ self._description = description
31
+ self._types = list(set(ner_types_maps))
32
+ self._types.sort()
33
+ self._spans_map = {ner_type: ner_type_map for ner_type, ner_type_map in zip(ner_types, ner_types_maps)}
34
+ ner_labels = NERLabels(notation=notation, ner_types=ner_types)
35
+ self._ner_map = dict()
36
+ for label in ner_labels.get_label_list():
37
+ if label == 'O' or self._spans_map[label[2:]] == 'O':
38
+ self._ner_map[label] = 'O'
39
+ else:
40
+ self._ner_map[label] = label[0:2] + self._spans_map[label[2:]]
41
+
42
+ def map_sequence(self, tag_sequence: Sequence[str]) -> List[str]:
43
+ """
44
+ Mapping a sequence of NER labels to another set of NER labels.
45
+ E.g: If we use a binary HIPAA mapping
46
+ This sequence [B-AGE, O, O, U-LOC, B-DATE, L-DATE, O, B-STAFF, I-STAFF, L-STAFF] will be mapped to
47
+ [B-HIPAA, O, O, U-HIPAA, B-HIPAA, I-HIPAA, O, O, O, O]
48
+ Return the original sequence if no mapping is used (i.e the maps are == None)
49
+ Args:
50
+ tag_sequence (Sequence[str]): A sequence of NER labels
51
+ Returns:
52
+ (List[str]): A mapped sequence of NER labels
53
+ """
54
+ # Return the original sequence if no mapping is used
55
+ return [self._ner_map[tag] for tag in tag_sequence]
56
+
57
+ def map_spans(self, spans: Sequence[Mapping[str, Union[str, int]]]) -> Sequence[Dict[str, Union[str, int]]]:
58
+ """
59
+ Mapping a sequence of NER spans to another set of NER spans.
60
+ E.g: If we use a binary HIPAA mapping
61
+ The spans: [{start:0, end:5, label: DATE}, {start:17, end:25, label: STAFF}, {start:43, end:54, label: PATIENT}]
62
+ will be mapped to: [{start:0, end:5, label: HIPAA}, {start:17, end:25, label: O}, {start:43, end:54, label: HIPAA}]
63
+ Return the original list of spans if no mapping is used (i.e the maps are == None)
64
+ Args:
65
+ spans (Sequence[Mapping[str, Union[str, int]]]): A sequence of NER spans
66
+ Returns:
67
+ (Sequence[Mapping[str, Union[str, int]]]): A mapped sequence of NER spans
68
+ """
69
+ return [{'start': span['start'], 'end': span['end'], 'label': self._spans_map[span['label']]} \
70
+ for span in spans]
71
+
72
+ def get_ner_description(self) -> str:
73
+ """
74
+ Get the description of the ner labels and span maps used
75
+ Returns:
76
+ (str): A description of the label/span maps used
77
+ """
78
+ return self._description
79
+
80
+ def get_ner_types(self) -> List[str]:
81
+ """
82
+ Get the PHI types back from the list of NER labels
83
+ [B-AGE, I-AGE, B-DATE, I-DATE ..] ---> [AGE, DATE, ...]
84
+ Returns:
85
+ ner_types (List[str]): The list of unique NER types
86
+ """
87
+ return self._types
sequence_tagging/dataset_builder/ner_dataset.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Sequence, Optional, NoReturn
2
+
3
+ from datasets import load_dataset, Dataset
4
+
5
+
6
+ class NERDataset(object):
7
+ """
8
+ This class is a wrapper around the huggingface datasets library
9
+ It maintains the train, validation and test datasets based on the
10
+ train, validation and test files passed by loading the dataset object
11
+ from the file and provides a get function to access each of the datasets.
12
+ """
13
+
14
+ def __init__(
15
+ self,
16
+ train_file: Optional[Sequence[str]] = None,
17
+ validation_file: Optional[Sequence[str]] = None,
18
+ test_file: Optional[Sequence[str]] = None,
19
+ extension: str = 'json',
20
+ shuffle: bool = True,
21
+ seed: int = 41
22
+ ) -> NoReturn:
23
+ """
24
+ Load the train, validation and test datasets from the files passed. Read the files and convert
25
+ it into a huggingface dataset.
26
+ Args:
27
+ train_file (Optional[Sequence[str]]): The list of files that contain train data
28
+ validation_file (Optional[Sequence[str]]): The list of files that contain validation data
29
+ test_file (Optional[Sequence[str]]): The list of files that contain test data
30
+ shuffle (bool): Whether to shuffle the dataset
31
+ seed (int): Shuffle seed
32
+
33
+ """
34
+ self._datasets = NERDataset.__prepare_data(
35
+ train_file,
36
+ validation_file,
37
+ test_file,
38
+ extension,
39
+ shuffle,
40
+ seed
41
+ )
42
+
43
+ @staticmethod
44
+ def __prepare_data(
45
+ train_file: Optional[Sequence[str]],
46
+ validation_file: Optional[Sequence[str]],
47
+ test_file: Optional[Sequence[str]],
48
+ extension: str,
49
+ shuffle: bool,
50
+ seed: int
51
+ ) -> Dataset:
52
+ """
53
+ Get the train, validation and test datasets from the files passed. Read the files and convert
54
+ it into a huggingface dataset.
55
+ Args:
56
+ train_file (Optional[Sequence[str]]): The list of files that contain train data
57
+ validation_file (Optional[Sequence[str]]): The list of files that contain validation data
58
+ test_file (Optional[Sequence[str]]): The list of files that contain test data
59
+ shuffle (bool): Whether to shuffle the dataset
60
+ seed (int): Shuffle seed
61
+ Returns:
62
+ (Dataset): The huggingface dataset with train, validation, test splits (if included)
63
+ """
64
+ # Read the datasets (train, validation, test etc).
65
+ data_files = {}
66
+ if train_file is not None:
67
+ data_files['train'] = train_file
68
+ if validation_file is not None:
69
+ data_files['validation'] = validation_file
70
+ if test_file is not None:
71
+ data_files['test'] = test_file
72
+ # Shuffle the dataset
73
+ if shuffle:
74
+ datasets = load_dataset(extension, data_files=data_files).shuffle(seed=seed)
75
+ else:
76
+ # Don't shuffle the dataset
77
+ datasets = load_dataset(extension, data_files=data_files)
78
+ return datasets
79
+
80
+ def get_train_dataset(self) -> Dataset:
81
+ """
82
+ Return the train dataset
83
+ Returns:
84
+ (Dataset): The huggingface dataset - train split
85
+ """
86
+ return self._datasets['train']
87
+
88
+ def get_validation_dataset(self) -> Dataset:
89
+ """
90
+ Return the validation dataset
91
+ Returns:
92
+ (Dataset): The huggingface dataset - validation split
93
+ """
94
+ return self._datasets['validation']
95
+
96
+ def get_test_dataset(self) -> Dataset:
97
+ """
98
+ Return the test dataset
99
+ Returns:
100
+ (Dataset): The huggingface dataset - test split
101
+ """
102
+ return self._datasets['test']
sequence_tagging/dataset_builder/ner_labels.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Sequence, List, NoReturn, Dict
2
+
3
+
4
+ class NERLabels(object):
5
+ """
6
+ Prepare the labels that will be used by the model. Parse the NER types
7
+ and prepare the NER labels. For example - NER Types: [AGE, DATE],
8
+ it will create a list like this (for BIO notation) [B-AGE, I-AGE, B-DATE, I-DATE, O]
9
+ These are the labels that will be assigned to the tokens based on the PHI type.
10
+ Say we had the following NER types: NAME, AGE, HOSP
11
+ The NER labels in the BIO notation would be B-AGE, B-HOSP, B-NAME, I-AGE, I-HOSP, I-NAME, O
12
+ This script creates a list of the NER labels ([B-AGE, B-HOSP, B-NAME, I-AGE, I-HOSP, I-NAME, O])
13
+ based on the NER types (NAME, AGE, HOSP) that have been defined. Labels have been sorted.
14
+ The script also returns the number of labels, the label_to_id mapping, the id_to_label mapping
15
+ Label_id_mapping: {B-AGE:0, B-HOSP:1, B-NAME:2, I-AGE:3, I-HOSP:4, I-NAME:5, O:6}
16
+ This information will be used during training, evaluation and prediction.
17
+ """
18
+
19
+ def __init__(self, notation: str, ner_types: Sequence[str]) -> NoReturn:
20
+ """
21
+ Initialize the notation that we are using for the NER task
22
+ Args:
23
+ notation (str): The notation that will be used for the NER labels
24
+ ner_types (Sequence[str]): The list of NER categories
25
+ """
26
+ self._notation = notation
27
+ self._ner_types = ner_types
28
+
29
+ def get_label_list(self) -> List[str]:
30
+ """
31
+ Given the NER types return the NER labels.
32
+ NER Types: [AGE, DATE] -> return a list like this (for BIO notation) [B-AGE, I-AGE, B-DATE, I-DATE, O]
33
+ Returns:
34
+ ner_labels (List[str]): The list of NER labels based on the NER notation (e.g BIO)
35
+ """
36
+ # Add the 'O' (Outside - Non-phi) label to the list
37
+ if 'O' not in self._ner_types:
38
+ ner_labels = ['O']
39
+ else:
40
+ ner_labels = list()
41
+ # Go through each label and prefix it based on the notation (e.g - B, I etc)
42
+ for ner_type in self._ner_types:
43
+ for ner_tag in list(self._notation):
44
+ if ner_tag != 'O':
45
+ ner_labels.append(ner_tag + '-' + ner_type)
46
+ ner_labels.sort()
47
+ return ner_labels
48
+
49
+ def get_label_to_id(self) -> Dict[str, int]:
50
+ """
51
+ Return a label to id mapping
52
+ Returns:
53
+ label_to_id (Dict[str, int]): label to id mapping
54
+ """
55
+ labels = self.get_label_list()
56
+ label_to_id = {label: index_id for index_id, label in enumerate(labels)}
57
+ return label_to_id
58
+
59
+ def get_id_to_label(self) -> Dict[int, str]:
60
+ """
61
+ Return a id to label mapping
62
+ Returns:
63
+ id_to_label (Dict[int, str]): id to label mapping
64
+ """
65
+ labels = self.get_label_list()
66
+ id_to_label = {index_id: label for index_id, label in enumerate(labels)}
67
+ return id_to_label