Spaces:

obi
/

Medical-Note-Deidentification

Running

App Files Files Community

prajwal967 commited on Feb 15, 2022

Commit

584d7ba

•

1 Parent(s): d27326e

delete unused folders

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

deid/__init__.py +0 -2
deid/text_deid.py +0 -307
deid/utils.py +0 -43
ner_datasets/__init__.py +0 -4
ner_datasets/__pycache__/__init__.cpython-37.pyc +0 -0
ner_datasets/dataset_builder/__init__.py +0 -3
ner_datasets/dataset_builder/dataset.py +0 -119
ner_datasets/dataset_builder/labels/__init__.py +0 -4
ner_datasets/dataset_builder/labels/mismatch_error.py +0 -7
ner_datasets/dataset_builder/labels/ner_predict_token_labels.py +0 -30
ner_datasets/dataset_builder/labels/ner_token_labels.py +0 -156
ner_datasets/dataset_builder/sentence_dataset.py +0 -355
ner_datasets/dataset_creator.py +0 -322
ner_datasets/dataset_splitter.py +0 -294
ner_datasets/distribution/__init__.py +0 -4
ner_datasets/distribution/dataset_splits.py +0 -218
ner_datasets/distribution/ner_distribution.py +0 -54
ner_datasets/distribution/print_distribution.py +0 -49
ner_datasets/preprocessing/__init__.py +0 -2
ner_datasets/preprocessing/preprocessing_loader.py +0 -64
ner_datasets/preprocessing/sentencizers/__init__.py +0 -3
ner_datasets/preprocessing/sentencizers/mimic_stanza_sentencizer.py +0 -37
ner_datasets/preprocessing/sentencizers/note_sentencizer.py +0 -33
ner_datasets/preprocessing/sentencizers/spacy_sentencizer.py +0 -37
ner_datasets/preprocessing/tokenizers/__init__.py +0 -4
ner_datasets/preprocessing/tokenizers/abbreviations/check.txt +0 -20
ner_datasets/preprocessing/tokenizers/abbreviations/medical_abbreviations_curated.txt +0 -87
ner_datasets/preprocessing/tokenizers/abbreviations/medical_abbreviations_wiki.txt +0 -459
ner_datasets/preprocessing/tokenizers/clinical_spacy_tokenizer.py +0 -73
ner_datasets/preprocessing/tokenizers/core_nlp_tokenizer.py +0 -58
ner_datasets/preprocessing/tokenizers/spacy_tokenizer.py +0 -49
ner_datasets/preprocessing/tokenizers/utils/__init__.py +0 -4
ner_datasets/preprocessing/tokenizers/utils/clean_regex.py +0 -64
ner_datasets/preprocessing/tokenizers/utils/clinical_regex.py +0 -309
ner_datasets/preprocessing/tokenizers/utils/date_regex.py +0 -104
ner_datasets/span_fixer.py +0 -380
ner_datasets/span_validation.py +0 -91
sequence_tagging/.DS_Store +0 -0
sequence_tagging/__init__.py +0 -2
sequence_tagging/__pycache__/__init__.cpython-37.pyc +0 -0
sequence_tagging/__pycache__/sequence_tagger.cpython-37.pyc +0 -0
sequence_tagging/arguments/__init__.py +0 -8
sequence_tagging/arguments/data_training_arguments.py +0 -115
sequence_tagging/arguments/evaluation_arguments.py +0 -26
sequence_tagging/arguments/model_arguments.py +0 -43
sequence_tagging/dataset_builder/__init__.py +0 -5
sequence_tagging/dataset_builder/dataset_tokenizer.py +0 -178
sequence_tagging/dataset_builder/label_mapper.py +0 -87
sequence_tagging/dataset_builder/ner_dataset.py +0 -102
sequence_tagging/dataset_builder/ner_labels.py +0 -67

deid/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .text_deid import TextDeid
2	- __all__ = ["TextDeid"]

deid/text_deid.py DELETED Viewed

@@ -1,307 +0,0 @@
-import json
-import re
-from argparse import ArgumentParser
-from typing import Sequence, List, Tuple, Mapping, Union, Any, Type
-import regex
-from seqeval.scheme import IOB1, IOB2, IOBES, BILOU, Entities
-from .utils import remove, replace_tag_type, replace_informative
-class TextDeid(object):
-    def __init__(self, notation, span_constraint):
-        self._span_constraint = span_constraint
-        if self._span_constraint == 'strict':
-            self._scheme = TextDeid.__get_scheme('IO')
-        elif self._span_constraint == 'super_strict':
-            self._scheme = TextDeid.__get_scheme('IO')
-        else:
-            self._scheme = TextDeid.__get_scheme(notation)
-    def decode(self, tokens, predictions):
-        if self._span_constraint == 'exact':
-            return predictions
-        elif self._span_constraint == 'strict':
-            return TextDeid.__get_relaxed_predictions(predictions)
-        elif self._span_constraint == 'super_strict':
-            return TextDeid.__get_super_relaxed_predictions(tokens, predictions)
-    def get_predicted_entities_positions(
-            self,
-            tokens: Sequence[Mapping[str, Union[str, int]]],
-            predictions: List[str],
-            suffix: bool
-    ) -> List[List[Union[Tuple[Union[str, int], Union[str, int]], Any]]]:
-        """
-        Use the seqeval get_entities method, which goes through the predictions and returns
-        where the span starts and ends. - [O, O, B-AGE, I-AGE, O, O] this will return
-        spans starts at token 2 and ends at token 3 - with type AGE. We then extract the
-        position of the token in the note (character position) - so we return that
-        this span starts at 32 and ends at 37. The function then returns a nested list
-        that contains a tuple of tag type and tag position (character positions).
-        Example: [[(3, 9), LOC], [(34, 41), PATIENT], ...]]
-        Args:
-            tokens (Sequence[Mapping[str, Union[str, int]]]): The list of tokens in the note
-            predictions (Sequence[str]): The list of predictions for the note
-            suffix (str): Whether the B, I etc is in the prefix or the suffix
-        Returns:
-            positions_info (List[Tuple[Tuple[int, int], str]])): List containing tuples of tag positions and tag type
-        """
-        positions_info = list()
-        entities = Entities(sequences=[predictions], scheme=self._scheme, suffix=suffix)
-        for entity_list in entities.entities:
-            for entity in entity_list:
-                position = (tokens[entity.start]['start'], tokens[entity.end - 1]['end'])
-                positions_info.append([position, entity.tag])
-        return positions_info
-    def run_deid(
-            self,
-            input_file,
-            predictions_file,
-            deid_strategy,
-            keep_age: bool = False,
-            metadata_key: str = 'meta',
-            note_id_key: str = 'note_id',
-            tokens_key: str = 'tokens',
-            predictions_key: str = 'predictions',
-            text_key: str = 'text'
-    ):
-        # Store note_id to note mapping
-        note_map = dict()
-        for line in open(input_file, 'r'):
-            note = json.loads(line)
-            note_id = note[metadata_key][note_id_key]
-            note_map[note_id] = note
-        # Go through note predictions and de identify the note accordingly
-        for line in open(predictions_file, 'r'):
-            note = json.loads(line)
-            # Get the text using the note_id for this note from the note_map dict
-            note_id = note[note_id_key]
-            # Get the note from the note_map dict
-            deid_note = note_map[note_id]
-            # Get predictions
-            predictions = self.decode(tokens=note[tokens_key], predictions=note[predictions_key])
-            # Get entities and their positions
-            entity_positions = self.get_predicted_entities_positions(
-                tokens=note[tokens_key],
-                predictions=predictions,
-                suffix=False
-            )
-            yield TextDeid.__get_deid_text(
-                deid_note=deid_note,
-                entity_positions=entity_positions,
-                deid_strategy=deid_strategy,
-                keep_age=keep_age,
-                text_key=text_key
-            )
-    @staticmethod
-    def __get_deid_text(
-            deid_note,
-            entity_positions,
-            deid_strategy,
-            keep_age: bool = False,
-            text_key: str = 'text'
-    ):
-        tag_mapping = TextDeid.__get_tag_mapping(deid_strategy=deid_strategy)
-        age_pattern = '((?<!\d+)([1-7]\d?)(?!\d+))|((?<!\d+)(8[0-8]?)(?!\d+))'
-        # Sort positions - store the last occurring tag first - i.e in descending order
-        # of start positions.
-        entity_positions.sort(key=lambda info: info[0][0], reverse=True)
-        # Get text and de identify it
-        note_text = deid_note[text_key]
-        deid_text = deid_note[text_key]
-        # Go through the entities and their positions and de identify the text
-        # Since we have the positions in sorted order (descending by start positions)
-        # we de identify the text from the end to the start - i.e back to front
-        for positions, tag in entity_positions:
-            start_pos, end_pos = positions
-            deid_tag = tag_mapping[tag]
-            age_unchanged = False
-            if tag == 'AGE' and keep_age:
-                span_text = note_text[start_pos:end_pos]
-                if regex.search(age_pattern, span_text, flags=regex.IGNORECASE):
-                    deid_tag = span_text
-                    age_unchanged = True
-                else:
-                    deid_tag = deid_tag
-            if deid_strategy == 'replace_informative' and not age_unchanged:
-                deid_text = deid_text[:start_pos] + deid_tag.format(note_text[start_pos:end_pos]) + deid_text[end_pos:]
-            else:
-                deid_text = deid_text[:start_pos] + deid_tag + deid_text[end_pos:]
-        deid_note['deid_text'] = regex.sub('[\n]+', '\n', regex.sub('[ \t\r\f\v]+', ' ', deid_text)).strip()
-        return deid_note
-    @staticmethod
-    def __get_tag_mapping(deid_strategy):
-        if deid_strategy == 'remove':
-            return remove()
-        elif deid_strategy == 'replace_tag_type':
-            return replace_tag_type()
-        elif deid_strategy == 'replace_informative':
-            return replace_informative()
-    @staticmethod
-    def __get_relaxed_predictions(predictions):
-        return ['I-' + prediction[2:] if '-' in prediction else prediction for prediction in predictions]
-    @staticmethod
-    def __get_super_relaxed_predictions(tokens, predictions):
-        # Super relaxed
-        # 360 Longwood Ave, OBI, Boston
-        # Tokens: ['360', 'Longwood', 'Ave', ',', 'OBI', ',', Boston[
-        # Predictions: [B-LOC, I-LOC, L-LOC, O, U-LOC, O, U-LOC]
-        # Relaxed: [I-LOC, I-LOC, I-LOC, O, I-LOC, O, I-LOC]
-        # Super relaxed: [I-LOC, I-LOC, I-LOC, I-LOC, I-LOC, I-LOC, I-LOC]
-        relaxed_predictions = TextDeid.__get_relaxed_predictions(predictions)
-        prev_type = None
-        replace_indexes = list()
-        super_relaxed_predictions = list()
-        for index, (token, prediction) in enumerate(zip(tokens, relaxed_predictions)):
-            super_relaxed_predictions.append(prediction)
-            # Check special characters that appear after a prediction
-            # we can assign the prediction label to this sequence of special characters
-            if prediction == 'O' and prev_type is not None:
-                # [a-zA-Z0-9]
-                if re.search('^(\W|_)+$', token['text'], flags=re.IGNORECASE | re.DOTALL):
-                    replace_indexes.append(index)
-                else:
-                    prev_type = None
-                    replace_indexes = list()
-            # Replace all the tokens identified above with the NER prediction type
-            # This is done only ig the current prediction type matches the previous type
-            elif prediction != 'O':
-                if prediction[2:] == prev_type and replace_indexes != []:
-                    for replace_index in replace_indexes:
-                        super_relaxed_predictions[replace_index] = 'I-' + prev_type
-                # Reset list and previous type
-                replace_indexes = list()
-                prev_type = prediction[2:]
-            else:
-                prev_type = None
-        return super_relaxed_predictions
-    @staticmethod
-    def __get_scheme(notation: str) -> Union[Type[IOB2], Type[IOBES], Type[BILOU], Type[IOB1]]:
-        """
-        Get the seqeval scheme based on the notation
-        Args:
-            notation (str): The NER notation
-        Returns:
-            (Union[IOB2, IOBES, BILOU, IOB1]): The seqeval scheme
-        """
-        if notation == 'BIO':
-            return IOB2
-        elif notation == 'BIOES':
-            return IOBES
-        elif notation == 'BILOU':
-            return BILOU
-        elif notation == 'IO':
-            return IOB1
-        else:
-            raise ValueError('Invalid Notation')
-def main():
-    # The following code sets up the arguments to be passed via CLI or via a JSON file
-    cli_parser = ArgumentParser(description='configuration arguments provided at run time from the CLI')
-    cli_parser.add_argument(
-        '--input_file',
-        type=str,
-        required=True,
-        help='the the jsonl file that contains the notes'
-    )
-    cli_parser.add_argument(
-        '--predictions_file',
-        type=str,
-        required=True,
-        help='the location where the predictions are'
-    )
-    cli_parser.add_argument(
-        '--span_constraint',
-        type=str,
-        required=True,
-        choices=['exact', 'strict', 'super_strict'],
-        help='whether we want to modify the predictions, make the process of removing phi more struct etc'
-    )
-    cli_parser.add_argument(
-        '--notation',
-        type=str,
-        required=True,
-        help='the NER notation in the predictions'
-    )
-    cli_parser.add_argument(
-        '--deid_strategy',
-        type=str,
-        required=True,
-        choices=['remove', 'replace_tag_type', 'replace_informative'],
-        help='The strategy '
-    )
-    cli_parser.add_argument(
-        '--keep_age',
-        action='store_true',
-        help='whether to keep ages below 89'
-    )
-    cli_parser.add_argument(
-        '--text_key',
-        type=str,
-        default='text',
-        help='the key where the note text is present in the json object'
-    )
-    cli_parser.add_argument(
-        '--metadata_key',
-        type=str,
-        default='meta',
-        help='the key where the note metadata is present in the json object'
-    )
-    cli_parser.add_argument(
-        '--note_id_key',
-        type=str,
-        default='note_id',
-        help='the key where the note id is present in the json object'
-    )
-    cli_parser.add_argument(
-        '--tokens_key',
-        type=str,
-        default='tokens',
-        help='the key where the tokens for the notes are present in the json object'
-    )
-    cli_parser.add_argument(
-        '--predictions_key',
-        type=str,
-        default='predictions',
-        help='the key where the note predictions is present in the json object'
-    )
-    cli_parser.add_argument(
-        '--output_file',
-        type=str,
-        required=True,
-        help='the location we would write the deid notes'
-    )
-    # Parse args
-    args = cli_parser.parse_args()
-    text_deid = TextDeid(notation=args.notation, span_constraint=args.span_constraint)
-    deid_notes = text_deid.run_deid(
-            input_file=args.input_file,
-            predictions_file=args.predictions_file,
-            deid_strategy=args.deid_strategy,
-            keep_age=args.keep_age,
-            metadata_key=args.metadata_key,
-            note_id_key=args.note_id_key,
-            tokens_key=args.tokens_key,
-            predictions_key=args.predictions_key,
-            text_key=args.text_key
-    )
-    # Write the dataset to the output file
-    with open(args.output_file, 'w') as file:
-        for deid_note in deid_notes:
-            file.write(json.dumps(deid_note) + '\n')
-if __name__ == "__main__":
-    # Get deid notes
-    main()

deid/utils.py DELETED Viewed

@@ -1,43 +0,0 @@
-def remove():
-    return {'PATIENT': '',
-            'STAFF': '',
-            'AGE': '',
-            'DATE': '',
-            'PHONE': '',
-            'MRN': '',
-            'ID': '',
-            'EMAIL': '',
-            'PATORG': '',
-            'LOC': '',
-            'HOSP': '',
-            'OTHERPHI': ''}
-def replace_tag_type():
-    return {'PATIENT': 'PATIENT',
-            'STAFF': 'STAFF',
-            'AGE': 'AGE',
-            'DATE': 'DATE',
-            'PHONE': 'PHONE',
-            'MRN': 'MRN',
-            'ID': 'ID',
-            'EMAIL': 'EMAIL',
-            'PATORG': 'PATORG',
-            'LOC': 'LOCATION',
-            'HOSP': 'HOSPITAL',
-            'OTHERPHI': 'OTHERPHI'}
-def replace_informative():
-    return {'PATIENT': '<<PATIENT:{}>>',
-            'STAFF': '<<STAFF:{}>>',
-            'AGE': '<<AGE:{}>>',
-            'DATE': '<<DATE:{}>>',
-            'PHONE': '<<PHONE:{}>>',
-            'MRN': '<<MRN:{}>>',
-            'ID': '<<ID:{}>>',
-            'EMAIL': '<<EMAIL:{}>>',
-            'PATORG': '<<PATORG:{}>>',
-            'LOC': '<<LOCATION:{}>>',
-            'HOSP': '<<HOSPITAL:{}>>',
-            'OTHERPHI': '<<OTHERPHI:{}>>'}

ner_datasets/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-from .span_fixer import SpanFixer
-from .dataset_splitter import DatasetSplitter
-from .dataset_creator import DatasetCreator
-__all__ = ["SpanFixer", "DatasetSplitter", "DatasetCreator"]

ner_datasets/__pycache__/__init__.cpython-37.pyc DELETED Viewed

Binary file (487 Bytes)

ner_datasets/dataset_builder/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .dataset import Dataset
-from .sentence_dataset import SentenceDataset
-__all__ = ["SentenceDataset", "Dataset"]

ner_datasets/dataset_builder/dataset.py DELETED Viewed

@@ -1,119 +0,0 @@
-import random
-import re
-from typing import Iterable, Dict, Sequence, Union, Mapping, Optional, List
-from .labels import NERTokenLabels, NERPredictTokenLabels, MismatchError
-random.seed(41)
-class Dataset(object):
-    """
-    Build a NER token classification dataset. Each token should have a corresponding label
-    based on the annotated spans
-    For training we will build the dataset using the annotated spans (e.g from prodigy)
-    For predictions we will assign default labels. to keep the format of the dataset the same
-    The dataset is on a sentence level, i.e each note is split into sentences and the
-    task is run on a sentence level. Even the predictions are run on a sentence level
-    The dataset would be something like:
-    Tokens: [tok1, tok2, ... tok n]
-    Labels: [lab1, lab2, ... lab n]
-    For the prediction mode the labels would be: [default, default, default .... default]
-    This script can also be used for predictions, the Labels will be filled with some
-    default value. This is done so that we can use the same script for building a dataset to train a model
-    and a dataset to obtain predictions using a model
-    """
-    def __init__(
-            self,
-            sentencizer,
-            tokenizer
-    ):
-        """
-        Build a NER token classification dataset
-        For training we will build the dataset using the annotated spans (e.g from prodigy)
-        For predictions we will assign default labels.
-        The dataset is on a sentence level, i.e each note is split into sentences and the de-id
-        task is run on a sentence level. Even the predictions are run on a sentence level
-        The dataset would be something like:
-        Tokens: [tok1, tok2, ... tok n]
-        Labels: [lab1, lab2, ... lab n]
-        This script can also be used for predictions, the Labels will be filled with some
-        default value. This is done so that we can use the same script for building a dataset to train a model
-        and a dataset to obtain predictions using a model
-        Args:
-            sentencizer (Union[SpacySentencizer, MimicStanzaSentencizer, NoteSentencizer]): The sentencizer to use for
-                                                                                            splitting notes into
-                                                                                            sentences
-            tokenizer (Union[ClinicalSpacyTokenizer, SpacyTokenizer, CoreNLPTokenizer]): The tokenizer to use for
-                                                                                         splitting text into tokens
-        """
-        self._sentencizer = sentencizer
-        self._tokenizer = tokenizer
-    def get_tokens(
-            self,
-            text: str,
-            spans: Optional[List[Mapping[str, Union[str, int]]]] = None,
-            notation: str = 'BIO',
-            token_text_key: str = 'text',
-            label_key: str = 'label'
-    ) -> Iterable[Sequence[Dict[str, Union[str, int]]]]:
-        """
-        Get a nested list of tokens where the the inner list represents the tokens in the
-        sentence and the outer list will contain all the sentences in the note
-        Args:
-            text (str): The text present in the note
-            spans (Optional[List[Mapping[str, Union[str, int]]]]): The NER spans in the note. This will be none if
-                                                                   building the dataset for prediction
-            notation (str): The notation we will be using for the label scheme (e.g BIO, BILOU etc)
-            token_text_key (str): The key where the note text is present
-            label_key (str): The key where the note label for each token is present
-        Returns:
-            Iterable[Sequence[Dict[str, Union[str, int]]]]: Iterable that iterates through all the sentences
-                                                            and yields the list of tokens in each sentence
-        """
-        # Initialize the object that will be used to align tokens and spans based on the notation
-        # as mentioned earlier - this will be used only when mode is train - because we have
-        # access to labelled spans for the notes
-        if spans is None:
-            label_spans = NERPredictTokenLabels('O')
-        else:
-            label_spans = NERTokenLabels(spans=spans, notation=notation)
-        # Iterate through the sentences in the note
-        for sentence in self._sentencizer.get_sentences(text=text):
-            # This is used to determine the position of the tokens with respect to the entire note
-            offset = sentence['start']
-            # Keeps track of the tokens in the sentence
-            tokens = list()
-            for token in self._tokenizer.get_tokens(text=sentence['text']):
-                # Get the token position (start, end) in the note
-                token['start'] += offset
-                token['end'] += offset
-                if token[token_text_key].strip() in ['\n', '\t', ' ', ''] or token['start'] == token['end']:
-                    continue
-                # Shorten consecutive sequences of special characters, this can prevent BERT from truncating
-                # extremely long sentences - that could arise because of these characters
-                elif re.search('(\W|_){9,}', token[token_text_key]):
-                    print('WARNING - Shortening a long sequence of special characters from {} to 8'.format(
-                        len(token[token_text_key])))
-                    token[token_text_key] = re.sub('(?P<specchar>(\W|_)){8,}', '\g<specchar>' * 8,
-                                                   token[token_text_key])
-                elif len(token[token_text_key].split(' ')) != 1:
-                    print('WARNING - Token contains a space character - will be replaced with hyphen')
-                    token[token_text_key] = token[token_text_key].replace(' ', '-')
-                # Get the labels for each token based on the notation (BIO)
-                # In predict mode - the default label (e.g O) will be assigned
-                try:
-                    # Get the label for the token - based on the notation
-                    label = label_spans.get_labels(token=token)
-                    if label[2:] == 'OTHERISSUE':
-                        raise ValueError('Fix OTHERISSUE spans')
-                # Check if there is a token and span mismatch, i.e the token and span does not align
-                except MismatchError:
-                    print(token)
-                    raise ValueError('Token-Span mismatch')
-                token[label_key] = label
-                tokens.append(token)
-            if tokens:
-                yield tokens

ner_datasets/dataset_builder/labels/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-from .mismatch_error import MismatchError
-from .ner_token_labels import NERTokenLabels
-from .ner_predict_token_labels import NERPredictTokenLabels
-__all__=["NERTokenLabels", "NERPredictTokenLabels", "MismatchError"]

ner_datasets/dataset_builder/labels/mismatch_error.py DELETED Viewed

@@ -1,7 +0,0 @@
-# Exception thrown when there is a mismatch between a token and span
-# The token and spans don't line up due to a tokenization issue
-# E.g - 79M - span is AGE - 79, but token is 79M
-# There is a mismatch and an error will be thrown - that is the token does
-# not line up with the span
-class MismatchError(Exception):
-    pass

ner_datasets/dataset_builder/labels/ner_predict_token_labels.py DELETED Viewed

@@ -1,30 +0,0 @@
-from typing import Mapping, Union, NoReturn
-class NERPredictTokenLabels(object):
-    """
-    Assign a default label while creating the dataset for prediction.
-    This is done since the sequence tagging code expects the input
-    file to contain a labels field, hence we assign a default label
-    to meet this requirement
-    """
-    def __init__(self, default_label: str) -> NoReturn:
-        """
-        Initialize the default label
-        Args:
-            default_label (str): Default label that will be used
-        """
-        # Keeps track of all the spans (list) in the text (note)
-        self._default_label = default_label
-    def get_labels(self, token: Mapping[str, Union[str, int]]) -> str:
-        """
-        Given a token, return the default label.
-        Args:
-            token (Mapping[str, Union[str, int]]): Contains the token text, start and end position of the token
-                                                   in the text
-        Returns:
-            default_label (str): default label
-        """
-        return self._default_label

ner_datasets/dataset_builder/labels/ner_token_labels.py DELETED Viewed

@@ -1,156 +0,0 @@
-from typing import Mapping, Union, Sequence, List
-from .mismatch_error import MismatchError
-class NERTokenLabels(object):
-    """
-    This class is used to align tokens with the spans
-    Each token is assigned one of the following labels
-    'B-LABEL', 'I-LABEL', 'O'. For example the text
-    360 Longwood Avenue is 2 tokens - [360, Longwood, Avenue]
-    and each token would be assigned the following labels
-    [B-LOC, I-LOC, I-LOC] (this would also depend on what
-    notation we are using). Generally the data after prodigy
-    annotation has all the tokens and all the spans.
-    We would have tokens:[tok1, tok2, ... tokn]
-    and spans:[span1:[tok1, tok2, tok3], span2:[tok7], ... span k]
-    This would be used to convert into the format we are using
-    which is assign the label to each token based on which span it
-    belongs to.
-    """
-    def __init__(
-            self,
-            spans: List[Mapping[str, Union[str, int]]],
-            notation: str
-    ):
-        """
-        Initialize variables that will be used to align tokens
-        and span labels. The spans variable will contain all the spans
-        in the note. Notation is whether we would like to use BIO, IO, BILOU,
-        when assigning the label to each token based on which span it belongs to.
-        Keep track of the total number of spans etc.
-        Args:
-            spans (Sequence[Mapping[str, Union[str, int]]]): List of all the spans in the text
-            notation (str): NER label notation
-        """
-        # Keeps track of all the spans (list) in the text (note)
-        self._spans = spans
-        for span in self._spans:
-            if type(span['start']) != int or type(span['end']) != int:
-                raise ValueError('The start and end keys of the span must be of type int')
-        self._spans.sort(key=lambda _span: (_span['start'], _span['end']))
-        # The current span is the first element of the list
-        self._current_span = 0
-        # Boolean variable that indicates whether the token is inside
-        # the span (I-LABEL)
-        self._inside = False
-        # Total number of spans
-        self._span_count = len(self._spans)
-        # Depending on the notation passed, we will return the label for
-        # the token accordingly
-        if notation == 'BIO':
-            self._prefix_single = 'B-'
-            self._prefix_begin = 'B-'
-            self._prefix_inside = 'I-'
-            self._prefix_end = 'I-'
-            self._prefix_outside = 'O'
-        elif notation == 'BIOES':
-            self._prefix_single = 'S-'
-            self._prefix_begin = 'B-'
-            self._prefix_inside = 'I-'
-            self._prefix_end = 'E-'
-            self._prefix_outside = 'O'
-        elif notation == 'BILOU':
-            self._prefix_single = 'U-'
-            self._prefix_begin = 'B-'
-            self._prefix_inside = 'I-'
-            self._prefix_end = 'L-'
-            self._prefix_outside = 'O'
-        elif notation == 'IO':
-            self._prefix_single = 'I-'
-            self._prefix_begin = 'I-'
-            self._prefix_inside = 'I-'
-            self._prefix_end = 'I-'
-            self._prefix_outside = 'O'
-    def __check_begin(self, token: Mapping[str, Union[str, int]]) -> str:
-        """
-        Given a token, return the label (B-LABEL) and check whether the token
-        covers the entire span or is a sub set of the span
-        Args:
-            token (Mapping[str, Union[str, int]]): Contains the token text, start and end position of the token
-                                                   in the text
-        Returns:
-            (str): The label - 'B-LABEL'
-        """
-        # Set the inside flag to true to indicate that the next token that is checked
-        # will be checked to see if it belongs 'inside' the span
-        self._inside = True
-        if token['end'] > int(self._spans[self._current_span]['end']):
-            raise MismatchError('Span and Token mismatch - Begin Token extends longer than the span')
-        # If this token does not cover the entire span then we expect another token
-        # to be in the span and that token should be assigned the I-LABEL
-        elif token['end'] < int(self._spans[self._current_span]['end']):
-            return self._prefix_begin + self._spans[self._current_span]['label']
-        # If this token does cover the entire span then we set inside = False
-        # to indicate this span is complete and increment the current span
-        # to move onto the next span in the text
-        elif token['end'] == int(self._spans[self._current_span]['end']):
-            self._current_span += 1
-            self._inside = False
-            return self._prefix_single + self._spans[self._current_span - 1]['label']
-    def __check_inside(self, token: Mapping[str, Union[str, int]]) -> str:
-        """
-        Given a token, return the label (I-LABEL) and check whether the token
-        covers the entire span or is still inside the span.
-        Args:
-            token (Mapping[str, Union[str, int]]): Contains the token text, start and end position of the token
-                                                   in the text
-        Returns:
-            (str): The label - 'I-LABEL'
-        """
-        if (token['start'] >= int(self._spans[self._current_span]['end'])
-                or token['end'] > int(self._spans[self._current_span]['end'])):
-            raise MismatchError('Span and Token mismatch - Inside Token starts after the span ends')
-        # If this token does not cover the entire span then we expect another token
-        # to be in the span and that token should be assigned the I-LABEL
-        elif token['end'] < int(self._spans[self._current_span]['end']):
-            return self._prefix_inside + self._spans[self._current_span]['label']
-        # If this token does cover the entire span then we set inside = False
-        # to indicate this span is complete and increment the current span
-        # to move onto the next span in the text
-        elif token['end'] == int(self._spans[self._current_span]['end']):
-            self._current_span += 1
-            self._inside = False
-            return self._prefix_end + self._spans[self._current_span - 1]['label']
-    def get_labels(self, token: Mapping[str, Union[str, int]]) -> str:
-        """
-        Given a token, return the label (B-LABEL, I-LABEL, O) based on
-        the spans present in the text & the desired notation.
-        Args:
-            token (Mapping[str, Union[str, int]]): Contains the token text, start and end position of the token
-                                                   in the text
-        Returns:
-            (str): One of the labels according to the notation - 'B-LABEL', 'I-LABEL', 'O'
-        """
-        # If we have iterated through all the spans in the text (note), all the tokens that
-        # come after the last span will be marked as 'O' - since they don't belong to any span
-        if self._current_span >= self._span_count:
-            return self._prefix_outside
-        # Check if the span can be assigned the B-LABEL
-        if token['start'] == int(self._spans[self._current_span]['start']):
-            return self.__check_begin(token)
-        # Check if the span can be assigned the I-LABEL
-        elif token['start'] > int(self._spans[self._current_span]['start']) and self._inside is True:
-            return self.__check_inside(token)
-        # Check if the token is outside a span
-        elif self._inside is False and (token['end'] <= int(self._spans[self._current_span]['start'])):
-            return self._prefix_outside
-        else:
-            raise MismatchError(
-                'Span and Token mismatch - the span and tokens don\'t line up. There might be a tokenization issue '
-                'that needs to be fixed')

ner_datasets/dataset_builder/sentence_dataset.py DELETED Viewed

@@ -1,355 +0,0 @@
-from collections import deque
-from typing import Deque, List, Sequence, Iterable, Optional, NoReturn, Dict, Mapping, Union, Tuple
-class SentenceDataset(object):
-    """
-    When we mention previous sentence and next sentence, we don't mean exactly one sentence
-    but rather a previous chunk and a next chunk. This can include one or more sentences and
-    it does not mean that the sentence has to be complete (it can be cutoff in between) - hence a chunk
-    This class is used to build a dataset at the sentence
-    level. It takes as input all the tokenized sentences in the note. So the input is
-    a list of lists where the outer list represents the sentences in the note and the inner list
-    is a list of tokens in the sentence. It then returns a dataset where each sentence is
-    concatenated with the previous and a next chunk. This is done so that when we build a model
-    we can use the previous and next chunks to add context to the sentence/model. The weights and loss etc
-    will be computed and updated based on the current sentence. The previous and next chunks will
-    only be used to add context. We could have different sizes of previous and next chunks
-    depending on the position of the sentence etc. Essentially we build a sentence level dataset
-    where we can also provide context to the sentence by including the previous and next chunks
-    """
-    def __init__(
-            self,
-            max_tokens: int,
-            max_prev_sentence_token: int,
-            max_next_sentence_token: int,
-            default_chunk_size: int,
-            ignore_label: str
-    ) -> NoReturn:
-        """
-        Set the maximum token length a given training example (sentence level) can have.
-        That is the total length of the current sentence + previous chunk + next chunk
-        We also set the the maximum length of the previous and next chunks. That is how many
-        tokens can be in these chunks. However if the total length exceeds, tokens in the
-        previous and next chunks will be dropped to ensure that the total length is < max_tokens
-        The default chunk size ensures that the length of the chunks will be a minimum number of
-        tokens based on the value passed. For example is default_chunk_size=10, the length
-        of the previous chunks and next chunks will be at least 10 tokens.
-        Args:
-            max_tokens (int): maximum token length a given training example (sentence level) can have
-            max_prev_sentence_token (int): The max chunk size for the previous chunks for a given sentence
-                                           (training/prediction example) in the note can have
-            max_next_sentence_token (int): The max chunk size for the next chunks for a given sentence
-                                           (training/prediction example) in the note can have
-            default_chunk_size (int): the training example will always include a chunk of this length
-                                      as part of the previous and next chunks
-            ignore_label (str): The label assigned to the previous and next chunks to distinguish
-                                from the current sentence
-        """
-        self._id_num = None
-        self._max_tokens = max_tokens
-        self._max_prev_sentence_token = max_prev_sentence_token
-        self._max_next_sentence_token = max_next_sentence_token
-        self._default_chunk_size = default_chunk_size
-        self._ignore_label = ignore_label
-    @staticmethod
-    def chunker(
-            seq: Sequence[Mapping[str, Union[str, int]]],
-            size: int
-    ) -> Iterable[Sequence[Mapping[str, Union[str, int]]]]:
-        """
-        Return chunks of the sequence. The size of each chunk will be based
-        on the value passed to the size argument.
-        Args:
-            seq (Sequence): maximum token length a given training example (sentence level) can have
-            size (int): The max chunk size for the chunks
-        Return:
-            (Iterable[Sequence[Mapping[str, Union[str, int]]]]): Iterable that iterates through fixed size chunks of
-                                                                 the input sequence chunked version of the sequence
-        """
-        return (seq[pos:pos + size] for pos in range(0, len(seq), size))
-    def get_previous_sentences(self, sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]]) -> List[Deque]:
-        """
-        Go through all the sentences in the medical note and create a list of
-        previous sentences. The output of this function will be a list of chunks
-        where each index of the list contains the sentences (chunks) - (tokens) present before
-        the sentence at that index in the medical note. For example prev_sent[0] will
-        be empty since there is no sentence before the first sentence in the note
-        prev_sent[1] will be equal to sent[0], that is the previous sentence of the
-        second sentence will be the first sentence. We make use of deque, where we
-        start to deque elements when it start to exceed max_prev_sentence_token. This
-        list of previous sentences will be used to define the previous chunks
-        Args:
-            sent_tokens (Sequence[str]): Sentences in the note and
-                                         each element of the list contains a
-                                         list of tokens in that sentence
-        Returns:
-            previous_sentences (List[deque]): A list of deque objects where each index contains a
-                                              list (queue) of previous tokens (chunk) with respect
-                                              to the sentence represented by that index in the note
-        """
-        previous_sentences = list()
-        # Create a queue and specify the capacity of the queue
-        # Tokens will be popped from the queue when the capacity is exceeded
-        prev_sentence = deque(maxlen=self._max_prev_sentence_token)
-        # The first previous chunk is empty since the first sentence in the note does not have
-        # anything before it
-        previous_sentences.append(prev_sentence.copy())
-        # As we iterate through the list of sentences in the not, we add the tokens from the previous chunks
-        # to the the queue. Since we have a queue, as soon as the capacity is exceeded we pop tokens from
-        # the queue
-        for sent_token in sent_tokens[:-1]:
-            for token in sent_token:
-                prev_sentence.append(token)
-            # As soon as each sentence in the list is processed
-            # We add a copy of the current queue to a list - this list keeps track of the
-            # previous chunks for a sentence
-            previous_sentences.append(prev_sentence.copy())
-        return previous_sentences
-    def get_next_sentences(self, sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]]) -> List[Deque]:
-        """
-        Go through all the sentences in the medical note and create a list of
-        next sentences. The output of this function will be a list of lists
-        where each index of the list contains the list of sentences present after
-        the sentence at that index in the medical note. For example next_sent[-] will
-        be empty since there is no sentence after the last sentence in the note
-        next_sent[0] will be equal to sent[1:], that is the next sentence of the
-        first sentence will be the subsequent sentences. We make use of deque, where we
-        start to deque elements when it start to exceed max_next_sentence_token. This
-        list of previous sentences will be used to define the previous chunks
-        Args:
-            sent_tokens (Sequence[str]): Sentences in the note and each
-                                         element of the list contains a
-                                         list of tokens in that sentence
-        Returns:
-            next_sentences (List[deque]): A list of deque objects where each index contains a list (queue)
-                                          of next tokens (chunk) with respect to the sentence represented
-                                          by that index in the note
-        """
-        # A list of next sentences is first created and reversed
-        next_sentences = list()
-        # Create a queue and specify the capacity of the queue
-        # Tokens will be popped from the queue when the capacity is exceeded
-        next_sentence = deque(maxlen=self._max_next_sentence_token)
-        # The first (which becomes the last chunk when we reverse this list) next chunk is empty since
-        # the last sentence in the note does not have
-        # anything after it
-        next_sentences.append(next_sentence.copy())
-        for sent_token in reversed(sent_tokens[1:]):
-            for token in reversed(sent_token):
-                next_sentence.appendleft(token)
-            next_sentences.append(next_sentence.copy())
-        # The list is reversed - since we went through the sentences in the reverse order in
-        # the earlier steps
-        return [next_sent for next_sent in reversed(next_sentences)]
-    def get_sentences(
-            self,
-            sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]],
-            token_text_key: str = 'text',
-            label_key: str = 'label',
-            start_chunk: Optional[Sequence[Mapping[str, Union[str, int]]]] = None,
-            end_chunk: Optional[Sequence[Mapping[str, Union[str, int]]]] = None,
-            sub: bool = False
-    ) -> Iterable[Tuple[int, Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]]:
-        """
-        When we mention previous sentence and next sentence, we don't mean exactly one sentence
-        but rather a previous chunk and a next chunk. This can include one or more sentences and
-        it does not mean that the sentence has to be complete (it can be cutoff in between) - hence a chunk
-        We iterate through all the tokenized sentences in the note. So the input is
-        a list of lists where the outer list represents the sentences in the note and the inner list
-        is a list of tokens in the sentence. It then returns a dataset where each sentence is
-        concatenated with the previous and the next sentence. This is done so that when we build a model
-        we can use the previous and next sentence to add context to the model. The weights and loss etc
-        will be computed and updated based on the current sentence. The previous and next sentence will
-        only be used to add context. We could have different sizes of previous and next chunks
-        depending on the position of the sentence etc. Since we split a note in several sentences which are
-        then used as training data.
-        ignore_label is used to differentiate between the current sentence and the previous and next
-        chunks. The chunks will have the label NA so that and the current sentence
-        will have the label (DATE, AGE etc) so that they can be distinguished.
-        If however we are building a dataset for predictions
-        the current sentence will have the default label O, but the next and previous chunks will still
-        have the label NA. However if the total length exceeds, tokens in the
-        previous and next chunks will be dropped to ensure that the total length is < max_tokens
-        The default chunk size ensures that the length of the chunks will be a minimum number of
-        tokens based on the value passed. For example is default_chunk_size=10, the length
-        of the previous chunks and next chunks will be at least 10 tokens. If the total length > max tokens
-        even after decreasing the sizes of the previous and next chunks, then we split this long
-        sentence into sub sentences and repeat the process described above.
-        Args:
-            sent_tokens (Sequence[Sequence[Mapping[str, Union[str, int]]]]): Sentences in the note and each sentence
-                                                                             contains the tokens (dict) in that sentence
-                                                                             the token dict object contains the
-                                                                             token text, start, end etc
-            token_text_key (str): Each sentence contains a list of tokens where each token is a dict. We use the text
-                                  key to extract the text of the token from the dictionary
-            label_key (str): Each sentence contains a list of tokens where each token is a dict. We use the label_key
-                             key to extract the label of the token from the dictionary. (if it does not have a label
-                             the default label will be assigned)
-            start_chunk (Optional[Sequence[Mapping[str, Union[str, int]]]]): Prefix the first sentence of with some
-                                                                             pre-defined chunk
-            end_chunk (Optional[Sequence[Mapping[str, Union[str, int]]]]): Suffix the last sentence of with some
-                                                                            pre-defined chunk
-            sub (bool): Whether the function is called to process sub-sentences (used when we are splitting
-                        long sentences into smaller sub sentences to keep sentence length < max_tokens
-        Returns:
-            (Iterable[Tuple[int, Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]]): Iterate through the
-                                                                                                   returned sentences,
-                                                                                                   where each sentence
-                                                                                                   has the previous
-                                                                                                   chunks and next
-                                                                                                   chunks attached
-                                                                                                   to it.
-        """
-        # Id num keeps track of the id of the sentence - that is the position the sentence occurs in
-        # the note. We keep the id of sub sentences the same as the sentence, so that the user
-        # knows that these sub sentences are chunked from a longer sentence.
-        # <SENT 0> <SENT 1>. Say length of sent 0 with the previous and next chunks is less than max_tokens
-        # we return sent 0 with id 0. For sent 1, say the length is longer, we split it into sub
-        # sentences - <SUB 1><SUB 2> - we return SUB 1, and SUB 2 with id 1 - so we know that it belongs
-        # to <SENT 1> in the note.
-        if not sub:
-            self._id_num = -1
-        # Initialize the object that will take all the sentences in the note and return
-        # a dataset where each row represents a sentence in the note. The sentence in each
-        # row will also contain a previous chunk and next chunk (tokens) that will act as context
-        # when training the mode
-        # [ps1, ps 2, ps 3...ps-i], [cs1, cs2, ... cs-j], [ns, ns, ... ns-k] - as you can see the current sentence
-        # which is the sentence we train on (or predict on) will be in the middle - the surrounding tokens will
-        # provide context to the current sentence
-        # Get the previous sentences (chunks) for each sentence in the note
-        previous_sentences = self.get_previous_sentences(sent_tokens)
-        # Get the next sentences (chunks) for each sentence in the note
-        next_sentences = self.get_next_sentences(sent_tokens)
-        # For the note we are going to iterate through all the sentences in the note and
-        # concatenate each sentence with the previous and next chunks. (This forms the data that
-        # will be used for training/predictions) Each sentence with the concatenated chunks will be
-        # a training sample. We would do the same thing for getting predictions on a sentence as well
-        # The only difference would be the labels that are used. We would use the default label O for
-        # prediction and the annotated labels for prediction
-        if len(sent_tokens) != len(previous_sentences) or len(sent_tokens) != len(next_sentences):
-            raise ValueError('Sentence length mismatch')
-        for index, (previous_sent, current_sent, next_sent) in enumerate(
-                zip(previous_sentences, sent_tokens, next_sentences)):
-            sent_tokens_text = list()
-            sent_labels = list()
-            sent_toks = list()
-            # Get the tokens and labels for the current sentence
-            for token in current_sent:
-                # We store this, if we need to process sub sentences when a sentence exceeds max_tokens
-                sent_toks.append(token)
-                sent_tokens_text.append(token[token_text_key])
-                sent_labels.append(token[label_key])
-            # We check if the number of tokens in teh current sentence + previous chunk
-            # + next chunk exceeds max tokens. If it does we start popping tokens from the previous and next chunks
-            # until the number of tokens is equal to max tokens
-            previous_sent_length = len(previous_sent)
-            current_sent_length = len(sent_tokens_text)
-            next_sent_length = len(next_sent)
-            total_length = previous_sent_length + current_sent_length + next_sent_length
-            # If the length of the current sentence plus the length of the previous and next
-            # chunks exceeds the max_tokens, start popping tokens from the previous and next
-            # chunks until either total length < max_tokens or the number of tokens in the previous and
-            # next chunks goes below the default chunk size
-            while total_length > self._max_tokens and \
-                    (next_sent_length > self._default_chunk_size or previous_sent_length > self._default_chunk_size):
-                if next_sent_length >= previous_sent_length:
-                    next_sent.pop()
-                    next_sent_length -= 1
-                    total_length -= 1
-                elif previous_sent_length > next_sent_length:
-                    previous_sent.popleft()
-                    previous_sent_length -= 1
-                    total_length -= 1
-            # If this is not a sub sentence, increment the ID to
-            # indicate the processing of the next sentence of the note
-            # If it is a sub sentence, keep the ID the same, to indicate
-            # it belongs to a larger sentence
-            if not sub:
-                self._id_num += 1
-            # If total length < max_tokens - process the sentence with the current sentence
-            # and add on the previous and next chunks and return
-            if total_length <= self._max_tokens:
-                # Check if we want to add a pre-defined chunk for the first sentence in the note
-                if index == 0 and start_chunk is not None:
-                    previous_sent_tokens = [chunk[token_text_key] for chunk in start_chunk] + \
-                                           [prev_token[token_text_key] for prev_token in list(previous_sent)]
-                else:
-                    previous_sent_tokens = [prev_token[token_text_key] for prev_token in list(previous_sent)]
-                # Check if we want to add a pre-defined chunk for the last sentence in the note
-                if index == len(sent_tokens) - 1 and end_chunk is not None:
-                    next_sent_tokens = [next_token[token_text_key] for next_token in list(next_sent)] + \
-                                       [chunk[token_text_key] for chunk in end_chunk]
-                else:
-                    next_sent_tokens = [next_token[token_text_key] for next_token in list(next_sent)]
-                previous_sent_length = len(previous_sent_tokens)
-                next_sent_length = len(next_sent_tokens)
-                # Store information about the current sentence - start and end pos etc
-                # this can be used to distinguish from the next and previous chunks
-                # current_sent_info = {'token_info':current_sent}
-                # Assign an different label (the ignore label) to the chunks - since they are used only for context
-                previous_sent_labels = list()
-                next_sent_labels = list()
-                if self._ignore_label == 'NA':
-                    previous_sent_labels = [self._ignore_label] * previous_sent_length
-                    next_sent_labels = [self._ignore_label] * next_sent_length
-                elif self._ignore_label == 'label':
-                    if index == 0 and start_chunk is not None:
-                        previous_sent_labels = [chunk[label_key] for chunk in start_chunk] + \
-                                               [prev_token[label_key] for prev_token in list(previous_sent)]
-                    else:
-                        previous_sent_labels = [prev_token[label_key] for prev_token in list(previous_sent)]
-                    if index == len(sent_tokens) - 1 and end_chunk is not None:
-                        next_sent_labels = [next_token[label_key] for next_token in list(next_sent)] + \
-                                           [chunk[label_key] for chunk in end_chunk]
-                    else:
-                        next_sent_labels = [next_token[label_key] for next_token in list(next_sent)]
-                # Concatenate the chunks and the sentence
-                # sent_tokens_text.append(token[token_text_key])
-                tokens_data = previous_sent_tokens + sent_tokens_text + next_sent_tokens
-                labels_data = previous_sent_labels + sent_labels + next_sent_labels
-                # Return processed sentences
-                yield self._id_num, {'tokens': tokens_data, 'labels': labels_data, 'current_sent_info': current_sent}
-            # Process the sub sentences - we take a long sentence
-            # and split it into smaller chunks - and we recursively call the function on this list
-            # of smaller chunks - as mentioned before the smaller chunks (sub sentences) will have the
-            # same ID as the original sentence
-            else:
-                # Store the smaller chunks - say <SENT1> is too long
-                # <PREV CHUNK><SENT1><NEXT CHUNK>
-                # We get chunk sent 1 - to <SUB1><SUB2><SUB3> and we pass this [<SUB1><SUB2><SUB3>] to the function
-                # as a recursive call. This list is now processed as a smaller note that essentially belongs
-                # to a sentence. But as you can see we did not pass <PREV CHUNK> & <NEXT CHUNK>, because
-                # these are chunks that are not part of the current sentence, but they still need to be
-                # included in the final output - and the work around is mentioned below
-                # So that we have a previous chunk for <SUB1> and next chunk for <SUB3>
-                # we include the previous_sent_tokens and next_sent_tokens as the start chunk
-                # and the next chunk in the function call below
-                # <PREV CHUNK><SUB1><NEXT SUB1>, id = x
-                # <PREV SUB2><SUB2><NEXT SUB2>, id = x
-                # <PREV SUB3><SUB3><NEXT CHUNK>, id = x
-                sub_sentences = list()
-                # Prefix the first sentence in these smaller chunks
-                previous_sent_tokens = list(previous_sent)
-                # Suffix the last sentence in these smaller chunks
-                next_sent_tokens = list(next_sent)
-                # Get chunks
-                for chunk in SentenceDataset.chunker(sent_toks, self._max_tokens - (2 * self._default_chunk_size)):
-                    sub_sentences.append(chunk)
-                # Process list of smaller chunks
-                for sub_sent in self.get_sentences(
-                        sub_sentences,
-                        token_text_key,
-                        label_key,
-                        start_chunk=previous_sent_tokens,
-                        end_chunk=next_sent_tokens,
-                        sub=True
-                ):
-                    yield sub_sent

ner_datasets/dataset_creator.py DELETED Viewed

@@ -1,322 +0,0 @@
-import json
-import random
-from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
-from typing import Iterable, Dict, List, Union, Optional, Sequence, NoReturn
-from .dataset_builder import Dataset, SentenceDataset
-from .preprocessing import PreprocessingLoader
-random.seed(41)
-class DatasetCreator(object):
-    """
-    Build a NER token classification dataset
-    For training we will build the dataset using the annotated spans (e.g from prodigy)
-    For predictions we will assign default labels.
-    The dataset is on a sentence level, i.e each note is split into sentences and the de-id
-    task is run on a sentence level. Even the predictions are run on a sentence level
-    The dataset would be something like:
-    Tokens: [[tok1, tok2, ... tok-n], [tok ...], ..., [tok ...]]
-    Labels: [[lab1, lab2, ... lab-n], [lab ...], ..., [lab ...]]
-    Where the inner list represents the sentences - the tokens in the sentence and the respective
-    labels for each token. The labels depend on the notation
-    This script can also be used for predictions, the Labels will be filled with some
-    default value. This is done so that we can use the same script for building a dataset to train a model
-    and a dataset to obtain predictions using a model
-    Example:
-    Note: Bruce Wayne is a 60yo man. He lives in Gotham
-    Sentences: [Bruce Wayne Jr is a 60yo man., He lives in Gotham]
-    Tokens: [[Bruce, Wayne, Jr, is, a, 60, yo, man, .], [He, lives, in, Gotham]]
-    Labels (BIO notation): [[B-Name, I-Name, I-Name, O, O, O, O, O, O], [O, O, O, B-LOC]]
-    Labels (BILOU notation): [[B-Name, I-Name, L-Name, O, O, O, O, O, O], [O, O, O, U-LOC]]
-    We also can create sentences that uses previous/next chunks as context - in this case the dataset would
-    look something like this. (Assume we limit the size of the chunks to 3 tokens)
-    Sentences: [Bruce Wayne Jr is a 60yo man., He lives in Gotham]
-    Tokens: [[Bruce, Wayne, Jr, is, a, 60, yo, man, ., He, lives, in], [yo, man, ., He, lives, in, Gotham]]
-    Labels (BIO notation): [[B-Name, I-Name, I-Name, O, O, O, O, O, O, NA, NA, NA], [NA, NA, NA, O, O, O, B-LOC]]
-    Labels (BILOU notation): [[B-Name, I-Name, L-Name, O, O, O, O, O, O, NA, NA, NA], [NA, NA, NA, O, O, O, U-LOC]]
-    NA represents the token is used for context
-    """
-    def __init__(
-            self,
-            sentencizer: str,
-            tokenizer: str,
-            abbreviations: Optional[Sequence[str]] = None,
-            max_tokens: int = 128,
-            max_prev_sentence_token: int = 32,
-            max_next_sentence_token: int = 32,
-            default_chunk_size: int = 32,
-            ignore_label: str = 'NA'
-    ) -> NoReturn:
-        """
-        Initialize the sentencizer and tokenizer
-        Args:
-            sentencizer (str): Specify which sentencizer you want to use
-            tokenizer (str): Specify which tokenizer you want to use
-            abbreviations (Optional[Sequence[str]]): A list of abbreviations for which tokens will not be split
-                                                     - works only with with custom clinical tokenizer.
-            max_tokens (int): The maximum number of tokens allowed in a sentence/training example,
-                              truncate if it exceeds.
-            max_prev_sentence_token (int): The maximum number of previous chunk tokens allowed in a
-                                           sentence/training example
-            max_next_sentence_token (int): The maximum number of next chunk tokens allowed in a
-                                           sentence/training example.
-            ignore_label (str): The label assigned to the previous and next chunks to distinguish
-                                from the current sentence
-        """
-        self._sentencizer = PreprocessingLoader.get_sentencizer(sentencizer=sentencizer)
-        self._tokenizer = PreprocessingLoader.get_tokenizer(tokenizer=tokenizer, abbreviations=abbreviations)
-        # Initialize the object that will be used to get the tokens and the sentences
-        self._dataset = Dataset(sentencizer=self._sentencizer, tokenizer=self._tokenizer)
-        # Initialize the object that will take all the sentences in the note and return
-        # a dataset where each row represents a sentence in the note. The sentence in each
-        # row will also contain a previous chunk and next chunk (tokens) that will act as context
-        # when training the mode
-        # [ps1, ps 2, ps 3...ps-i], [cs1, cs2, ... cs-j], [ns, ns, ... ns-k] - as you can see the current sentence
-        # which is the sentence we train on (or predict on) will be in the middle - the surrounding tokens will
-        # provide context to the current sentence
-        self._sentence_dataset = SentenceDataset(
-            max_tokens=max_tokens,
-            max_prev_sentence_token=max_prev_sentence_token,
-            max_next_sentence_token=max_next_sentence_token,
-            default_chunk_size=default_chunk_size,
-            ignore_label=ignore_label
-        )
-    def create(
-            self,
-            input_file: str,
-            mode: str = 'predict',
-            notation: str = 'BIO',
-            token_text_key: str = 'text',
-            metadata_key: str = 'meta',
-            note_id_key: str = 'note_id',
-            label_key: str = 'labels',
-            span_text_key: str = 'spans'
-    ) -> Iterable[Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]:
-        """
-        This function is used to get the sentences that will be part of the NER dataset.
-        We check whether the note belongs to the desired dataset split. If it does,
-        we fix any spans that can cause token-span alignment errors. Then we extract
-        all the sentences in the notes, the tokens in each sentence. Finally we
-        add some context tokens to the sentence if required. This function returns
-        an iterable that iterated through each of the processed sentences
-        Args:
-            input_file (str): Input jsonl file. Make sure the spans are in ascending order (based on start position)
-            mode (str): Dataset being built for train or predict.
-            notation (str): The NER labelling notation
-            token_text_key (str): The key where the note text and token text is present in the json object
-            metadata_key (str): The key where the note metadata is present in the json object
-            note_id_key (str): The key where the note id is present in the json object
-            label_key (str): The key where the token label will be stored in the json object
-            span_text_key (str): The key where the note spans is present in the json object
-        Returns:
-            (Iterable[Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]): Iterate through the processed
-                                                                                       sentences/training examples
-        """
-        # Go through the notes
-        for line in open(input_file, 'r'):
-            note = json.loads(line)
-            note_text = note[token_text_key]
-            note_id = note[metadata_key][note_id_key]
-            if mode == 'train':
-                note_spans = note[span_text_key]
-            # No spans in predict mode
-            elif mode == 'predict':
-                note_spans = None
-            else:
-                raise ValueError("Invalid mode - can only be train/predict")
-            # Store the list of tokens in the sentence
-            # Eventually this list will contain all the tokens in the note (split on the sentence level)
-            # Store the start and end positions of the sentence in the note. This can
-            # be used later to reconstruct the note from the sentences
-            # we also store the note_id for each sentence so that we can map it back
-            # to the note and therefore have all the sentences mapped back to the notes they belong to.
-            sent_tokens = [sent_tok for sent_tok in self._dataset.get_tokens(
-                text=note_text,
-                spans=note_spans,
-                notation=notation
-            )]
-            # The following loop goes through each sentence in the note and returns
-            # the current sentence and previous and next chunks that will be used for context
-            # The chunks will have a default label (e.g NA) to distinguish from the current sentence
-            # and so that we can ignore these chunks when calculating loss and updating weights
-            # during training
-            for ner_sent_index, ner_sentence in self._sentence_dataset.get_sentences(
-                    sent_tokens=sent_tokens,
-                    token_text_key=token_text_key,
-                    label_key=label_key
-            ):
-                # Return the processed sentence. This sentence will then be used
-                # by the model
-                current_sent_info = ner_sentence['current_sent_info']
-                note_sent_info_store = {'start': current_sent_info[0]['start'],
-                                        'end': current_sent_info[-1]['end'], 'note_id': note_id}
-                ner_sentence['note_sent_info'] = note_sent_info_store
-                yield ner_sentence
-def main():
-    cli_parser = ArgumentParser(
-        description='configuration arguments provided at run time from the CLI',
-        formatter_class=ArgumentDefaultsHelpFormatter
-    )
-    cli_parser.add_argument(
-        '--input_file',
-        type=str,
-        required=True,
-        help='the the jsonl file that contains the notes. spans need to be sorted in ascending order (based on start '
-             'position) '
-    )
-    cli_parser.add_argument(
-        '--notation',
-        type=str,
-        default='BIO',
-        help='the notation we will be using for the label scheme'
-    )
-    cli_parser.add_argument(
-        '--max_tokens',
-        type=int,
-        default=128,
-        help='The max tokens that a given sentence (training/prediction example) in the note can have'
-    )
-    cli_parser.add_argument(
-        '--default_chunk_size',
-        type=int,
-        default=32,
-        help='the default chunk size for the previous and next chunks for a given sentence (training/prediction '
-             'example) in the note can have '
-    )
-    cli_parser.add_argument(
-        '--max_prev_sentence_token',
-        type=int,
-        default=32,
-        help='the max chunk size for the previous chunks for a given sentence (training/prediction example) in the '
-             'note can have '
-    )
-    cli_parser.add_argument(
-        '--max_next_sentence_token',
-        type=int,
-        default=32,
-        help='the max chunk size for the next chunks for a given sentence (training/prediction example) in the note '
-             'can have '
-    )
-    cli_parser.add_argument(
-        '--mode',
-        type=str,
-        choices=['train', 'predict'],
-        required=True,
-        help='whether we are building the dataset for training or prediction'
-    )
-    cli_parser.add_argument(
-        '--sentencizer',
-        type=str,
-        required=True,
-        help='the sentencizer to use for splitting notes into sentences'
-    )
-    cli_parser.add_argument(
-        '--tokenizer',
-        type=str,
-        required=True,
-        help='the tokenizer to use for splitting text into tokens'
-    )
-    cli_parser.add_argument(
-        '--abbreviations',
-        type=str,
-        default=None,
-        help='file that will be used by clinical tokenizer to handle abbreviations'
-    )
-    cli_parser.add_argument(
-        '--ignore_label',
-        type=str,
-        default='NA',
-        help='whether to use the ignore label or not'
-    )
-    cli_parser.add_argument(
-        '--token_text_key',
-        type=str,
-        default='text',
-        help='the key where the note text is present in the json object'
-    )
-    cli_parser.add_argument(
-        '--metadata_key',
-        type=str,
-        default='meta',
-        help='the key where the note metadata is present in the json object'
-    )
-    cli_parser.add_argument(
-        '--note_id_key',
-        type=str,
-        default='note_id',
-        help='the key where the note metadata is present in the json object'
-    )
-    cli_parser.add_argument(
-        '--label_key',
-        type=str,
-        default='label',
-        help='the key where the note label for each token is present in the json object'
-    )
-    cli_parser.add_argument(
-        '--span_text_key',
-        type=str,
-        default='spans',
-        help='the key where the note annotates spans are present in the json object'
-    )
-    cli_parser.add_argument(
-        '--format',
-        type=str,
-        default='jsonl',
-        help='format to store the dataset in: jsonl or conll'
-    )
-    cli_parser.add_argument(
-        '--output_file',
-        type=str,
-        help='The file where the NER dataset will be stored'
-    )
-    args = cli_parser.parse_args()
-    dataset_creator = DatasetCreator(
-        sentencizer=args.sentencizer,
-        tokenizer=args.tokenizer,
-        abbreviations=args.abbreviations,
-        max_tokens=args.max_tokens,
-        max_prev_sentence_token=args.max_prev_sentence_token,
-        max_next_sentence_token=args.max_next_sentence_token,
-        default_chunk_size=args.default_chunk_size,
-        ignore_label=args.ignore_label)
-    ner_notes = dataset_creator.create(
-        input_file=args.input_file,
-        mode=args.mode,
-        notation=args.notation,
-        token_text_key=args.token_text_key,
-        metadata_key=args.metadata_key,
-        note_id_key=args.note_id_key,
-        label_key=args.label_key,
-        span_text_key=args.span_text_key
-    )
-    # Store the NER dataset in the desired format
-    if args.format == 'jsonl':
-        # Write the dataset to the output file
-        with open(args.output_file, 'w') as file:
-            for ner_sentence in ner_notes:
-                file.write(json.dumps(ner_sentence) + '\n')
-    elif args.format == 'conll':
-        with open(args.output_file, 'w') as file:
-            for ner_sentence in ner_notes:
-                tokens = ner_sentence['tokens']
-                labels = ner_sentence['labels']
-                current_sent_info = ner_sentence['current_sent_info']
-                note_id = ner_sentence['note_sent_info']['note_id']
-                if len(tokens) != len(labels) or len(labels) != len(current_sent_info):
-                    raise ValueError('Length mismatch')
-                for token, label, sent_info in zip(tokens, labels, current_sent_info):
-                    sent_info['note_id'] = note_id
-                    data = token + ' ' + label + ' ' + json.dumps(sent_info) + '\n'
-                    file.write(data)
-                file.write('\n')
-if __name__ == '__main__':
-    main()

ner_datasets/dataset_splitter.py DELETED Viewed

@@ -1,294 +0,0 @@
-import json
-import random
-from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
-from collections import Counter
-from typing import NoReturn, List
-from .distribution import NERDistribution, DatasetSplits, PrintDistribution
-random.seed(41)
-class DatasetSplitter(object):
-    """
-    Prepare dataset splits - training, validation & testing splits
-    Compute ner distributions in our dataset. Compute ner distributions
-    based on which we create and store a dictionary which will contain
-    information about which notes (in a dataset) belong to which split.
-    Based on this distribution and whether we want to keep certain notes
-    grouped (e.g by patient) we assign notes to a split, such that the
-    final ner type distribution in each split is similar.
-    """
-    def __init__(
-            self,
-            train_proportion: int = 70,
-            validation_proportion: int = 15,
-            test_proportion: int = 15
-    ) -> NoReturn:
-        """
-        Initialize the proportions of the splits.
-        Args:
-            train_proportion (int): Ratio of train dataset
-            validation_proportion (int): Ratio of validation dataset
-            test_proportion (int): Ratio of test dataset
-        """
-        self._train_proportion = train_proportion
-        self._validation_proportion = validation_proportion
-        self._test_proportion = test_proportion
-        self._split = None
-        self._lookup_split = dict()
-    def get_split(self, split: str) -> List[str]:
-        return [key for key in self._lookup_split[split].keys()]
-    def set_split(self, split: str) -> NoReturn:
-        """
-        Set the split that you are currently checking/processing.
-        Based on the split you can perform certain checks and
-        computation. Once the split is set, read the information
-        present in the split_info_path. Extract only the information
-        belonging to the split. Create a hash map where we have
-        the keys as the note_ids/patient ids that belong to the split. This hashmap
-        can then be used to check if a particular note belongs to this
-        split.
-        Args:
-            split (str): The split - train, test etc (depends on how you named it)
-        """
-        if split not in ['train', 'validation', 'test']:
-            raise ValueError('Invalid split')
-        self._split = split
-    def __update_split(self, key: str) -> NoReturn:
-        """
-        Update the hash map where we have
-        the keys (e.g note_id) that belong to the split. This hashmap
-        can then be used to check if a particular note belongs to this
-        split.
-        Args:
-            key (str): The key that identify the note belonging to the split
-        """
-        self._lookup_split[self._split][key] = 1
-    def check_note(self, key: str) -> bool:
-        """
-        Use the hash map created in the __get_i2b2_filter_map function
-        to check if the note (note_info) belongs to this split (train,
-        val, test etc). If it does, return true, else false
-        Args:
-            key (str): The key that identify the note belonging to the split
-        Returns:
-            (bool): True if the note belongs to the split, false otherwise
-        """
-        if self._split is None:
-            raise ValueError('Split not set')
-        if self._lookup_split[self._split].get(key, False):
-            return True
-        else:
-            return False
-    def assign_splits(
-            self,
-            input_file: str,
-            spans_key: str = 'spans',
-            metadata_key: str = 'meta',
-            group_key: str = 'note_id',
-            margin: float = 0.3
-    ) -> NoReturn:
-        """
-        Get the dataset splits - training, validation & testing splits
-        Based on the NER distribution and whether we want to keep certain
-        notes grouped (e.g by patient). Return an iterable that contains
-        a tuple that contains the note_id and the split. This can be used
-        to filter notes based on the splits.
-        Args:
-            input_file (str): The input file
-            spans_key (str): The key where the note spans are present
-            metadata_key (str): The key where the note metadata is present
-            group_key (str): The key where the note group (e.g note_id or patient id etc) is present.
-                             This field is what the notes will be grouped by, and all notes belonging
-                             to this grouping will be in the same split
-            margin (float): Margin of error when maintaining proportions in the splits
-        """
-        # Compute the distribution of NER types in the grouped notes.
-        # For example the distribution of NER types in all notes belonging to a
-        # particular patient
-        self._lookup_split = {
-            'train': dict(),
-            'validation': dict(),
-            'test': dict()
-        }
-        ner_distribution = NERDistribution()
-        for line in open(input_file, 'r'):
-            note = json.loads(line)
-            key = note[metadata_key][group_key]
-            ner_distribution.update_distribution(spans=note[spans_key], key=key)
-        # Initialize the dataset splits object
-        dataset_splits = DatasetSplits(
-            ner_distribution=ner_distribution,
-            train_proportion=self._train_proportion,
-            validation_proportion=self._validation_proportion,
-            test_proportion=self._test_proportion,
-            margin=margin
-        )
-        # Check the note and assign it to a split
-        for line in open(input_file, 'r'):
-            note = json.loads(line)
-            key = note[metadata_key][group_key]
-            split = dataset_splits.get_split(key=key)
-            self.set_split(split)
-            self.__update_split(key)
-        return None
-def main() -> NoReturn:
-    """
-    Prepare dataset splits - training, validation & testing splits
-    Compute ner distributions in our dataset. Based on this distribution
-    and whether we want to keep certain notes grouped (e.g by patient)
-    we assign notes to a split, such that the final ner type distribution
-    in each split is similar.
-    """
-    # Compute the distribution of NER types in the grouped notes.
-    # For example the distribution of NER types in all notes belonging to a
-    # particular patient
-    # The following code sets up the arguments to be passed via CLI or via a JSON file
-    cli_parser = ArgumentParser(
-        description='configuration arguments provided at run time from the CLI',
-        formatter_class=ArgumentDefaultsHelpFormatter
-    )
-    cli_parser.add_argument(
-        '--input_file',
-        type=str,
-        required=True,
-        help='the the jsonl file that contains the notes'
-    )
-    cli_parser.add_argument(
-        '--spans_key',
-        type=str,
-        default='spans',
-        help='the key where the note spans is present in the json object'
-    )
-    cli_parser.add_argument(
-        '--metadata_key',
-        type=str,
-        default='meta',
-        help='the key where the note metadata is present in the json object'
-    )
-    cli_parser.add_argument(
-        '--group_key',
-        type=str,
-        default='note_id',
-        help='the key to group notes by in the json object'
-    )
-    cli_parser.add_argument(
-        '--train_proportion',
-        type=int,
-        default=70,
-        help='ratio of train dataset'
-    )
-    cli_parser.add_argument(
-        '--train_file',
-        type=str,
-        default=None,
-        help='The file to store the train data'
-    )
-    cli_parser.add_argument(
-        '--validation_proportion',
-        type=int,
-        default=15,
-        help='ratio of validation dataset'
-    )
-    cli_parser.add_argument(
-        '--validation_file',
-        type=str,
-        default=None,
-        help='The file to store the validation data'
-    )
-    cli_parser.add_argument(
-        '--test_proportion',
-        type=int,
-        default=15,
-        help='ratio of test dataset'
-    )
-    cli_parser.add_argument(
-        '--test_file',
-        type=str,
-        default=None,
-        help='The file to store the test data'
-    )
-    cli_parser.add_argument(
-        '--margin',
-        type=float,
-        default=0.3,
-        help='margin of error when maintaining proportions in the splits'
-    )
-    cli_parser.add_argument(
-        '--print_dist',
-        action='store_true',
-        help='whether to print the label distribution in the splits'
-    )
-    args = cli_parser.parse_args()
-    dataset_splitter = DatasetSplitter(
-        train_proportion=args.train_proportion,
-        validation_proportion=args.validation_proportion,
-        test_proportion=args.test_proportion
-    )
-    dataset_splitter.assign_splits(
-        input_file=args.input_file,
-        spans_key=args.spans_key,
-        metadata_key=args.metadata_key,
-        group_key=args.group_key,
-        margin=args.margin
-    )
-    if args.train_proportion > 0:
-        with open(args.train_file, 'w') as file:
-            for line in open(args.input_file, 'r'):
-                note = json.loads(line)
-                key = note[args.metadata_key][args.group_key]
-                dataset_splitter.set_split('train')
-                if dataset_splitter.check_note(key):
-                    file.write(json.dumps(note) + '\n')
-    if args.validation_proportion > 0:
-        with open(args.validation_file, 'w') as file:
-            for line in open(args.input_file, 'r'):
-                note = json.loads(line)
-                key = note[args.metadata_key][args.group_key]
-                dataset_splitter.set_split('validation')
-                if dataset_splitter.check_note(key):
-                    file.write(json.dumps(note) + '\n')
-    if args.test_proportion > 0:
-        with open(args.test_file, 'w') as file:
-            for line in open(args.input_file, 'r'):
-                note = json.loads(line)
-                key = note[args.metadata_key][args.group_key]
-                dataset_splitter.set_split('test')
-                if dataset_splitter.check_note(key):
-                    file.write(json.dumps(note) + '\n')
-    if args.print_dist:
-        # Read the dataset splits file and compute the NER type distribution
-        key_counts = Counter()
-        ner_distribution = NERDistribution()
-        for line in open(args.input_file, 'r'):
-            note = json.loads(line)
-            key = note[args.metadata_key][args.group_key]
-            key_counts[key] += 1
-            ner_distribution.update_distribution(spans=note[args.spans_key], key=key)
-        print_distribution = PrintDistribution(ner_distribution=ner_distribution, key_counts=key_counts)
-        train_splits = dataset_splitter.get_split('train')
-        validation_splits = dataset_splitter.get_split('validation')
-        test_splits = dataset_splitter.get_split('test')
-        all_splits = train_splits + validation_splits + test_splits
-        # Print distribution for each split
-        print_distribution.split_distribution(split='total', split_info=all_splits)
-        print_distribution.split_distribution(split='train', split_info=train_splits)
-        print_distribution.split_distribution(split='validation', split_info=validation_splits)
-        print_distribution.split_distribution(split='test', split_info=test_splits)
-if __name__ == "__main__":
-    main()

ner_datasets/distribution/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-from .dataset_splits import DatasetSplits
-from .ner_distribution import NERDistribution
-from .print_distribution import PrintDistribution
-__all__=["DatasetSplits", "NERDistribution", "PrintDistribution"]

ner_datasets/distribution/dataset_splits.py DELETED Viewed

@@ -1,218 +0,0 @@
-import random
-from collections import Counter
-from typing import NoReturn
-from .ner_distribution import NERDistribution
-random.seed(41)
-class DatasetSplits(object):
-    """
-    Prepare dataset splits - training, validation & testing splits
-    Compute ner distributions in the dataset. Based on this we assign
-    notes to different splits and at the same time, we keep the distribution of
-    NER types in each split similar. .
-    Keep track of the split information - which notes are present in which split.
-    The label distribution in each split, the number of notes in each split.
-    """
-    def __init__(
-            self,
-            ner_distribution: NERDistribution,
-            train_proportion: int,
-            validation_proportion: int,
-            test_proportion: int,
-            margin: float
-    ) -> NoReturn:
-        """
-        Maintain split information. Assign notes based on the proportion of
-        the splits, while keeping the label distribution in each split similar.
-        Keep track of the split information - which notes are present in which split.
-        The label distribution in each split, the number of notes in each split.
-        Keep track of the dataset splits and the counts in each split etc.
-        These will be used to assign the different notes to different
-        splits while keeping the proportion of ner similar in each split.
-        Get the maximum number of ner that can be present in the train,
-        validation and test split. The total count will be used to
-        calculate the current proportion of ner in the split. This can be used
-        to keep the proportion of ner types consistent among different splits
-        Args:
-            ner_distribution (NERDistribution): The NER distribution in the dataset
-            train_proportion (int): Ratio of train dataset
-            validation_proportion (int): Ratio of validation dataset
-            test_proportion (int): Ratio of test dataset
-            margin (float): Margin by which the label distribution can be exceeded in the split
-        """
-        self._ner_distribution = ner_distribution
-        # Compute the counts of NER types in the entire dataset
-        total_distribution = Counter()
-        for key, counts in ner_distribution.get_ner_distribution().items():
-            for label, count in counts.items():
-                total_distribution[label] += count
-        # Compute the percentages of NER types in the entire dataset
-        self._total_ner = sum(total_distribution.values())
-        self._label_dist_percentages = {
-            ner_type: float(count) / self._total_ner * 100 if self._total_ner else 0
-            for ner_type, count in total_distribution.items()
-        }
-        self._margin = margin
-        # The three splits
-        self._splits = ['train', 'validation', 'test']
-        self._split_weights = None
-        self._splits_info = None
-        # Keep track of the patient_ids that have been processed.
-        # Since a patient can have multiple notes and we already know the
-        # ner distribution for this patient across all the notes (i.e the ner types
-        # and count that appear in all the notes associated with this patient)
-        # We also keep all the notes associated with a patient in the same split
-        # So we check if adding all the notes associated with this patient will
-        # disturb the ner distribution (proportions) as mentioned before.
-        self._processed_keys = dict()
-        # Based on these proportions we compute train_ner_count, validation_ner_count, test_ner_count
-        # Say the proportion are 85, 10, 5
-        # The train split will have a maximum of 85% of the overall ner, validation will have 10 and test will 5
-        # That is if there are total count of all ner is 100, on splitting the datasets
-        # the train split will have a total of 85 ner, validation split will have a total of 10 ner and the
-        # test split will have a total of 5 ner
-        train_ner_count = int(train_proportion * self._total_ner / 100)
-        validation_ner_count = int(validation_proportion * self._total_ner / 100)
-        test_ner_count = int(test_proportion * self._total_ner / 100)
-        # So based on this, we check if adding a note keeps the balance in proportion or not
-        # If it does not, we check the splits given in the "remain" field of the dict (which is
-        # the 2 other splits
-        self._split_weights = [train_proportion, validation_proportion, test_proportion]
-        # Based on the split proportions, ner counts and ner distribution
-        # we need to split our dataset into train, validation and test split
-        # For each split we try and maintain the same distribution (proportions) between ner types
-        # that we computed from the entire dataset (given by - ner_distribution)
-        # If the entire dataset had AGE:50%, DATE:30%, LOC:20%, we want the same proportions
-        # in each of the train, validation and test splits
-        # So based on this, we check if adding a note keeps the balance in proportion or not
-        # If it does not, we check the splits given in the "remain" field of the dict (which is
-        # the 2 other splits
-        self._splits_info = {'train': {'remain': ['validation', 'test'],
-                                       'total': train_ner_count,
-                                       'remain_weights': [validation_proportion, test_proportion],
-                                       'groups': list(), 'number_of_notes': 0, 'label_dist': Counter()},
-                             'validation': {'remain': ['train', 'test'],
-                                            'total': validation_ner_count,
-                                            'remain_weights': [train_proportion, test_proportion],
-                                            'groups': list(), 'number_of_notes': 0, 'label_dist': Counter()},
-                             'test': {'remain': ['validation', 'train'],
-                                      'total': test_ner_count,
-                                      'remain_weights': [validation_proportion, train_proportion],
-                                      'groups': list(), 'number_of_notes': 0, 'label_dist': Counter()}}
-    def __set_split(self, split: str) -> NoReturn:
-        """
-        Set the split that you are currently checking/processing.
-        Based on the split you can perform certain checks and
-        computation for that split.
-        Args:
-            split (str): The split - train, validation or test
-        """
-        self._split = split
-    def __update_label_dist(self, distribution: Counter) -> NoReturn:
-        """
-        Once we have determined that a note can be added to the split we need to
-        update the current count of the ner types in the split. So we pass the ner counts
-        in the note that will be updated and update the counts of the ner types in the split.
-        Args:
-            distribution (Counter): Contains the ner type and it's counts (distribution)
-        """
-        self._splits_info[self._split]['label_dist'].update(distribution)
-    def __update_groups(self, note_group_key: str) -> NoReturn:
-        """
-        Once we have determined that a note can be added to the split, we append
-        to a list some distinct element of the note (e.g note_id). This list will
-        contain the note_ids of the notes that belong to this split.
-        Args:
-            note_group_key (str): Contains the note metadata - e.g note_id, institute etc
-        """
-        self._processed_keys[note_group_key] = self._split
-        self._splits_info[self._split]['groups'].append(note_group_key)
-    def __check_split(self, distribution: Counter) -> bool:
-        """
-        This function is used to check the resulting ner distribution in the split on adding this
-        note to the split. We check how the proportion of ner changes if this note is added to
-        the split. If the proportion exceeds the desired proportion then we return false
-        to indicate that adding this note will upset the ner distribution across splits, so we should
-        instead check adding this note to another split. If it does not update the balance then we return
-        True, which means we can add this note to this split. The desired proportion of ner is passed
-        in the percentages argument - where we have the desired proportion for each ner type.
-        Args:
-            distribution (Counter): Contains the mapping between ner type and count
-        Returns:
-            (bool): True if the note can be added to the split, false otherwise
-        """
-        # Get the current ner types and counts in the split
-        split_label_dist = self._splits_info[self._split]['label_dist']
-        # Get the max ner count that can be present in the split
-        # This will be used to compute the ner proportions in the split
-        split_total = self._splits_info[self._split]['total']
-        # Check if the proportion of the split picked in zero
-        # and return False because we cant add any note to this split
-        if split_total == 0:
-            return False
-        for ner_type, count in distribution.items():
-            percentage = (split_label_dist.get(ner_type, 0) + count) / split_total * 100
-            # Check if the proportion on adding this note exceeds the desired proportion
-            # within the margin of error
-            # If it does return false
-            if percentage > self._label_dist_percentages[ner_type] + self._margin:
-                return False
-        return True
-    def get_split(self, key: str) -> str:
-        """
-        Assign a split to the note - based on the distribution of ner types in the note
-        and the distribution of ner types in the split. Essentially assign a note to a split
-        such that the distribution of ner types in each split is similar, once all notes have
-        been assigned to their respective splits.
-        Args:
-            key (str): The note id or patient id of the note (some grouping key)
-        Returns:
-            (str): The split
-        """
-        current_splits = self._splits
-        current_weights = self._split_weights
-        distribution = self._ner_distribution.get_group_distribution(key=key)
-        if self._processed_keys.get(key, False):
-            return self._processed_keys[key]
-        while True:
-            # Pick and set the split
-            check_split = random.choices(current_splits, current_weights)[0]
-            self.__set_split(check_split)
-            # Get the ner distribution for this particular patient (across all the notes associated
-            # with this patient) and check if the notes can be added to this split.
-            # The margin of error for the ner proportions. As we said above we try and keep the proportions
-            # across the splits the same, but we allow for some flexibility, so we can go +- the amount
-            # given by margin.
-            include = self.__check_split(distribution=distribution)
-            if include:
-                self.__update_groups(key)
-                self.__update_label_dist(distribution=distribution)
-                return check_split
-            else:
-                # Check the two other possible splits
-                if len(current_splits) == 3:
-                    current_splits = self._splits_info[check_split]['remain']
-                    current_weights = self._splits_info[check_split]['remain_weights']
-                    # Check the one other possible split (when the one of the above two other split check returns false)
-                elif len(current_splits) == 2 and current_weights[1 - current_splits.index(check_split)] != 0:
-                    index = current_splits.index(check_split)
-                    current_splits = [current_splits[1 - index]]
-                    current_weights = [100]
-                # If it can't be added to any split - choose a split randomly
-                else:
-                    current_splits = self._splits
-                    current_weights = self._split_weights
-                    check_split = random.choices(current_splits, current_weights)[0]
-                    self.__set_split(check_split)
-                    self.__update_groups(key)
-                    self.__update_label_dist(distribution=distribution)
-                    return check_split

ner_datasets/distribution/ner_distribution.py DELETED Viewed

@@ -1,54 +0,0 @@
-from collections import Counter, defaultdict
-from typing import Sequence, Mapping, NoReturn
-class NERDistribution(object):
-    """
-    Store the distribution of ner types based on some key.
-    That is we store the NER type distribution for some given key value and we update
-    the distribution when spans related to that key is passed
-    """
-    def __init__(self) -> NoReturn:
-        """
-        Initialize the NER type - count mapping
-        """
-        # Counter the captures the ner types and counts per patient/note_id in the dataset
-        # Depending on what we set the group_key as. Basically gather counts with respect
-        # to some grouping of the notes
-        # E.g - {{PATIENT 1: {AGE: 99, DATE: 55, ...}, {PATIENT 2: {AGE: 5, DATE: 9, ...} ... }
-        self._ner_distribution = defaultdict(Counter)
-    def update_distribution(self, spans: Sequence[Mapping[str, str]], key: str) -> NoReturn:
-        """
-        Update the distribution of ner types for the given key
-        Args:
-            spans (Sequence[Mapping[str, str]]): The list of spans in the note
-            key (str): The note id or patient id of the note (some grouping)
-        """
-        # Go through the spans in the note and compute the ner distribution
-        # Compute both the overall ner distribution and ner distribution per
-        # patient (i.e the ner types in all the notes associated with the patient)
-        if not self._ner_distribution.get(key, False):
-            self._ner_distribution[key] = Counter()
-        for span in spans:
-            self._ner_distribution[key][span['label']] += 1
-    def get_ner_distribution(self) -> defaultdict:
-        """
-        Return overall ner distribution. The NER type distribution for every key.
-        Returns:
-            ner_distribution (defaultdict(Counter)): Overall NER type distribution for all keys
-        """
-        return self._ner_distribution
-    def get_group_distribution(self, key: str) -> Counter:
-        """
-        Return the NER type distribution for the given key
-        Returns:
-            (Counter): ner distribution w.r.t some grouping (key)
-        """
-        if key in self._ner_distribution.keys():
-            return self._ner_distribution[key]
-        else:
-            raise ValueError('Key not found')

ner_datasets/distribution/print_distribution.py DELETED Viewed

@@ -1,49 +0,0 @@
-from collections import Counter
-from typing import Sequence, NoReturn
-from .ner_distribution import NERDistribution
-class PrintDistribution(object):
-    """
-    This class is used to print the distribution of NER types
-    """
-    def __init__(self, ner_distribution: NERDistribution, key_counts: Counter) -> NoReturn:
-        """
-        Initialize
-        Args:
-            ner_distribution (NERDistribution): NERDistribution object that keeps track of the NER type distributions
-            key_counts (Counter): Number of keys/groups (e.g note_ids, patient ids etc)
-        """
-        self._ner_distribution = ner_distribution
-        self._key_counts = key_counts
-    def split_distribution(self, split: str, split_info: Sequence[str]) -> NoReturn:
-        """
-        Print NER type distribution
-        Args:
-            split (str): The dataset split
-            split_info (Sequence[str]): The keys belonging to that split
-        """
-        split_distribution = Counter()
-        number_of_notes = 0
-        for key in split_info:
-            number_of_notes += self._key_counts[key]
-            split_distribution.update(self._ner_distribution.get_group_distribution(key))
-        total_ner = sum(split_distribution.values())
-        percentages = {ner_type: float(count) / total_ner * 100 if total_ner else 0
-                       for ner_type, count in split_distribution.items()}
-        print('{:^70}'.format('============ ' + split.upper() + ' NER Distribution ============='))
-        print('{:<20}{:<10}'.format('Number of Notes: ', number_of_notes))
-        print('{:<20}{:<10}\n'.format('Number of Groups: ', len(split_info)))
-        for ner_type, count in split_distribution.most_common():
-            print('{:<10}{:<10}{:<5}{:<10}{:<5}{:<10}'.format(
-                'NER Type: ', ner_type,
-                'Count: ', count,
-                'Percentage: ', '{:0.2f}'.format(percentages[ner_type]))
-            )
-        print('{:<10}{:<10}{:<5}{:<10}{:<5}{:<10}'.format(
-            'NER Type:', 'TOTALS', 'Count: ', total_ner, 'Percentage: ', '{:0.2f}'.format(100))
-        )
-        print('\n')

ner_datasets/preprocessing/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .preprocessing_loader import PreprocessingLoader
2	- __all__ = ["PreprocessingLoader"]

ner_datasets/preprocessing/preprocessing_loader.py DELETED Viewed

@@ -1,64 +0,0 @@
-from typing import Union, Optional, Sequence
-from .sentencizers import SpacySentencizer, NoteSentencizer
-from .tokenizers import ClinicalSpacyTokenizer, SpacyTokenizer, CoreNLPTokenizer
-class PreprocessingLoader(object):
-    @staticmethod
-    def get_sentencizer(sentencizer: str) -> Union[SpacySentencizer, NoteSentencizer]:
-        """
-        Initialize the sentencizer and tokenizer based
-        We can either use the sci-spacy (en_core_sci_lg or en_core_web_sm) or
-        consider the entire note as a single sentence.
-        Args:
-            sentencizer (str): Specify which sentencizer you want to use
-        Returns:
-            Union[SpacySentencizer, MimicStanzaSentencizer, NoteSentencizer]: An object of the requested
-                                                                              sentencizer class
-        """
-        if sentencizer == 'en_core_sci_lg':
-            return SpacySentencizer(spacy_model='en_core_sci_lg')
-        elif sentencizer == 'en_core_sci_sm':
-            return SpacySentencizer(spacy_model='en_core_sci_sm')
-        elif sentencizer == 'en_core_web_sm':
-            return SpacySentencizer(spacy_model='en_core_web_sm')
-        elif sentencizer == 'note':
-            return NoteSentencizer()
-        else:
-            raise ValueError('Invalid sentencizer - does not exist')
-    @staticmethod
-    def get_tokenizer(
-            tokenizer: str,
-            abbreviations: Optional[Sequence[str]] = None,
-    ) -> Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]:
-        """
-        Initialize the tokenizer based on the CLI arguments
-        We can either use the default scipacy (en_core_sci_lg or en_core_web_sm)
-        or the modified scipacy (with regex rule) tokenizer.
-        It also supports the corenlp tokenizer
-        Args:
-            tokenizer (str): Specify which tokenizer you want to use
-            abbreviations (Optional[str]): A list of abbreviations for which tokens will not be split - works only with
-                                           used with custom clinical tokenizer
-        Returns:
-            Union[SpacyTokenizer, ClinicalSpacyTokenizer, CoreNLPTokenizer]: An object of the requested tokenizer class
-        """
-        if tokenizer == 'en_core_sci_lg':
-            return SpacyTokenizer(spacy_model='en_core_sci_lg')
-        elif tokenizer == 'en_core_web_sm':
-            return SpacyTokenizer(spacy_model='en_core_web_sm')
-        elif tokenizer == 'en':
-            return SpacyTokenizer(spacy_model='en')
-        elif tokenizer == 'corenlp':
-            return CoreNLPTokenizer()
-        elif tokenizer == 'clinical':
-            # Abbreviations - we won't split tokens that match these (e.g 18F-FDG)
-            if abbreviations is None:
-                return ClinicalSpacyTokenizer(spacy_model='en_core_sci_sm', abbreviations=abbreviations)
-            else:
-                return ClinicalSpacyTokenizer(spacy_model='en_core_sci_sm', abbreviations=abbreviations)
-        else:
-            raise ValueError('Invalid tokenizer - does not exist')

ner_datasets/preprocessing/sentencizers/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .note_sentencizer import NoteSentencizer
-from .spacy_sentencizer import SpacySentencizer
-__all__=["NoteSentencizer", "SpacySentencizer"]

ner_datasets/preprocessing/sentencizers/mimic_stanza_sentencizer.py DELETED Viewed

@@ -1,37 +0,0 @@
-from typing import Iterable, Dict, Union
-import stanza
-class MimicStanzaSentencizer(object):
-    """
-    This class is used to read text and split it into
-    sentences (and their start and end positions)
-    using the mimic stanza package
-    """
-    def __init__(self, package: str):
-        """
-        Initialize a mimic stanza model to read text and split it into
-        sentences.
-        Args:
-            package (str): Name of the mimic model
-        """
-        self._nlp = stanza.Pipeline('en', package=package, processors='tokenize', use_gpu=True)
-    def get_sentences(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
-        """
-        Return an integrator that iterates through the sentences in the text
-        Args:
-            text (str): The text
-        Returns:
-            (Iterable[Dict[str, Union[str, int]]]): Yields a dictionary that contains the text of the sentence
-                                                    the start position of the sentence in the entire text
-                                                    and the end position of the sentence in the entire text
-        """
-        doc = self._nlp(text)
-        for sentence in doc.sentences:
-            yield {'text': sentence.text,
-                   'start': sentence.tokens[0].start_char,
-                   'end': sentence.tokens[-1].end_char,
-                   'last_token': sentence.tokens[-1].text}

ner_datasets/preprocessing/sentencizers/note_sentencizer.py DELETED Viewed

@@ -1,33 +0,0 @@
-from typing import Iterable, Dict, Union
-class NoteSentencizer(object):
-    """
-    This class is used to read text and split it into
-    sentences (and their start and end positions)
-    This class considers an entire note or text as
-    a single sentence
-    """
-    def __init__(self):
-        """
-        Nothing to be initialized
-        """
-    def get_sentences(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
-        """
-        Return an iterator that iterates through the sentences in the text.
-        In this case it just returns the text itself.
-        Args:
-            text (str): The text
-        Returns:
-            (Iterable[Dict[str, Union[str, int]]]): Yields a dictionary that contains the text of the sentence
-                                                    the start position of the sentence in the entire text
-                                                    and the end position of the sentence in the entire text
-        """
-        yield {
-            'text': text,
-            'start': 0,
-            'end': len(text),
-            'last_token': None
-        }

ner_datasets/preprocessing/sentencizers/spacy_sentencizer.py DELETED Viewed

@@ -1,37 +0,0 @@
-from typing import Iterable, Dict, Union
-import spacy
-class SpacySentencizer(object):
-    """
-    This class is used to read text and split it into
-    sentences (and their start and end positions)
-    using a spacy model
-    """
-    def __init__(self, spacy_model: str):
-        """
-        Initialize a spacy model to read text and split it into
-        sentences.
-        Args:
-            spacy_model (str): Name of the spacy model
-        """
-        self._nlp = spacy.load(spacy_model)
-    def get_sentences(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
-        """
-        Return an iterator that iterates through the sentences in the text
-        Args:
-            text (str): The text
-        Returns:
-            (Iterable[Dict[str, Union[str, int]]]): Yields a dictionary that contains the text of the sentence
-                                                    the start position of the sentence in the entire text
-                                                    and the end position of the sentence in the entire text
-        """
-        document = self._nlp(text)
-        for sentence in document.sents:
-            yield {'text': sentence.text,
-                   'start': sentence.start_char,
-                   'end': sentence.end_char,
-                   'last_token': None}

ner_datasets/preprocessing/tokenizers/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-from .spacy_tokenizer import SpacyTokenizer
-from .core_nlp_tokenizer import CoreNLPTokenizer
-from .clinical_spacy_tokenizer import ClinicalSpacyTokenizer
-__all__=["SpacyTokenizer", "CoreNLPTokenizer", "ClinicalSpacyTokenizer"]

ner_datasets/preprocessing/tokenizers/abbreviations/check.txt DELETED Viewed

@@ -1,20 +0,0 @@
-sec.
-secs.
-Sec.
-Secs.
-fig.
-figs.
-Fig.
-Figs.
-eq.
-eqs.
-Eq.
-Eqs.
-no.
-nos.
-No.
-Nos.
-al.
-gen.
-sp.
-nov.

ner_datasets/preprocessing/tokenizers/abbreviations/medical_abbreviations_curated.txt DELETED Viewed

@@ -1,87 +0,0 @@
--ve
-+ve
-a.c.
-a/g
-b.i.d.
-C&S
-C/O
-D/C
-D&C
-D and C
-H&H
-H&P
-h.s.
-H/O
-h/o
-I&D
-M/H
-N/V
-O&P
-O.D.
-O.S.
-O.U.
-p¯
-p.o.
-p.r.n.
-q.d.
-q.i.d.
-R/O
-s/p
-T&A
-t.i.d.
-u/a
-u**
-y.o.
-F/u
-Crohn's
-R.N.
-S/p
-S/P
-s/P
-N/A
-n/a
-N/a
-n/A
-w/
-Pt.
-pt.
-PT.
-cf.
-CF.
-Cf.
-dr.
-DR.
-Dr.
-ft.
-FT.
-Ft.
-lt.
-LT.
-Lt.
-mr.
-MR.
-Mr.
-ms.
-MS.
-Ms.
-mt.
-MT.
-Mt.
-mx.
-MX.
-Mx.
-ph.
-PH.
-Ph.
-rd.
-RD.
-Rd.
-st.
-ST.
-St.
-vs.
-VS.
-Vs.
-wm.
-WM.
-Wm.

ner_datasets/preprocessing/tokenizers/abbreviations/medical_abbreviations_wiki.txt DELETED Viewed

@@ -1,459 +0,0 @@
-+ve
-x/12
-x/40
-x/52
-x/7
-18F-FDG
-2°
-2/2
-3TC
-5-FU
-5-HIAA
-5-HT
-6MP
-a.a.
-A1C
-Aa.
-AAOx3
-A/B
-a.c.
-AC&BC
-ad.
-part.
-A+E
-AF-AFP
-a.h.
-altern.
-d.
-Anti-
-A&O
-A/O
-A&Ox3
-A&Ox4
-a.p.
-A&P
-A/P
-applic.
-aq.
-bull.
-calid.
-dist.
-gel.
-ASC-H
-ASC-US
-A-T
-AT-III
-aur.
-dextro.
-aurist.
-A&W
-A/W
-b.i.d.
-b/l
-bl.cult
-B/O
-BRCA1
-BRCA2
-C1
-C2
-c/b
-CBC/DIFF
-C/C/E
-CCK-PZ
-CHEM-7
-CHEM-20
-C/O
-c/o
-CO2
-COX-1
-COX-2
-COX-3
-C/S
-C&S
-C-section
-C-spine
-C-SSRS
-c/a/p
-c/w
-D5
-D25
-D4T
-D5W
-D&C
-D/C
-D&E
-DHEA-S
-Di-Di
-DM2
-D/O
-D/T
-Ex-n
-F/C
-F/C/S
-FEF25–75
-FEV1
-fl.oz.
-FTA-ABS
-F/U
-G6PD
-G-CSF
-GM-CSF
-H/A
-HbA1c
-HCO3
-HDL-C
-H&E
-H/H
-H&H
-H&M
-HMG-CoA
-H-mole
-H/O
-H&P
-H/oPI
-h.s.
-I131
-ICD-10
-I&D
-IgG4-RD
-IgG4-RKD
-IgG4-ROD
-IgG4-TIN
-INF(-α/-β/-γ)
-I&O
-IV-DSA
-L&D
-LDL-C
-L-DOPA
-L/S
-MC&S
-M/E
-MgSO4
-MHA-TP
-M&M
-MMR-D
-Mono-Di
-Mono-Mono
-MS-AFP
-MSO4
-MVo2
-No.
-rep.
-n.s.
-n/t
-N&V
-n/v
-O2
-OB-GYN
-ob-gyne
-O/E
-O/N
-O&P
-P&A
-PAI-1
-PAPP-A
-p.c.
-PIG-A
-PM&R
-p.r.
-Pt.
-p.v.
-P-Y
-q2wk
-q6h
-q6°
-q.a.d.
-q.AM
-q.d.
-q.d.s.
-q.h.
-q.h.s.
-q.i.d.
-q.l.
-q.m.t.
-q.n.
-q.n.s.
-q.o.d.
-q.o.h.
-q.s.
-q.v.
-q.wk.
-r/g/m
-R&M
-R/O
-r/r/w
-R/t
-RT-PCR
-S1
-S2
-S3
-S4
-S&O
-S.D.
-op.
-SMA-6
-SMA-7
-s/p
-spp.
-Sp.
-fl.
-gr.
-S/S
-S/Sx
-Staph.
-Strep.
-Strepto.
-T&A
-T&C
-T&S
-TAH-BSO
-T2DM
-T/F
-T&H
-Tib-Fib
-TRF'd
-TSHR-Ab
-T.S.T.H.
-U/A
-U&E
-U/O
-V-fib
-V/Q
-WAIS-R
-W/C
-WISC-R
-W/O
-w/o
-w/u
-X-AFP
-y/o
-a.c.h.s.
-ac&hs
-a.d.
-ad.
-add.
-lib.
-admov.
-us.
-æq.
-agit.
-alt.
-d.
-dieb.
-h.
-hor.
-a.m.
-amp.
-com.
-dest.
-ferv.
-a.l.
-a.s.
-a.u.
-b.d.s.
-bib.
-b.i.d.
-b.d.
-ind.
-bol.
-Ph.Br.
-b.t.
-bucc.
-cap.
-caps.
-cap.
-c.m.
-c.m.s.
-c.
-cib.
-c.c.
-cf.
-c.n.
-cochl.
-ampl.
-infant.
-mag.
-mod.
-parv.
-colet.
-comp.
-contin.
-cpt.
-cr.
-cuj.
-c.v.
-cyath.
-vinos.
-D5LR
-D5NS
-D5W
-D10W
-D10W
-D/C
-decoct.
-det.
-dil.
-dim.
-p.
-æ.
-disp.
-div.
-d.t.d.
-elix.
-e.m.p.
-emuls.
-exhib.
-f.
-f.h.
-fl.
-fld.
-f.m.
-pil.
-f.s.a.
-ft.
-garg.
-gutt.
-habt.
-decub.
-intermed.
-tert.
-inj.
-i.m.
-inf.
-i.v.
-i.v.p.
-lat.
-dol.
-lb.
-l.c.d.
-liq.
-lot.
-M.
-m.
-max.
-m.d.u.
-mg/dL
-min.
-mist.
-mit.
-mitt.
-præscript.
-neb.
-noct.
-n.p.o.
-1/2NS
-o 2
-o2
-o.d.
-o.m.
-omn.
-bih.
-o.n.
-o.s.
-o.u.
-p.c.h.s.
-pc&hs
-Ph.Br.
-Ph.Eur.
-Ph.Int.
-pig./pigm.
-p.m.
-p.o.
-ppt.
-p.r.
-p.r.n.
-pt.
-pulv.
-p.v.
-q.1
-q.1°
-q4PM
-q.a.m.
-q.d./q.1.d.
-q.d.a.m.
-q.d.p.m.
-q.p.m.
-q.q.
-q.q.h.
-a.d
-rep.
-rept.
-R/L
-s.
-s.a.
-sem.
-s.i.d.
-sig.
-sing.
-s.l.
-sol.
-s.o.s.
-s.s.
-st.
-sum.
-supp.
-susp.
-syr.
-tab.
-tal.
-t.
-t.d.s.
-t.i.d.
-t.d.
-tinct.
-t.i.w.
-top.
-tinc.
-trit.
-troch.
-u.d.
-ut.
-dict.
-ung.
-vag.
-w/a
-w/f
-y.o.
-ADD-RT
-A-T
-PDD-NOS
-Alzheimer's
-Age-related
-Aldosterone-producing
-Alcohol-related
-Ataxia-telangiectasia
-Binswanger's
-Becker's
-Bloom's
-Brown-Séquard
-Crimean-Congo
-Cerebro-oculo-facio-skeletal
-Carbapenem-resistant
-Drug-resistant
-End-stage
-Graft-versus-host
-Huntington's
-High-functioning
-Hypoxanthine-guanine
-Legionnaires'
-Low-functioning
-Multi-drug-resistant
-Multi-infarct
-Machado-Joseph
-Maturity-onset
-Multi-sensory
-Obsessive-compulsive
-Parkinson's
-kinase-associated
-Post-polio
-Port-wine
-Reye's
-Sensory-based
-Vitus's
-Septo-optic
-ST-elevation
-Short-lasting
-Urticaria-deafness-amyloidosis
-Wilson's
-drug-resistant
-X-linked

ner_datasets/preprocessing/tokenizers/clinical_spacy_tokenizer.py DELETED Viewed

@@ -1,73 +0,0 @@
-import re
-import spacy
-from spacy.symbols import ORTH
-from .spacy_tokenizer import SpacyTokenizer
-from .utils import DateRegex, CleanRegex, ClinicalRegex
-class ClinicalSpacyTokenizer(SpacyTokenizer):
-    """
-    This class is used to read text and return the tokens
-    present in the text (and their start and end positions)
-    """
-    def __init__(self, spacy_model, abbreviations,
-                 split_multiple=True, split_temperature=True,
-                 split_percentage=True):
-        """
-        Initialize a spacy model to read text and split it into
-        tokens.
-        Args:
-            spacy_model (str): Name of the spacy model
-        """
-        super().__init__(spacy_model)
-        self._nlp.tokenizer.prefix_search = self.__get_prefix_regex(split_multiple, split_temperature,
-                                                                    split_percentage).search
-        self._nlp.tokenizer.infix_finditer = self.__get_infix_regex().finditer
-        self._nlp.tokenizer.suffix_search = self.__get_suffix_regex().search
-        new_rules = {}
-        for orth, exc in self._nlp.tokenizer.rules.items():
-            if re.search('((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)[.]$)|(^(W|w)ed$)', orth):
-                continue
-            new_rules[orth] = exc
-        self._nlp.tokenizer.rules = new_rules
-        if (abbreviations != None):
-            for abbreviation in abbreviations:
-                special_case = [{ORTH: abbreviation}]
-                self._nlp.tokenizer.add_special_case(abbreviation, special_case)
-            # this matches any lower case tokens - abstract this part out - whetehr to lowercase abbreviations ro not
-            exclusions_uncased = {abbreviation.lower(): [{ORTH: abbreviation.lower()}] for abbreviation in
-                                  abbreviations}
-            for k, excl in exclusions_uncased.items():
-                try:
-                    self._nlp.tokenizer.add_special_case(k, excl)
-                except:
-                    print('failed to add exception: {}'.format(k))
-    def __get_prefix_regex(self, split_multiple, split_temperature, split_percentage):
-        date_prefix = DateRegex.get_infixes()
-        clinical_prefix = ClinicalRegex.get_prefixes(split_multiple, split_temperature, split_percentage)
-        clean_prefix = CleanRegex.get_prefixes()
-        digit_infix = ClinicalRegex.get_infixes()
-        prefixes = clean_prefix + self._nlp.Defaults.prefixes + date_prefix + clinical_prefix + digit_infix
-        prefix_regex = spacy.util.compile_prefix_regex(prefixes)
-        return prefix_regex
-    def __get_suffix_regex(self):
-        clean_suffix = CleanRegex.get_suffixes()
-        suffixes = clean_suffix + self._nlp.Defaults.suffixes
-        suffix_regex = spacy.util.compile_suffix_regex(suffixes)
-        return suffix_regex
-    def __get_infix_regex(self):
-        date_infixes = DateRegex.get_infixes()
-        clean_infixes = CleanRegex.get_infixes()
-        digit_infix = ClinicalRegex.get_infixes()
-        infixes = self._nlp.Defaults.infixes + date_infixes + clean_infixes
-        infix_re = spacy.util.compile_infix_regex(infixes)
-        return infix_re
-    def get_nlp(self):
-        return self._nlp

ner_datasets/preprocessing/tokenizers/core_nlp_tokenizer.py DELETED Viewed

@@ -1,58 +0,0 @@
-import json
-from typing import Iterable, Mapping, Dict, Union
-from pycorenlp import StanfordCoreNLP
-class CoreNLPTokenizer(object):
-    """
-    This class is used to read text and return the tokens
-    present in the text (and their start and end positions)
-    using core nlp tokenization
-    """
-    def __init__(self, port: int = 9000):
-        """
-        Initialize a core nlp server to read text and split it into
-        tokens using the core nlp annotators
-        Args:
-            port (int): The port to run the server on
-        """
-        self._core_nlp = StanfordCoreNLP('http://localhost:{0}'.format(port))
-    def get_stanford_annotations(self, text: str, annotators: str = 'tokenize,ssplit,pos,lemma') -> Dict:
-        """
-        Use the core nlp server to annotate the text and return the
-        results as a json object
-        Args:
-            text (str): The text to annotate
-            annotators (str): The core nlp annotations to run on the text
-        Returns:
-            output (Dict): The core nlp results
-        """
-        output = self._core_nlp.annotate(text, properties={
-            "timeout": "50000",
-            "ssplit.newlineIsSentenceBreak": "two",
-            'annotators': annotators,
-            'outputFormat': 'json'
-        })
-        if type(output) is str:
-            output = json.loads(output, strict=False)
-        return output
-    def get_tokens(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
-        """
-        Return an iterable that iterates through the tokens in the text
-        Args:
-            text (str): The text to annotate
-        Returns:
-            (Iterable[Mapping[str, Union[str, int]]]): Yields a dictionary that contains the text of the token
-                                                       the start position of the token in the entire text
-                                                       and the end position of the token in the entire text
-        """
-        stanford_output = self.get_stanford_annotations(text)
-        for sentence in stanford_output['sentences']:
-            for token in sentence['tokens']:
-                yield {'text': token['originalText'],
-                       'start': token['characterOffsetBegin'],
-                       'end': token['characterOffsetEnd']}

ner_datasets/preprocessing/tokenizers/spacy_tokenizer.py DELETED Viewed

@@ -1,49 +0,0 @@
-import spacy
-from typing import Tuple, Iterable, Mapping, Dict, Union
-class SpacyTokenizer(object):
-    """
-    This class is used to read text and return the tokens
-    present in the text (and their start and end positions)
-    using spacy
-    """
-    def __init__(self, spacy_model: str):
-        """
-        Initialize a spacy model to read text and split it into
-        tokens.
-        Args:
-            spacy_model (str): Name of the spacy model
-        """
-        self._nlp = spacy.load(spacy_model)
-    @staticmethod
-    def __get_start_and_end_offset(token: spacy.tokens.Token) -> Tuple[int, int]:
-        """
-        Return the start position of the token in the entire text
-        and the end position of the token in the entire text
-        Args:
-            token (spacy.tokens.Token): The spacy token object
-        Returns:
-            start (int): the start position of the token in the entire text
-            end (int): the end position of the token in the entire text
-        """
-        start = token.idx
-        end = start + len(token)
-        return start, end
-    def get_tokens(self, text: str) -> Iterable[Dict[str, Union[str, int]]]:
-        """
-        Return an iterable that iterates through the tokens in the text
-        Args:
-            text (str): The text to annotate
-        Returns:
-            (Iterable[Mapping[str, Union[str, int]]]): Yields a dictionary that contains the text of the token
-                                                       the start position of the token in the entire text
-                                                       and the end position of the token in the entire text
-        """
-        document = self._nlp(text)
-        for token in document:
-            start, end = SpacyTokenizer.__get_start_and_end_offset(token)
-            yield {'text': token.text, 'start': start, 'end': end}

ner_datasets/preprocessing/tokenizers/utils/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-from .date_regex import DateRegex
-from .clean_regex import CleanRegex
-from .clinical_regex import ClinicalRegex
-__all__=["DateRegex", "CleanRegex", "ClinicalRegex"]

ner_datasets/preprocessing/tokenizers/utils/clean_regex.py DELETED Viewed

@@ -1,64 +0,0 @@
-from typing import List
-class CleanRegex(object):
-    """
-    This class is used to define the regexes that will be used by the
-    spacy tokenizer rules. Mainly the regexes are used to clean up
-    tokens that have unwanted characters (e.g extra hyphens).
-    """
-    #Staff - 3
-    #Hosp - 4, 5
-    #Loc - 2
-    @staticmethod
-    def get_prefixes() -> List[str]:
-        """
-        This function is used to build the regex that will clean up dirty characters
-        present at the prefix position (start position) of a token. For example the token ---clean
-        has three hyphens that need to be split from the word clean. This regex
-        will be used by spacy to clean it up. This rule considers any characters that is
-        not a letter or a digit as dirty characters
-        Examples: ----------------9/36, :63, -ESH
-        Returns:
-            (list): List of regexes to clean the prefix of the token
-        """
-        #Handles case 5 of HOSP
-        return ['((?P<prefix>([^a-zA-Z0-9.]))(?P=prefix)*)', '([.])(?!\d+(\W+|$))']
-    @staticmethod
-    def get_suffixes() -> List[str]:
-        """
-        This function is used to build the regex that will clean up dirty characters
-        present at the suffix position (end position) of a token. For example the token clean---
-        has three hyphens that need to be split from the word clean. This regex
-        will be used by spacy to clean it up. This rule considers any characters that is
-        not a letter or a digit as dirty characters
-        Examples: FRANK^, regimen---------------, no)
-        Returns:
-            (list): List of regexes to clean the suffix of the token
-        """
-        return ['((?P<suffix>([^a-zA-Z0-9]))(?P=suffix)*)']
-    @staticmethod
-    def get_infixes() -> List[str]:
-        """
-        This function is used to build the regex that will clean up dirty characters
-        present at the infix position (in-between position) of a token. For example the token
-        clean---me has three hyphens that need to be split from the word clean and me. This regex
-        will be used by spacy to clean it up. This rule considers any characters that is
-        not a letter or a digit as dirty characters
-        Examples: FRANK^08/30/76^UNDERWOOD, regimen---------------1/37
-        Returns:
-            (list): List of regexes to clean the infix of the token
-        """
-        #Handles case 3 of STAFF
-        #Handles case 4 of HOSP
-        #Handles case 2 of LOC
-        connector_clean = '\^|;|&#|([\(\)\[\]:="])'
-        #full_stop_clean = '(?<=[a-zA-Z])(\.)(?=([A-Z][A-Za-z]+)|[^a-zA-Z0-9_.]+)'
-        bracket_comma_clean = '(((?<=\d)[,)(](?=[a-zA-Z]+))|((?<=[a-zA-Z])[,)(](?=\w+)))'
-        #special_char_clean = '(?<=[a-zA-Z])(\W{3,}|[_]{3,})(?=[A-Za-z]+)'
-        special_char_clean = '(?<=[a-zA-Z])([_\W_]{3,})(?=[A-Za-z]+)'
-        #Sometimes when there is no space between a period and a comma - it becomes part of the same token
-        #e.g John.,M.D - we need to split this up.
-        comma_period_clean = '(?<=[a-zA-Z])(\.,)(?=[A-Za-z]+)'
-        return [connector_clean, bracket_comma_clean, special_char_clean, comma_period_clean]

ner_datasets/preprocessing/tokenizers/utils/clinical_regex.py DELETED Viewed

@@ -1,309 +0,0 @@
-from typing import List
-class ClinicalRegex(object):
-    """
-    This class is used to define the regexes that will be used by the
-    spacy tokenizer rules. Mainly the regexes are used to clean up
-    tokens that have unwanted characters and typos (e.g missing spaces).
-    In the descriptions when we mention symbol we refer to any character
-    that is not a letter or a digit or underscore. The spacy tokenizer splits
-    the text by whitespace and applies these rules (along with some default rules)
-    to the indiviudal tokens.
-    """
-    #Patient - 2, 3, 5
-    #Staff - 1, 2
-    #Hosp - 2, 3
-    #Loc - 1, 3
-    @staticmethod
-    def get_word_typo_prefix():
-        """
-        If token contains a typo. What we mean by a typo is when two tokens
-        that should be separate tokens are fused into one token because there
-        is a missing space.
-        Examples: JohnMarital Status - John is the name that is fused into the
-        token Marital because of a missing space.
-        The regex checks if we have a sequence of characters followed by another
-        sequence of characters that starts with a capital letter, followed by two or
-        more small letters, we assume this is a typo and split the tokens (two sequences) up.
-        If there is a symbol separating the two sequences, we ease the condition saying
-        the Cpaital letter can be followed by two or more capital/small letters.
-        Returns:
-            (str): regex to clean tokens that are fused because of a missing space
-        """
-        #Handles cases 2 of PATIENT
-        #Handles cases 1 & 2 of STAFF
-        #Handles cases 2 & 3 of HOSP
-        #Handles cases 1 & 3 of LOC
-        #'(([a-z]+)|([A-Z]+)|([A-Z][a-z]+))(?=(([-./]*[A-Z][a-z]{2,})|([-./]+[A-Z][a-zA-Z]{2,})))'
-        return '(([a-z]+)|([A-Z]{2,})|([A-Z][a-z]+))(?=(([-./]*[A-Z][a-z]{2,})|([-./]+[A-Z][a-zA-Z]{2,})))'
-    @staticmethod
-    def get_word_symbol_digit_prefix() -> str:
-        """
-        If text is followed by one or more symbols and then followed by one or more digits
-        we make the assumption that the text is a seperate token. Spacy will use this regex
-        to extract the text portion as one token and will then move on to
-        process the rest (symbol and tokens) based on the defined rules.
-        Examples: Yang(4986231) - "Yang" will become a seperate token & "(4986231)" will
-        be processed as new token
-        Returns:
-            (str): regex to clean text followed by symbols followed by digits
-        """
-        #Handles cases 3 & 5 of patient
-        return '([a-zA-Z]+)(?=\W+\d+)'
-    @staticmethod
-    def get_multiple_prefix(split_multiple: bool) -> str:
-        """
-        If text is of the format take it x2 times, this function
-        can be used to treat the entire thing as one token or
-        split into two seperate tokens
-        Args:
-            split_multiple (bool): whether to treat it as one token or split them up
-        Returns:
-            (str): regex to either keep as one token or split into two
-        """
-        if(split_multiple):
-            return '([x])(?=(\d{1,2}$))'
-        else:
-            return '[x]\d{1,2}$'
-    @staticmethod
-    def get_pager_prefix():
-        return '([pXxPb])(?=(\d{4,}|\d+[-]\d+))'
-    @staticmethod
-    def get_age_word_prefix():
-        return '([MFmf])(?=\d{2,3}(\W+|$))'
-    @staticmethod
-    def get_id_prefix():
-        return '(ID|id|Id)(?=\d{3,})'
-    @staticmethod
-    def get_word_period_prefix():
-        return '((cf|CF|Cf|dr|DR|Dr|ft|FT|Ft|lt|LT|Lt|mr|MR|Mr|ms|MS|Ms|mt|MT|Mt|mx|MX|Mx|ph|PH|Ph|rd|RD|Rd|st|ST|St|vs|VS|Vs|wm|WM|Wm|[A-Za-z]{1})[.])(?=((\W+|$)))'
-    @staticmethod
-    def get_chemical_prefix():
-        #Vitamin B12 T9 or maybe codes like I48.9- should probaly do \d{1,2} - limit arbitary numbers
-        """
-        There are certain chemicals, vitamins etc that should not be split. They
-        should be kept as a single token - for example the token "B12" in
-        "Vitamin B12". This regex checks if there is a single capital letter
-        followed by some digits (there can be a hyphen in between those digits)
-        then this most likely represents a token that should not be split
-        Returns:
-            (str): regex to keep vitamin/chemical names as a single token
-        """
-        #return '((\d)?[A-EG-LN-OQ-WYZ]{1}\d+([.]\d+)?(-\d{1,2})*)(?=(([\(\)\[\]:="])|\W*$))'
-        return '((\d)?[A-EG-LN-OQ-WYZ]{1}\d+([.]\d+)?(-\d+)*)(?=(([\(\)\[\]:="])|\W*$))'
-    @staticmethod
-    def get_chemical_prefix_small():
-        #Vitamin B12 T9 or maybe codes like I48.9- should probaly do \d{1,2} - limit arbitary numbers
-        """
-        There are certain chemicals, vitamins etc that should not be split. They
-        should be kept as a single token - for example the token "B12" in
-        "Vitamin B12". This regex checks if there is a single capital letter
-        followed by some digits (there can be a hyphen in between those digits)
-        then this most likely represents a token that should not be split
-        Returns:
-            (str): regex to keep vitamin/chemical names as a single token
-        """
-        #return '((\d)?[A-EG-LN-OQ-WYZ]{1}\d+([.]\d+)?(-\d{1,2})*)(?=(([\(\)\[\]:="])|\W*$))'
-        return '((\d)?[a-eg-ln-oq-wyz]{1}\d+([.]\d+)?(-\d+)*)(?=(([\(\)\[\]:="])|\W*$))'
-    @staticmethod
-    def get_instrument_prefix():
-        """
-        There are cases when there are tokens like L1-L2-L3, we want to keep these as one
-        single token. This regex checks if there is a capital letter
-        Returns:
-            (str): regex to keep vitamin/chemical names as a single token
-        """
-        return '([A-Z]{1,2}\d+(?P<instrument>[-:]+)[A-Z]{1,2}\d+((?P=instrument)[A-Z]{1,2}\d+)*)'
-    @staticmethod
-    def get_instrument_prefix_small():
-        """
-        There are cases when there are tokens like L1-L2-L3, we want to keep these as one
-        single token. This regex checks if there is a capital letter
-        Returns:
-            (str): regex to keep vitamin/chemical names as a single token
-        """
-        return '([a-z]{1,2}\d+(?P<instrument_small>[-:]+)[a-z]{1,2}\d+((?P=instrument_small)[a-z]{1,2}\d+)*)'
-    #Handles Case 3, 4, 5 of MRN
-    #Handles Case 1, 2, 3 of PHONE
-    #Handles Case 7, 10 of AGE
-    #Handles Case 1 of IDNUM
-    #Handles Case 3, 5 of PATIENT
-    #Handles Case 7 of HOSP
-    #Handles Case 1 of General
-    @staticmethod
-    def get_age_typo_prefix():
-        """
-        There are cases when there is no space between the text and the age
-        Example: Plan88yo - we want Plan to be a seperate token
-        Returns:
-            (str):
-        """
-        age_suffix = '(([yY][eE][aA][rR]|[yY][oO]' + \
-        '|[yY][rR]|[yY]\.[oO]|[yY]/[oO]|[fF]|[mM]|[yY])' + \
-        '(-)*([o|O][l|L][d|D]|[f|F]|[m|M]|[o|O])?)'
-        return '([a-zA-Z]+)(?=((\d{1,3})' + age_suffix + '$))'
-    @staticmethod
-    def get_word_digit_split_prefix():
-        #Word followed by more than 3 digits - might not be part of the same token
-        #and could be a typo
-        #This need not be true - maybe we have an id like BFPI980801 - this will be split
-        #BFPI 980801 - but it might be okay to split - need to check
-        #([A-Z][a-z]{2,})(?=\d+)
-        return '([A-Z][a-z]{2,})(?=[A-Za-z]*\d+)'
-    @staticmethod
-    def get_word_digit_mix_prefix():
-        #Mix of letters and characters - most likely a typo if the
-        #following characters is a capital letter followed by more than
-        #2 small letters
-        #return '([A-Z]+\d+([A-Z]+(?!([a-z]{2,}))))(?=(\W+|([A-Z][a-z]{2,})|[a-z]{3,}))'
-        return '([A-Z]+\d+)(?=(\W+|([A-Z][a-z]{2,})|[a-z]{3,}))'
-    @staticmethod
-    def get_word_digit_mix_prefix_small():
-        #Mix of letters and characters - most likely a typo if the
-        #following characters is a capital letter followed by more than
-        #2 small letters
-        return '([a-z]+\d+)(?=(\W+|[A-Z][a-z]{2,}|[A-Z]{3,}))'
-    @staticmethod
-    def get_word_id_split_prefix():
-        return '([a-zA-Z]+)(?=(\d+[-./]+(\d+|$)))'
-    @staticmethod
-    def get_word_section_prefix():
-        #Fix JOHNID/CC - missing space from previous section - JOHN
-        return '([A-Za-z]+)(?=(((?P<slash>[/:]+)[A-Za-z]+)((?P=slash)[A-Za-z]+)*\W+\d+))'
-    @staticmethod
-    def get_colon_prefix():
-        #Split tokens before and after the token
-        #Does not split time - we make sure the token ebfore the colon
-        #starts with a letter.
-        #Splits patterns like <CHAR 1>:<CHAR 2> where CHAR 1 starts with a
-        #letter and is followed by one more letters/digits
-        #CHAR 2 is a combination of letters/digits of length greater than 2
-        #This wont split time, but assumes that when the colon is present
-        #the entities on either side of the token are different tokens
-        #A:9 - not split - more likely this makes sense as a single token (could be a chemical)
-        return '([A-Za-z][A-Za-z0-9]+)(?=([:][A-Za-z0-9]{2,}))'
-    @staticmethod
-    def get_temperature_prefix(split_temperature):
-        if(split_temperature):
-            return '((\d+)|(\d+[.]\d+))(?=(\u00B0([FCK]{1}|$)))'
-        else:
-            return '(((\d+)|(\d+[.]\d+))\u00B0([FCK]{1}|$))|(\u00A9[FCK]{1})'
-    @staticmethod
-    def get_percentage_prefix(split_percentage):
-        """
-        If text is of the format take it 20% times, this function
-        can be used to treat the entire thing as one token or
-        split into two seperate tokens
-        Args:
-            split_percentage (bool): whether to treat it as one token or split them up
-        Returns:
-            (str): regex to either keep as one token or split into two
-        """
-        if(split_percentage):
-            return '(((\d+)|(\d+[.]\d+)))(?=(%(\W+|$)))'
-        else:
-            return '(((\d+)|(\d+[.]\d+))%(\W+|$))'
-    @staticmethod
-    def get_value_range_prefixes():
-        #The following regex might not work on .4-.5 - no number before decimal point
-        #need to figure this out without breaking anything else
-        value_range_1 = '(\d{1})(?=([-]((\d{1,2}|(\d+)[.](\d+)))([a-zA-Z]+|[\W]*$)))'
-        value_range_2 = '(\d{2})(?=([-]((\d{2,3}|(\d+)[.](\d+)))([a-zA-Z]+|[\W]*$)))'
-        value_range_3 = '(\d{3})(?=([-]((\d{3}|(\d+)[.](\d+)))([a-zA-Z]+|[\W]*$)))'
-        return value_range_1, value_range_2, value_range_3
-    @staticmethod
-    def get_year_range_prefix():
-        return '(\d{4})(?=([-](\d{4})([a-zA-Z]+|[\W]*$)))'
-    @staticmethod
-    def get_short_digit_id_prefix():
-        #4A, 3C etc
-        return '(\d{1,2}[A-EG-LN-WZ]{1}(?=(\W+|$)))'
-    #Handles Case 1, 2 of MRN
-    #Handles Case 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19 of AGE
-    #Handles Case 2, 3, 5 of IDNUM
-    #Handles Case 1 of HOSP
-    @staticmethod
-    def get_digit_symbol_word_prefix():
-        return '((\d+)|(\d+[.]\d+))(?=\W+[a-zA-Z]+)'
-    @staticmethod
-    def get_digit_age_split_prefix():
-        age_suffix = '(([yY][eE][aA][rR]|[yY][oO]' + \
-        '|[yY][rR]|[yY]\.[oO]|[yY]/[oO]|[fF]|[mM]|[yY])' + \
-        '(-)*([o|O][l|L][d|D]|[f|F]|[m|M]|[o|O])?)'
-        return '((\d{1,3}))(?=(' + age_suffix + '\W*$))'
-    @staticmethod
-    def get_digit_word_short_prefix():
-        return '((\d+)|(\d+[.]\d+))([a-z]{1,2}|[A-Z]{1,2})(?=(\W*$))'
-    @staticmethod
-    def get_digit_word_typo_prefix():
-        return '((\d+)|(\d+[.]\d+))(?=[a-zA-Z]{1}[a-zA-Z\W]+)'
-    @staticmethod
-    def get_prefixes(split_multiple, split_temperature, split_percentage):
-        word_typo_prefix = ClinicalRegex.get_word_typo_prefix()
-        word_symbol_digit_prefix = ClinicalRegex.get_word_symbol_digit_prefix()
-        pager_prefix = ClinicalRegex.get_pager_prefix()
-        age_word_prefix = ClinicalRegex.get_age_word_prefix()
-        word_period_prefix = ClinicalRegex.get_word_period_prefix()
-        id_prefix = ClinicalRegex.get_id_prefix()
-        multiple_prefix = ClinicalRegex.get_multiple_prefix(split_multiple)
-        chemical_prefix = ClinicalRegex.get_chemical_prefix()
-        chemical_prefix_small = ClinicalRegex.get_chemical_prefix_small()
-        instrument_prefix = ClinicalRegex.get_instrument_prefix()
-        instrument_prefix_small = ClinicalRegex.get_instrument_prefix_small()
-        age_typo_prefix = ClinicalRegex.get_age_typo_prefix()
-        word_digit_split_prefix = ClinicalRegex.get_word_digit_split_prefix()
-        word_digit_mix_prefix = ClinicalRegex.get_word_digit_mix_prefix()
-        word_digit_mix_prefix_small = ClinicalRegex.get_word_digit_mix_prefix_small()
-        word_id_split_prefix = ClinicalRegex.get_word_id_split_prefix()
-        word_section_prefix = ClinicalRegex.get_word_section_prefix()
-        colon_prefix = ClinicalRegex.get_colon_prefix()
-        temperature_prefix = ClinicalRegex.get_temperature_prefix(split_temperature)
-        percentage_prefix = ClinicalRegex.get_percentage_prefix(split_percentage)
-        value_range_1, value_range_2, value_range_3 = ClinicalRegex.get_value_range_prefixes()
-        year_range_prefix = ClinicalRegex.get_year_range_prefix()
-        short_digit_id_prefix = ClinicalRegex.get_short_digit_id_prefix()
-        digit_symbol_word_prefix = ClinicalRegex.get_digit_symbol_word_prefix()
-        digit_age_split_prefix = ClinicalRegex.get_digit_age_split_prefix()
-        digit_word_short_prefix = ClinicalRegex.get_digit_word_short_prefix()
-        digit_word_typo_prefix = ClinicalRegex.get_digit_word_typo_prefix()
-        return [word_typo_prefix, word_symbol_digit_prefix, pager_prefix, age_word_prefix,\
-                word_period_prefix, id_prefix, multiple_prefix, chemical_prefix, chemical_prefix_small,\
-                instrument_prefix, instrument_prefix_small, age_typo_prefix, word_digit_split_prefix,\
-                word_id_split_prefix, word_digit_mix_prefix, word_digit_mix_prefix_small, \
-                word_section_prefix, colon_prefix, temperature_prefix,\
-                percentage_prefix, value_range_1, value_range_2, value_range_3, year_range_prefix,\
-                short_digit_id_prefix, digit_symbol_word_prefix, digit_age_split_prefix,\
-                digit_word_short_prefix, digit_word_typo_prefix]
-    @staticmethod
-    def get_infixes():
-        digit_infix = '(\d+(?P<sep>[-:]+)\d+((?P=sep)\d+)*)'
-        return [digit_infix, ]

ner_datasets/preprocessing/tokenizers/utils/date_regex.py DELETED Viewed

@@ -1,104 +0,0 @@
-class DateRegex(object):
-    @staticmethod
-    def __get_day_attributes():
-        # day of the month with optional suffix, such as 7th, 22nd
-        dd = '(([0-2]?[0-9]|3[01])(\s*)([sS][tT]|[nN][dD]|[rR][dD]|[tT][hH])?)'
-        # two-digit numeric day of the month
-        DD = '(0[0-9]|[1-2][0-9]|3[01])'
-        return dd, DD
-    @staticmethod
-    def __get_month_attributes():
-        m = \
-        '([jJ][aA][nN]([uU][aA][rR][yY])?|'+\
-        '[fF][eE][bB]([rR][uU][aA][rR][yY])?|'+\
-        '[mM][aA][rR]([cC][hH])?|'+\
-        '[aA][pP][rR]([iI][lL])?|'+\
-        '[mM][aA][yY]|'+\
-        '[jJ][uU][nN]([eE])?|'+\
-        '[jJ][uU][lL]([yY])?|'+\
-        '[aA][uU][gG]([uU][sS][tT])?|'+\
-        '[sS][eE][pP]([tT][eE][mM][bB][eE][rR])?|'+\
-        '[oO][cC][tT]([oO][bB][eE][rR])?|'+\
-        '[nN][oO][vV]([eE][mM][bB][eE][rR])?|'+\
-        '[dD][eE][cC]([eE][mM][bB][eE][rR])?)'
-        M = m
-        # numeric month
-        mm = '(0?[0-9]|1[0-2]|' + m + ')'
-        # two digit month
-        MM = '(0[0-9]|1[0-2]|' + m + ')'
-        return m, M, mm, MM
-    @staticmethod
-    def __get_year_attributes():
-        # two or four digit year
-        y = '([0-9]{4}|[0-9]{2})'
-        # two digit year
-        yy = '([0-9]{2})'
-        # four digit year
-        YY = '([0-9]{4})'
-        return y, yy, YY
-    @staticmethod
-    def __get_sep_attributes():
-        date_sep = '[-./]'
-        date_sep_optional = '[-./]*'
-        date_sep_no_full = '[-/]'
-        return date_sep, date_sep_optional, date_sep_no_full
-    #def get_week_attributes():
-    #    w = \
-    #    '([mM][oO][nN]([dD][aA][yY])?|'+\
-    #    '[tT][uU][eE]([sS][dD][aA][yY])?|'+\
-    #    '[wW][eE][dD]([nN][eE][sS][dD][aA][yY])?|'+\
-    #    '[tT][hH][uU][gG]([uU][sS][tT])?|'+\
-    #    '[sS][eE][pP]([tT][eE][mM][bB][eE][rR])?|'+\
-    #    '[oO][cC][tT]([oO][bB][eE][rR])?|'+\
-    #    '[nN][oO][vV]([eE][mM][bB][eE][rR])?|'+\
-    #    '[dD][eE][cC]([eE][mM][bB][eE][rR])?)'
-    @staticmethod
-    def get_infixes():
-        dd, DD = DateRegex.__get_day_attributes()
-        m, M, mm, MM = DateRegex.__get_month_attributes()
-        y, yy, YY = DateRegex.__get_year_attributes()
-        date_sep, date_sep_optional, date_sep_no_full = DateRegex.__get_sep_attributes()
-        date_1 = y + '/' + mm + '/' + dd + '(?!([/]+|\d+))'
-        date_2 = y + '/' + dd + '/' + mm + '(?!([/]+|\d+))'
-        date_3 = dd + '/' + mm + '/' + y + '(?!([/]+|\d+))'
-        date_4 = mm + '/' + dd + '/' + y + '(?!([/]+|\d+))'
-        #Do I make this optional (date_sep_optional) - need to check
-        date_5 = y + date_sep + m + date_sep + dd + '(?!\d)'
-        date_6 = y + date_sep + dd + date_sep + m
-        date_7 = dd + date_sep + m + date_sep + y
-        date_8 = m + date_sep + dd + date_sep + y
-        date_9 =  y + date_sep + m
-        date_10 = m + date_sep + y
-        date_11 = dd + date_sep + m
-        date_12 = m + date_sep + dd
-        date_13 = '(?<!([/]|\d))' + y + '/' + dd + '(?!([/]+|\d+))'
-        date_14 = '(?<!([/]|\d))' + y + '/' + dd + '(?!([/]+|\d+))'
-        date_15 = '(?<!([/]|\d))' + dd + '/' + y + '(?!([/]+|\d+))'
-        date_16 = '(?<!([/]|\d))' + mm + '/' + y + '(?!([/]+|\d+))'
-        date_17 = '(?<!([/]|\d))' + dd + '/' + mm + '(?!([/]+|\d+))'
-        date_18 = '(?<!([/]|\d))' + mm + '/' + dd + '(?!([/]+|\d+))'
-        date_infixes = [date_1, date_2, date_3, date_4, date_5, date_6,\
-                date_7, date_8, date_9, date_10, date_11, date_12,\
-                date_13, date_14, date_15, date_16, date_17, date_18]
-        return date_infixes

ner_datasets/span_fixer.py DELETED Viewed

@@ -1,380 +0,0 @@
-import re
-import json
-from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
-from typing import Iterable, Dict, List, Sequence, Union, Mapping, Tuple, NoReturn
-from .preprocessing import PreprocessingLoader
-class SpanFixer(object):
-    """
-    The tokens and spans may not align depending on the tokenizer used.
-    This class either expands the span to cover the tokens, so we don't have a mismatch.
-    A mismatch is when a span_start will not coincide with some token_start or the span_end
-    will not coincide with some token_end. This class changes the span_start and span_end
-    so that the span_start will coincide with some token_start and the span_end
-    will coincide with some token_end - and we don't get any position mismatch errors while
-    building our dataset. This entire process involves updating span positions which can lead to duplicate
-    or overlapping spans, which then need to be removed.
-    E.g we have text: The patient is 75yo man
-    AGE Span: 75
-    Token: 75yo
-    As you can see the span is smaller than the token, which will lead to an error when
-    building the NER dataset.
-    To ensure this does not happen, we correct the span. We change the span from
-    75 to 75yo -> So now AGE Span is 75yo instead of 75. This script essentially changes
-    the annotated spans to match the tokens. In an ideal case we wouldn't need this script
-    but since medical notes have many typos, this script becomes necessary to deal with
-    issues and changes that arise from different tokenizers.
-    Also sort the spans and convert the start and end keys of the spans to integers
-    """
-    def __init__(
-            self,
-            sentencizer: str,
-            tokenizer: str,
-            ner_priorities: Mapping[str, int],
-            verbose: bool = True
-    ) -> NoReturn:
-        """
-        Initialize the sentencizer and tokenizer
-        Args:
-            sentencizer (str): The sentencizer to use for splitting text into sentences
-            tokenizer (str): The tokenizer to use for splitting text into tokens
-            ner_priorities (Mapping[str, int]): The priority when choosing which duplicates to remove.
-                                                Mapping that represents a priority for each NER type
-            verbose (bool): To print out warnings etc
-        """
-        self._sentencizer = PreprocessingLoader.get_sentencizer(sentencizer)
-        self._tokenizer = PreprocessingLoader.get_tokenizer(tokenizer)
-        self._ner_priorities = ner_priorities
-        self._verbose = verbose
-    def __get_token_positions(self, text: str) -> Tuple[Dict[int, int], Dict[int, int]]:
-        """
-        Get the start and end positions of all the tokens in the note.
-        Args:
-            text (str): The text present in the note
-        Returns:
-            token_start_positions (Mapping[int, int]): The start positions of all the tokens in the note
-            token_end_positions (Mapping[int, int]): The end positions of all the tokens in the note
-        """
-        token_start_positions = dict()
-        token_end_positions = dict()
-        for sentence in self._sentencizer.get_sentences(text):
-            offset = sentence['start']
-            for token in self._tokenizer.get_tokens(sentence['text']):
-                start = token['start'] + offset
-                end = token['end'] + offset
-                token_start_positions[start] = 1
-                token_end_positions[end] = 1
-        return token_start_positions, token_end_positions
-    def get_duplicates(
-            self,
-            spans: List[Dict[str, Union[str, int]]],
-    ) -> List[int]:
-        """
-        Return the indexes where there are duplicate/overlapping spans. A duplicate or
-        span is one where the same token can have two labels.
-        E.g:
-        Token: BWH^Bruce
-        This is a single token where BWH is the hospital label and Bruce is the Patient label
-        The fix_alignment function assigns this entre token the hospital label but it also
-        assigns this entire token the patient label. Since we have two labels for the same
-        token, we need to remove one of them.
-        We assign this entire token one label - either hospital label or the patient label
-        In this case we assign patient because of higher priority. So now we need to remove
-        the hospital label from the dataset (since it is essentially a duplicate label). This
-        script handles this case.
-        There are cases when two different labels match the same token partially
-        E.g
-        Text: JT/781-815-9090
-        Spans: JT - hospital, 781-815-9090 - Phone
-        Tokens: (Jt/781) & (- 815 - 9090)
-        As you can see the token JT/781 will be assigned the label in the fix_alignment function
-        but 781-815-9090 is also phone and the 781 portion is overlapped, and we need to resolve this.
-        In this script, we resolve it by treating JT/781 as one span (hospital) and
-        -815-9090 as another span (phone).
-        Args:
-            spans ([List[Dict[str, Union[str, int]]]): The NER spans in the note
-        Returns:
-            remove_spans (Sequence[int]): A list of indexes of the spans to remove
-        """
-        remove_spans = list()
-        prev_start = -1
-        prev_end = -1
-        prev_label = None
-        prev_index = None
-        spans.sort(key=lambda _span: (_span['start'], _span['end']))
-        for index, span in enumerate(spans):
-            current_start = span['start']
-            current_end = span['end']
-            current_label = span['label']
-            if type(current_start) != int or type(current_end) != int:
-                raise ValueError('The start and end keys of the span must be of type int')
-            # Check if the current span matches another span
-            # that is if this span covers the same tokens as the
-            # previous spans (but has a different label)
-            # Based on the priority, treat the span with the low
-            # priority label as a duplicate label and add it to the
-            # list of spans that need to be removed
-            if current_start == prev_start and current_end == prev_end:
-                if self._ner_priorities[current_label] > self._ner_priorities[prev_label]:
-                    # Store index of the previous span if it has lower priority
-                    remove_spans.append(prev_index)
-                    # Reset span details
-                    prev_start = current_start
-                    prev_end = current_end
-                    prev_index = index
-                    prev_label = current_label
-                    if self._verbose:
-                        print('DUPLICATE: ', span)
-                        print('REMOVED: ', spans[remove_spans[-1]])
-                elif self._ner_priorities[current_label] <= self._ner_priorities[prev_label]:
-                    # Store current index of span if it has lower priority
-                    remove_spans.append(index)
-                    if self._verbose:
-                        print('DUPLICATE: ', spans[prev_index])
-                        print('REMOVED: ', spans[remove_spans[-1]])
-            # Check for overlapping span
-            elif current_start < prev_end:
-                # If the current span end matches the overlapping span end
-                # Remove the current span, since it is smaller
-                if current_end <= prev_end:
-                    remove_spans.append(index)
-                    if self._verbose:
-                        print('DUPLICATE: ', spans[prev_index])
-                        print('REMOVED: ', spans[remove_spans[-1]])
-                # If the current end is greater than the prev_end
-                # then we split it into tow spans. We treat the previous span
-                # as one span and the end of the previous span to the end of the current span
-                # as another span.
-                elif current_end > prev_end:
-                    # Create the new span - start=previous_span_end, end=current_span_end
-                    overlap_length = spans[prev_index]['end'] - current_start
-                    new_text = span['text'][overlap_length:]
-                    # Remove extra spaces that may arise during this span separation
-                    new_text = re.sub('^(\s+)', '', new_text, flags=re.DOTALL)
-                    span['start'] = current_end - len(new_text)
-                    span['text'] = new_text
-                    if self._verbose:
-                        print('OVERLAP: ', spans[prev_index])
-                        print('UPDATED: ', span)
-                    # Reset span details
-                    prev_start = current_start
-                    prev_end = current_end
-                    prev_label = current_label
-                    prev_index = index
-            # Reset span details
-            else:
-                prev_start = current_start
-                prev_end = current_end
-                prev_label = current_label
-                prev_index = index
-        return remove_spans
-    def fix_alignment(
-            self,
-            text: str,
-            spans: Sequence[Dict[str, Union[str, int]]]
-    ) -> Iterable[Dict[str, Union[str, int]]]:
-        """
-        Align the span and tokens. When the tokens and spans don't align, we change the
-        start and end positions of the spans so that they align with the tokens. This is
-        needed when a different tokenizer is used and the spans which are defined against
-        a different tokenizer don't line up with the new tokenizer. Also remove spaces present
-        at the start or end of the span.
-        E.g:
-        Token: BWH^Bruce
-        This is a single token where BWH is the hospital label and Bruce is the Patient label
-        The fix_alignment function assigns this entre token the hospital label but it also
-        assigns this entire token the patient label. This function basically expands the span
-        so that it matches the start and end positions of some token. By doing this it may create
-        overlapping and duplicate spans. As you can see it expands the patient label to match the
-        start of the token and it expands the hospital label to match the end of the token.
-        function.
-        Args:
-            text (str): The text present in the note
-            spans ([Sequence[Dict[str, Union[str, int]]]): The NER spans in the note
-        Returns:
-            (Iterable[Dict[str, Union[str, int]]]): Iterable through the modified spans
-        """
-        # Get token start and end positions so that we can check if a span
-        # coincides with the start and end position of some token.
-        token_start_positions, token_end_positions = self.__get_token_positions(text)
-        for span in spans:
-            start = span['start']
-            end = span['end']
-            if type(start) != int or type(end) != int:
-                raise ValueError('The start and end keys of the span must be of type int')
-            if re.search('^\s', text[start:end]):
-                if self._verbose:
-                    print('WARNING - space present in the start of the span')
-                start = start + 1
-            if re.search('(\s+)$', text[start:end], flags=re.DOTALL):
-                new_text = re.sub('(\s+)$', '', text[start:end], flags=re.DOTALL)
-                end = start + len(new_text)
-            # When a span does not coincide with the start and end position of some token
-            # it means there will be an error when building the ner dataset, we try and avoid
-            # that error by updating the spans itself, that is we expand the start/end positions
-            # of the spans so that it is aligned with the tokens.
-            while token_start_positions.get(start, False) is False:
-                start -= 1
-            while token_end_positions.get(end, False) is False:
-                end += 1
-            # Print what the old span was and what the new expanded span will look like
-            if self._verbose and (int(span['start']) != start or int(span['end']) != end):
-                print('OLD SPAN: ', text[int(span['start']):int(span['end'])])
-                print('NEW SPAN: ', text[start:end])
-            # Update the span with its new start and end positions
-            span['start'] = start
-            span['end'] = end
-            span['text'] = text[start:end]
-            yield span
-    def fix_note(
-            self,
-            text: str,
-            spans: Sequence[Dict[str, Union[str, int]]],
-    ) -> Iterable[Dict[str, Union[str, int]]]:
-        """
-        This function changes the span_start and span_end
-        so that the span_start will coincide with some token_start and the span_end
-        will coincide with some token_end and also removes duplicate/overlapping spans
-        that may arise when we change the span start and end positions. The resulting
-        spans from this function will always coincide with some token start and token
-        end, and hence will not have any token and span mismatch errors when building the
-        NER dataset. For more details and examples check the documentation of the
-        fix_alignment and get_duplicates functions.
-        Args:
-            text (str): The text present in the note
-            spans ([Sequence[Mapping[str, Union[str, int]]]): The NER spans in the note
-        Returns:
-            (Iterable[Mapping[str, Union[str, int]]]): Iterable through the fixed spans
-        """
-        # Fix span position alignment
-        spans = [span for span in self.fix_alignment(text=text, spans=spans)]
-        # Check for duplicate/overlapping spans
-        remove_spans = self.get_duplicates(spans=spans)
-        for index, span in enumerate(spans):
-            # Remove the duplicate/overlapping spans
-            if index not in remove_spans:
-                yield span
-    def fix(
-            self,
-            input_file: str,
-            text_key: str = 'text',
-            spans_key: str = 'spans'
-    ) -> Iterable[Dict[str, Union[str, Dict[str, str], List[Dict[str, str]]]]]:
-        """
-        This function changes the span_start and span_end
-        so that the span_start will coincide with some token_start and the span_end
-        will coincide with some token_end and also removes duplicate/overlapping spans
-        that may arise when we change the span start and end positions. The resulting
-        spans from this function will always coincide with some token start and token
-        end, and hence will not have any token and span mismatch errors when building the
-        NER dataset. For more details and examples check the documentation of the
-        fix_alignment and get_duplicates functions. Fix spans that arise due to bad typos,
-        which are not fixed during tokenization. This essentially updates the spans so that
-        they line up with the start and end positions of tokens - so that there is no error
-        when we assign labels to tokens based on these spans
-        Args:
-            input_file (str): The file that contains the notes that we want to fix the token issues in
-            text_key (str) the key where the note & token text is present in the json object
-            spans_key (str): The key where the note spans are present in the json object
-        Returns:
-            (Iterable[Dict[str, Union[str, Dict[str, str], List[Dict[str, str]]]]]): Iterable through the fixed
-                                                                                     notes
-        """
-        for line in open(input_file, 'r'):
-            note = json.loads(line)
-            note[spans_key] = [span for span in self.fix_note(text=note[text_key], spans=note[spans_key])]
-            yield note
-def main():
-    # The following code sets up the arguments to be passed via CLI or via a JSON file
-    cli_parser = ArgumentParser(
-        description='configuration arguments provided at run time from the CLI',
-        formatter_class=ArgumentDefaultsHelpFormatter
-    )
-    cli_parser.add_argument(
-        '--input_file',
-        type=str,
-        required=True,
-        help='the the jsonl file that contains the notes'
-    )
-    cli_parser.add_argument(
-        '--sentencizer',
-        type=str,
-        required=True,
-        help='the sentencizer to use for splitting notes into sentences'
-    )
-    cli_parser.add_argument(
-        '--tokenizer',
-        type=str,
-        required=True,
-        help='the tokenizer to use for splitting text into tokens'
-    )
-    cli_parser.add_argument(
-        '--abbreviations_file',
-        type=str,
-        default=None,
-        help='file that will be used by clinical tokenizer to handle abbreviations'
-    )
-    cli_parser.add_argument(
-        '--ner_types',
-        nargs="+",
-        require=True,
-        help='the NER types'
-    )
-    cli_parser.add_argument(
-        '--ner_priorities',
-        nargs="+",
-        require=True,
-        help='the priorities for the NER types - the priority when choosing which duplicates to remove'
-    )
-    cli_parser.add_argument(
-        '--text_key',
-        type=str,
-        default='text',
-        help='the key where the note & token text is present in the json object'
-    )
-    cli_parser.add_argument(
-        '--spans_key',
-        type=str,
-        default='spans',
-        help='the key where the note spans is present in the json object'
-    )
-    cli_parser.add_argument(
-        '--output_file',
-        type=str,
-        required=True,
-        help='the output json file that will contain the new fixed spans'
-    )
-    args = cli_parser.parse_args()
-    # Mapping that represents a priority for each PHI type
-    # For example, the PATIENT type will have a higher priority as
-    # compared to STAFF.
-    if len(args.ner_types) == len(args.ner_priorities):
-        ner_priorities = {ner_type: priority for ner_type, priority in zip(args.ner_types, args.ner_priorities)}
-    else:
-        raise ValueError('Length of ner_types and ner_priorities must be the same')
-    span_fixer = SpanFixer(
-        tokenizer=args.tokenizer,
-        sentencizer=args.sentencizer,
-        ner_priorities=ner_priorities
-    )
-    with open(args.output_file, 'w') as file:
-        for note in span_fixer.fix(
-                input_file=args.input_file,
-                text_key=args.text_key,
-                spans_key=args.spans_key
-        ):
-            file.write(json.dumps(note) + '\n')
-if __name__ == '__main__':
-    main()

ner_datasets/span_validation.py DELETED Viewed

@@ -1,91 +0,0 @@
-import json
-import random
-from argparse import ArgumentParser
-from typing import Union, NoReturn, Iterable, Dict, List
-random.seed(41)
-class SpanValidation(object):
-    """
-    This class is used to build  a mapping between the note id
-    and the annotated spans in that note. This will be used during the
-    evaluation of the models. This is required to perform span level
-    evaluation.
-    """
-    @staticmethod
-    def get_spans(
-            input_file: str,
-            metadata_key: str = 'meta',
-            note_id_key: str = 'note_id',
-            spans_key: str = 'spans'
-    ):
-        """
-        Get a mapping between the note id
-        and the annotated spans in that note. This will mainly be used during the
-        evaluation of the models.
-        Args:
-            input_file (str): The input file
-            metadata_key (str): The key where the note metadata is present
-            note_id_key (str): The key where the note id is present
-            spans_key (str): The key that contains the annotated spans for a note dictionary
-        Returns:
-            (Iterable[Dict[str, Union[str, List[Dict[str, str]]]]]): An iterable that iterates through each note
-                                                                     and contains the note id and annotated spans
-                                                                     for that note
-        """
-        # Read the input files (data source)
-        for line in open(input_file, 'r'):
-            note = json.loads(line)
-            note_id = note[metadata_key][note_id_key]
-            # Store the note_id and the annotated spans
-            note[spans_key].sort(key=lambda _span: (_span['start'], _span['end']))
-            yield {'note_id': note_id, 'note_spans': note[spans_key]}
-def main() -> NoReturn:
-    cli_parser = ArgumentParser(description='configuration arguments provided at run time from the CLI')
-    cli_parser.add_argument(
-        '--input_file',
-        type=str,
-        required=True,
-        help='the the jsonl file that contains the notes'
-    )
-    cli_parser.add_argument(
-        '--metadata_key',
-        type=str,
-        default='meta',
-        help='the key where the note metadata is present in the json object'
-    )
-    cli_parser.add_argument(
-        '--note_id_key',
-        type=str,
-        default='note_id',
-        help='the key where the note id is present in the json object'
-    )
-    cli_parser.add_argument(
-        '--span_text_key',
-        type=str,
-        default='spans',
-        help='the key where the annotated spans for the notes are present in the json object'
-    )
-    cli_parser.add_argument(
-        '--output_file',
-        type=str,
-        required=True,
-        help='the file where the note id and the corresponding spans for that note are to be saved'
-    )
-    args = cli_parser.parse_args()
-    # Write the dataset to the output file
-    with open(args.output_file, 'w') as file:
-        for span_info in SpanValidation.get_spans(
-                input_file=args.input_file,
-                metadata_key=args.metadata_key,
-                note_id_key=args.note_id_key,
-                spans_key=args.spans_key):
-            file.write(json.dumps(span_info) + '\n')
-if __name__ == "__main__":
-    main()

sequence_tagging/.DS_Store DELETED Viewed

Binary file (6.15 kB)

sequence_tagging/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .sequence_tagger import SequenceTagger
2	- __all__ = ["SequenceTagger"]

sequence_tagging/__pycache__/__init__.cpython-37.pyc DELETED Viewed

Binary file (267 Bytes)

sequence_tagging/__pycache__/sequence_tagger.cpython-37.pyc DELETED Viewed

Binary file (13.6 kB)

sequence_tagging/arguments/__init__.py DELETED Viewed

@@ -1,8 +0,0 @@
-from .model_arguments import ModelArguments
-from .evaluation_arguments import EvaluationArguments
-from .data_training_arguments import DataTrainingArguments
-__all__ = [
-    "ModelArguments",
-    "DataTrainingArguments",
-    "EvaluationArguments",
-]

sequence_tagging/arguments/data_training_arguments.py DELETED Viewed

@@ -1,115 +0,0 @@
-from typing import Optional
-from dataclasses import dataclass, field
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-    task_name: Optional[str] = field(
-        default="ner",
-        metadata={"help": "The name of the task (ner, pos...)."}
-    )
-    notation: str = field(
-        default="BIO",
-        metadata={"help": "NER notation e.g BIO"},
-    )
-    ner_types: Optional[str] = field(
-        default=None,
-        metadata={"help": "Pass a list of NER types"},
-    )
-    train_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "The input training data file (a csv or JSON file)."}
-    )
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."},
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."},
-    )
-    output_predictions_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "A location where to write the output of the test data"},
-    )
-    text_column_name: Optional[str] = field(
-        default='tokens',
-        metadata={"help": "The column name of text to input in the file (a csv or JSON file)."}
-    )
-    label_column_name: Optional[str] = field(
-        default='labels',
-        metadata={"help": "The column name of label to input in the file (a csv or JSON file)."}
-    )
-    overwrite_cache: bool = field(
-        default=False,
-        metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether to pad all samples to model maximum sentence length. "
-            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
-            "efficient on GPU but very bad for TPU."
-        },
-    )
-    truncation: bool = field(
-        default=True,
-        metadata={
-            "help": "Activates and controls truncation"
-        },
-    )
-    max_length: int = field(
-        default=512,
-        metadata={
-            "help": "Controls the maximum length to use by one of the truncation/padding parameters."
-        },
-    )
-    do_lower_case: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether to lowercase the text"
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
-            "value if set."
-        },
-    )
-    max_predict_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
-            "value if set."
-        },
-    )
-    label_all_tokens: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
-            "one (in which case the other tokens will have a padding index)."
-        },
-    )
-    return_entity_level_metrics: bool = field(
-        default=True,
-        metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."},
-    )
-    token_ignore_label: str = field(
-        default='NA',
-        metadata={"help": "The label that indicates where the tokens will be ignored in loss computation. Used for "
-                          "indicating context tokens to the model"}
-    )

sequence_tagging/arguments/evaluation_arguments.py DELETED Viewed

@@ -1,26 +0,0 @@
-from typing import Optional
-from dataclasses import dataclass, field
-@dataclass
-class EvaluationArguments:
-    """
-    Arguments pertaining to the evaluation process.
-    """
-    model_eval_script: Optional[str] = field(
-        default=None,
-        metadata={"help": "The script that is used for evaluation"},
-    )
-    evaluation_mode: Optional[str] = field(
-        default=None,
-        metadata={"help": "Strict or default mode for sequence evaluation"},
-    )
-    validation_spans_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "A span evaluation data file to evaluate on span level (json file). This will contain a "
-                          "mapping between the note_ids and note spans"},
-    )
-    ner_type_maps: Optional[str] = field(
-        default=None,
-        metadata={"help": "List that contains the mappings between the original NER types to another set of NER "
-                          "types. Used mainly for evaluation. to map ner token labels to another set of ner token"},
-    )

sequence_tagging/arguments/model_arguments.py DELETED Viewed

@@ -1,43 +0,0 @@
-from typing import Optional
-from dataclasses import dataclass, field
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
-                    "with private models)."
-        },
-    )
-    post_process: str = field(
-        default='argmax',
-        metadata={"help": "What post processing to use on the model logits"},
-    )
-    threshold: Optional[float] = field(
-        default=None,
-        metadata={"help": "Threshold cutoff for softmax"},
-    )

sequence_tagging/dataset_builder/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-from .ner_labels import NERLabels
-from .ner_dataset import NERDataset
-from .label_mapper import LabelMapper
-from .dataset_tokenizer import DatasetTokenizer
-__all__=["NERLabels", "NERDataset", "LabelMapper", "DatasetTokenizer"]

sequence_tagging/dataset_builder/dataset_tokenizer.py DELETED Viewed

@@ -1,178 +0,0 @@
-from typing import Mapping, Sequence, List, Union, Optional, NoReturn
-from datasets import Dataset
-from transformers import PreTrainedTokenizerFast, PreTrainedTokenizer
-class DatasetTokenizer(object):
-    """
-    The main goal of this class is to solve the problem described below.
-    Most of the comments have been copied from the huggingface webpage.
-    What this class does is initialize a tokenizer with the desired parameters
-    and then tokenize our dataset and align the tokens with the labels
-    while keeping in mind the problem & solution described below. We can use this
-    function to train and for predictions - we just assume the predictions dataset
-    will have a label column filled with some values (so this code can be re-used).
-    Now we arrive at a common obstacle with using pre-trained models for
-    token-level classification: many of the tokens in the dataset may not
-    be in the tokenizer vocabulary. Bert and many models like it use a method
-    called WordPiece Tokenization, meaning that single words are split into multiple
-    tokens such that each token is likely to be in the vocabulary. For example,
-    the tokenizer would split the date (token) 2080 into the tokens ['208', '##0'].
-    This is a problem for us because we have exactly one tag per token (2080 -> B-DATE).
-    If the tokenizer splits a token into multiple sub-tokens, then we will end up with
-    a mismatch between our tokens and our labels (208, 0) - two tokens but one label (B-DATE).
-    One way to handle this is to only train on the tag labels for the first subtoken of a
-    split token. We can do this in huggingface Transformers by setting the labels
-    we wish to ignore to -100. In the example above, if the label for 2080 is B-DATE
-    and say the id (from the label to id mapping) for B-DATE is 3, we would set the labels
-    of ['208', '##0'] to [3, -100]. This tells the model to ignore the tokens labelled with
-    -100 while updating the weights etc.
-    """
-    def __init__(
-            self,
-            tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizer],
-            token_column: str,
-            label_column: str,
-            label_to_id: Mapping[str, int],
-            b_to_i_label: Sequence[int],
-            padding: Union[bool, str],
-            truncation: Union[bool, str],
-            is_split_into_words: bool,
-            max_length: Optional[int],
-            label_all_tokens: bool,
-            token_ignore_label: Optional[str]
-    ) -> NoReturn:
-        """
-        Set the tokenizer we are using to subword tokenizer our dataset. The name of the
-        column that contains the pre-split tokens, the name of the column that contains
-        the labels for each token, label to id mapping.
-        Set the padding strategy of the input. Set whether to truncate the input tokens.
-        Indicate whether the input is pre-split into tokens. Set the max length of the
-        input tokens (post subword tokenization). This will be used in conjunction with truncation.
-        Set whether we want to label even the sub tokens
-        In the description above we say for 2080 (B-DATE) - [208, ##0]
-        We do [3, -100] - which says assume to label of token 2080 is the one
-        predicted for 208 or we can just label both sub tokens
-        in which case it would be [3, 3] - so we would label 208 as DATE
-        and ##0 as DATE - then we would have to figure out how to merge these
-        labels etc
-        Args:
-            tokenizer (Union[PreTrainedTokenizerFast, PreTrainedTokenizer]): Tokenizer from huggingface
-            token_column (str): The column that contains the tokens in the dataset
-            label_column (str): The column that contains the labels in the dataset
-            label_to_id (Mapping[str, int]): The mapping between labels and ID
-            b_to_i_label (Sequence[int]): The mapping between labels and ID
-            padding (Union[bool, str]): Padding strategy
-            truncation (Union[bool, str]): Truncation strategy
-            is_split_into_words (bool): Is the input pre-split(tokenized)
-            max_length (Optional[int]): Max subword tokenized length for the model
-            label_all_tokens (bool): Whether to label sub words
-            token_ignore_label (str): The value of the token ignore label - we ignore these in the loss computation
-        """
-        self._tokenizer = tokenizer
-        self._token_column = token_column
-        self._label_column = label_column
-        self._label_to_id = label_to_id
-        self._b_to_i_label = b_to_i_label
-        # We can tell the tokenizer that we’re dealing with ready-split tokens rather than full
-        # sentence strings by passing is_split_into_words=True.
-        # Set the following parameters using the kwargs
-        self._padding = padding
-        self._truncation = truncation
-        self._is_split_into_words = is_split_into_words
-        self._max_length = max_length
-        self._label_all_tokens = label_all_tokens
-        self._token_ignore_label = token_ignore_label
-        self._ignore_label = -100
-    def tokenize_and_align_labels(self, dataset: Dataset) -> Dataset:
-        """
-        This function is the one that is used to read the input dataset
-        Run the subword tokenization on the pre-split tokens and then
-        as mentioned above align the subtokens and labels and add the ignore
-        label. This will read the input - say [60, year, old, in, 2080]
-        and will return the subtokens - [60, year, old, in, 208, ##0]
-        some other information like token_type_ids etc
-        and the labels [0, 20, 20, 20, 3, -100] (0 - corresponds to B-AGE, 20 corresponds to O
-        and 3 corresponds to B-DATE. This returned input serves as input for training the model
-        or for gathering predictions from a trained model.
-        Another important thing to note is that we have mentioned before that
-        we add chunks of tokens that appear before and after the current chunk for context. We would
-        also need to assign the label -100 (ignore_label) to these chunks, since we are using them
-        only to provide context. Basically if a token has the label NA, we don't use it for
-        training or evaluation. For example the input would be something
-        like tokens: [James, Doe, 60, year, old, in, 2080, BWH, tomorrow, only],
-        labels: [NA, NA, B-DATE, O, O, O, B-DATE, NA, NA, NA]. NA represents the tokens used for context
-        This function would return some tokenizer info (e.g attention mask etc), along with
-        the information that maps the tokens to the subtokens -
-        [James, Doe, 60, year, old, in, 208, ##0, BW, ##h, tomorrow, only]
-        and the labels - [-100, -100, 0, 20, 20, 20, 3, -100, -100, -100]
-        (if label_all_tokens was true, we would return [-100, -100, 0, 20, 20, 20, 3, 3, -100, -100]).
-        Args:
-            dataset (Dataset): The pre-split (tokenized dataset) that contain labels
-        Returns:
-            tokenized_inputs (Dataset): Subword tokenized and label aligned dataset
-        """
-        # Run the tokenizer - subword tokenization
-        tokenized_inputs = self._tokenizer(
-            dataset[self._token_column],
-            padding=self._padding,
-            truncation=self._truncation,
-            max_length=self._max_length,
-            is_split_into_words=self._is_split_into_words,
-        )
-        # Align the subwords and tokens
-        labels = [self.__get_labels(
-            labels,
-            tokenized_inputs.word_ids(batch_index=index)
-        ) for index, labels in enumerate(dataset[self._label_column])]
-        tokenized_inputs[self._label_column] = labels
-        return tokenized_inputs
-    def __get_labels(
-            self,
-            labels: Sequence[str],
-            word_ids: Sequence[int]
-    ) -> List[int]:
-        """
-        Go thorough the subword tokens - which are given as word_ids. 2 different tokens
-        2080 & John will have different word_ids, but the subword tokens 2080 & ##0 will
-        have the same word_id, we use this to align and assign the labels accordingly.
-        if the subword tokens belong to [CLS], [SEP] append the ignore label (-100) to the
-        list of labels. If the (2080) subword token (##0) belongs to a token - 2080
-        then the labels would be [3, -100] if label_all_tokens is false. Also if the token
-        is used only for context (with label NA) it would get the value -100 for its label
-        Args:
-            labels (Sequence[str]): The list of labels for the input (example)
-            word_ids (Sequence[int]): The word_ids after subword tokenization of the input
-        Returns:
-            label_ids (List[int]): The list of label ids for the input with the ignore label (-100) added
-                                       as required.
-        """
-        label_ids = list()
-        previous_word_idx = None
-        for word_idx in word_ids:
-            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
-            # ignored in the loss function.
-            if word_idx is None:
-                label_ids.append(self._ignore_label)
-            # We set the label for the first token of each word.
-            elif word_idx != previous_word_idx:
-                if labels[word_idx] == self._token_ignore_label:
-                    label_ids.append(self._ignore_label)
-                else:
-                    label_ids.append(self._label_to_id[labels[word_idx]])
-            # For the other tokens in a word, we set the label to either the current label or -100, depending on
-            # the label_all_tokens flag.
-            else:
-                if labels[word_idx] == self._token_ignore_label:
-                    label_ids.append(self._ignore_label)
-                else:
-                    label_ids.append(
-                        self._b_to_i_label[self._label_to_id[labels[word_idx]]]
-                        if self._label_all_tokens else self._ignore_label
-                    )
-            previous_word_idx = word_idx
-        return label_ids

sequence_tagging/dataset_builder/label_mapper.py DELETED Viewed

@@ -1,87 +0,0 @@
-from typing import List, Sequence, Mapping, Optional, NoReturn, Dict, Union
-from .ner_labels import NERLabels
-class LabelMapper(object):
-    """
-    This class is used to map one set of NER labels to another set of NER labels
-    For example we might want to map all NER labels to Binary HIPAA labels.
-    E.g:
-    We change the token labels - [B-AGE, O, O, U-LOC, B-DATE, L-DATE, O, B-STAFF, I-STAFF, L-STAFF] to
-    [B-HIPAA, O, O, U-HIPAA, B-HIPAA, I-HIPAA, O, O, O, O] or if we wanted binary I2B2 labels we map it to
-    [B-I2B2, O, O, U-I2B2, B-I2B2, -I2B2, O, B-I2B2, I-I2B2, L-I2B2]
-    We do this mapping at the token and the span level. That is we have a span from says start=9, end=15
-    labelled as LOC, we map this label to HIPAA or I2B2. This class maps an exisitng set of labels to
-    another set of labels
-    """
-    def __init__(
-            self,
-            notation,
-            ner_types: Sequence[str],
-            ner_types_maps: Sequence[str],
-            description: str
-    ) -> NoReturn:
-        """
-        Initialize the variables that will be used to map the NER labels and spans
-        The ner_map and spans_map should correspond to each other and contain the same NER types
-        Args:
-        """
-        self._description = description
-        self._types = list(set(ner_types_maps))
-        self._types.sort()
-        self._spans_map = {ner_type: ner_type_map for ner_type, ner_type_map in zip(ner_types, ner_types_maps)}
-        ner_labels = NERLabels(notation=notation, ner_types=ner_types)
-        self._ner_map = dict()
-        for label in ner_labels.get_label_list():
-            if label == 'O' or self._spans_map[label[2:]] == 'O':
-                self._ner_map[label] = 'O'
-            else:
-                self._ner_map[label] = label[0:2] + self._spans_map[label[2:]]
-    def map_sequence(self, tag_sequence: Sequence[str]) -> List[str]:
-        """
-        Mapping a sequence of NER labels to another set of NER labels.
-        E.g: If we use a binary HIPAA mapping
-        This sequence [B-AGE, O, O, U-LOC, B-DATE, L-DATE, O, B-STAFF, I-STAFF, L-STAFF] will be mapped to
-        [B-HIPAA, O, O, U-HIPAA, B-HIPAA, I-HIPAA, O, O, O, O]
-        Return the original sequence if no mapping is used (i.e the maps are == None)
-        Args:
-            tag_sequence (Sequence[str]): A sequence of NER labels
-        Returns:
-            (List[str]): A mapped sequence of NER labels
-        """
-        # Return the original sequence if no mapping is used
-        return [self._ner_map[tag] for tag in tag_sequence]
-    def map_spans(self, spans: Sequence[Mapping[str, Union[str, int]]]) -> Sequence[Dict[str, Union[str, int]]]:
-        """
-        Mapping a sequence of NER spans to another set of NER spans.
-        E.g: If we use a binary HIPAA mapping
-        The spans: [{start:0, end:5, label: DATE}, {start:17, end:25, label: STAFF}, {start:43, end:54, label: PATIENT}]
-        will be mapped to: [{start:0, end:5, label: HIPAA}, {start:17, end:25, label: O}, {start:43, end:54, label: HIPAA}]
-        Return the original list of spans if no mapping is used (i.e the maps are == None)
-        Args:
-            spans (Sequence[Mapping[str, Union[str, int]]]): A sequence of NER spans
-        Returns:
-            (Sequence[Mapping[str, Union[str, int]]]): A mapped sequence of NER spans
-        """
-        return [{'start': span['start'], 'end': span['end'], 'label': self._spans_map[span['label']]} \
-                for span in spans]
-    def get_ner_description(self) -> str:
-        """
-        Get the description of the ner labels and span maps used
-        Returns:
-            (str): A description of the label/span maps used
-        """
-        return self._description
-    def get_ner_types(self) -> List[str]:
-        """
-        Get the PHI types back from the list of NER labels
-        [B-AGE, I-AGE, B-DATE, I-DATE ..] ---> [AGE, DATE, ...]
-        Returns:
-            ner_types (List[str]): The list of unique NER types
-        """
-        return self._types

sequence_tagging/dataset_builder/ner_dataset.py DELETED Viewed

@@ -1,102 +0,0 @@
-from typing import Sequence, Optional, NoReturn
-from datasets import load_dataset, Dataset
-class NERDataset(object):
-    """
-    This class is a wrapper around the huggingface datasets library
-    It maintains the train, validation and test datasets based on the
-    train, validation and test files passed by loading the dataset object
-    from the file and provides a get function to access each of the datasets.
-    """
-    def __init__(
-            self,
-            train_file: Optional[Sequence[str]] = None,
-            validation_file: Optional[Sequence[str]] = None,
-            test_file: Optional[Sequence[str]] = None,
-            extension: str = 'json',
-            shuffle: bool = True,
-            seed: int = 41
-    ) -> NoReturn:
-        """
-        Load the train, validation and test datasets from the files passed. Read the files and convert
-        it into a huggingface dataset.
-        Args:
-            train_file (Optional[Sequence[str]]): The list of files that contain train data
-            validation_file (Optional[Sequence[str]]): The list of files that contain validation data
-            test_file (Optional[Sequence[str]]): The list of files that contain test data
-            shuffle (bool): Whether to shuffle the dataset
-            seed (int): Shuffle seed
-        """
-        self._datasets = NERDataset.__prepare_data(
-            train_file,
-            validation_file,
-            test_file,
-            extension,
-            shuffle,
-            seed
-        )
-    @staticmethod
-    def __prepare_data(
-            train_file: Optional[Sequence[str]],
-            validation_file: Optional[Sequence[str]],
-            test_file: Optional[Sequence[str]],
-            extension: str,
-            shuffle: bool,
-            seed: int
-    ) -> Dataset:
-        """
-        Get the train, validation and test datasets from the files passed. Read the files and convert
-        it into a huggingface dataset.
-        Args:
-            train_file (Optional[Sequence[str]]): The list of files that contain train data
-            validation_file (Optional[Sequence[str]]): The list of files that contain validation data
-            test_file (Optional[Sequence[str]]): The list of files that contain test data
-            shuffle (bool): Whether to shuffle the dataset
-            seed (int): Shuffle seed
-        Returns:
-            (Dataset): The huggingface dataset with train, validation, test splits (if included)
-        """
-        # Read the datasets (train, validation, test etc).
-        data_files = {}
-        if train_file is not None:
-            data_files['train'] = train_file
-        if validation_file is not None:
-            data_files['validation'] = validation_file
-        if test_file is not None:
-            data_files['test'] = test_file
-        # Shuffle the dataset
-        if shuffle:
-            datasets = load_dataset(extension, data_files=data_files).shuffle(seed=seed)
-        else:
-            # Don't shuffle the dataset
-            datasets = load_dataset(extension, data_files=data_files)
-        return datasets
-    def get_train_dataset(self) -> Dataset:
-        """
-        Return the train dataset
-        Returns:
-            (Dataset): The huggingface dataset - train split
-        """
-        return self._datasets['train']
-    def get_validation_dataset(self) -> Dataset:
-        """
-        Return the validation dataset
-        Returns:
-            (Dataset): The huggingface dataset - validation split
-        """
-        return self._datasets['validation']
-    def get_test_dataset(self) -> Dataset:
-        """
-        Return the test dataset
-        Returns:
-            (Dataset): The huggingface dataset - test split
-        """
-        return self._datasets['test']

sequence_tagging/dataset_builder/ner_labels.py DELETED Viewed

@@ -1,67 +0,0 @@
-from typing import Sequence, List, NoReturn, Dict
-class NERLabels(object):
-    """
-    Prepare the labels that will be used by the model. Parse the NER types
-    and prepare the NER labels. For example - NER Types: [AGE, DATE],
-    it will create a list like this (for BIO notation) [B-AGE, I-AGE, B-DATE, I-DATE, O]
-    These are the labels that will be assigned to the tokens based on the PHI type.
-    Say we had the following NER types: NAME, AGE, HOSP
-    The NER labels in the BIO notation would be B-AGE, B-HOSP, B-NAME, I-AGE, I-HOSP, I-NAME, O
-    This script creates a list of the NER labels ([B-AGE, B-HOSP, B-NAME, I-AGE, I-HOSP, I-NAME, O])
-    based on the NER types (NAME, AGE, HOSP) that have been defined. Labels have been sorted.
-    The script also returns the number of labels, the label_to_id mapping, the id_to_label mapping
-    Label_id_mapping: {B-AGE:0, B-HOSP:1, B-NAME:2, I-AGE:3, I-HOSP:4, I-NAME:5, O:6}
-    This information will be used  during training, evaluation and prediction.
-    """
-    def __init__(self, notation: str, ner_types: Sequence[str]) -> NoReturn:
-        """
-        Initialize the notation that we are using for the NER task
-        Args:
-            notation (str): The notation that will be used for the NER labels
-            ner_types (Sequence[str]): The list of NER categories
-        """
-        self._notation = notation
-        self._ner_types = ner_types
-    def get_label_list(self) -> List[str]:
-        """
-        Given the NER types return the NER labels.
-        NER Types: [AGE, DATE] -> return a list like this (for BIO notation) [B-AGE, I-AGE, B-DATE, I-DATE, O]
-        Returns:
-            ner_labels (List[str]): The list of NER labels based on the NER notation (e.g BIO)
-        """
-        # Add the 'O' (Outside - Non-phi) label to the list
-        if 'O' not in self._ner_types:
-            ner_labels = ['O']
-        else:
-            ner_labels = list()
-        # Go through each label and prefix it based on the notation (e.g - B, I etc)
-        for ner_type in self._ner_types:
-            for ner_tag in list(self._notation):
-                if ner_tag != 'O':
-                    ner_labels.append(ner_tag + '-' + ner_type)
-        ner_labels.sort()
-        return ner_labels
-    def get_label_to_id(self) -> Dict[str, int]:
-        """
-        Return a label to id mapping
-        Returns:
-            label_to_id (Dict[str, int]): label to id mapping
-        """
-        labels = self.get_label_list()
-        label_to_id = {label: index_id for index_id, label in enumerate(labels)}
-        return label_to_id
-    def get_id_to_label(self) -> Dict[int, str]:
-        """
-        Return a id to label mapping
-        Returns:
-            id_to_label (Dict[int, str]): id to label mapping
-        """
-        labels = self.get_label_list()
-        id_to_label = {index_id: label for index_id, label in enumerate(labels)}
-        return id_to_label