Gurveer05 commited on Dec 5, 2023

Commit

bf1f674

•

1 Parent(s): 60e7251

Added pred func

Browse files

Files changed (26) hide show

config.yaml +96 -0
data/.gitkeep +0 -0
data/test.txt +1 -0
models/.gitkeep +0 -0
{byte-level-bpe-tokenizer → models/byte-level-bpe-tokenizer}/merges.txt +0 -0
{byte-level-bpe-tokenizer → models/byte-level-bpe-tokenizer}/vocab.json +0 -0
{transformer → models/transformer}/language-model/config.json +0 -0
{transformer → models/transformer}/language-model/pytorch_model.bin +0 -0
{transformer → models/transformer}/language-model/training_args.bin +0 -0
{transformer → models/transformer}/prediction-model/config.json +0 -0
{transformer → models/transformer}/prediction-model/pytorch_model.bin +0 -0
{transformer → models/transformer}/prediction-model/training_args.bin +0 -0
module/.gitkeep +0 -0
module/__pycache__/config.cpython-311.pyc +0 -0
module/__pycache__/dataio.cpython-311.pyc +0 -0
module/__pycache__/metrics.cpython-311.pyc +0 -0
module/__pycache__/models.cpython-311.pyc +0 -0
module/__pycache__/transformers_utility.cpython-311.pyc +0 -0
module/__pycache__/utils.cpython-311.pyc +0 -0
module/config.py +53 -0
module/dataio.py +138 -0
module/metrics.py +45 -0
module/models.py +441 -0
module/transformers_utility.py +90 -0
module/utils.py +264 -0
prediction.py +58 -0

config.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+# Project-wide configuration settings
+# Variables for train-test-split
+TRAIN_SIZE: 0.7
+# General parameters
+max_len: 1000
+num_tissues: 8
+expressed_threshold: 0.1
+random_seed: 766
+dnabert:
+    max_seq_len: 512
+    kmer: 6
+    test_size: 0.2
+tokenizer:
+    vocab_size: 5000
+data:
+    max_seq_len: 1000
+    test_size: 0.2
+    num_labels: 8
+training:
+    pretrain:
+        num_train_epochs: 3
+        per_device_train_batch_size: 64
+        per_device_eval_batch_size: 64
+        fp16: true
+        logging_steps: 50
+        eval_steps: 200
+        save_steps: 100
+        save_total_limit: 20
+        gradient_accumulation_steps: 25
+        learning_rate: 1.e-4
+        weight_decay: 0
+        adam_epsilon: 1.e-8
+        max_grad_norm: 10
+        warmup_steps: 50
+        optimizer: "lamb"
+        scheduler: "linear"
+        mlm_prob: 0.15
+    finetune:
+        # num_train_epochs: 10
+        num_train_epochs: 3
+        per_device_train_batch_size: 64
+        per_device_eval_batch_size: 8
+        fp16: true
+        logging_steps: 50
+        eval_steps: 500
+        save_steps: 500
+        save_total_limit: 10
+        # gradient_accumulation_steps: 1
+        gradient_accumulation_steps: 10
+        eval_accumulation_steps: 64
+        learning_rate: 1.e-3
+        # learning_rate: 1.e-1
+        # lr: 1.e-3
+        betas:
+            - 0.9
+            - 0.999
+        eps: 1.e-8
+        weight_decay: 0
+        adam_epsilon: 1.e-8
+        max_grad_norm: 10
+        warmup_steps: 200
+        num_cooldown_steps: 2000
+        optimizer: "lamb"
+        # optimizer: "adamw"
+        # scheduler: "delay"
+        scheduler: "constant"
+        # num_param_groups: 0
+        # param_group_size: 2 # Except for the classification head, which has param_group_size == 1
+        delay_size: 0
+models:
+    roberta-base:
+        num_attention_heads: 6
+        num_hidden_layers: 6
+        type_vocab_size: 1
+        block_size: 258
+        max_tokenized_len: 256
+    roberta-lm: {}
+    roberta-pred: {}
+    roberta-pred-mean-pool:
+        hidden_dropout_prob: 0.2
+        output_mode: "regression"
+        # For sparse (bce + mse) loss
+        # output_mode: "sparse"
+        threshold: 1
+        alpha: 0.1
+    dnabert-base:
+        block_size: 512
+        max_tokenized_len: 510
+    dnabert-lm: {}
+    dnabert-pred: {}
+    dnabert-pred-mean-pool:
+        hidden_dropout_prob: 0.2
+        output_mode: "regression"

data/.gitkeep ADDED Viewed

File without changes

data/test.txt ADDED Viewed

	@@ -0,0 +1 @@

+ CTCAAGCTGAGCAGTGGGTTTGCTCTGGAGGGGAAGCTCAACGGTGGCGACAAGGAAGAATCTGCTTGCGAGGCGAGCCCTGACGCCGCTGATAGCGACCAAAGGTGGATTAAACAACCCATTTCATCATTCTTCTTCCTTGTTAGTTATGATTCCCACGCTTGCCTTTCATGAATCATGATCCTATATGTATATTGATATTAATCAGTTCTAGAAAGTTCAACAACATTTGAGCATGTCAAAACCTGATCGTTGCCTGTTCCATGTCAACAGTGGATTATAACACGTGCAAATGTAGCTATTTGTGTGAGAAGACGTGTGATCGACTCTTTTTTTATATAGATAGCATTGAGATCAACTGTTTGTATATATCTTGTCATAACATTTTTACTTCGTAGCAACGTACGAGCGTTCACCTATTTGTATATAAGTTATCATGATATTTATAAGTTACCGTTGCAACGCACGGACACTCACCTAGTATAGTTTATGTATTACAGTACTAGGAGCCCTAGGCTTCCAATAACTAGAAAAAGTCCTGGTCAGTCGAACCAAACCACAATCCGACGTATACATTCTGGTTCCCCCACGCCCCCATCCGTTCGATTCA

models/.gitkeep ADDED Viewed

File without changes

{byte-level-bpe-tokenizer → models/byte-level-bpe-tokenizer}/merges.txt RENAMED Viewed

File without changes

{byte-level-bpe-tokenizer → models/byte-level-bpe-tokenizer}/vocab.json RENAMED Viewed

File without changes

{transformer → models/transformer}/language-model/config.json RENAMED Viewed

File without changes

{transformer → models/transformer}/language-model/pytorch_model.bin RENAMED Viewed

File without changes

{transformer → models/transformer}/language-model/training_args.bin RENAMED Viewed

File without changes

{transformer → models/transformer}/prediction-model/config.json RENAMED Viewed

File without changes

{transformer → models/transformer}/prediction-model/pytorch_model.bin RENAMED Viewed

File without changes

{transformer → models/transformer}/prediction-model/training_args.bin RENAMED Viewed

File without changes

module/.gitkeep ADDED Viewed

File without changes

module/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (1.78 kB). View file

module/__pycache__/dataio.cpython-311.pyc ADDED Viewed

Binary file (6.98 kB). View file

module/__pycache__/metrics.cpython-311.pyc ADDED Viewed

Binary file (3.01 kB). View file

module/__pycache__/models.cpython-311.pyc ADDED Viewed

Binary file (17.7 kB). View file

module/__pycache__/transformers_utility.cpython-311.pyc ADDED Viewed

Binary file (4.02 kB). View file

module/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (12.6 kB). View file

module/config.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from pathlib import Path
+import yaml
+import random
+import numpy as np
+import torch
+root = Path(__file__).parent.parent
+data = root / 'data'
+models = root / 'models'
+notebooks = root / 'notebooks'
+scripts = root / 'scripts'
+output = root / 'output'
+docs = root / 'docs'
+# Data specific paths
+data_raw = data / 'raw'
+data_processed = data / 'processed'
+data_final = data / 'final'
+# Location of tools
+libs = root / 'libs'
+samtools = libs / 'samtools'
+bedtools = libs / 'bedtools'
+dnabert = root / 'DNABERT'
+# Locations of specific files
+bpe_tokenizer = data_final / 'tokenizer' / 'maize_bpe_full.tokenizer.json'
+# Loading settings
+settings = yaml.full_load((root / 'config.yaml').open('r'))
+# Setting random seeds across the whole project
+random_seed = settings['random_seed']
+random.seed(random_seed)
+np.random.seed(random_seed)
+torch.manual_seed(random_seed)
+def reload_settings():
+    global settings
+    settings = yaml.full_load((root / 'config.yaml').open('r'))
+tissues = [
+    'tassel',
+    'base',
+    'anther',
+    'middle',
+    'ear',
+    'shoot',
+    'tip',
+    'root'
+]

module/dataio.py ADDED Viewed

	@@ -0,0 +1,138 @@

+""" Utilities for reading and writing data files.
+"""
+import multiprocessing as mp
+import os
+from pathlib import PosixPath
+from typing import Callable, Dict, List, Optional, Tuple, Union
+from datasets import load_dataset
+from torch.utils.data import Dataset
+from transformers import (
+    DataCollatorForLanguageModeling,
+    PreTrainedTokenizer,
+    default_data_collator,
+)
+from . import config
+# To avoid huggingface warning
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+UBUNTU_ROOT = str(config.root)
+def load_datasets(
+    tokenizer: PreTrainedTokenizer,
+    train_data: Union[str, PosixPath],
+    eval_data: Optional[Union[str, PosixPath]] = None,
+    test_data: Union[str, PosixPath] = None,
+    file_type: str = "csv",
+    delimiter: str = "\t",
+    seq_key: str = "sequence",
+    shuffle: bool = True,
+    filter_empty: bool = False,
+    n_workers: int = mp.cpu_count(),
+    **kwargs,
+) -> Dataset:
+    """Load and cache data using Huggingface datasets library
+    Args:
+        tokenizer (PreTrainedTokenizer): tokenizer to apply to the sequences
+        train_data (Union[str, PosixPath]): location of training data
+        eval_data (Union[str, PosixPath], optional): location of evaluation data. Defaults to None.
+        test_data (Union[str, PosixPath], optional): location of test data. Defaults to None.
+        file_type (str, optional): type of file. Possible values are 'text' and 'csv'. Defaults to 'csv'.
+        delimiter (str, optional): Defaults to '\t'.
+        seq_key (str, optional): Column name of sequence data Can be 'sequence', 'seq', or 'text'. Defaults to 'sequence'.
+        shuffle (bool, optional): Whether to shuffle the dataset. Defaults to True.
+        filter_empty (bool, optional): Whether to filter out empty sequences. Defaults to False.
+            NOTE: This completes an additional iteration, which can be time-consuming.
+            Only enable if you have reason to believe that preprocessing steps will
+            result in empty sequences.
+        transformation (str, optional): type of transformation to apply.
+            Options are 'log', 'boxcox'. Defaults to None.
+        log_offset (Union[float, int]): value to offset gene expression values
+            by before log transforming. Defaults to 1.
+        preprocessor (BaseEstimator): preprocessor Yeoh-Johnson transformation.
+        tissue_subset (Union[str, int, list], optional): tissues to subset labels to.
+            Defaults to None.
+        nshards (int, optional): Number of shards to divide data into, only
+            keeping the first. Defaults to None.
+        threshold (float, optional): filter out rows where all labels are
+            below `threshold`. OR if `discretize` is True, see `discretize`.
+            Defaults to None.
+        discretize (bool, optional): set gene expression values below
+            `threshold` to 0, above `threshold` to 1.
+        kmer (int, optional): whether to run the kmer flip experiment and if so,
+            how large kmers to flip. Defaults to None.
+        n_workers (int, optional): number of processes to use for preprocessing.
+            Defaults to `mp.cpu_count()` (number of available CPUs).
+        position_buckets (Tuple[int], optional): the different buckets for the bucketed
+            positional importance experiment
+    Returns:
+        Dataset
+    """
+    data_files = {"train": str(train_data)}
+    if eval_data:
+        data_files["eval"] = str(eval_data)
+    if test_data:
+        data_files["test"] = str(test_data)
+    if file_type == "csv":
+        kwargs.update({"delimiter": delimiter})
+    datasets = load_dataset(file_type, data_files=data_files, **kwargs)
+    # Tokenizing
+    preprocess_fn = make_preprocess_function(tokenizer, seq_key=seq_key)
+    # print("Tokenizing...")
+    datasets = datasets.map(preprocess_fn, batched=True, num_proc=n_workers)
+    if filter_empty:
+        datasets = datasets.filter(filter_empty_sequence)
+    if shuffle:
+        seed = config.settings["random_seed"]
+        datasets = datasets.shuffle(seeds={"train": seed, "eval": seed, "test": seed})
+    return datasets
+def make_preprocess_function(tokenizer, seq_key: str = "sequence") -> callable:
+    """Make a preprocessing function that selects the appropriate column and
+    tokenizes it.
+    Args:
+        tokenizer (PreTrainedTokenizer): tokenizer to apply to each sequence
+        seq_key (str, optional): column name of the text data. Defaults to 'sequence'.
+    Returns:
+        callable: preprocessing function
+    """
+    def preprocess_function(examples):
+        if seq_key:
+            seqs = examples[seq_key]
+        else:
+            seqs = examples
+        return tokenizer(
+            seqs,
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+            padding="max_length",
+        )
+    return preprocess_function
+def filter_empty_sequence(example: dict) -> bool:
+    """Filter out empty sequences."""
+    # sum(example['attention_mask']) gives the number of tokens, including SOS and EOS
+    return sum(example["attention_mask"]) > 2
+def load_data_collator(model_type: str, tokenizer=None, mlm_prob=None):
+    if model_type == "language-model":
+        assert (
+            tokenizer is not None
+        ), "tokenizer must not be None if model is type language-model"
+        assert (
+            mlm_prob is not None
+        ), "mlm_prob must not be None if model is type language-model"
+        return DataCollatorForLanguageModeling(
+            tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob
+        )
+    else:
+        return default_data_collator

module/metrics.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""Reusable metrics functions for evaluating models
+"""
+import multiprocessing as mp
+from typing import List
+import torch
+from torch.utils.data import DataLoader
+from transformers import default_data_collator
+from tqdm import tqdm
+def get_predictions(
+    model: torch.nn.Module,
+    dataset: torch.utils.data.Dataset,
+) -> List:
+    """Compute model predictions for `dataset`.
+    Args:
+        model (torch.nn.Module): Model to evaluate
+        dataset (torch.utils.data.Dataset): Dataset to get predictions for
+        return_labels (bool, optional): Whether to return the labels (predictions are always returned).
+            Defaults to True.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: 'true_labels', 'pred_labels'
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()
+    loader = DataLoader(
+        dataset,
+        batch_size=64,
+        collate_fn=default_data_collator,
+        drop_last=False,
+        num_workers=mp.cpu_count(),
+    )
+    pred_labels = []
+    for batch in tqdm(loader):
+        inputs = {k: batch[k].to(device) for k in ["attention_mask", "input_ids"]}
+        with torch.no_grad():
+            outputs = model(**inputs)
+        del inputs  # to free up space on GPU
+        logits = outputs[0]
+        pred_labels.append([round(e, 4) for e in logits.cpu().tolist()[0]])
+    return pred_labels

module/models.py ADDED Viewed

	@@ -0,0 +1,441 @@

+"""
+Modified HuggingFace transformer model classes
+"""
+from typing import Tuple
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import BCELoss, BCEWithLogitsLoss, MSELoss, PoissonNLLLoss, KLDivLoss
+from transformers import BertConfig, BertModel, RobertaConfig, RobertaModel
+from transformers import BertPreTrainedModel
+from transformers.modeling_outputs import SequenceClassifierOutput
+from transformers import RobertaPreTrainedModel
+class RobertaMeanPoolConfig(RobertaConfig):
+    model_type = "roberta"
+    def __init__(
+        self,
+        output_mode="regression",
+        freeze_base=True,
+        start_token_idx=0,
+        end_token_idx=1,
+        threshold=1,
+        alpha=0.5,
+        log_offset=1,
+        batch_norm=False,
+        **kwargs,
+    ):
+        """Constructs RobertaConfig."""
+        super().__init__(**kwargs)
+        self.output_mode = output_mode
+        self.freeze_base = freeze_base
+        self.start_token_idx = start_token_idx
+        self.end_token_idx = end_token_idx
+        self.threshold = threshold
+        self.alpha = alpha
+        self.log_offset = log_offset
+        self.batch_norm = batch_norm
+class ClassificationHeadMeanPool(nn.Module):
+    """Head for sentence-level classification tasks.
+    Modifications:
+        1. Using mean-pooling over tokens instead of CLS token
+        2. Multi-output regression
+    """
+    def __init__(self, config: RobertaMeanPoolConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dense2 = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+        self.start_token_idx = config.start_token_idx
+        self.end_token_idx = config.end_token_idx
+        self.batch_norm = (
+            nn.BatchNorm1d(config.hidden_size) if config.batch_norm else None
+        )
+        if self.batch_norm is not None:
+            print("Using batch_norm")
+    def forward(self, features, attention_mask=None, input_ids=None, **kwargs):
+        x = self.embed(features, attention_mask, input_ids, **kwargs)
+        x = self.out_proj(x)
+        return x
+    def embed(self, features, attention_mask=None, input_ids=None, **kwargs):
+        attention_mask[input_ids == self.start_token_idx] = 0
+        attention_mask[input_ids == self.end_token_idx] = 0
+        x = torch.sum(features * attention_mask.unsqueeze(2), dim=1) / torch.sum(
+            attention_mask, dim=1, keepdim=True
+        )  # Mean pooling over non-padding tokens
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        # Batchnorm
+        x = self.normalize(x)
+        # Second linear layer
+        x = self.dense2(x)
+        x = torch.tanh(x)
+        return x
+    def normalize(self, x: torch.Tensor) -> torch.Tensor:
+        if self.batch_norm is not None:
+            return self.batch_norm(x)
+        return x
+class ClassificationHeadMeanPoolSparse(nn.Module):
+    """Classification head that predicts binary outcome (expressed/not)
+    and real-valued gene expression values.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.classification_head = ClassificationHeadMeanPool(config)
+        self.regression_head = ClassificationHeadMeanPool(config)
+    def forward(
+        self, features, attention_mask=None, input_ids=None, **kwargs
+    ) -> Tuple[torch.Tensor]:
+        """Compute binarized logits and real-valued gene expressions for each tissue.
+        Args:
+            features (torch.Tensor): outputs of RoBERTa
+            attention_mask (Optional[torch.Tensor]): attention mask for sentence
+            input_ids (Optional[torch.Tensor]): original sequence inputs
+        Returns:
+            (torch.Tensor): classification logits (whether gene is expressed/not for tissue)
+            (torch.Tensor): gene expression value predictions (real-valued)
+        """
+        # Consider using .clone().detach()
+        attention_mask_copy = attention_mask.clone()
+        return (
+            self.classification_head(
+                features, attention_mask=attention_mask, input_ids=input_ids, **kwargs
+            ),
+            self.regression_head(
+                features,
+                attention_mask=attention_mask_copy,
+                input_ids=input_ids,
+                **kwargs,
+            ),
+        )
+class SparseMSELoss(nn.Module):
+    """Custom loss function that takes in two inputs:
+    1. Predicted logits for whether gene is expressed (1) or not (0)
+    2. Real-valued log-TPM values for gene expression predictions.
+    """
+    def __init__(self, threshold: float = 1, alpha: float = 0.5):
+        """
+        Args:
+            threshold (float): any value below this threshold (in natural
+                scale, NOT log-scale) is considered "not expressed"
+            alpha (float): parameter controlling importance of classification
+                in overall accuracy. alpha == 1 means this is identical to
+                classification. alpha == 0 means this is identical to regression.
+        """
+        super().__init__()
+        self.threshold = np.log(threshold)
+        self.alpha = alpha
+        self.mse = MSELoss()
+        self.bce = BCEWithLogitsLoss()
+    def forward(self, logits: Tuple[torch.Tensor], labels: torch.Tensor):
+        classification_outputs, regression_outputs = logits
+        binarized_labels = (labels >= self.threshold).float()
+        mse_loss = self.mse(regression_outputs, labels)
+        bce_loss = self.bce(classification_outputs, binarized_labels)
+        # Weight the losses by the logits
+        # the mse loss should be weighted by the probability of being expressed
+        # the bce loss should be weighted by the probability of not being expressed
+        loss = self.alpha * bce_loss + (1 - self.alpha) * mse_loss
+        return loss
+class ZeroInflatedNegativeBinomialNLL(nn.Module):
+    """Custom loss function that calculates the negative log-likelihood
+    according to a zero-inflated negative binomial model.
+    """
+    pass
+# -------------------------------------- #
+#                                        #
+# ---------- Modified RoBERTa ---------- #
+#                                        #
+# -------------------------------------- #
+class RobertaForSequenceClassificationMeanPool(RobertaPreTrainedModel):
+    """RobertaForSequenceClassification using modified classification head
+    Args:
+        RobertaPreTrainedModel ([type]): [description]
+    Returns:
+        [type]: [description]
+    """
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def __init__(self, config: RobertaMeanPoolConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.output_mode = config.output_mode or "regression"
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.threshold = config.threshold
+        self.alpha = config.alpha
+        self.log_offset = config.log_offset
+        if self.output_mode == "sparse":
+            self.classifier = ClassificationHeadMeanPoolSparse(config)
+        else:
+            self.classifier = ClassificationHeadMeanPool(config)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(
+            sequence_output, attention_mask=attention_mask, input_ids=input_ids
+        )
+        loss = None
+        if labels is not None:
+            if self.output_mode == "regression":
+                loss_fct = MSELoss()
+            elif self.output_mode == "sparse":
+                loss_fct = SparseMSELoss(threshold=self.threshold, alpha=self.alpha)
+            elif self.output_mode == "classification":
+                loss_fct = BCEWithLogitsLoss()
+            elif self.output_mode == "poisson":
+                loss_fct = PoissonNLLLoss()
+            loss = loss_fct(
+                logits.view(-1, self.num_labels), labels.view(-1, self.num_labels)
+            )
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def embed(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        """Embed sequences by running the `forward` method up to the dense layer of the classifier"""
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        embeddings = self.classifier.embed(
+            sequence_output, attention_mask=attention_mask, input_ids=input_ids
+        )
+        return embeddings
+    def get_tissue_embeddings(self):
+        return self.classifier.out_proj.weight.detach()
+    def predict(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        logits = self.forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )[0]
+        if self.output_mode == "sparse":
+            binary_logits, pred_values = logits
+            # Convert logits to binary predictions
+            binary_preds = binary_logits < 0
+            # return binary_preds * pred_values
+            pred_values[binary_preds] = np.log(self.log_offset)
+            return pred_values
+        return logits
+# -------------------------------------- #
+#                                        #
+# ----------  Modified BERT  ----------- #
+#                                        #
+# -------------------------------------- #
+class BertMeanPoolConfig(BertConfig):
+    model_type = "bert"
+    def __init__(
+        self, output_mode="regression", start_token_idx=2, end_token_idx=3, **kwargs
+    ):
+        """Constructs BertConfig."""
+        super().__init__(**kwargs)
+        self.output_mode = output_mode
+        self.start_token_idx = start_token_idx
+        self.end_token_idx = end_token_idx
+class BertForSequenceClassificationMeanPool(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.output_mode = config.output_mode or "regression"
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = ClassificationHeadMeanPool(config)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[0]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(
+            pooled_output, attention_mask=attention_mask, input_ids=input_ids
+        )
+        loss = None
+        if labels is not None:
+            if self.output_mode == "regression":
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = BCELoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

module/transformers_utility.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from pathlib import PosixPath
+from typing import Union, Optional
+from transformers import (
+    RobertaConfig,
+    RobertaTokenizerFast,
+    RobertaForMaskedLM,
+    RobertaForSequenceClassification,
+)
+from .models import (
+    RobertaMeanPoolConfig,
+    RobertaForSequenceClassificationMeanPool,
+)
+RobertaSettings = dict(
+    padding_side='left'
+)
+MODELS = {
+    "roberta-lm": (RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM, RobertaSettings),
+    "roberta-pred": (RobertaConfig, RobertaTokenizerFast, RobertaForSequenceClassification, RobertaSettings),
+    "roberta-pred-mean-pool": (RobertaMeanPoolConfig, RobertaTokenizerFast, RobertaForSequenceClassificationMeanPool, RobertaSettings)
+}
+def load_model(model_name: str,
+               tokenizer_dir: Union[str, PosixPath],
+               max_tokenized_len: int = 254,
+               pretrained_model: Union[str, PosixPath] = None,
+               k: Optional[int] = None,
+               do_lower_case: Optional[bool] = None,
+               padding_side: Optional[str] = 'left',
+               **config_settings) -> tuple:
+    """Load specified model, config, and tokenizer.
+    Args:
+        model_name (str): Name of model. Acceptable options are
+            - 'roberta-lm',
+            - 'roberta-pred',
+            - 'roberta-pred-mean-pool'
+        tokenizer_dir (Union[str, PosixPath]): Directory containing tokenizer
+            files: merges.txt and vocab.txt
+        max_len (int, optional): Maximum tokenized length,
+            not including SOS and EOS. Defaults to 254.
+        pretrained_model (Union[str, PosixPath], optional): path to saved
+            pretrained RoBERTa transformer model. Defaults to None.
+        k (Optional[int], optional): Size of kmers (for DNABERT model). Defaults to 6.
+        do_lower_case (bool, optional): Whether to convert all inputs to lower case. Defaults to None.
+        padding_side (str, optional): Which side to pad on. Defaults to 'left'.
+    Returns:
+        tuple: config_obj, tokenizer, model
+    """
+    config_settings = config_settings or {}
+    max_position_embeddings = max_tokenized_len + 2  # To include SOS and EOS
+    config_class, tokenizer_class, model_class, tokenizer_settings = MODELS[model_name]
+    kwargs = dict(
+        max_len=max_tokenized_len,
+        truncate=True,
+        padding="max_length",
+        **tokenizer_settings
+    )
+    if k is not None:
+        kwargs.update(dict(k=k))
+    if do_lower_case is not None:
+        kwargs.update(dict(do_lower_case=do_lower_case))
+    if padding_side is not None:
+        kwargs.update(dict(padding_side=padding_side))
+    tokenizer = tokenizer_class.from_pretrained(str(tokenizer_dir), **kwargs)
+    name_or_path = str(pretrained_model) or ''
+    config_obj = config_class(
+        vocab_size=len(tokenizer),
+        max_position_embeddings=max_position_embeddings,
+        name_or_path=name_or_path,
+        output_hidden_states=True,
+        **config_settings
+    )
+    if pretrained_model:
+        # print(f"Loading from pretrained model {pretrained_model}")
+        model = model_class.from_pretrained(
+            str(pretrained_model), config=config_obj)
+    else:
+        print("Loading untrained model")
+        model = model_class(config=config_obj)
+    model.resize_token_embeddings(len(tokenizer))
+    return config_obj, tokenizer, model

module/utils.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import os
+import sys
+import wget
+import requests
+import re
+import argparse
+from types import GeneratorType, ModuleType
+from typing import Union, Tuple
+import subprocess
+from pathlib import PosixPath, Path
+import importlib as im
+import json
+import pickle
+import pandas as pd
+import numpy as np
+from IPython.display import display
+import torch
+from tqdm import tqdm
+from sklearn.metrics import r2_score
+from .config import settings, output, data_final, models
+def preprocess_genex(genex_data: pd.DataFrame, settings: dict) -> pd.DataFrame:
+    if settings["data"].get("preprocess", False):
+        preproc_dict = settings["data"]["preprocess"]
+        preproc_type = preproc_dict["type"]
+        if preproc_type == "log":
+            delta = preproc_dict["delta"]
+            df_preprocessed = genex_data.applymap(lambda x: np.log(x + delta))
+        elif preproc_type == "binary":
+            thresh = preproc_dict["threshold"]
+            df_preprocessed = genex_data.applymap(lambda x: float(x > thresh))
+        elif preproc_type == "ceiling":
+            ceiling = preproc_dict["ceiling"]
+            df_preprocessed = genex_data.applymap(lambda x: min(ceiling, x))
+        else:
+            df_preprocessed = genex_data
+        return df_preprocessed
+    else:
+        return genex_data
+def get_args(
+    data_dir=data_final / "transformer" / "seq",
+    train_data="all_seqs_train.txt",
+    eval_data=None,
+    test_data="all_seqs_test.txt",
+    output_dir=models / "transformer" / "language-model",
+    model_name=None,
+    pretrained_model=None,
+    tokenizer_dir=None,
+    log_offset=None,
+    preprocessor=None,
+    filter_empty=False,
+    hyperparam_search_metrics=None,
+    hyperparam_search_trials=None,
+    transformation=None,
+    output_mode=None,
+) -> argparse.Namespace:
+    """Use Python's ArgumentParser to create a namespace from (optional) user input
+    Args:
+        data_dir ([type], optional): Base location of data files. Defaults to data_final/'transformer'/'seq'.
+        train_data (str, optional): Name of train data file in `data_dir` Defaults to 'all_seqs_train.txt'.
+        test_data (str, optional): Name of test data file in `data_dir`. Defaults to 'all_seqs_test.txt'.
+        output_dir ([type], optional): Location to save trained model. Defaults to models/'transformer'/'language-model'.
+        model_name (Union[str, PosixPath], optional): Name of model
+        pretrained_mdoel (Union[str, PosixPath], optional): path to config and weights for huggingface pretrained model.
+        tokenizer_dir (Union[str, PosixPath], optional): path to config files for huggingface pretrained tokenizer.
+        filter_empty (bool, optional): Whether to filter out empty sequences.
+            Necessary for kmer-based models; takes additional time.
+        hyperparam_search_metrics (Union[list, str], optional): metrics for hyperparameter search.
+        hyperparam_search_trials (int, optional): number of trials to run hyperparameter search.
+        transformation (str, optional): how to transform data. Defaults to None.
+        output_mode (str, optional): default output mode for model and data transformation. Defaults to None.
+    Returns:
+        argparse.Namespace: parsed arguments
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-w",
+        "--warmstart",
+        action="store_true",
+        help="Whether to start with a saved checkpoint",
+        default=False,
+    )
+    parser.add_argument("--num-embeddings", type=int, default=-1)
+    parser.add_argument(
+        "--data-dir",
+        type=str,
+        default=str(data_dir),
+        help="Directory containing train/eval data. Defaults to data/final/transformer/seq",
+    )
+    parser.add_argument(
+        "--train-data",
+        type=str,
+        default=train_data,
+        help="Name of training data file. Will be added to the end of `--data-dir`.",
+    )
+    parser.add_argument(
+        "--eval-data",
+        type=str,
+        default=eval_data,
+        help="Name of eval data file. Will be added to the end of `--data-dir`.",
+    )
+    parser.add_argument(
+        "--test-data",
+        type=str,
+        default=test_data,
+        help="Name of test data file. Will be added to the end of `--data-dir`.",
+    )
+    parser.add_argument("--output-dir", type=str, default=str(output_dir))
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        help='Name of model. Supported values are "roberta-lm", "roberta-pred", "roberta-pred-mean-pool", "dnabert-lm", "dnabert-pred", "dnabert-pred-mean-pool"',
+        default=model_name,
+    )
+    parser.add_argument(
+        "--pretrained-model",
+        type=str,
+        help="Directory containing config.json and pytorch_model.bin files for loading pretrained huggingface model",
+        default=(str(pretrained_model) if pretrained_model else None),
+    )
+    parser.add_argument(
+        "--tokenizer-dir",
+        type=str,
+        help="Directory containing necessary files to instantiate pretrained tokenizer.",
+        default=str(tokenizer_dir),
+    )
+    parser.add_argument(
+        "--log-offset",
+        type=float,
+        help="Offset to apply to gene expression values before log transform",
+        default=log_offset,
+    )
+    parser.add_argument(
+        "--preprocessor",
+        type=str,
+        help="Path to pickled preprocessor file",
+        default=preprocessor,
+    )
+    parser.add_argument(
+        "--filter-empty",
+        help="Whether to filter out empty sequences.",
+        default=filter_empty,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--tissue-subset", default=None, help="Subset of tissues to use", nargs="*"
+    )
+    parser.add_argument("--hyperparameter-search", action="store_true", default=False)
+    parser.add_argument("--ntrials", default=hyperparam_search_trials, type=int)
+    parser.add_argument("--metrics", default=hyperparam_search_metrics, nargs="*")
+    parser.add_argument("--direction", type=str, default="minimize")
+    parser.add_argument(
+        "--nshards",
+        type=int,
+        default=None,
+        help="Number of shards to divide data into; only the first is kept.",
+    )
+    parser.add_argument(
+        "--nshards-eval",
+        type=int,
+        default=None,
+        help="Number of shards to divide eval data into.",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=None,
+        help="Minimum value for filtering gene expression values.",
+    )
+    parser.add_argument(
+        "--transformation",
+        type=str,
+        default=transformation,
+        help='How to transform the data. Options are "log", "boxcox"',
+    )
+    parser.add_argument(
+        "--freeze-base",
+        action="store_true",
+        help="Freeze the pretrained base of the model",
+    )
+    parser.add_argument(
+        "--output-mode",
+        type=str,
+        help='Output mode for model: {"regression", "classification"}',
+        default=output_mode,
+    )
+    parser.add_argument(
+        "--learning-rate",
+        type=float,
+        help="Learning rate for training. Default None",
+        default=None,
+    )
+    parser.add_argument(
+        "--num-train-epochs",
+        type=int,
+        help="Number of epochs to train for",
+        default=None,
+    )
+    parser.add_argument(
+        "--search-metric",
+        type=str,
+        help="Metric to optimize in hyperparameter search",
+        default=None,
+    )
+    parser.add_argument("--batch-norm", action="store_true", default=False)
+    args = parser.parse_args()
+    if args.pretrained_model and not args.pretrained_model.startswith("/"):
+        args.pretrained_model = str(Path.cwd() / args.pretrained_model)
+    args.data_dir = Path(args.data_dir)
+    args.output_dir = Path(args.output_dir)
+    args.train_data = _get_fpath_if_not_none(args.data_dir, args.train_data)
+    args.eval_data = _get_fpath_if_not_none(args.data_dir, args.eval_data)
+    args.test_data = _get_fpath_if_not_none(args.data_dir, args.test_data)
+    args.preprocessor = Path(args.preprocessor) if args.preprocessor else None
+    if args.tissue_subset is not None:
+        if isinstance(args.tissue_subset, (int, str)):
+            args.tissue_subset = [args.tissue_subset]
+        args.tissue_subset = [
+            int(t) if t.isnumeric() else t for t in args.tissue_subset
+        ]
+    return args
+def get_model_settings(
+    settings: dict, args: dict = None, model_name: str = None
+) -> dict:
+    """Get the appropriate model settings from the dictionary `settings`."""
+    if model_name is None:
+        model_name = args.model_name
+    base_model_name = model_name.split("-")[0] + "-base"
+    base_model_settings = settings["models"].get(base_model_name, {})
+    model_settings = settings["models"].get(model_name, {})
+    data_settings = settings["data"]
+    settings = dict(**base_model_settings, **model_settings, **data_settings)
+    if args is not None:
+        if args.output_mode:
+            settings["output_mode"] = args.output_mode
+        if args.tissue_subset is not None:
+            settings["num_labels"] = len(args.tissue_subset)
+        if args.batch_norm:
+            settings["batch_norm"] = args.batch_norm
+    return settings
+def _get_fpath_if_not_none(
+    dirpath: PosixPath, fpath: PosixPath
+) -> Union[None, PosixPath]:
+    if fpath:
+        return dirpath / fpath
+    return None
+def load_pickle(path: PosixPath) -> object:
+    with path.open("rb") as f:
+        obj = pickle.load(f)
+    return obj

prediction.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from module import config, transformers_utility as tr, utils, metrics, dataio
+from prettytable import PrettyTable
+table = PrettyTable()
+table.field_names = config.tissues
+TOKENIZER_DIR = config.models / "byte-level-bpe-tokenizer"
+PRETRAINED_MODEL = config.models / "transformer" / "prediction-model"
+DATA_DIR = config.data
+def load_model(args, settings):
+    return tr.load_model(
+        args.model_name,
+        args.tokenizer_dir,
+        pretrained_model=args.pretrained_model,
+        log_offset=args.log_offset,
+        **settings,
+    )
+def main(TEST_DATA):
+    args = utils.get_args(
+        data_dir=DATA_DIR,
+        train_data=TEST_DATA,
+        test_data=TEST_DATA,
+        pretrained_model=PRETRAINED_MODEL,
+        tokenizer_dir=TOKENIZER_DIR,
+        model_name="roberta-pred-mean-pool",
+    )
+    settings = utils.get_model_settings(config.settings, args)
+    if args.output_mode:
+        settings["output_mode"] = args.output_mode
+    if args.tissue_subset is not None:
+        settings["num_labels"] = len(args.tissue_subset)
+    print("Loading model...")
+    config_obj, tokenizer, model = load_model(args, settings)
+    print("Loading data...")
+    datasets = dataio.load_datasets(
+        tokenizer,
+        args.train_data,
+        eval_data=args.eval_data,
+        test_data=args.test_data,
+        seq_key="text",
+        file_type="text",
+        filter_empty=args.filter_empty,
+        shuffle=False,
+    )
+    dataset_test = datasets["train"]
+    print("Getting predictions:")
+    preds = metrics.get_predictions(model, dataset_test)
+    for e in preds:
+        table.add_row(e)
+    print(table)
+if __name__ == "__main__":
+    main("test.txt")