File size: 40,744 Bytes

from typing import Callable, Dict, Optional, Union, Tuple

import copy
import math
import multiprocessing
import os

import torch
import torch.nn as nn
import transformers

from .misc import ContextualModelConfig

def load_embedder_and_tokenizer(name: str) -> Tuple[
        transformers.PreTrainedModel, 
        transformers.PreTrainedTokenizer
]:
    if name.startswith("nomic") or (name == "bert-base-uncased"):
        model = transformers.AutoModelForMaskedLM.from_pretrained(name, trust_remote_code=True).bert
        tokenizer = transformers.AutoTokenizer.from_pretrained(name)
    elif name in ["gtr-base", "gtr_base"]:
        model = transformers.AutoModel.from_pretrained(
            "sentence-transformers/gtr-t5-base"
        ).encoder
        tokenizer = transformers.AutoTokenizer.from_pretrained(
            "sentence-transformers/gtr-t5-base"
        )
    elif name == "pile-t5-base-encoder":
        model = transformers.AutoModel.from_pretrained(
            "EleutherAI/pile-t5-base"
        ).encoder
        tokenizer = transformers.AutoTokenizer.from_pretrained(
            "EleutherAI/pile-t5-base"
        )
        tokenizer.pad_token = tokenizer.eos_token
    elif name == "pile-t5-base-decoder":
        model = transformers.AutoModel.from_pretrained(
            "EleutherAI/pile-t5-base"
        ).decoder
        tokenizer = transformers.AutoTokenizer.from_pretrained(
            "EleutherAI/pile-t5-base"
        )
        tokenizer.pad_token = tokenizer.eos_token
    elif name.startswith("gpt2") or name.startswith("meta-llama") or ("Llama" in name):
        model = transformers.AutoModelForCausalLM.from_pretrained(
            name, 
            # torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2",
            low_cpu_mem_usage=True,
            # device_map="auto",
        )
        model.padding_side = "right"
        tokenizer = transformers.AutoTokenizer.from_pretrained(name)
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.add_eos_token = True
    else:
        model = transformers.AutoModel.from_pretrained(name, trust_remote_code=True)
        tokenizer = transformers.AutoTokenizer.from_pretrained(name)

        # if use_bettertransformer:
        #     from optimum.bettertransformer import BetterTransformer
        #     model = BetterTransformer.transform(model)
    return model, tokenizer
def get_world_size() -> int:
    try:
        return torch.distributed.get_world_size()
    except (RuntimeError, ValueError):
        return 1


def get_rank() -> int:
    try:
        return torch.distributed.get_rank()
    except (RuntimeError, ValueError):
        return 0
    
def gather(t: torch.Tensor) -> torch.Tensor:
    # torch.distributed.nn.all_gather scales by world size since the reduce op is SUM
    # https://github.com/pytorch/pytorch/issues/58005
    # only should use torch.distributed.nn.all_gather if we implement a `local_loss`
    # like: https://github.com/mlfoundations/open_clip/issues/616
    world_size = get_world_size()
    if world_size == 1:
        return t

    if t.ndim == 0:
        t = t.unsqueeze(0)

    gathered = [torch.empty_like(t) for _ in range(world_size)]
    torch.distributed.all_gather(gathered, t)
    gathered[get_rank()] = t
    return torch.cat(gathered, dim=0)


def gather_sum(t: torch.Tensor) -> torch.Tensor:
    # torch.distributed.nn.all_gather scales by world size since the reduce op is SUM
    # https://github.com/pytorch/pytorch/issues/58005
    # only should use torch.distributed.nn.all_gather if we implement a `local_loss`
    # like: https://github.com/mlfoundations/open_clip/issues/616
    world_size = get_world_size()
    if world_size == 1:
        return t

    if t.ndim == 0:
        t = t.unsqueeze(0)

    gathered = [torch.empty_like(t) for _ in range(world_size)]
    torch.distributed.all_gather(gathered, t)
    gathered = torch.stack(gathered, dim=0)
    return gathered.sum(dim=0) # Sum across workers


def get_num_proc() -> int:
    world_size: int = get_world_size()
    try:
        # os.sched_getaffinity respects schedulers, unlike cpu_count(), but it's only available
        # on some Unix platforms, so we support both!
        return len(os.sched_getaffinity(0)) // world_size  # type: ignore[attr-defined]
    except AttributeError:
        return multiprocessing.cpu_count() // world_size


def torch_main_worker_finish_first(func: Callable):
    def wrapper(*args, **kwargs):
        # Get local rank (need to support non-DDP).
        try:
            local_rank = torch.distributed.get_rank()
            ddp_enabled = True
        except (RuntimeError, ValueError):
            local_rank = -1
            ddp_enabled = False
        is_main_worker = local_rank <= 0
        # Run on main worker first.
        if is_main_worker:
            result = func(*args, **kwargs)
        # Then everyone waits.
        if ddp_enabled:
            torch.distributed.barrier()
        # Run on other workers now.
        if not is_main_worker:
            result = func(*args, **kwargs)
        # Now everyone waits again.
        if ddp_enabled:
            torch.distributed.barrier()
        return result

    return wrapper


def print0(*args, **kwargs) -> None:
    if get_rank() == 0:
        print(*args, **kwargs)


def verify_ddp_weights_equal(model: torch.nn.Module, atol: float = 1e-5) -> None:
    if hasattr(model, "module"):
        model = model.module
    
    world_size = get_world_size()

    if world_size > 8:
        print0(f"[verify_ddp_weights_equal] Skipping with world_size={world_size} ⚠️")
        return

    for name, param in model.named_parameters():
        if param is None: continue
        if param.grad is None: 
            print0(f"[verify_ddp_weights_equal] Skipping param [{name}] with no grad")
            continue
        gathered_param = gather(param).reshape((world_size, -1))
        absolute_diffs = (gathered_param[None, 0, :] - gathered_param).abs()
        rank_params_eq = (absolute_diffs < atol).all()
        assert rank_params_eq, f"❌ param [{name}] not equal - got max_absolute_diff={absolute_diffs.max()}"
        ###################################################################################################################
        gathered_param_grad = gather(param.grad).reshape((world_size, -1))
        absolute_grad_diffs = (gathered_param_grad[None, 0, :] - gathered_param_grad).abs()
        rank_grad_params_eq = (absolute_grad_diffs < atol).all()
        assert rank_grad_params_eq, f"❌ param [{name}] grad not equal - got max_absolute_diff={absolute_grad_diffs.max()}"
        ###################################################################################################################
        
    
    print0("[verify_ddp_weights_equal] Verified DDP parameter correctness ✅")
    


def mean_pool_3d(
    hidden_states: torch.Tensor, attention_mask: torch.Tensor
) -> torch.Tensor:
    B, T, S, D = hidden_states.shape
    unmasked_outputs = hidden_states * attention_mask[..., None]
    pooled_outputs = unmasked_outputs.sum(dim=2) / (attention_mask.sum(dim=2)[..., None] + 1e-9)

    # fix for gradient flow: fill empty rows with the mean of the rest of the sequence
    sequence_means = (
        hidden_states.reshape((B, S * T, D))
            .mean(dim=1, keepdim=True)
            .expand(-1, T, -1)
    )
    pooled_outputs = pooled_outputs.where(
        (attention_mask.sum(dim=2)[..., None] > 0), 
        sequence_means
    )
    assert pooled_outputs.shape == (B, T, D)

    return pooled_outputs

def mean_pool(
    hidden_states: torch.Tensor, attention_mask: torch.Tensor
) -> torch.Tensor:
    B, _S, D = hidden_states.shape
    unmasked_outputs = hidden_states * attention_mask[..., None]
    pooled_outputs = unmasked_outputs.sum(dim=1) / (attention_mask.sum(dim=1)[:, None] + 1e-20)
    
    assert pooled_outputs.shape == (B, D)
    return pooled_outputs


def mean_pool_weighted(
    hidden_states: torch.Tensor, attention_mask: torch.Tensor
) -> torch.Tensor:
    B, _S, D = hidden_states.shape
    attention_mask *= attention_mask.cumsum(dim=1) # [0,1,1,1,0,0] -> [0,1,2,3,0,0]
    s = torch.sum(hidden_states * attention_mask.unsqueeze(-1).float(), dim=1)
    d = attention_mask.sum(dim=1, keepdim=True).float()
    return s / d


def slice_sparse_tensor_rows(t: torch.sparse.Tensor, min_row: int, max_row: int) -> torch.sparse.Tensor:
    assert min_row < max_row, f"can't slice from row {min_row} to {max_row}"
    t = t.coalesce()
    row_idxs = t.indices()[0]
    index_mask = (min_row <= row_idxs) & (row_idxs < max_row)

    num_rows = (max_row - min_row)
    num_cols = t.shape[1]

    idxs = t.indices()[:, index_mask]
    vals = t.values()[index_mask]
    return torch.sparse_coo_tensor(idxs, vals, size=(num_rows, num_cols)).coalesce()


def slice_tensor_rows(t: torch.Tensor, min_row: int, max_row: int) -> torch.Tensor:
    if t.is_sparse:
        return slice_sparse_tensor_rows(t=t, min_row=min_row, max_row=max_row)
    else:
        return t[min_row:max_row]


@torch.no_grad
def maxsim(
    X: torch.Tensor, y: torch.Tensor, 
    maximize: bool, chunk_size: int = 8_000,
    debug_mem_usage: bool = False) -> torch.Tensor:
    device = X.device
    n_samples = X.shape[0]

    max_sim_v = torch.zeros(n_samples, device=device, dtype=X.dtype)
    max_sim_i = torch.zeros(n_samples, device=device, dtype=torch.int64)

    # TODO: Implement faster max (without going to dense tensors).
    # TODO: Use multiple GPUs.
    rank = get_rank()
    world_size = get_world_size()

    worker_worklist_size = int(math.ceil(n_samples / world_size))
    splits_start_idx = worker_worklist_size * rank
    splits_end_idx = worker_worklist_size * (rank + 1)

    for i in range(splits_start_idx, splits_end_idx, chunk_size):
        start, end = i, min(i + chunk_size, n_samples)
        sub_x = slice_tensor_rows(X, start, end)
        if debug_mem_usage: print(f"[maxsim] step {i} cuda mem free/total = {torch.cuda.mem_get_info()}")
        if debug_mem_usage: print("[maxsim] sub_x.shape:", sub_x.shape, "//", "y.shape:", y.shape)
        sub_sim = sub_x @ y # TODO – Implement sparse max here to save mem!
        sub_sim = sub_sim
        if maximize:
            sub_max_sim_v, sub_max_sim_i = sub_sim.to_dense().max(dim=-1)
        else:
            sub_max_sim_v, sub_max_sim_i = sub_sim.to_dense().min(dim=-1)
        del sub_sim
        del sub_x
        torch.cuda.empty_cache() # needs to happen after maxsim for some reason.
        max_sim_v[start: end] = sub_max_sim_v
        max_sim_i[start: end] = sub_max_sim_i
    
    # gather
    max_sim_v = gather_sum(max_sim_v)
    max_sim_i = gather_sum(max_sim_i)
    k = y.shape[1]

    assert max_sim_v.shape == (n_samples,)
    assert max_sim_i.shape == (n_samples,)
    assert max_sim_i.min() >= 0
    assert max_sim_i.max() <= k

    return max_sim_v, max_sim_i


def forward_batched(
        model: torch.nn.Module,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        batch_size: int,
        dataset_input_ids: Optional[torch.Tensor] = None,
        dataset_attention_mask: Optional[torch.Tensor] = None,
        **second_stage_model_kwargs,
) -> torch.Tensor:
    if hasattr(model, "module"):
        model = model.module
    
    if hasattr(model, "first_stage_model"):
        # Support pooling over 3D dataset_input_ids inputs.
        if len(dataset_input_ids.shape) == 2:
            dataset_input_ids = dataset_input_ids[None]
            dataset_attention_mask = dataset_attention_mask[None]

        dataset_embeddings = []
        for j in range(len(dataset_input_ids)):
            i = 0
            dataset_embeddings_batch = []
            while i < dataset_input_ids.shape[1]:
                dataset_embeddings_batch.append(
                    model.first_stage_model(
                        input_ids=dataset_input_ids[j][i:i+batch_size],
                        attention_mask=dataset_attention_mask[j][i:i+batch_size],
                    )
                )
                i += batch_size
            dataset_embeddings.append(
                torch.cat(dataset_embeddings_batch, dim=0)
            )
       
        # Automatically pool over 3D dataset_input_ids.
        dataset_embeddings = torch.stack(dataset_embeddings, dim=0).mean(dim=0)

        j = 0
        outputs = []
        while j < len(input_ids):
            outputs.append(
                model.second_stage_model(
                    input_ids=input_ids[j:j+batch_size],
                    attention_mask=attention_mask[j:j+batch_size],
                    dataset_embeddings=dataset_embeddings,
                    **second_stage_model_kwargs,
                )
            )
            j += batch_size
        return torch.cat(outputs, dim=0)

    else:
        i = 0
        outputs = []
        while i < len(input_ids):
            # breakpoint()
            outputs.append(
                model(
                    input_ids=input_ids[i:i+batch_size],
                    attention_mask=attention_mask[i:i+batch_size],
                    **second_stage_model_kwargs,
                )
            )
            i += batch_size
        return torch.cat(outputs, dim=0)


def last_token_pool(hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    # https://github.com/ContextualAI/gritlm/blob/main/gritlm/gritlm.py#L190
    b, n, d = hidden_state.size()
    # Get the last `1` in the attention mask of each item
    # Often it is just `gather_indices = torch.argmin(attention_mask, 1, keepdim=False) - 1`
    # except when 1) There's all 1's 2) There's 0's before the 1's
    reversed_mask = torch.flip(attention_mask, dims=(1,))
    argmax_reverse = torch.argmax(reversed_mask, dim=1, keepdim=False)
    gather_indices = attention_mask.size(1) - argmax_reverse - 1
    # If there are empty sequences, where the index would become -1 it will crash so set them to 0
    gather_indices = torch.clamp(gather_indices, min=0)
    # Turn indices from shape [b] -> [b, 1, d]
    gather_indices = gather_indices.unsqueeze(-1).repeat(1, d)
    gather_indices = gather_indices.unsqueeze(1)
    assert gather_indices.shape == (b, 1, d)
    # Gather along the seq len: [b, n, d] -> [b, d]
    # Actually no need for the attention mask as we gather the last token where attn_mask=1 but
    # as some indices (which shouldn't be attended to) may be 0 due to clamp, use mask to ignore them again
    input_mask_expanded = attention_mask.unsqueeze(-1).expand((b, n, d)).float()
    return torch.gather(hidden_state * input_mask_expanded, 1, gather_indices).squeeze(dim=1)

def print0(*args, **kwargs) -> None:
    if get_rank() == 0:
        print(*args, **kwargs)


def limit_layers(model: transformers.PreTrainedModel, n_layers: int) -> None:
    if hasattr(model, 'transformer'):
        if hasattr(model.transformer, 'h'):
            # gpt2
            model.transformer.h = model.transformer.h[:n_layers]
        else:
            model.transformer.layer = model.transformer.layer[:n_layers]
    elif hasattr(model, 'encoder'):
        if hasattr(model.encoder, 'layers'):
            model.encoder.layers = model.encoder.layers[:n_layers]
        else:
            model.encoder.layer = model.encoder.layer[:n_layers]
    else:
        raise RuntimeError(f"unknown how to limit layers of model {type(model)}")
    


def disable_dropout(model: torch.nn.Module):
    dropout_modules = [m for m in model.modules() if isinstance(m, torch.nn.Dropout)]
    for m in dropout_modules:
        m.p = 0.0
    print0(
        f"Disabled {len(dropout_modules)} dropout modules from model type {type(model)}"
    )


def disable_causality(model: torch.nn.Module):
    disabled_modules = 0
    for m in model.modules():
        if hasattr(m, "is_causal"):
            m.is_causal = False
            disabled_modules += 1
    print0(
        f"Set is_causal=False in {disabled_modules} modules from model type {type(model)}"
    )

class ContextualModelMixin(nn.Module):
    @property
    def num_corpus_tokens(self) -> int:
        return self.transductive_corpus_size * self.transductive_tokens_per_document

    def contextual_init(self):
        self.n_soft_prompt = 8
        self.prompt_projection = torch.nn.Sequential(
            torch.nn.Linear(self.hidden_size, self.hidden_size),
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_size, self.hidden_size * self.n_soft_prompt)
        )
        self.transductive_corpus_size = vars(self.config).get("transductive_corpus_size", 1)
        self.transductive_tokens_per_document = vars(self.config).get("transductive_tokens_per_document", 1)
        self.randomize_dataset_sequence_order = True
        self.sequence_dropout_prob = vars(self.config).get("transductive_sequence_dropout_prob", 0.0)
        if self.sequence_dropout_prob > 0.0:
            self.sequence_dropout_null_embedding = torch.nn.Parameter(
                torch.randn(self.hidden_size) * 0.01,
                requires_grad = True
            )       
        self.output_projection = torch.nn.Sequential(
            torch.nn.Linear(self.hidden_size, self.hidden_size),
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_size, self.hidden_size)
        )

    def _prepare_dataset_embeddings(
            self, 
            input_ids: torch.Tensor, dataset_embeddings: torch.Tensor,
            null_dataset_embedding: bool = False,
        ) -> torch.Tensor:
        if not isinstance(dataset_embeddings, torch.Tensor):
            dataset_embeddings = torch.tensor(dataset_embeddings)

        if len(dataset_embeddings.shape) == 2:
            # Auto-expand for a batch.
            dataset_embeddings = dataset_embeddings[None, :, :] # (b, d) -> (1, b, d)
        dataset_embeddings = dataset_embeddings.to(input_ids.device)
    
        batch_size = input_ids.shape[0]
        if (self.transductive_tokens_per_document > 1):
            if self.training:
                # Choose N random documents to fill our context window with.
                # This logic is a little confusing but allows us to sample a
                # different batch *per-document*
                assert dataset_embeddings.shape[1] == self.transductive_tokens_per_document
                R = torch.randint(
                    low=0, 
                    high=len(dataset_embeddings), 
                    size=(batch_size, self.config.transductive_corpus_size), 
                    device=dataset_embeddings.device
                )
                # TODO make this deterministic somehow for evaluation?
                dataset_embeddings = dataset_embeddings[R].reshape((batch_size, self.num_corpus_tokens, self.hidden_size))
            else:
                dataset_embeddings = dataset_embeddings.reshape((1, self.num_corpus_tokens, self.hidden_size))
                # print("reshaped to dataset_embeddings.shape =", dataset_embeddings.shape)

        if dataset_embeddings.shape[1] > self.num_corpus_tokens:
            # If too many dataset embeddings are passed in, just take the first N until
            # we have the proper number.
            dataset_embeddings = dataset_embeddings[:, :self.num_corpus_tokens, :]
        
        _, corpus_size, _hidden_size = dataset_embeddings.shape
        if _ == 1:
            # Auto-expand for a batch.
            dataset_embeddings = dataset_embeddings.expand((batch_size, -1, -1))

        if self.training and self.sequence_dropout_prob > 0.0:
            sequence_dropout_mask = (
                torch.rand((batch_size, corpus_size), device=dataset_embeddings.device) < self.sequence_dropout_prob
            )
            null_embeddings = self.sequence_dropout_null_embedding[None, None].expand(batch_size, corpus_size, -1)
            dataset_embeddings = torch.where(
                sequence_dropout_mask[..., None], null_embeddings, dataset_embeddings
            )
        elif null_dataset_embedding:
            null_embeddings = self.sequence_dropout_null_embedding[None, None].expand(batch_size, corpus_size, -1)
            dataset_embeddings = null_embeddings
        
        # print(f"[ContextualModelMixin] dataset_embeddings.shape = {dataset_embeddings.shape}")
        
        # backbone_max_seq_length = self.backbone.config.max_trained_positions
        # assert batch_size + (2 * self.n_soft_prompt + corpus_size) <= backbone_max_seq_length, "too many hard negatives for backbone model"
        soft_prompt = torch.ones((1, self.hidden_size), device=dataset_embeddings.device, dtype=dataset_embeddings.dtype)
        soft_prompt = self.prompt_projection(soft_prompt).reshape((1, self.n_soft_prompt, self.hidden_size))
        soft_prompt = soft_prompt.expand((len(dataset_embeddings), -1, -1)) # -> (b, 4+b, d) # soft_prompt.repeat((len(input_ids), 1, 1))  
        soft_prompt = torch.cat((dataset_embeddings, soft_prompt), dim=1)

        # print(f"[ContextualModelMixin] soft_prompt.shape = {soft_prompt.shape}")

        if self.training and self.randomize_dataset_sequence_order:
            randomized_order = torch.stack(
                [
                    torch.cat(
                        (
                            torch.randperm(corpus_size, device=soft_prompt.device), 
                            torch.arange(self.n_soft_prompt, device=soft_prompt.device) + corpus_size
                        ), dim=0) 
                        for _ in range(batch_size)])
            randomized_order = randomized_order.to(soft_prompt.device)
            soft_prompt = soft_prompt.gather(1, randomized_order[..., None].expand_as(soft_prompt))
        
        return soft_prompt

class BiEncoder(transformers.PreTrainedModel):
    embedder: transformers.PreTrainedModel
    def __init__(
            self, 
            config, #: transformers.PreTrainedConfig, 
        ):
        super().__init__(config=config)
        embedder, _ = load_embedder_and_tokenizer(
            config.embedder,
        )

        if config.limit_layers:
            print0(f"Limiting layers to {config.limit_layers}")
            limit_layers(embedder, config.limit_layers)
    
        self.embedder = embedder
        # if ("t5" in embedder.config.model_type):
        #     print0(f"using torch.compile() on embedder of type `{embedder.config.model_type}`")
        #     self.embedder = torch.compile(self.embedder) 
        self.hidden_size = self.embedder.config.hidden_size
        # Allow pooling to multiple tokens per document
        self.transductive_tokens_per_document = vars(self.config).get("transductive_tokens_per_document", 1)
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(self.hidden_size, self.hidden_size),
            torch.nn.GELU(),
            torch.nn.Linear(self.hidden_size, self.config.embedding_output_dim or self.hidden_size),
        )
        self.temp = config.logit_scale

        if config.disable_dropout:
            disable_dropout(self)
        self.pooling_strategy = vars(config).get("pooling_strategy", "mean")

    def forward(
            self, 
            input_ids: torch.Tensor,
            attention_mask: torch.Tensor,
            dataset_input_ids: Optional[torch.Tensor] = None,
            dataset_attention_mask: Optional[torch.Tensor] = None,
            token_type_ids = None,
            output_hidden_states: bool = False,
        ) -> torch.Tensor:
        """
        query_embedding (float torch.Tensor) - shape (batch_size, embedding_dim)
        document_embeddings (float torch.Tensor) - shape (corpus_size, embedding_dim)
            where the corpus_size >= batch_size and is structured like this:
                [d1, d2, d3, hn1_1, hn1_2, hn2_1, hn2_2, hn3_1, hn3_2]
                for a corpus with three documents and two hard negatives per document
        """
        # del dataset_input_ids
        # del dataset_attention_mask
        del token_type_ids

        # from cde.lib.dist import get_rank
        # tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")
        # if get_rank() == 0:
        #     breakpoint()
        # torch.distributed.barrier()
        outputs = (
            self.embedder(
                input_ids=input_ids,
                attention_mask=attention_mask,
            ).last_hidden_state
        )

        if self.transductive_tokens_per_document > 1:
            document_embeddings = None
            batch_size, seq_length, output_dim = outputs.shape

            if seq_length % self.transductive_tokens_per_document != 0:
                # Pad to nearest multiple
                n_extra_embeds = self.transductive_tokens_per_document - (seq_length % self.transductive_tokens_per_document)
                outputs = torch.cat(
                    (outputs, torch.zeros((batch_size, n_extra_embeds, output_dim), device=outputs.device)),
                    dim=1
                )
                attention_mask = torch.cat(
                    (attention_mask, torch.zeros((batch_size, n_extra_embeds), device=attention_mask.device)),
                    dim=1
                )
                seq_length += n_extra_embeds
                print(f"Added {n_extra_embeds} padding tokens to input_ids and attention_mask")
            
            # print("ftransductive_tokens_per_document {self.transductive_tokens_per_document} outputs.shape =", outputs.shape)

            outputs = outputs.reshape(
                (batch_size,  self.transductive_tokens_per_document, seq_length // self.transductive_tokens_per_document, output_dim)
            )

            attention_mask = attention_mask.reshape((batch_size, self.transductive_tokens_per_document, -1))
            document_embeddings = mean_pool_3d(outputs, attention_mask)
            
            document_embeddings = document_embeddings.reshape((batch_size, self.transductive_tokens_per_document, output_dim))
        else:
            if self.pooling_strategy == "mean":
                document_embeddings = mean_pool(outputs, attention_mask)
            else:
                document_embeddings = document_embeddings.max(dim=1)
        output = self.mlp(document_embeddings)

        if output_hidden_states:
            return {
                "hidden_states": outputs,
                "pooled": output,
            }
        else:
            return output


class DatasetConditionedAutoregressive(transformers.PreTrainedModel, ContextualModelMixin):
    def __init__(
            self, 
            config,
            dataset_backbone: transformers.PreTrainedModel,
            first_stage_hidden_size: int,
        ):
        super().__init__(config=config)
        self.backbone = dataset_backbone
        self.backbone_hidden_size = self.backbone.config.hidden_size
        self.hidden_size = first_stage_hidden_size # Input token size
        self.contextual_init()
        disable_causality(self.backbone)
        
        self.input_ln = torch.nn.LayerNorm(
            self.backbone_hidden_size, 
            eps=1e-5
        )
        
        # Override contextual init
        self.output_projection = torch.nn.Sequential(
            torch.nn.Linear(self.backbone_hidden_size, self.backbone_hidden_size),
            torch.nn.ReLU(),
            torch.nn.Linear(self.backbone_hidden_size, self.backbone_hidden_size)
        )
        self._shift_rotary_embedding()
                
    @property
    def num_corpus_tokens(self) -> int:
        return self.config.transductive_corpus_size * self.transductive_tokens_per_document

    @property
    def corpus_token_ratio(self) -> float:
        # How many tokens from the first stage make one token in the second
        # stage?
        return self.backbone_hidden_size / self.hidden_size
    
    def corpus_token_pad_size(self, n_tokens: int) -> int:
        return self.hidden_size % self.backbone_hidden_size
    
    def _shift_rotary_embedding(self) -> None:
        disable_transductive_rotary_embedding = vars(self.config).get("disable_transductive_rotary_embedding", True)
        # TODO: Can we do this for LLAMA?
        print("Warning: Positional embedding disabling not implemented for LLAMA.")
    
    def forward(
            self, 
            input_ids: torch.Tensor,
            attention_mask: torch.Tensor,
            dataset_embeddings: torch.Tensor,
            output_hidden_states: bool = False,
            null_dataset_embedding: bool = False,
        ) -> torch.Tensor:
        soft_prompt = self._prepare_dataset_embeddings(
            input_ids=input_ids,
            dataset_embeddings=dataset_embeddings,
            null_dataset_embedding=null_dataset_embedding,
        )
        
        # Reshape for this model.
        # print("[DatasetConditionedAutoregressive] 1 -> soft_prompt.shape =", soft_prompt.shape)
        num_soft_elements = torch.prod(torch.tensor(soft_prompt.shape[1:])).item()
        soft_prompt = soft_prompt.reshape((soft_prompt.shape[0], num_soft_elements))
        num_padding_elements = self.backbone_hidden_size - (num_soft_elements % self.backbone_hidden_size)
        padding = torch.ones((soft_prompt.shape[0], num_padding_elements), device=soft_prompt.device)
        soft_prompt = torch.cat((soft_prompt, padding), dim=1)
        soft_prompt = soft_prompt.reshape(
            (soft_prompt.shape[0], -1, self.backbone_hidden_size)
        )
        soft_prompt = self.input_ln(soft_prompt)
        # print("[DatasetConditionedAutoregressive] 2 -> soft_prompt.shape =", soft_prompt.shape)

        backbone_attention_mask = torch.ones(
            soft_prompt.shape[0:2],
            dtype=torch.long,
            device=soft_prompt.device,
        )
        token_embeddings = self.backbone.get_input_embeddings()
        inputs_embeds = token_embeddings(input_ids) # (b, s) -> (b, s, d)
        # print("[2] inputs_embeds.shape =", inputs_embeds.shape)
        inputs_embeds = torch.cat((soft_prompt, inputs_embeds), dim=1) # (v, 4+b+s, d)
        # print("[3.a] inputs_embeds.shape =", inputs_embeds.shape)
        input_attention_mask = torch.cat((backbone_attention_mask, attention_mask), dim=1)
        # print("[3.b] attention_mask.shape =", attention_mask.shape)

        output = self.backbone(
            inputs_embeds=inputs_embeds,
            attention_mask=input_attention_mask,
            output_hidden_states=True,
        ) # (1, 4 + b + s, d)
        # trim soft prompt
        last_hidden_state = output.hidden_states[-1]
        n_soft_prompt_tokens = soft_prompt.shape[1]

        output_vectors = last_hidden_state[:, n_soft_prompt_tokens:, :]
        output_attention_mask = input_attention_mask[:, n_soft_prompt_tokens:]

        # Take last token position
        if vars(self.config).get("pooling_strategy") == "last_token":
            output_pooled = last_token_pool(output_vectors, output_attention_mask)
        elif vars(self.config).get("pooling_strategy") == "mean":
            output_pooled = mean_pool(output_vectors, output_attention_mask)
        else:
            output_pooled = mean_pool_weighted(output_vectors, output_attention_mask)

        # average with original vectors
        # TODO: Argparse for pooling strategy.
        output = self.output_projection(output_pooled) # (b, 2d) -> (b, d)

        if output_hidden_states:
            return {
                "hidden_states": output_vectors,
                "pooled": output,
            }
        else:
            return output


class DatasetConditionedBiencoder(transformers.PreTrainedModel, ContextualModelMixin):
    def __init__(
            self, 
            config,
            dataset_backbone: transformers.PreTrainedModel,
        ):
        super().__init__(config=config)
        self.backbone = dataset_backbone
        self.hidden_size = self.backbone.config.hidden_size
        self.hidden_size = dataset_backbone.config.hidden_size
        # self.input_ln = torch.nn.LayerNorm(
        #     self.hidden_size, 
        #     eps=self.backbone.config.layer_norm_epsilon
        # )
        self.contextual_init()
        self._shift_rotary_embedding()
                
    @property
    def num_corpus_tokens(self) -> int:
        return self.config.transductive_corpus_size * self.transductive_tokens_per_document
    
    def _shift_rotary_embedding(self) -> None:
        disable_transductive_rotary_embedding = vars(self.config).get("disable_transductive_rotary_embedding", True)
        if self.backbone.config.model_type.startswith("nomic") and disable_transductive_rotary_embedding:
            # We only want to apply positional embeddings to the
            # *text* portion of the backbone network.
            self.backbone.config.rotary_start_pos = 0.0
            rotary_disabled = 0

            rotary_start_pos = self.num_corpus_tokens
            for module in self.backbone.modules():
                if hasattr(module, "rotary_emb_dim"):
                    module.rotary_start_pos = rotary_start_pos
                    rotary_disabled += 1
            print0(f"modified {rotary_disabled} rotary modules – set rotary_start_pos to {rotary_start_pos}")
    
    def forward(
            self, 
            input_ids: torch.Tensor,
            attention_mask: torch.Tensor,
            dataset_embeddings: torch.Tensor,
            output_hidden_states: bool = False,
            null_dataset_embedding: bool = False,
        ) -> torch.Tensor:
        # print(f"[DatasetConditionedBiencoder - 0] input_ids.shape => {input_ids.shape} // dataset_embeddings.shape =", dataset_embeddings.shape)
        soft_prompt = self._prepare_dataset_embeddings(
            input_ids=input_ids,
            dataset_embeddings=dataset_embeddings,
            null_dataset_embedding=null_dataset_embedding,
        )
        # print(f"[DatasetConditionedBiencoder - 1] soft_prompt.shape => {soft_prompt.shape}")
        backbone_attention_mask = torch.ones(
            soft_prompt.shape[0:2],
            dtype=torch.long,
            device=soft_prompt.device,
        )
        inputs_embeds = self.backbone.embeddings(input_ids) # (b, s) -> (b, s, d)
        # print("[2] inputs_embeds.shape =", inputs_embeds.shape)
        inputs_embeds = torch.cat((soft_prompt, inputs_embeds), dim=1) # (v, 4+b+s, d)
        # print("[3.a] inputs_embeds.shape =", inputs_embeds.shape)
        attention_mask = torch.cat((backbone_attention_mask, attention_mask), dim=1)
        # print("[3.b] attention_mask.shape =", attention_mask.shape)
        output = self.backbone(
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
        ) # (1, 4 + b + s, d)
        # trim soft prompt
        output_vectors = output.last_hidden_state

        # use only these tokens
        n_soft_prompt_tokens = soft_prompt.shape[1]
        # print("n_soft_prompt_tokens =", n_soft_prompt_tokens)

        output_vectors = output.last_hidden_state[:, n_soft_prompt_tokens:, :]
        output_attention_mask = attention_mask[:, n_soft_prompt_tokens:]

        # print("pooling output_vectors.shape =", output_vectors.shape, "and output_attention_mask.shape =", output_attention_mask.shape)
        output_pooled = mean_pool(output_vectors, output_attention_mask)

        # average with original vectors
        # TODO: Argparse for pooling strategy.
        # output_vectors = torch.cat((soft_prompt_pooled, output_pooled), dim=1) # (b, d) + (b, d) -> (b, 2d)
        # print("output_pooled.shape =", output_pooled.shape)
        output = self.output_projection(output_pooled) # (b, 2d) -> (b, d)

        # print("returning output.shape =", output.shape)

        if output_hidden_states:
            return {
                "hidden_states": output_vectors,
                "pooled": output,
            }
        else:
            return output


class DatasetPrefixBiencoder(transformers.PreTrainedModel, ContextualModelMixin):
    def __init__(
            self, 
            config, #: transformers.PreTrainedConfig, 
            embedder: transformers.PreTrainedModel, 
        ):
        super().__init__(config=config)
        self.embedder = embedder
        self.hidden_size = self.embedder.config.hidden_size
        self.contextual_init()
    
    def forward(
            self, 
            input_ids: torch.Tensor,
            attention_mask: torch.Tensor,
            dataset_input_ids: torch.Tensor,
            dataset_attention_mask: torch.Tensor,
            output_hidden_states: bool = False,
        ) -> torch.Tensor:
        R = torch.randint(low=0, high=len(dataset_input_ids), size=(len(input_ids),), device=dataset_input_ids.device)
        
        dataset_input_ids = dataset_input_ids[R]
        input_ids = torch.cat((dataset_input_ids, input_ids), dim=1)

        dataset_attention_mask = torch.ones_like(dataset_attention_mask, device=dataset_attention_mask.device)
        input_attention_mask = torch.cat((dataset_attention_mask, attention_mask), dim=1)
        output_attention_mask = torch.cat(
            (torch.zeros_like(dataset_input_ids), attention_mask), dim=1
        )

        output = self.embedder(
            input_ids=input_ids,
            attention_mask=input_attention_mask,
        ) 
        
        output_vectors = output.last_hidden_state
        output_pooled = mean_pool(output_vectors, output_attention_mask)
        output = self.output_projection(output_pooled) # (b, 2d) -> (b, d)

        if output_hidden_states:
            S_d = dataset_attention_mask.shape[1]
            output_vectors = output_vectors[:, S_d:, :]
            return {
                "hidden_states": output_vectors,
                "pooled": output,
            }
        else:
            return output


class DatasetTransformer(transformers.PreTrainedModel):
    config_class = ContextualModelConfig
    embedder: transformers.PreTrainedModel
    dataset_backbone: transformers.PreTrainedModel
    def __init__(
            self, 
            config,
        ):
        super().__init__(config=config)
        dataset_backbone, _ = load_embedder_and_tokenizer(
            vars(config).get("dataset_backbone", config.embedder)
        )

        if config.limit_layers:
            print0(f"Limiting layers to {config.limit_layers}")
            limit_layers(dataset_backbone, config.limit_layers)
        
        biencoder_config = copy.deepcopy(config)
        biencoder_config.embedding_output_dim = None
        biencoder_config.limit_layers = vars(self.config).get("limit_layers_first_stage", None)
        self.first_stage_model = BiEncoder(
            config=biencoder_config,
        )

        if vars(config).get("autoregressive_backbone", False):
            self.second_stage_model = DatasetConditionedAutoregressive(
                config=config,
                dataset_backbone=dataset_backbone,
                first_stage_hidden_size=self.first_stage_model.hidden_size,
            )
        else:
            self.second_stage_model = DatasetConditionedBiencoder(
                config=config,
                dataset_backbone=dataset_backbone
            )
        
        self.temp = config.logit_scale
        if config.disable_dropout:
            disable_dropout(self)
        
        transductive_tie_token_embeddings = vars(self.config).get("transductive_tie_token_embeddings", False)
        if transductive_tie_token_embeddings:
            self.second_stage_model.backbone.embeddings.word_embeddings.weight = (
                self.first_stage_model.embedder.embeddings.word_embeddings.weight
            )

    def forward(
            self, 
            input_ids: torch.Tensor,
            attention_mask: torch.Tensor,
            dataset_input_ids: Optional[torch.Tensor],
            dataset_attention_mask: Optional[torch.Tensor],
            output_hidden_states: bool = False,
        ) -> torch.Tensor:
        """
        input_ids (long torch.Tensor) – ids of input tokens
        attention_mask (bool torch.Tensor)
        """
        dataset_embeddings = self.first_stage_model(
            input_ids=dataset_input_ids, 
            attention_mask=dataset_attention_mask
        )
        return self.second_stage_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            dataset_embeddings=dataset_embeddings,
            output_hidden_states=output_hidden_states,
        )



def get_model_class(name: str):
    if name in 'transductive': 
        return DatasetTransformer
    elif name == 'biencoder':
        return BiEncoder
    elif name == "dataset_prefix_biencoder":
        return DatasetPrefixBiencoder
    else:
        raise ValueError(f'unknown model cls {name}')