Spaces:

valcore
/

Branchy-phi-2

Sleeping

App Files Files Community

Florian commited on Feb 13, 2024

Commit

5b2e6a5

1 Parent(s): 48f630f

first commit

Browse files

Files changed (5) hide show

.gitignore +1 -0
app.py +94 -0
requirements.txt +4 -0
src/BranchyModel.py +469 -0
src/utils.py +57 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ model/*

app.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Save this as app.py and run with `streamlit run app.py`
+import streamlit as st
+import torch
+import pandas as pd
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from src.utils import generate_next_token, breaking_ties
+from src.BranchyModel import BranchyModel
+st.title("Multi-Head LLM Demo")
+def add_and_run(token, head):
+    # Update pd with Head and mean of previous heads and actual head
+    head_list = st.session_state["computation_pd"]["Head"].to_list() + [head]
+    mean = sum(head_list) / len(head_list)
+    st.session_state["computation_pd"] = pd.concat([st.session_state["computation_pd"], pd.DataFrame({"Head": [head], "Mean": [mean], "Base model consumption": [st.session_state['head_number']]})], ignore_index=True)
+    st.session_state['current_sentence'] += token
+    _, st.session_state['logits'], _, st.session_state['head_tokens'] = generate_next_token(st.session_state.model, st.session_state.tokenizer, st.session_state['current_sentence'])
+def reset():
+    st.session_state['computation_pd'] = pd.DataFrame(columns=["Head", "Mean", "Base model consumption"])
+    st.session_state['current_sentence'] = "The climate in"
+    _, st.session_state['logits'], _, st.session_state['head_tokens'] = generate_next_token(st.session_state.model, st.session_state.tokenizer, st.session_state['current_sentence'])
+@st.cache_resource
+def load_model(penalty_alpha):
+    penalty_map = {0.1:"model_20240118-144039.bin",
+               0.5:"model_20240118-192548.bin",
+               2:"model_20240118-211943.bin",
+               5:"model_20240118-231333.bin",
+               10:"model_20240119-010725.bin",
+               20:"model_20240119-030115.bin",
+               0:"model_20240119-135506.bin",
+               1:"model_20240119-154900.bin",
+               -20: "model_20240208-072350.bin",
+               -10: "model_20240208-052958.bin",
+               -5: "model_20240208-033606.bin",
+               -2: "model_20240208-014211.bin",
+               -1: "model_20240207-234817.bin",
+               -0.5: "model_20240207-215423.bin",
+               -0.1: "model_20240207-200020.bin"}
+    model_str = "susnato/phi-1_5_dev"
+    model = AutoModelForCausalLM.from_pretrained(model_str).to("cuda:1")
+    tokenizer = AutoTokenizer.from_pretrained(model_str)
+    branch_locations = list(range(0, 23, 5))
+    model = BranchyModel(branch_locations= branch_locations, model= model).to("cuda:1")
+    # Load the specific model based on penalty_alpha
+    model_path = penalty_map.get(penalty_alpha)
+    if model_path:
+        model.load_state_dict(torch.load(model_path, map_location="cuda:1"))
+    else:
+        print("Invalid penalty_alpha. Using default model weights.")
+    return model, tokenizer
+if "model" not in st.session_state or "tokenizer" not in st.session_state:
+    print("Loading model...")
+    st.session_state.model, st.session_state.tokenizer = load_model(penalty_alpha=-2)  # Example penalty_alpha
+    st.session_state["head_number"] = len(st.session_state.model.branch_locations) + 1
+    print(f"Head number: {st.session_state['head_number']}")
+# Session state to store the current sentence
+if 'current_sentence' not in st.session_state:
+    reset()
+# Create a container to hold the buttons
+cols = st.columns(len(st.session_state.head_tokens))  # Create a column for each token
+# Iterate through each head token and create a button in a separate column
+for i, (col, token) in enumerate(zip(cols, st.session_state.head_tokens)):
+    col.button(f"{st.session_state['head_tokens'][i]}",
+                key=f"head_{i}",
+                use_container_width=True,
+                on_click=add_and_run,
+                args=(st.session_state['head_tokens'][i], i))
+# Display the current sentence
+st.markdown(f"{st.session_state['current_sentence']}")
+# Reset button to start over
+st.button('Reset', on_click=reset)
+if 'computation_pd' in st.session_state:
+    st.line_chart(st.session_state['computation_pd'])
+    # get last element from a pd
+    saved_budget = 100 - ((st.session_state["computation_pd"]["Mean"].iloc[-1] * 100) / st.session_state["computation_pd"]["Base model consumption"].iloc[-1])
+    st.markdown(f"You saved **{saved_budget:.2f}%** of the base model consumption.")
+    #st.write(st.session_state['computation_pd'])

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+streamlit==1.31.0
+torch==2.0.1
+pandas==2.0.3
+transformers==4.36.0

src/BranchyModel.py ADDED Viewed

	@@ -0,0 +1,469 @@

+from typing import Dict, List, Optional
+from dataclasses import dataclass
+import torch
+from torch import nn
+from torch.nn import functional as F
+from transformers import PreTrainedModel
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.utils import ModelOutput
+@dataclass
+class CausalBranchyLLMOutputWithPast(ModelOutput):
+    loss: Optional[torch.Tensor] = None
+    lm_loss: Optional[torch.Tensor] = None
+    head_loss: Optional[torch.Tensor] = None
+    logits: torch.Tensor = None
+    head_outputs: Optional[torch.Tensor] = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+class Branch(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
+    def forward(self, x):
+        x = self.layernorm(x)
+        x = self.lm_head(x)
+        return x
+class BranchyModel(PreTrainedModel):
+    """
+    This class is a wrapper for transformer models with added functionality for branchy networks.
+    It uses BranchyConfig to initialize a model and later will be extended to add branches.
+    Args:
+        branch_locations (List[int]): The locations of the branches in the model.
+        starts indexing from 0. Branch 0 is after layer 0.
+        model (PreTrainedModel): The underlying transformer model to wrap.
+    Returns:
+        A model instance with the given configuration.
+    """
+    def __init__(self, branch_locations, model, loss_type="kl_div", penality_weight=None):
+        super().__init__(model.config)
+        # Initialize the base transformer model
+        self.model = model
+        self.branch_locations = branch_locations
+        self.loss_type = loss_type
+        self.penality_weight = penality_weight
+        if self.loss_type == "penalized_cross_entropy":
+            assert self.penality_weight is not None, "penality_weight must be provided for penalized_cross_entropy loss"
+        # Get details on layering inside the model
+        if hasattr(self.model.config, "n_layer") or hasattr(
+            self.model.config, "num_hidden_layers"
+        ):  # If there is no n_layer in the config, there might be ways to get it from the model itself
+            self.num_layers = (
+                self.model.config.n_layer
+                if hasattr(self.model.config, "n_layer")
+                else self.model.config.num_hidden_layers
+            )
+        else:
+            raise ValueError("cannot find n_layer in config")
+        # if no branch locations are specified, branch at every layer
+        if self.branch_locations is None:
+            self.branch_locations = list(range(self.num_layers - 1))
+        assert self.num_layers > 0, "The number of layers must be greater than 0"
+        assert (
+            len(self.branch_locations) < self.num_layers
+        ), "The number of branches must be less than the number of layers"
+        assert all(
+            [0 <= i < self.num_layers for i in self.branch_locations]
+        ), "The branch locations must be between 0 and num_layers"
+        # Make sure the base model is frozen
+        for param in self.model.parameters():
+            param.requires_grad = False
+        # Instantiate heads. Default: heads are copies of the lm_head
+        self.model.heads = torch.nn.ModuleList(
+            [
+                Branch(self.model.config) for _ in range(len(self.branch_locations))
+            ]
+        )
+        # initialize heads
+        for head in self.model.heads:
+            head.apply(self.model._init_weights)
+            # Make them trainable
+            for param in head.parameters():
+                param.requires_grad = True
+        self.post_init()
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        **kwargs,
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if (
+                attention_mask is not None
+                and attention_mask.shape[1] > input_ids.shape[1]
+            ):
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "fixed_output_head": kwargs.get("fixed_output_head", None),
+            }
+        )
+        return model_inputs
+    def compute_self_supervision_loss(
+        self,
+        aux_logits: torch.Tensor,
+        lm_logits: torch.Tensor,
+        return_per_head: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        last_aux_logits = aux_logits[..., -1, :]
+        last_lm_logits = lm_logits[..., -1, :]
+        repeated_last_lm_logits = last_lm_logits.repeat(
+            last_aux_logits.shape[0], 1, 1, 1
+        )
+        losses = []
+        # Can be useful to have detailed loss per head for comparison of performance
+        if return_per_head:
+            for head_logit in last_aux_logits:
+                if self.loss_type == "kl_div":
+                    losses.append(
+                        nn.KLDivLoss(reduction="batchmean")(
+                            F.log_softmax(head_logit, dim=-1),
+                            F.softmax(last_lm_logits, dim=-1),
+                        )
+                    )
+                elif self.loss_type == "cross_entropy":
+                    losses.append(
+                        nn.CrossEntropyLoss(reduction="mean")(
+                            head_logit, torch.argmax(last_lm_logits, dim=-1)
+                        )
+                    )
+                elif self.loss_type == "penalized_cross_entropy":
+                    ce_loss = nn.CrossEntropyLoss(reduction="mean")(
+                        head_logit, torch.argmax(last_lm_logits, dim=-1)
+                    )
+                    probas = F.softmax(head_logit, dim=-1)
+                    entropy = torch.mean(-torch.sum(probas * torch.log(probas + 1e-8), dim=-1))
+                    #losses.append(ce_loss - self.penality_weight * (1.0 / (1.0 + entropy)))
+                    losses.append(ce_loss - self.penality_weight * entropy)
+                else:
+                    raise ValueError(
+                        "The loss type must be either kl_div or cross_entropy"
+                    )
+            loss = torch.stack(losses, dim=0).mean(dim=-1)
+        else:
+            # Compute the KL divergence between the last auxiliary head and the last LM head
+            if self.loss_type == "kl_div":
+                loss = nn.KLDivLoss(reduction="batchmean")(
+                    F.log_softmax(last_aux_logits.view(-1, self.config.vocab_size), dim=-1),
+                    F.softmax(
+                        repeated_last_lm_logits.view(-1, self.config.vocab_size), dim=-1
+                    ),
+                )
+            elif self.loss_type == "cross_entropy":
+                loss = nn.CrossEntropyLoss(reduction="mean")(
+                    last_aux_logits.view(-1, self.config.vocab_size),
+                    torch.argmax(
+                        repeated_last_lm_logits.view(-1, self.config.vocab_size), dim=-1
+                    ),
+                )
+            elif self.loss_type == "penalized_cross_entropy":
+                ce_loss = nn.CrossEntropyLoss(reduction="mean")(
+                    last_aux_logits.view(-1, self.config.vocab_size),
+                    torch.argmax(
+                        repeated_last_lm_logits.view(-1, self.config.vocab_size), dim=-1
+                    ),
+                )
+                probas = F.softmax(
+                    last_aux_logits.view(-1, self.config.vocab_size), dim=-1
+                )
+                entropy = torch.mean(-torch.sum(probas * torch.log(probas + 1e-8), dim=-1))
+                loss = ce_loss + self.penality_weight * entropy
+            else:
+                raise ValueError(
+                    "The loss type must be either kl_div or cross_entropy"
+                )
+        if return_per_head:
+            return {"loss": loss, "aux_loss": torch.stack(losses)}
+        else:
+            return {"loss": loss, "aux_loss": None}
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        self_supervision: Optional[bool] = None,
+        fixed_output_head: Optional[int] = None,
+    ):
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if self_supervision:
+            output_hidden_states = True
+            return self.forward_for_training(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                labels=labels,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        else:
+            return self.forward_for_inference(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                return_dict=return_dict,
+                fixed_output_head=fixed_output_head,
+            )
+    def forward_for_inference(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        fixed_output_head: Optional[int] = None,
+    ):
+        if fixed_output_head not in self.branch_locations and fixed_output_head is not None and fixed_output_head != -1:
+            raise ValueError(
+                "The fixed output head must be one of the branch locations"
+            )
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+        if inputs_embeds is None:
+            inputs_embeds = self.model.model.embed_tokens(input_ids)
+        inputs_embeds = self.model.model.embed_dropout(inputs_embeds)
+        # Attention mask.
+        if self.model.model._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
+        all_head_logits = []
+        hidden_states = inputs_embeds
+        is_early_exited = False
+        for layer_idx, decoder_layer in enumerate(self.model.model.layers):
+            layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[1]
+            if fixed_output_head is not None and layer_idx == fixed_output_head:
+                # find postion of layer idx in branch_locations
+                branch_idx = self.branch_locations.index(layer_idx)
+                logits = self.model.heads[branch_idx](hidden_states)
+                is_early_exited = True
+                break
+            elif fixed_output_head == -1 and layer_idx in self.branch_locations:
+                # -1 means output all heads
+                branch_idx = self.branch_locations.index(layer_idx)
+                logits = self.model.heads[branch_idx](hidden_states)
+                all_head_logits.append(logits)
+        if not is_early_exited:
+            hidden_states = self.model.model.final_layernorm(hidden_states)
+            logits = self.model.lm_head(hidden_states)
+            if fixed_output_head == -1:
+                all_head_logits.append(logits)
+                all_head_logits = torch.stack(all_head_logits, dim=0)
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [logits, next_cache] if v is not None)
+        return CausalBranchyLLMOutputWithPast(
+            logits=logits,
+            head_outputs=all_head_logits,
+            past_key_values=next_cache,
+        )
+    def forward_for_training(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        if not output_hidden_states:
+            raise ValueError("output_hidden_states must be True for BranchyLLM")
+        if labels is not None:
+            raise NotImplementedError("BranchyLLM only supports self-supervision")
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not hasattr(outputs, "hidden_states") or outputs.hidden_states is None:
+            raise ValueError("The model must return hidden states")
+        hidden_states = outputs.hidden_states
+        heads_logits = []
+        for i, branch in enumerate(self.branch_locations):
+            heads_logits.append(
+                self.model.heads[i](
+                    hidden_states[branch]
+                )
+            )
+        lm_logits = self.model.lm_head(hidden_states[-1])
+        heads_logits = torch.stack(heads_logits, dim=0).float()
+        lm_logits = lm_logits.float()
+        logits = torch.cat([heads_logits, lm_logits.unsqueeze(0)], dim=0)
+        loss = None
+        lm_loss = None
+        aux_loss = None
+        losses = self.compute_self_supervision_loss(
+            heads_logits, lm_logits, return_per_head=True
+        )
+        loss = losses["loss"]
+        if losses["aux_loss"] is not None:
+            aux_loss = losses["aux_loss"]
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss, aux_loss, lm_loss) + output) if loss is not None else output
+        return CausalBranchyLLMOutputWithPast(
+            loss=loss,
+            lm_loss=lm_loss,
+            head_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

src/utils.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import torch
+def generate_next_token(model, tokenizer, input, method='greedy'):
+    """
+    Generate the next token of a sequence using the given model and tokenizer.
+    Specific for multi branched models.
+    Only output token from last head.
+    Args:
+        model (torch.nn.Module): The model to use for generation.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to use for generation.
+        input (str): The input text to generate from.
+    Returns:
+        token (str): The next token in the sequence.
+        logits (torch.Tensor): The logits of the next token. of shape[Head, vocab_size]
+        new_sequence (str): The new sequence after adding the next token.
+    """
+    device = model.device
+    input_ids = tokenizer.encode(input, return_tensors="pt").to(device)
+    model.eval()
+    logits = model(input_ids, fixed_output_head=-1).head_outputs[..., -1, :].squeeze(1) # squeeze batch dimension as it is 1 new shape is (head_count, vocab_size)
+    if logits == []:
+        raise ValueError("Model does not have head_outputs")
+    if method == 'greedy':
+        head_tokens = torch.argmax(logits, dim=-1)
+    elif method == 'sample':
+        head_tokens = torch.multinomial(torch.nn.functional.softmax(logits, dim=-1), num_samples=1)
+    elif method == 'top_k':
+        k = 5
+        top_k = torch.topk(logits, k, dim=-1)
+        top_k_logits, top_k_indices = top_k.values, top_k.indices
+        top_k_probs = torch.nn.functional.softmax(top_k_logits, dim=-1)
+        head_tokens = top_k_indices[torch.arange(top_k_probs.shape[0]), torch.multinomial(top_k_probs, num_samples=1).squeeze()]
+    elif method == 'top_p':
+        # logits is of shape [batch, vocab_size]
+        p = 0.9
+        probs = torch.nn.functional.softmax(logits, dim=-1)
+        sorted_probs, sorted_indices = torch.sort(probs, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+        sorted_indices_to_remove = cumulative_probs > p
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        tmp_logits = logits.clone()
+        for i in range(logits.shape[0]):
+            tmp_logits[i, indices_to_remove[i]] = float('-inf')
+        head_tokens = torch.multinomial(torch.nn.functional.softmax(tmp_logits, dim=-1), num_samples=1).squeeze()
+    else:
+        raise ValueError(f"Unknown method: {method}")
+    head_tokens = tokenizer.batch_decode(head_tokens) # Treat head dim as batch dim
+    new_sequence = input + head_tokens[-1]
+    return head_tokens[-1], logits, new_sequence, head_tokens
+def breaking_ties(tensor):
+    return torch.sub(torch.topk(tensor, 2, dim=-1).values[..., 0], torch.topk(tensor, 2, dim=-1).values[..., 1]).squeeze()