Spaces:

ZekunXi
/

easyedit

Sleeping

App Files Files Community

ZekunXi commited on Feb 25, 2024

Commit

8124a18

1 Parent(s): d0f3258

Add application file

Browse files

Files changed (35) hide show

easyeditor/__init__.py +2 -0
easyeditor/__pycache__/__init__.cpython-39.pyc +0 -0
easyeditor/models/README.md +6 -0
easyeditor/models/__init__.py +1 -0
easyeditor/models/__pycache__/__init__.cpython-39.pyc +0 -0
easyeditor/models/grace/GRACE.py +218 -0
easyeditor/models/grace/__init__.py +2 -0
easyeditor/models/grace/__pycache__/GRACE.cpython-39.pyc +0 -0
easyeditor/models/grace/__pycache__/__init__.cpython-39.pyc +0 -0
easyeditor/models/grace/__pycache__/grace_hparams.cpython-39.pyc +0 -0
easyeditor/models/grace/__pycache__/grace_main.cpython-39.pyc +0 -0
easyeditor/models/grace/__pycache__/metrics.cpython-39.pyc +0 -0
easyeditor/models/grace/__pycache__/utils.cpython-39.pyc +0 -0
easyeditor/models/grace/grace_hparams.py +48 -0
easyeditor/models/grace/grace_main.py +38 -0
easyeditor/models/grace/metrics.py +59 -0
easyeditor/models/grace/utils.py +86 -0
easyeditor/util/__init__.py +2 -0
easyeditor/util/__pycache__/__init__.cpython-39.pyc +0 -0
easyeditor/util/__pycache__/hparams.cpython-39.pyc +0 -0
easyeditor/util/__pycache__/logit_lens.cpython-39.pyc +0 -0
easyeditor/util/__pycache__/nethook.cpython-39.pyc +0 -0
easyeditor/util/alg_dict.py +45 -0
easyeditor/util/alg_train_dict.py +9 -0
easyeditor/util/generate.py +171 -0
easyeditor/util/globals.py +43 -0
easyeditor/util/hparams.py +46 -0
easyeditor/util/logit_lens.py +97 -0
easyeditor/util/nethook.py +451 -0
easyeditor/util/perplexity.py +24 -0
easyeditor/util/runningstats.py +1883 -0
hparams/GRACE/README.md +19 -0
hparams/GRACE/gpt2-xl.yaml +19 -0
hparams/config.yaml +6 -0
utils.py +36 -0

easyeditor/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .models import *
2	+ from .util import *

easyeditor/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (174 Bytes). View file

easyeditor/models/README.md ADDED Viewed

	@@ -0,0 +1,6 @@

+We compare ROME against several open sourced state-of-the-art model editors. All are implemented in their respective folders. Implementations other than FT/FT+L are adapted from third parties.
+- Fine-Tuning (`ft`): Direct fine-tuning.
+- Constrained Fine-Tuning (`ft`): FT with $L_\infty$ norm constraint. Inspired by Zhu et al. [[Paper]](https://arxiv.org/abs/2012.00363)
+- Knowledge Neurons (`kn`): Dai et al. [[Code]](https://github.com/EleutherAI/knowledge-neurons) [[Paper]](https://arxiv.org/abs/2104.08696)
+- Knowledge Editor (`efk`): De Cao et al. [[Code]](https://github.com/eric-mitchell/mend) [[Paper]](https://arxiv.org/abs/2104.08164)
+- Model Editor Networks with Gradient Decomposition (`mend`): Mitchell et al. [[Code]](https://github.com/eric-mitchell/mend) [[Paper]](https://arxiv.org/abs/2110.11309)

easyeditor/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .grace import *

easyeditor/models/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (164 Bytes). View file

easyeditor/models/grace/GRACE.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import torch
+from .utils import parent_module, brackets_to_periods
+import transformers
+import os
+os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
+def euc(query, key):
+    # Euclidean distance
+    if len(key.shape) < 2:
+        key = key.view(1, -1)
+    return torch.cdist(key, query, p=2)
+def perturb_values(chosen_value, num_pert, device):
+    # Create a bunch of noised versions of the value, then create batch, then train value
+    chosen_value = chosen_value
+    noise = torch.normal(0, 1, chosen_value.shape, device=device)
+    noise[0] = noise[0]*0
+    noise.requires_grad = True
+    chosen_value = chosen_value + noise
+    return chosen_value
+class GRACE(torch.nn.Module):
+    def __init__(self, config, model, device):
+        super(GRACE, self).__init__()
+        self.config = config
+        self.log_dict = {}
+        self.model = model
+        # self.tokenizer = model.tokenizer
+        layer = config.inner_params[0]
+        self.device = device
+        # --- ensure proper formatting (GRACE edits ~layers~ not weights matrices) ---
+        suffixes = [".weight", ".bias"]
+        self.layer = layer.rsplit(".", 1)[0] if any(layer.endswith(x) for x in suffixes) else layer
+        for n, p in self.model.named_parameters():
+            p.requires_grad = False
+        if isinstance(self.model, transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel):
+            transpose = False
+        else:
+            transpose = True
+        # --- Add GRACE to chosen layers ---
+        edit_module = parent_module(self.model, brackets_to_periods(self.layer))
+        layer_name = self.layer.rsplit(".", 1)[-1]
+        original_layer = getattr(edit_module, layer_name)
+        if type(original_layer) is not GRACEAdapter:
+            setattr(edit_module, layer_name, GRACEAdapter(config, original_layer, transpose=transpose).to(self.device))
+    def __call__(self, **kwargs):
+        # if self.config.task == "hallucination":
+        #     print(kwargs)
+        #     key_id = (kwargs["labels"] == -100).sum() - 1
+        #     setattr(eval(f"self.model.{self.layer}"), "key_id", key_id) # Tell GRACE which token to use for its query (default is the last token)
+        return self.model(**kwargs)
+    def generate(self, *args, **kwargs):
+        setattr(eval(f"self.model.{self.layer}"), "key_id", -1)
+        return self.model.generate(*args, **kwargs)
+    def edit(self, config, tokens):
+        key_id = (tokens["labels"] == -100).sum() - 1
+        setattr(eval(f"self.model.{self.layer}"), "key_id", key_id)
+        # --- pass edit label, training mode, and key_id into GRACE ---
+        setattr(eval(f"self.model.{self.layer}"), "training", True)
+        setattr(eval(f"self.model.{self.layer}"), "edit_label", tokens["labels"])
+        self.losses = []
+        # --- train GRACE value ---
+        for i in range(config.n_iter):
+            # --- insert iteration into each layer (only initiate keys on iteration 1) ---
+            setattr(eval(f"self.model.{self.layer}"), "iter", i)
+            # --- pass tokens through model (including through the GRACE layer) ---
+            outputs = self.model(**tokens)
+            if i == 0:
+                # --- we only need to create an optimizer for the first iteration (but forward pass instantiates the key, so optimzer is passed after first inference) ---
+                optimizer = torch.optim.Adam(self.model.parameters(), config.edit_lr)
+            loss = outputs.loss
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            self.losses.append(loss.detach().cpu().numpy())
+        self.loss = loss # Log final loss
+        # --- pull out info we want to log from the GRACE layer ---
+        setattr(eval(f"self.model.{self.layer}"), "training", False)
+        chosen_key = getattr(eval(f"self.model.{self.layer}"), "chosen_key")
+        nkeys = len(getattr(eval(f"self.model.{self.layer}"), "keys"))
+        self.log_dict["chosen_key"] =  chosen_key
+        self.log_dict["nkeys"] = nkeys
+class GRACEAdapter(torch.nn.Module):
+    def __init__(self, config, layer, transpose):
+        super(GRACEAdapter, self).__init__()
+        self.layer = layer
+        self.weight = self.layer.weight
+        self.init_epsilon = config.eps
+        self.dist_fn = config.dist_fn
+        self.replacement = config.replacement
+        self.device = layer.weight.device
+        self.config = config
+        self.num_pert = config.num_pert
+        self.key_id = -1
+        self.ensure_replace_token_loc = False
+        if transpose:
+            self.key_shape = layer.weight.shape[1]
+            self.value_shape = layer.weight.shape[0]
+        else:
+            self.key_shape = layer.weight.shape[0]
+            self.value_shape = layer.weight.shape[1]
+        self.training = False
+    def add_key(self, new_key, new_value):
+        keys = torch.vstack([self.keys, new_key.detach()]) # Add new key to list of keys
+        values = torch.nn.Parameter(torch.vstack([self.values, new_value]), requires_grad=True) # Add new value to list of values
+        new_epsilon = torch.tensor(self.init_epsilon, device=self.device).view(1)
+        epsilons = torch.vstack([self.epsilons, new_epsilon]) # Add new epsilon to list of epsilons
+        key_labels = self.key_labels + [self.edit_label] # Add new key_label to list of key_labels
+        return keys, values, epsilons, key_labels
+    def init_key_value(self, query, value):
+        key = query.detach()
+        epsilon = torch.tensor(self.init_epsilon, device=self.device, requires_grad=False).view(1)
+        key_label = [self.edit_label]
+        return key, value, epsilon, key_label
+    def label_match(self, edit_label, key_label):
+        return edit_label.float().mean() == key_label.float().mean()
+    def split_epsilons_in_half(self, nearest_key, smallest_distance):
+        self.epsilons[nearest_key] = (smallest_distance / 2) - 1e-5 # Cut nearest epsilon in half
+        self.epsilons[-1] = smallest_distance / 2 # Cut new epsilon in half
+    def forward(self, *args):
+        # Run layer forward and save what it would have returned for this instance
+        layer_out = self.layer(*args)
+        ### If training, we need to modify the codebook
+        if (not self.training) & ('keys' not in self.__dict__):
+            # If it's not training time and we haven't added any keys yet (this is before doing any editing)
+            # print(self.__dict__)
+            return layer_out
+        else:
+            if not self.training and not self.ensure_replace_token_loc and self.key_id == -1:
+                token_to_edit = args[0].shape[1]-1
+                self.key_id = args[0].shape[1]-1
+                self.ensure_replace_token_loc = True
+            else:
+                token_to_edit = min(self.key_id, args[0].shape[1]-1) # args[0].shape[1] - 1 is sequence length
+            query = args[0][:, token_to_edit, :] # Just use activation for last token
+            if self.config.val_init == "cold":
+                new_value = torch.nn.Parameter(torch.rand(1, self.value_shape, requires_grad=True, device=self.device))
+            elif self.config.val_init == "warm":
+                new_value = torch.nn.Parameter(layer_out[:, token_to_edit, :].detach(), requires_grad=True)
+            if 'keys' not in self.__dict__:
+                # If no keys exist, initialize keys, values, epsilons, and key labels
+                self.keys, self.values, self.epsilons, self.key_labels = self.init_key_value(query, new_value)
+            elif self.iter == 0:
+                # Keys exist, so we have decide whether or not to update them (the fact that we've made it to this point means there was an error!)
+                # --- search through keys for a match for query ---
+                dists = torch.cdist(self.keys, query, p=2).view(-1, len(query))
+                smallest_distance, nearest_key = dists.min(0)
+                if smallest_distance > (self.init_epsilon + self.epsilons[nearest_key]):
+                    # If there's no close key, make a new key
+                    self.keys, self.values, self.epsilons, self.key_labels = self.add_key(query, new_value)
+                else:
+                    # If there is a close key, we need to handle conflicts
+                    if not self.label_match(self.edit_label, self.key_labels[nearest_key]):
+                        self.keys, self.values, self.epsilons, self.key_labels = self.add_key(query, new_value)
+                        self.split_epsilons_in_half(nearest_key, smallest_distance)
+                    else:
+                        # If the current label is the SAME as the nearest label, just make the nearest epsilon bigger
+                        if smallest_distance > self.epsilons[nearest_key]:
+                            if self.config.eps_expand== "coverage":
+                                self.epsilons[nearest_key] = smallest_distance # Replace nearest epsilon with dist between old key and new key
+                            elif self.config.eps_expand == "moving_average":
+                                a = 0.5
+                                self.keys[nearest_key] = a*self.keys[nearest_key] + (1-a)*query # Move old key to be halfway between
+                                self.epsilons[nearest_key] = smallest_distance
+                                # self.epsilons[nearest_key] = smallest_distance + self.init_epsilon
+            else:
+                # If not iter 0, we don't need to change keys, we just need to learn the value
+                pass
+        # print(token_to_edit)
+        # compute distance from query to all keys and find the closest keys
+        dists = torch.cdist(self.keys, query, p=2).view(-1, len(query))
+        smallest_dist, self.chosen_key = dists.min(0)
+        smallest_dist = smallest_dist.view(-1, 1)
+        chosen_value = self.values[self.chosen_key]
+        eps = self.epsilons[self.chosen_key].view(-1, 1)
+        if (self.config.val_train == "adv") and (self.training):
+            chosen_value = perturb_values(chosen_value, self.num_pert, self.device)
+        if self.replacement == "replace_all":
+            layer_out = torch.where((smallest_dist <= eps).view(-1, 1, 1), chosen_value.unsqueeze(1).repeat_interleave(layer_out.shape[1], 1), layer_out)
+        elif self.replacement == "replace_last":
+            layer_out[:, token_to_edit] = torch.where((smallest_dist <= eps), chosen_value, layer_out[:, token_to_edit])
+        elif self.replacement == "replace_prompt":
+            layer_out[:, :token_to_edit] = torch.where((smallest_dist <= eps), chosen_value, layer_out[:, :token_to_edit])
+        else:
+            print("token replacement choice not found")
+        return layer_out

easyeditor/models/grace/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .grace_main import GraceHyperParams, apply_grace_to_model
2	+ from .metrics import F1, PPL, Accuracy, is_qa_error, is_acc_error

easyeditor/models/grace/__pycache__/GRACE.cpython-39.pyc ADDED Viewed

Binary file (6.34 kB). View file

easyeditor/models/grace/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (342 Bytes). View file

easyeditor/models/grace/__pycache__/grace_hparams.cpython-39.pyc ADDED Viewed

Binary file (1.49 kB). View file

easyeditor/models/grace/__pycache__/grace_main.cpython-39.pyc ADDED Viewed

Binary file (1.12 kB). View file

easyeditor/models/grace/__pycache__/metrics.cpython-39.pyc ADDED Viewed

Binary file (2.07 kB). View file

easyeditor/models/grace/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (3.53 kB). View file

easyeditor/models/grace/grace_hparams.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from dataclasses import dataclass
+from typing import List
+from ...util.hparams import HyperParams
+import yaml
+@dataclass
+class GraceHyperParams(HyperParams):
+    # Experiments
+    edit_lr: int
+    n_iter: int
+    # Method
+    eps: float
+    dist_fn: str
+    val_init: str
+    val_train: str
+    val_reg: str
+    reg: str
+    replacement: str
+    eps_expand: str
+    num_pert: str
+    dropout: float
+    # Module templates
+    inner_params: List[str]
+    device: int
+    alg_name: str
+    model_name: str
+    # Defaults
+    batch_size: int = 128
+    max_length: int = 30
+    model_parallel: bool = False
+    @classmethod
+    def from_hparams(cls, hparams_name_or_path: str):
+        if '.yaml' not in hparams_name_or_path:
+            hparams_name_or_path = hparams_name_or_path + '.yaml'
+        with open(hparams_name_or_path, "r") as stream:
+            config = yaml.safe_load(stream)
+            config = super().construct_float_from_scientific_notation(config)
+        assert (config and config['alg_name'] == 'GRACE') or print(
+            f'GraceHyperParams can not load from {hparams_name_or_path}, '
+            f'alg_name is {config["alg_name"]} ')
+        return cls(**config)

easyeditor/models/grace/grace_main.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from typing import Any, Dict, List, Tuple
+import torch
+from copy import deepcopy
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from .GRACE import GRACE
+from .grace_hparams import GraceHyperParams
+from .utils import tokenize
+from ...util import nethook
+def apply_grace_to_model(
+        model: AutoModelForCausalLM,
+        tok: AutoTokenizer,
+        requests: List[Dict],
+        hparams: GraceHyperParams,
+        copy=False,
+        return_orig_weights=False,
+        keep_original_weight=False,
+        **kwargs: Any,
+) -> Tuple[AutoModelForCausalLM, Dict[str, Any]]:
+    model.to(f'cuda:{hparams.device}')
+    request = requests
+    if copy:
+        model = deepcopy(model)
+    weights_copy = {}
+    device = torch.device(f'cuda:{hparams.device}')
+    editor = GRACE(model=model, config=hparams, device=device)
+    tokens = tokenize(request, tokenizer=tok, device=device)
+    editor.edit(config=hparams, tokens=tokens)
+    if not keep_original_weight:
+        weights_copy = {}
+    editor.to(f'cuda:{hparams.device}')
+    return editor, weights_copy

easyeditor/models/grace/metrics.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import torch
+import numpy as np
+from .utils import *
+def is_acc_error(model, tokens):
+    # Check whether or not the model's prediction for a batch element is correct
+    labels = tokens["labels"]
+    logits = model(**tokens).logits
+    probs = torch.softmax(logits, -1).squeeze()
+    argmaxs = torch.argmax(probs, dim=-1).squeeze()
+    return labels != argmaxs
+def Accuracy(model, tokens):
+    labels = tokens["labels"]
+    new_tokens = {f"{k}" : v for k, v in tokens.items() if k != "labels"}
+    logits = model(**new_tokens).logits
+    probs = torch.softmax(logits, -1).squeeze()
+    argmaxs = torch.argmax(probs, dim=-1).squeeze()
+    return (labels == argmaxs).float().mean()
+def is_qa_error(model, tokens):
+    preds = model.generate(tokens["input_ids"], max_length=20).squeeze() # Run model to get its predictions
+    labels = tokens["labels"]#[tokens["labels"] != -100]
+    if (len(preds) != len(labels)) or ((preds == labels).sum() != len(preds)):
+        return True
+    else:
+        return False
+def PPL(model, batch):
+    input_ids = batch["input_ids"][:, :1024]#.to(device)
+    if "labels" not in batch:
+        target_ids = batch["input_ids"][:, :1024].clone()
+    else:
+        target_ids = batch["labels"][:, :1024].clone()
+    with torch.no_grad():
+        outputs = model(input_ids=input_ids, labels=target_ids)
+        nll = outputs.loss
+    ppl = torch.exp(nll)#.clip(0, 100)
+    return ppl
+def F1(model, batch):
+    try:
+        preds = model.generate(batch["input_ids"], max_length=20).squeeze()
+        if len(preds) > 1:
+            preds = preds[preds != model.tokenizer.pad_token_id]
+        gold_toks = batch["labels"][batch["labels"] != -100].cpu().squeeze() # -100 might be nonsense
+        num_same = len(np.intersect1d(preds.cpu().squeeze(), gold_toks))
+        if (num_same == 0) or (len(preds.squeeze()) == 0):
+            return 0
+        precision = num_same / len(preds.squeeze())
+        recall = 1.0 * num_same / len(gold_toks)
+        f1 = (2 * precision * recall) / (precision + recall)
+        return f1
+    except:
+        # Every once in a while, the model just returns the stop token
+        return 0

easyeditor/models/grace/utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import transformers
+import torch
+import os
+import numpy as np
+import datetime
+import struct
+from torch.nn.utils.rnn import pad_sequence
+import torch.nn.functional as F
+def get_inner_params(named_parameters, inner_names):
+    param_dict = dict(named_parameters)
+    return [(n, param_dict[n]) for n in inner_names]
+def param_subset(named_parameters, inner_names):
+    param_dict = dict(named_parameters)
+    return [param_dict[n] for n in inner_names]
+def parent_module(model, pname):
+    components = pname.split('.')
+    parent = model
+    for component in components[:-1]:
+        if hasattr(parent, component):
+            parent = getattr(parent, component)
+        elif component.isdigit():
+            parent = parent[int(component)]
+        else:
+            raise RuntimeError(f"Couldn't find child module {component}")
+    if not hasattr(parent, components[-1]):
+        raise RuntimeError(f"Couldn't find child module {components[-1]}")
+    return parent
+def uuid(digits=4):
+    if not hasattr(uuid, "uuid_value"):
+        uuid.uuid_value = struct.unpack('I', os.urandom(4))[0] % int(10**digits)
+    return uuid.uuid_value
+def ckpt_dir():
+    """returns the directory in which to store model checkpoints"""
+    path = "./ckpts/"
+    if not os.path.exists(path):
+        os.makedirs(path)
+    return path
+def brackets_to_periods(name):
+    return name.replace("[", ".").replace("]", "")
+def get_params(model):
+    return model.state_dict()
+def get_shape(p, model):
+    # We need to flip the shapes since OpenAI gpt2 uses convs instead of linear
+    return p.shape if isinstance(model, transformers.GPT2LMHeadModel) else (p.shape[1], p.shape[0])
+def get_logits(x):
+    return x.logits if hasattr(x, "logits") else x
+def tokenize(batch, tokenizer, device, test=False):
+    prompt, label = batch["prompt"], batch["target_new"]
+    if not isinstance(prompt, list):
+        prompt=[prompt]
+    if not isinstance(label, list):
+        label=[label]
+    mask_token = -100 # ignore_index of CrossEntropyLoss
+    if test or not label:
+        tokens = tokenizer(list(prompt), return_tensors="pt", padding=True, truncation=True)
+        tokens["labels"] = tokens["input_ids"].clone()
+        tokens["labels"][tokens["input_ids"] == tokenizer.pad_token_id] = mask_token
+    else:
+        full_prompt = [f"{p} {l}" for p, l in zip(prompt, label)]
+        prompt_ids = tokenizer(list(prompt), return_tensors="pt", padding=True, truncation=True)["input_ids"]
+        num_prompt_toks = [int((i != tokenizer.pad_token_id).sum()) for i in prompt_ids]
+        tokens = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
+        tokens["labels"] = tokens["input_ids"].clone()
+        for i in range(len(prompt)):
+            tokens["labels"][i][:num_prompt_toks[i]] = mask_token
+        tokens["labels"][tokens["input_ids"] == tokenizer.pad_token_id] = mask_token
+    tokens = {f"{k1}" : v1.to(device) for k1, v1 in tokens.items()}
+    return tokens

easyeditor/util/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .logit_lens import LogitLens
2	+ from .hparams import *

easyeditor/util/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (208 Bytes). View file

easyeditor/util/__pycache__/hparams.cpython-39.pyc ADDED Viewed

Binary file (1.21 kB). View file

easyeditor/util/__pycache__/logit_lens.cpython-39.pyc ADDED Viewed

Binary file (3.35 kB). View file

easyeditor/util/__pycache__/nethook.cpython-39.pyc ADDED Viewed

Binary file (13.2 kB). View file

easyeditor/util/alg_dict.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from ..models.rome import ROMEHyperParams, apply_rome_to_model
+from ..models.memit import MEMITHyperParams, apply_memit_to_model
+from ..models.kn import KNHyperParams, apply_kn_to_model
+from ..models.mend import MENDHyperParams, MendRewriteExecutor, MendMultimodalRewriteExecutor
+from ..models.ft import FTHyperParams, apply_ft_to_model
+from ..models.serac import SERACHparams, SeracRewriteExecutor, SeracMultimodalRewriteExecutor
+from ..dataset import ZsreDataset, CounterFactDataset, CaptionDataset, VQADataset
+from ..models.ike import IKEHyperParams, apply_ike_to_model, apply_ike_to_multimodal_model
+from ..models.ft_api import FTApiHyperParams, apply_ft_api_to_model
+from ..models.lora import LoRAHyperParams, apply_lora_to_model
+from ..models.grace import GraceHyperParams, apply_grace_to_model
+from ..models.pmet import PMETHyperParams, apply_pmet_to_model
+from ..models.melo import MELOHyperParams, apply_melo_to_model
+ALG_DICT = {
+    'ROME': apply_rome_to_model,
+    'MEMIT': apply_memit_to_model,
+    "FT": apply_ft_to_model,
+    'KN': apply_kn_to_model,
+    'MEND': MendRewriteExecutor().apply_to_model,
+    'SERAC': SeracRewriteExecutor().apply_to_model,
+    'IKE': apply_ike_to_model,
+    'FT-Api': apply_ft_api_to_model,
+    'LoRA': apply_lora_to_model,
+    'GRACE': apply_grace_to_model,
+    'PMET': apply_pmet_to_model,
+    'MELO': apply_melo_to_model
+}
+ALG_MULTIMODAL_DICT = {
+    'MEND': MendMultimodalRewriteExecutor().apply_to_model,
+    'SERAC': SeracMultimodalRewriteExecutor().apply_to_model,
+    'SERAC_MULTI': SeracMultimodalRewriteExecutor().apply_to_model,
+    'IKE': apply_ike_to_multimodal_model,
+}
+DS_DICT = {
+    "cf": CounterFactDataset,
+    "zsre": ZsreDataset,
+}
+MULTIMODAL_DS_DICT = {
+    "caption": CaptionDataset,
+    "vqa": VQADataset,
+}

easyeditor/util/alg_train_dict.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from ..trainer import MEND
+from ..trainer import SERAC, SERAC_MULTI
+ALG_TRAIN_DICT = {
+    'MEND': MEND,
+    'SERAC': SERAC,
+    'SERAC_MULTI': SERAC_MULTI,
+}

easyeditor/util/generate.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import unicodedata
+from typing import List, Optional
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from .logit_lens import LogitLens
+def generate_interactive(
+    model: AutoModelForCausalLM,
+    tok: AutoTokenizer,
+    top_k: int = 5,
+    max_out_len: int = 200,
+    compare_against: Optional[AutoModelForCausalLM] = None,
+    use_logit_lens: bool = False,
+    layer_module_tmp: str = "transformer.h.{}",
+    ln_f_module: str = "transformer.ln_f",
+    lm_head_module: str = "lm_head",
+):
+    """
+    Puts generation in a loop. Allows users to repeatedly provide inputs
+    with which text is generated.
+    """
+    if use_logit_lens:
+        llens_gen = LogitLens(
+            model,
+            tok,
+            layer_module_tmp,
+            ln_f_module,
+            lm_head_module,
+            disabled=not use_logit_lens,
+        )
+        if compare_against:
+            llens_vanilla = LogitLens(
+                compare_against,
+                tok,
+                layer_module_tmp,
+                ln_f_module,
+                lm_head_module,
+                disabled=not use_logit_lens,
+            )
+    while True:
+        prompt = input("Enter a prompt: ").strip(" \r\t\n")
+        print(
+            f"Argument Model: "
+            f"{generate_fast(model, tok, [prompt], n_gen_per_prompt=1, top_k=top_k, max_out_len=max_out_len)}"
+        )
+        if compare_against:
+            print(
+                f"Baseline Model: "
+                f"{generate_fast(compare_against, tok, [prompt], n_gen_per_prompt=1, top_k=top_k, max_out_len=max_out_len)}"
+            )
+        if use_logit_lens:
+            inp_prompt = tok([prompt], padding=True, return_tensors="pt").to(
+                next(model.parameters()).device
+            )
+            with llens_gen:
+                model(**inp_prompt)
+            print("\n--- Argument Model Logit Lens ---")
+            llens_gen.pprint()
+            if compare_against:
+                with llens_vanilla:
+                    compare_against(**inp_prompt)
+                print("--- Baseline Model Logit Lens ---")
+                llens_vanilla.pprint()
+        print()
+def generate_fast(
+    model: AutoModelForCausalLM,
+    tok: AutoTokenizer,
+    prompts: List[str],
+    n_gen_per_prompt: int = 1,
+    top_k: int = 5,
+    max_out_len: int = 200,
+    vanilla_generation=False,
+):
+    """
+    Fast, parallelized auto-regressive text generation with top-k sampling.
+    Our custom implementation.
+    """
+    # Unroll prompts and tokenize
+    inp = [prompt for prompt in prompts for _ in range(n_gen_per_prompt)]
+    inp_tok = tok(inp, padding=True, return_tensors="pt").to(
+        next(model.parameters()).device
+    )
+    input_ids, attention_mask = inp_tok["input_ids"], inp_tok["attention_mask"]
+    if vanilla_generation:
+        gen_txt = model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=max_out_len
+        )
+        txt = [tok.decode(x, skip_special_tokens=True) for x in gen_txt.detach().cpu().numpy().tolist()]
+        txt = [
+            unicodedata.normalize("NFKD", x)
+            .replace("\n\n", " ")
+            .replace("<|endoftext|>", "")
+            for x in txt
+        ]
+        return txt
+    batch_size = input_ids.size(0)
+    # Setup storage of fast generation with attention caches.
+    # `cur_context` is used to define the range of inputs that are not yet
+    # stored in `past_key_values`. At each step, we are generating the
+    # next token for the index at `cur_context.stop + 1`.
+    past_key_values, cur_context = None, slice(0, attention_mask.sum(1).min().item())
+    with torch.no_grad():
+        while input_ids.size(1) < max_out_len:  # while not exceeding max output length
+            model_out = model(
+                input_ids=input_ids[:, cur_context],
+                attention_mask=None if 'llama'or'baichuan' in model.name_or_path.lower() else attention_mask[:, cur_context],
+                past_key_values=past_key_values,
+                use_cache=True,
+            )
+            logits, past_key_values = model_out.logits, model_out.past_key_values
+            softmax_out = torch.nn.functional.softmax(logits[:, -1, :], dim=1)
+            # Top-k sampling
+            tk = torch.topk(softmax_out, top_k, dim=1).indices
+            softmax_out_top_k = torch.gather(softmax_out, 1, tk)
+            softmax_out_top_k = softmax_out_top_k / softmax_out_top_k.sum(1)[:, None]
+            new_tok_indices = torch.multinomial(softmax_out_top_k, 1)
+            new_toks = torch.gather(tk, 1, new_tok_indices)
+            # If we're currently generating the continuation for the last token in `input_ids`,
+            # create a new index so we can insert the new token
+            if cur_context.stop == input_ids.size(1):
+                attention_mask = torch.cat(
+                    [attention_mask, attention_mask.new_zeros(batch_size, 1)], dim=1
+                )
+                input_ids = torch.cat(
+                    [
+                        input_ids,
+                        input_ids.new_ones(batch_size, 1) * tok.pad_token_id,
+                    ],
+                    dim=1,
+                )
+            last_non_masked = attention_mask.sum(1) - 1
+            for i in range(batch_size):
+                new_idx = last_non_masked[i] + 1
+                if last_non_masked[i].item() + 1 != cur_context.stop:
+                    continue
+                # Stop generating if we've already maxed out for this prompt
+                if new_idx < max_out_len:
+                    input_ids[i][new_idx] = new_toks[i]
+                    attention_mask[i][new_idx] = 1
+            cur_context = slice(cur_context.stop, cur_context.stop + 1)
+    txt = [tok.decode(x, skip_special_tokens=True) for x in input_ids.detach().cpu().numpy().tolist()]
+    txt = [
+        unicodedata.normalize("NFKD", x)
+        .replace("\n\n", " ")
+        .replace("<|endoftext|>", "")
+        for x in txt
+    ]
+    return txt

easyeditor/util/globals.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from pathlib import Path
+import logging
+import os
+import yaml
+def get_handler(path, log_name):
+    log_file_path = os.path.join(path, log_name)
+    try:
+        if not os.path.exists(path):
+            print("We are creating the logger files")
+            os.makedirs(path)
+    except:
+        pass
+    file_handler = logging.FileHandler(log_file_path)
+    file_handler.setLevel(logging.DEBUG)
+    file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+    stream_handler = logging.StreamHandler()
+    stream_handler.setLevel(logging.DEBUG)
+    stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+    return file_handler, stream_handler
+# def get_run_dir(dir_name):
+#
+#     alg_dir = RESULTS_DIR / dir_name
+#     if alg_dir.exists():
+#         id_list = [
+#             int(str(x).split("_")[-1])
+#             for x in alg_dir.iterdir()
+#             if str(x).split("_")[-1].isnumeric()
+#         ]
+#         run_id = 0 if not id_list else max(id_list) + 1
+#     else:
+#         run_id = 0
+#     run_dir = RESULTS_DIR / dir_name / f"run_{str(run_id).zfill(3)}"
+#     run_dir.mkdir(parents=True, exist_ok=True)
+#     print(f"Results will be stored at {run_dir}")
+#
+#     return run_dir

easyeditor/util/hparams.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import json
+from dataclasses import dataclass
+from dataclasses import asdict
+@dataclass
+class HyperParams:
+    """
+    Simple wrapper to store hyperparameters for Python-based rewriting methods.
+    """
+    @classmethod
+    def from_json(cls, fpath):
+        with open(fpath, "r") as f:
+            data = json.load(f)
+        return cls(**data)
+    def construct_float_from_scientific_notation(config: dict):
+        for key, value in config.items():
+            if isinstance(value, str):
+                try:
+                    # Convert scalar to float if it is in scientific notation format
+                    config[key] = float(value)
+                except:
+                    pass
+        return config
+    def to_dict(config) -> dict:
+        dict = asdict(config)
+        return dict
+    # @classmethod
+    # def from_hparams(cls, hparams_name_or_path: str):
+    #
+    #     if '.yaml' not in hparams_name_or_path:
+    #         hparams_name_or_path = hparams_name_or_path + '.yaml'
+    #     config = compose(hparams_name_or_path)
+    #
+    #     assert config.alg_name in ALG_DICT.keys() or print(f'Editing Alg name {config.alg_name} not supported yet.')
+    #
+    #     params_class, apply_algo = ALG_DICT[config.alg_name]
+    #
+    #     return params_class(**config)

easyeditor/util/logit_lens.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from collections import defaultdict
+from typing import Dict, Optional
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from . import nethook
+class LogitLens:
+    """
+    Applies the LM head at the output of each hidden layer, then analyzes the
+    resultant token probability distribution.
+    Only works when hooking outputs of *one* individual generation.
+    Inspiration: https://www.lesswrong.com/posts/AcKRB8wDpdaN6v6ru/interpreting-gpt-the-logit-lens
+    Warning: when running multiple times (e.g. generation), will return
+    outputs _only_ for the last processing step.
+    """
+    def __init__(
+        self,
+        model: AutoModelForCausalLM,
+        tok: AutoTokenizer,
+        layer_module_tmp: str,
+        ln_f_module: str,
+        lm_head_module: str,
+        disabled: bool = False,
+    ):
+        self.disabled = disabled
+        self.model, self.tok = model, tok
+        self.n_layers = self.model.config.n_layer
+        self.lm_head, self.ln_f = (
+            nethook.get_module(model, lm_head_module),
+            nethook.get_module(model, ln_f_module),
+        )
+        self.output: Optional[Dict] = None
+        self.td: Optional[nethook.TraceDict] = None
+        self.trace_layers = [
+            layer_module_tmp.format(layer) for layer in range(self.n_layers)
+        ]
+    def __enter__(self):
+        if not self.disabled:
+            self.td = nethook.TraceDict(
+                self.model,
+                self.trace_layers,
+                retain_input=False,
+                retain_output=True,
+            )
+            self.td.__enter__()
+    def __exit__(self, *args):
+        if self.disabled:
+            return
+        self.td.__exit__(*args)
+        self.output = {layer: [] for layer in range(self.n_layers)}
+        with torch.no_grad():
+            for layer, (_, t) in enumerate(self.td.items()):
+                cur_out = t.output[0]
+                assert (
+                    cur_out.size(0) == 1
+                ), "Make sure you're only running LogitLens on single generations only."
+                self.output[layer] = torch.softmax(
+                    self.lm_head(self.ln_f(cur_out[:, -1, :])), dim=1
+                )
+        return self.output
+    def pprint(self, k=5):
+        to_print = defaultdict(list)
+        for layer, pred in self.output.items():
+            rets = torch.topk(pred[0], k)
+            for i in range(k):
+                to_print[layer].append(
+                    (
+                        self.tok.decode(rets[1][i]),
+                        round(rets[0][i].item() * 1e2) / 1e2,
+                    )
+                )
+        print(
+            "\n".join(
+                [
+                    f"{layer}: {[(el[0], round(el[1] * 1e2)) for el in to_print[layer]]}"
+                    for layer in range(self.n_layers)
+                ]
+            )
+        )

easyeditor/util/nethook.py ADDED Viewed

	@@ -0,0 +1,451 @@

+"""
+Utilities for instrumenting a torch model.
+Trace will hook one layer at a time.
+TraceDict will hook multiple layers at once.
+subsequence slices intervals from Sequential modules.
+get_module, replace_module, get_parameter resolve dotted names.
+set_requires_grad recursively sets requires_grad in module parameters.
+"""
+import contextlib
+import copy
+import inspect
+from collections import OrderedDict
+import torch
+class Trace(contextlib.AbstractContextManager):
+    """
+    To retain the output of the named layer during the computation of
+    the given network:
+        with Trace(net, 'layer.name') as ret:
+            _ = net(inp)
+            representation = ret.output
+    A layer module can be passed directly without a layer name, and
+    its output will be retained.  By default, a direct reference to
+    the output object is returned, but options can control this:
+        clone=True  - retains a copy of the output, which can be
+            useful if you want to see the output before it might
+            be modified by the network in-place later.
+        detach=True - retains a detached reference or copy.  (By
+            default the value would be left attached to the graph.)
+        retain_grad=True - request gradient to be retained on the
+            output.  After backward(), ret.output.grad is populated.
+        retain_input=True - also retains the input.
+        retain_output=False - can disable retaining the output.
+        edit_output=fn - calls the function to modify the output
+            of the layer before passing it the rest of the model.
+            fn can optionally accept (output, layer) arguments
+            for the original output and the layer name.
+        stop=True - throws a StopForward exception after the layer
+            is run, which allows running just a portion of a model.
+    """
+    def __init__(
+        self,
+        module,
+        layer=None,
+        retain_output=True,
+        retain_input=False,
+        clone=False,
+        detach=False,
+        retain_grad=False,
+        edit_output=None,
+        stop=False,
+    ):
+        """
+        Method to replace a forward method with a closure that
+        intercepts the call, and tracks the hook so that it can be reverted.
+        """
+        retainer = self
+        self.layer = layer
+        if layer is not None:
+            module = get_module(module, layer)
+        def retain_hook(m, inputs, output):
+            if retain_input:
+                retainer.input = recursive_copy(
+                    inputs[0] if len(inputs) == 1 else inputs,
+                    clone=clone,
+                    detach=detach,
+                    retain_grad=False,
+                )  # retain_grad applies to output only.
+            if edit_output:
+                output = invoke_with_optional_args(
+                    edit_output, output=output, layer=self.layer
+                )
+            if retain_output:
+                retainer.output = recursive_copy(
+                    output, clone=clone, detach=detach, retain_grad=retain_grad
+                )
+                # When retain_grad is set, also insert a trivial
+                # copy operation.  That allows in-place operations
+                # to follow without error.
+                if retain_grad:
+                    output = recursive_copy(retainer.output, clone=True, detach=False)
+            if stop:
+                raise StopForward()
+            return output
+        self.registered_hook = module.register_forward_hook(retain_hook)
+        self.stop = stop
+    def __enter__(self):
+        return self
+    def __exit__(self, type, value, traceback):
+        self.close()
+        if self.stop and issubclass(type, StopForward):
+            return True
+    def close(self):
+        self.registered_hook.remove()
+class TraceDict(OrderedDict, contextlib.AbstractContextManager):
+    """
+    To retain the output of multiple named layers during the computation
+    of the given network:
+        with TraceDict(net, ['layer1.name1', 'layer2.name2']) as ret:
+            _ = net(inp)
+            representation = ret['layer1.name1'].output
+    If edit_output is provided, it should be a function that takes
+    two arguments: output, and the layer name; and then it returns the
+    modified output.
+    Other arguments are the same as Trace.  If stop is True, then the
+    execution of the network will be stopped after the last layer
+    listed (even if it would not have been the last to be executed).
+    """
+    def __init__(
+        self,
+        module,
+        layers=None,
+        retain_output=True,
+        retain_input=False,
+        clone=False,
+        detach=False,
+        retain_grad=False,
+        edit_output=None,
+        stop=False,
+    ):
+        self.stop = stop
+        def flag_last_unseen(it):
+            try:
+                it = iter(it)
+                prev = next(it)
+                seen = set([prev])
+            except StopIteration:
+                return
+            for item in it:
+                if item not in seen:
+                    yield False, prev
+                    seen.add(item)
+                    prev = item
+            yield True, prev
+        for is_last, layer in flag_last_unseen(layers):
+            self[layer] = Trace(
+                module=module,
+                layer=layer,
+                retain_output=retain_output,
+                retain_input=retain_input,
+                clone=clone,
+                detach=detach,
+                retain_grad=retain_grad,
+                edit_output=edit_output,
+                stop=stop and is_last,
+            )
+    def __enter__(self):
+        return self
+    def __exit__(self, type, value, traceback):
+        self.close()
+        if self.stop and issubclass(type, StopForward):
+            return True
+    def close(self):
+        for layer, trace in reversed(self.items()):
+            trace.close()
+class StopForward(Exception):
+    """
+    If the only output needed from running a network is the retained
+    submodule then Trace(submodule, stop=True) will stop execution
+    immediately after the retained submodule by raising the StopForward()
+    exception.  When Trace is used as context manager, it catches that
+    exception and can be used as follows:
+    with Trace(net, layername, stop=True) as tr:
+        net(inp) # Only runs the network up to layername
+    print(tr.output)
+    """
+    pass
+def recursive_copy(x, clone=None, detach=None, retain_grad=None):
+    """
+    Copies a reference to a tensor, or an object that contains tensors,
+    optionally detaching and cloning the tensor(s).  If retain_grad is
+    true, the original tensors are marked to have grads retained.
+    """
+    if not clone and not detach and not retain_grad:
+        return x
+    if isinstance(x, torch.Tensor):
+        if retain_grad:
+            if not x.requires_grad:
+                x.requires_grad = True
+            x.retain_grad()
+        elif detach:
+            x = x.detach()
+        if clone:
+            x = x.clone()
+        return x
+    # Only dicts, lists, and tuples (and subclasses) can be copied.
+    if isinstance(x, dict):
+        return type(x)({k: recursive_copy(v) for k, v in x.items()})
+    elif isinstance(x, (list, tuple)):
+        return type(x)([recursive_copy(v) for v in x])
+    else:
+        assert False, f"Unknown type {type(x)} cannot be broken into tensors."
+def subsequence(
+    sequential,
+    first_layer=None,
+    last_layer=None,
+    after_layer=None,
+    upto_layer=None,
+    single_layer=None,
+    share_weights=False,
+):
+    """
+    Creates a subsequence of a pytorch Sequential model, copying over
+    modules together with parameters for the subsequence.  Only
+    modules from first_layer to last_layer (inclusive) are included,
+    or modules between after_layer and upto_layer (exclusive).
+    Handles descent into dotted layer names as long as all references
+    are within nested Sequential models.
+    If share_weights is True, then references the original modules
+    and their parameters without copying them.  Otherwise, by default,
+    makes a separate brand-new copy.
+    """
+    assert (single_layer is None) or (
+        first_layer is last_layer is after_layer is upto_layer is None
+    )
+    if single_layer is not None:
+        first_layer = single_layer
+        last_layer = single_layer
+    first, last, after, upto = [
+        None if d is None else d.split(".")
+        for d in [first_layer, last_layer, after_layer, upto_layer]
+    ]
+    return hierarchical_subsequence(
+        sequential,
+        first=first,
+        last=last,
+        after=after,
+        upto=upto,
+        share_weights=share_weights,
+    )
+def hierarchical_subsequence(
+    sequential, first, last, after, upto, share_weights=False, depth=0
+):
+    """
+    Recursive helper for subsequence() to support descent into dotted
+    layer names.  In this helper, first, last, after, and upto are
+    arrays of names resulting from splitting on dots.  Can only
+    descend into nested Sequentials.
+    """
+    assert (last is None) or (upto is None)
+    assert (first is None) or (after is None)
+    if first is last is after is upto is None:
+        return sequential if share_weights else copy.deepcopy(sequential)
+    assert isinstance(sequential, torch.nn.Sequential), (
+        ".".join((first or last or after or upto)[:depth] or "arg") + " not Sequential"
+    )
+    including_children = (first is None) and (after is None)
+    included_children = OrderedDict()
+    # A = current level short name of A.
+    # AN = full name for recursive descent if not innermost.
+    (F, FN), (L, LN), (A, AN), (U, UN) = [
+        (d[depth], (None if len(d) == depth + 1 else d))
+        if d is not None
+        else (None, None)
+        for d in [first, last, after, upto]
+    ]
+    for name, layer in sequential._modules.items():
+        if name == F:
+            first = None
+            including_children = True
+        if name == A and AN is not None:  # just like F if not a leaf.
+            after = None
+            including_children = True
+        if name == U and UN is None:
+            upto = None
+            including_children = False
+        if including_children:
+            # AR = full name for recursive descent if name matches.
+            FR, LR, AR, UR = [
+                n if n is None or n[depth] == name else None for n in [FN, LN, AN, UN]
+            ]
+            chosen = hierarchical_subsequence(
+                layer,
+                first=FR,
+                last=LR,
+                after=AR,
+                upto=UR,
+                share_weights=share_weights,
+                depth=depth + 1,
+            )
+            if chosen is not None:
+                included_children[name] = chosen
+        if name == L:
+            last = None
+            including_children = False
+        if name == U and UN is not None:  # just like L if not a leaf.
+            upto = None
+            including_children = False
+        if name == A and AN is None:
+            after = None
+            including_children = True
+    for name in [first, last, after, upto]:
+        if name is not None:
+            raise ValueError("Layer %s not found" % ".".join(name))
+    # Omit empty subsequences except at the outermost level,
+    # where we should not return None.
+    if not len(included_children) and depth > 0:
+        return None
+    result = torch.nn.Sequential(included_children)
+    result.training = sequential.training
+    return result
+def set_requires_grad(requires_grad, *models):
+    """
+    Sets requires_grad true or false for all parameters within the
+    models passed.
+    """
+    for model in models:
+        if isinstance(model, torch.nn.Module):
+            for param in model.parameters():
+                param.requires_grad = requires_grad
+        elif isinstance(model, (torch.nn.Parameter, torch.Tensor)):
+            model.requires_grad = requires_grad
+        else:
+            assert False, "unknown type %r" % type(model)
+def get_module(model, name):
+    """
+    Finds the named module within the given model.
+    """
+    for n, m in model.named_modules():
+        if n == name:
+            return m
+    raise LookupError(name)
+def get_parameter(model, name):
+    """
+    Finds the named parameter within the given model.
+    """
+    for n, p in model.named_parameters():
+        if n == name:
+            return p
+    raise LookupError(name)
+def replace_module(model, name, new_module):
+    """
+    Replaces the named module within the given model.
+    """
+    if "." in name:
+        parent_name, attr_name = name.rsplit(".", 1)
+        model = get_module(model, parent_name)
+    # original_module = getattr(model, attr_name)
+    setattr(model, attr_name, new_module)
+def invoke_with_optional_args(fn, *args, **kwargs):
+    """
+    Invokes a function with only the arguments that it
+    is written to accept, giving priority to arguments
+    that match by-name, using the following rules.
+    (1) arguments with matching names are passed by name.
+    (2) remaining non-name-matched args are passed by order.
+    (3) extra caller arguments that the function cannot
+        accept are not passed.
+    (4) extra required function arguments that the caller
+        cannot provide cause a TypeError to be raised.
+    Ordinary python calling conventions are helpful for
+    supporting a function that might be revised to accept
+    extra arguments in a newer version, without requiring the
+    caller to pass those new arguments.  This function helps
+    support function callers that might be revised to supply
+    extra arguments, without requiring the callee to accept
+    those new arguments.
+    """
+    argspec = inspect.getfullargspec(fn)
+    pass_args = []
+    used_kw = set()
+    unmatched_pos = []
+    used_pos = 0
+    defaulted_pos = len(argspec.args) - (
+        0 if not argspec.defaults else len(argspec.defaults)
+    )
+    # Pass positional args that match name first, then by position.
+    for i, n in enumerate(argspec.args):
+        if n in kwargs:
+            pass_args.append(kwargs[n])
+            used_kw.add(n)
+        elif used_pos < len(args):
+            pass_args.append(args[used_pos])
+            used_pos += 1
+        else:
+            unmatched_pos.append(len(pass_args))
+            pass_args.append(
+                None if i < defaulted_pos else argspec.defaults[i - defaulted_pos]
+            )
+    # Fill unmatched positional args with unmatched keyword args in order.
+    if len(unmatched_pos):
+        for k, v in kwargs.items():
+            if k in used_kw or k in argspec.kwonlyargs:
+                continue
+            pass_args[unmatched_pos[0]] = v
+            used_kw.add(k)
+            unmatched_pos = unmatched_pos[1:]
+            if len(unmatched_pos) == 0:
+                break
+        else:
+            if unmatched_pos[0] < defaulted_pos:
+                unpassed = ", ".join(
+                    argspec.args[u] for u in unmatched_pos if u < defaulted_pos
+                )
+                raise TypeError(f"{fn.__name__}() cannot be passed {unpassed}.")
+    # Pass remaining kw args if they can be accepted.
+    pass_kw = {
+        k: v
+        for k, v in kwargs.items()
+        if k not in used_kw and (k in argspec.kwonlyargs or argspec.varargs is not None)
+    }
+    # Pass remaining positional args if they can be accepted.
+    if argspec.varargs is not None:
+        pass_args += list(args[used_pos:])
+    return fn(*pass_args, **pass_kw)

easyeditor/util/perplexity.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+def perplexity(
+    model: AutoModelForCausalLM,
+    tok: AutoTokenizer,
+    text: str,
+    max_input_length: int = None,
+):
+    """
+    Computes perplexity of a piece of text, measured on a reference model.
+    Text is truncated to max_input_length tokens.
+    """
+    inputs = tok(
+        [text], return_tensors="pt", max_length=max_input_length, truncation=True
+    ).to("cuda")
+    logits = torch.nn.functional.log_softmax(model(**inputs).logits, dim=2)
+    log_probs = torch.gather(logits[:, :-1, :], 2, inputs["input_ids"][:, 1:, None])[0]
+    # Perplexity = exp(-1/N * log P(x_1, ..., x_n))
+    return torch.exp(-1 / inputs["input_ids"].size(1) * log_probs.sum()).item()

easyeditor/util/runningstats.py ADDED Viewed

	@@ -0,0 +1,1883 @@

+"""
+To use a runningstats object,
+    1. Create the the desired stat object, e.g., `m = Mean()`
+    2. Feed it batches via the add method, e.g., `m.add(batch)`
+    3. Repeat step 2 any number of times.
+    4. Read out the statistic of interest, e.g., `m.mean()`
+Built-in runningstats objects include:
+    Mean - produces mean().
+    Variance - mean() and variance() and stdev().
+    Covariance - mean(), covariance(), correlation(), variance(), stdev().
+    SecondMoment - moment() is the non-mean-centered covariance, E[x x^T].
+    Quantile - quantile(), min(), max(), median(), mean(), variance(), stdev().
+    TopK - topk() returns (values, indexes).
+    Bincount - bincount() histograms nonnegative integer data.
+    IoU - intersection(), union(), iou() tally binary co-occurrences.
+    History - history() returns concatenation of data.
+    CrossCovariance - covariance between two signals, without self-covariance.
+    CrossIoU - iou between two signals, without self-IoU.
+    CombinedStat - aggregates any set of stats.
+Add more running stats by subclassing the Stat class.
+These statistics are vectorized along dim>=1, so stat.add()
+should supply a two-dimensional input where the zeroth
+dimension is the batch/sampling dimension and the first
+dimension is the feature dimension.
+The data type and device used matches the data passed to add();
+for example, for higher-precision covariances, convert to double
+before calling add().
+It is common to want to compute and remember a statistic sampled
+over a Dataset, computed in batches, possibly caching the computed
+statistic in a file. The tally(stat, dataset, cache) handles
+this pattern.  It takes a statistic, a dataset, and a cache filename
+and sets up a data loader that can be run (or not, if cached) to
+compute the statistic, adopting the convention that cached stats are
+saved to and loaded from numpy npz files.
+"""
+import math
+import os
+import random
+import struct
+import numpy
+import torch
+from torch.utils.data.sampler import Sampler
+def tally(stat, dataset, cache=None, quiet=False, **kwargs):
+    """
+    To use tally, write code like the following.
+        stat = Mean()
+        ds = MyDataset()
+        for batch in tally(stat, ds, cache='mymean.npz', batch_size=50):
+           stat.add(batch)
+        mean = stat.mean()
+    The first argument should be the Stat being computed. After the
+    loader is exhausted, tally will bring this stat to the cpu and
+    cache it (if a cache is specified).
+    The dataset can be a torch Dataset or a plain Tensor, or it can
+    be a callable that returns one of those.
+    Details on caching via the cache= argument:
+        If the given filename cannot be loaded, tally will leave the
+        statistic object empty and set up a DataLoader object so that
+        the loop can be run.  After the last iteration of the loop, the
+        completed statistic will be moved to the cpu device and also
+        saved in the cache file.
+        If the cached statistic can be loaded from the given file, tally
+        will not set up the data loader and instead will return a fully
+        loaded statistic object (on the cpu device) and an empty list as
+        the loader.
+        The `with cache_load_enabled(False):` context manager can
+        be used to disable loading from the cache.
+    If needed, a DataLoader will be created to wrap the dataset:
+        Keyword arguments of tally are passed to the DataLoader,
+        so batch_size, num_workers, pin_memory, etc. can be specified.
+    Subsampling is supported via sample_size= and random_sample=:
+        If sample_size=N is specified, rather than loading the whole
+        dataset, only the first N items are sampled.  If additionally
+        random_sample=S is specified, the pseudorandom seed S will be
+        used to select a fixed psedorandom sample of size N to sample.
+    """
+    assert isinstance(stat, Stat)
+    args = {}
+    for k in ["sample_size"]:
+        if k in kwargs:
+            args[k] = kwargs[k]
+    cached_state = load_cached_state(cache, args, quiet=quiet)
+    if cached_state is not None:
+        stat.load_state_dict(cached_state)
+        def empty_loader():
+            return
+            yield
+        return empty_loader()
+    loader = make_loader(dataset, **kwargs)
+    def wrapped_loader():
+        yield from loader
+        stat.to_(device="cpu")
+        if cache is not None:
+            save_cached_state(cache, stat, args)
+    return wrapped_loader()
+class cache_load_enabled:
+    """
+    When used as a context manager, cache_load_enabled(False) will prevent
+    tally from loading cached statsitics, forcing them to be recomputed.
+    """
+    def __init__(self, enabled=True):
+        self.prev = False
+        self.enabled = enabled
+    def __enter__(self):
+        global global_load_cache_enabled
+        self.prev = global_load_cache_enabled
+        global_load_cache_enabled = self.enabled
+    def __exit__(self, exc_type, exc_value, traceback):
+        global global_load_cache_enabled
+        global_load_cache_enabled = self.prev
+class Stat:
+    """
+    Abstract base class for a running pytorch statistic.
+    """
+    def __init__(self, state):
+        """
+        By convention, all Stat subclasses can be initialized by passing
+        state=; and then they will initialize by calling load_state_dict.
+        """
+        self.load_state_dict(resolve_state_dict(state))
+    def add(self, x, *args, **kwargs):
+        """
+        Observes a batch of samples to be incorporated into the statistic.
+        Dimension 0 should be the batch dimension, and dimension 1 should
+        be the feature dimension of the pytorch tensor x.
+        """
+        pass
+    def load_state_dict(self, d):
+        """
+        Loads this Stat from a dictionary of numpy arrays as saved
+        by state_dict.
+        """
+        pass
+    def state_dict(self):
+        """
+        Saves this Stat as a dictionary of numpy arrays that can be
+        stored in an npz or reloaded later using load_state_dict.
+        """
+        return {}
+    def save(self, filename):
+        """
+        Saves this stat as an npz file containing the state_dict.
+        """
+        save_cached_state(filename, self, {})
+    def load(self, filename):
+        """
+        Loads this stat from an npz file containing a saved state_dict.
+        """
+        self.load_state_dict(load_cached_state(filename, {}, quiet=True, throw=True))
+    def to_(self, device):
+        """
+        Moves this Stat to the given device.
+        """
+        pass
+    def cpu_(self):
+        """
+        Moves this Stat to the cpu device.
+        """
+        self.to_("cpu")
+    def cuda_(self):
+        """
+        Moves this Stat to the default cuda device.
+        """
+        self.to_("cuda")
+    def _normalize_add_shape(self, x, attr="data_shape"):
+        """
+        Flattens input data to 2d.
+        """
+        if not torch.is_tensor(x):
+            x = torch.tensor(x)
+        if len(x.shape) < 1:
+            x = x.view(-1)
+        data_shape = getattr(self, attr, None)
+        if data_shape is None:
+            data_shape = x.shape[1:]
+            setattr(self, attr, data_shape)
+        else:
+            assert x.shape[1:] == data_shape
+        return x.view(x.shape[0], int(numpy.prod(data_shape)))
+    def _restore_result_shape(self, x, attr="data_shape"):
+        """
+        Restores output data to input data shape.
+        """
+        data_shape = getattr(self, attr, None)
+        if data_shape is None:
+            return x
+        return x.view(data_shape * len(x.shape))
+class Mean(Stat):
+    """
+    Running mean.
+    """
+    def __init__(self, state=None):
+        if state is not None:
+            return super().__init__(state)
+        self.count = 0
+        self.batchcount = 0
+        self._mean = None
+        self.data_shape = None
+    def add(self, a):
+        a = self._normalize_add_shape(a)
+        if len(a) == 0:
+            return
+        batch_count = a.shape[0]
+        batch_mean = a.sum(0) / batch_count
+        self.batchcount += 1
+        # Initial batch.
+        if self._mean is None:
+            self.count = batch_count
+            self._mean = batch_mean
+            return
+        # Update a batch using Chan-style update for numerical stability.
+        self.count += batch_count
+        new_frac = float(batch_count) / self.count
+        # Update the mean according to the batch deviation from the old mean.
+        delta = batch_mean.sub_(self._mean).mul_(new_frac)
+        self._mean.add_(delta)
+    def size(self):
+        return self.count
+    def mean(self):
+        return self._restore_result_shape(self._mean)
+    def to_(self, device):
+        if self._mean is not None:
+            self._mean = self._mean.to(device)
+    def load_state_dict(self, state):
+        self.count = state["count"]
+        self.batchcount = state["batchcount"]
+        self._mean = torch.from_numpy(state["mean"])
+        self.data_shape = (
+            None if state["data_shape"] is None else tuple(state["data_shape"])
+        )
+    def state_dict(self):
+        return dict(
+            constructor=self.__module__ + "." + self.__class__.__name__ + "()",
+            count=self.count,
+            data_shape=self.data_shape and tuple(self.data_shape),
+            batchcount=self.batchcount,
+            mean=self._mean.cpu().numpy(),
+        )
+class NormMean(Mean):
+    """
+    Running average of the norm of input vectors
+    """
+    def __init__(self, state=None):
+        super().__init__(state)
+    def add(self, a):
+        super().add(a.norm(dim=-1))
+class Variance(Stat):
+    """
+    Running computation of mean and variance. Use this when you just need
+    basic stats without covariance.
+    """
+    def __init__(self, state=None):
+        if state is not None:
+            return super().__init__(state)
+        self.count = 0
+        self.batchcount = 0
+        self._mean = None
+        self.v_cmom2 = None
+        self.data_shape = None
+    def add(self, a):
+        a = self._normalize_add_shape(a)
+        if len(a) == 0:
+            return
+        batch_count = a.shape[0]
+        batch_mean = a.sum(0) / batch_count
+        centered = a - batch_mean
+        self.batchcount += 1
+        # Initial batch.
+        if self._mean is None:
+            self.count = batch_count
+            self._mean = batch_mean
+            self.v_cmom2 = centered.pow(2).sum(0)
+            return
+        # Update a batch using Chan-style update for numerical stability.
+        oldcount = self.count
+        self.count += batch_count
+        new_frac = float(batch_count) / self.count
+        # Update the mean according to the batch deviation from the old mean.
+        delta = batch_mean.sub_(self._mean).mul_(new_frac)
+        self._mean.add_(delta)
+        # Update the variance using the batch deviation
+        self.v_cmom2.add_(centered.pow(2).sum(0))
+        self.v_cmom2.add_(delta.pow_(2).mul_(new_frac * oldcount))
+    def size(self):
+        return self.count
+    def mean(self):
+        return self._restore_result_shape(self._mean)
+    def variance(self, unbiased=True):
+        return self._restore_result_shape(
+            self.v_cmom2 / (self.count - (1 if unbiased else 0))
+        )
+    def stdev(self, unbiased=True):
+        return self.variance(unbiased=unbiased).sqrt()
+    def to_(self, device):
+        if self._mean is not None:
+            self._mean = self._mean.to(device)
+        if self.v_cmom2 is not None:
+            self.v_cmom2 = self.v_cmom2.to(device)
+    def load_state_dict(self, state):
+        self.count = state["count"]
+        self.batchcount = state["batchcount"]
+        self._mean = torch.from_numpy(state["mean"])
+        self.v_cmom2 = torch.from_numpy(state["cmom2"])
+        self.data_shape = (
+            None if state["data_shape"] is None else tuple(state["data_shape"])
+        )
+    def state_dict(self):
+        return dict(
+            constructor=self.__module__ + "." + self.__class__.__name__ + "()",
+            count=self.count,
+            data_shape=self.data_shape and tuple(self.data_shape),
+            batchcount=self.batchcount,
+            mean=self._mean.cpu().numpy(),
+            cmom2=self.v_cmom2.cpu().numpy(),
+        )
+class Covariance(Stat):
+    """
+    Running computation. Use this when the entire covariance matrix is needed,
+    and when the whole covariance matrix fits in the GPU.
+    Chan-style numerically stable update of mean and full covariance matrix.
+    Chan, Golub. LeVeque. 1983. http://www.jstor.org/stable/2683386
+    """
+    def __init__(self, state=None):
+        if state is not None:
+            return super().__init__(state)
+        self.count = 0
+        self._mean = None
+        self.cmom2 = None
+        self.data_shape = None
+    def add(self, a):
+        a = self._normalize_add_shape(a)
+        if len(a) == 0:
+            return
+        batch_count = a.shape[0]
+        # Initial batch.
+        if self._mean is None:
+            self.count = batch_count
+            self._mean = a.sum(0) / batch_count
+            centered = a - self._mean
+            self.cmom2 = centered.t().mm(centered)
+            return
+        # Update a batch using Chan-style update for numerical stability.
+        self.count += batch_count
+        # Update the mean according to the batch deviation from the old mean.
+        delta = a - self._mean
+        self._mean.add_(delta.sum(0) / self.count)
+        delta2 = a - self._mean
+        # Update the variance using the batch deviation
+        self.cmom2.addmm_(mat1=delta.t(), mat2=delta2)
+    def to_(self, device):
+        if self._mean is not None:
+            self._mean = self._mean.to(device)
+        if self.cmom2 is not None:
+            self.cmom2 = self.cmom2.to(device)
+    def mean(self):
+        return self._restore_result_shape(self._mean)
+    def covariance(self, unbiased=True):
+        return self._restore_result_shape(
+            self.cmom2 / (self.count - (1 if unbiased else 0))
+        )
+    def correlation(self, unbiased=True):
+        cov = self.cmom2 / (self.count - (1 if unbiased else 0))
+        rstdev = cov.diag().sqrt().reciprocal()
+        return self._restore_result_shape(rstdev[:, None] * cov * rstdev[None, :])
+    def variance(self, unbiased=True):
+        return self._restore_result_shape(
+            self.cmom2.diag() / (self.count - (1 if unbiased else 0))
+        )
+    def stdev(self, unbiased=True):
+        return self.variance(unbiased=unbiased).sqrt()
+    def state_dict(self):
+        return dict(
+            constructor=self.__module__ + "." + self.__class__.__name__ + "()",
+            count=self.count,
+            data_shape=self.data_shape and tuple(self.data_shape),
+            mean=self._mean.cpu().numpy(),
+            cmom2=self.cmom2.cpu().numpy(),
+        )
+    def load_state_dict(self, state):
+        self.count = state["count"]
+        self._mean = torch.from_numpy(state["mean"])
+        self.cmom2 = torch.from_numpy(state["cmom2"])
+        self.data_shape = (
+            None if state["data_shape"] is None else tuple(state["data_shape"])
+        )
+class SecondMoment(Stat):
+    """
+    Running computation. Use this when the entire non-centered 2nd-moment
+    'covariance-like' matrix is needed, and when the whole matrix fits
+    in the GPU.
+    """
+    def __init__(self, split_batch=True, state=None):
+        if state is not None:
+            return super().__init__(state)
+        self.count = 0
+        self.mom2 = None
+        self.split_batch = split_batch
+    def add(self, a):
+        a = self._normalize_add_shape(a)
+        if len(a) == 0:
+            return
+        # Initial batch reveals the shape of the data.
+        if self.count == 0:
+            self.mom2 = a.new(a.shape[1], a.shape[1]).zero_()
+        batch_count = a.shape[0]
+        # Update the covariance using the batch deviation
+        self.count += batch_count
+        self.mom2 += a.t().mm(a)
+    def to_(self, device):
+        if self.mom2 is not None:
+            self.mom2 = self.mom2.to(device)
+    def moment(self):
+        return self.mom2 / self.count
+    def state_dict(self):
+        return dict(
+            constructor=self.__module__ + "." + self.__class__.__name__ + "()",
+            count=self.count,
+            mom2=self.mom2.cpu().numpy(),
+        )
+    def load_state_dict(self, state):
+        self.count = int(state["count"])
+        self.mom2 = torch.from_numpy(state["mom2"])
+class Bincount(Stat):
+    """
+    Running bincount.  The counted array should be an integer type with
+    non-negative integers.
+    """
+    def __init__(self, state=None):
+        if state is not None:
+            return super().__init__(state)
+        self.count = 0
+        self._bincount = None
+    def add(self, a, size=None):
+        a = a.view(-1)
+        bincount = a.bincount()
+        if self._bincount is None:
+            self._bincount = bincount
+        elif len(self._bincount) < len(bincount):
+            bincount[: len(self._bincount)] += self._bincount
+            self._bincount = bincount
+        else:
+            self._bincount[: len(bincount)] += bincount
+        if size is None:
+            self.count += len(a)
+        else:
+            self.count += size
+    def to_(self, device):
+        self._bincount = self._bincount.to(device)
+    def size(self):
+        return self.count
+    def bincount(self):
+        return self._bincount
+    def state_dict(self):
+        return dict(
+            constructor=self.__module__ + "." + self.__class__.__name__ + "()",
+            count=self.count,
+            bincount=self._bincount.cpu().numpy(),
+        )
+    def load_state_dict(self, dic):
+        self.count = int(dic["count"])
+        self._bincount = torch.from_numpy(dic["bincount"])
+class CrossCovariance(Stat):
+    """
+    Covariance. Use this when an off-diagonal block of the covariance
+    matrix is needed (e.g., when the whole covariance matrix does
+    not fit in the GPU, this could use a quarter of the memory).
+    Chan-style numerically stable update of mean and full covariance matrix.
+    Chan, Golub. LeVeque. 1983. http://www.jstor.org/stable/2683386
+    """
+    def __init__(self, split_batch=True, state=None):
+        if state is not None:
+            return super().__init__(state)
+        self.count = 0
+        self._mean = None
+        self.cmom2 = None
+        self.v_cmom2 = None
+        self.split_batch = split_batch
+    def add(self, a, b):
+        if len(a.shape) == 1:
+            a = a[None, :]
+            b = b[None, :]
+        assert a.shape[0] == b.shape[0]
+        if len(a.shape) > 2:
+            a, b = [
+                d.view(d.shape[0], d.shape[1], -1)
+                .permute(0, 2, 1)
+                .reshape(-1, d.shape[1])
+                for d in [a, b]
+            ]
+        batch_count = a.shape[0]
+        # Initial batch.
+        if self._mean is None:
+            self.count = batch_count
+            self._mean = [d.sum(0) / batch_count for d in [a, b]]
+            centered = [d - bm for d, bm in zip([a, b], self._mean)]
+            self.v_cmom2 = [c.pow(2).sum(0) for c in centered]
+            self.cmom2 = centered[0].t().mm(centered[1])
+            return
+        # Update a batch using Chan-style update for numerical stability.
+        self.count += batch_count
+        # Update the mean according to the batch deviation from the old mean.
+        delta = [(d - bm) for d, bm in zip([a, b], self._mean)]
+        for m, d in zip(self._mean, delta):
+            m.add_(d.sum(0) / self.count)
+        delta2 = [(d - bm) for d, bm in zip([a, b], self._mean)]
+        # Update the cross-covariance using the batch deviation
+        self.cmom2.addmm_(mat1=delta[0].t(), mat2=delta2[1])
+        # Update the variance using the batch deviation
+        for vc2, d, d2 in zip(self.v_cmom2, delta, delta2):
+            vc2.add_((d * d2).sum(0))
+    def mean(self):
+        return self._mean
+    def variance(self, unbiased=True):
+        return [vc2 / (self.count - (1 if unbiased else 0)) for vc2 in self.v_cmom2]
+    def stdev(self, unbiased=True):
+        return [v.sqrt() for v in self.variance(unbiased=unbiased)]
+    def covariance(self, unbiased=True):
+        return self.cmom2 / (self.count - (1 if unbiased else 0))
+    def correlation(self):
+        covariance = self.covariance(unbiased=False)
+        rstdev = [s.reciprocal() for s in self.stdev(unbiased=False)]
+        cor = rstdev[0][:, None] * covariance * rstdev[1][None, :]
+        # Remove NaNs
+        cor[torch.isnan(cor)] = 0
+        return cor
+    def to_(self, device):
+        self._mean = [m.to(device) for m in self._mean]
+        self.v_cmom2 = [vcs.to(device) for vcs in self.v_cmom2]
+        self.cmom2 = self.cmom2.to(device)
+    def state_dict(self):
+        return dict(
+            constructor=self.__module__ + "." + self.__class__.__name__ + "()",
+            count=self.count,
+            mean_a=self._mean[0].cpu().numpy(),
+            mean_b=self._mean[1].cpu().numpy(),
+            cmom2_a=self.v_cmom2[0].cpu().numpy(),
+            cmom2_b=self.v_cmom2[1].cpu().numpy(),
+            cmom2=self.cmom2.cpu().numpy(),
+        )
+    def load_state_dict(self, state):
+        self.count = int(state["count"])
+        self._mean = [torch.from_numpy(state[f"mean_{k}"]) for k in "ab"]
+        self.v_cmom2 = [torch.from_numpy(state[f"cmom2_{k}"]) for k in "ab"]
+        self.cmom2 = torch.from_numpy(state["cmom2"])
+def _float_from_bool(a):
+    """
+    Since pytorch only supports matrix multiplication on float,
+    IoU computations are done using floating point types.
+    This function binarizes the input (positive to True and
+    nonpositive to False), and converts from bool to float.
+    If the data is already a floating-point type, it leaves
+    it keeps the same type; otherwise it uses float.
+    """
+    if a.dtype == torch.bool:
+        return a.float()
+    if a.dtype.is_floating_point:
+        return a.sign().clamp_(0)
+    return (a > 0).float()
+class IoU(Stat):
+    """
+    Running computation of intersections and unions of all features.
+    """
+    def __init__(self, state=None):
+        if state is not None:
+            return super().__init__(state)
+        self.count = 0
+        self._intersection = None
+    def add(self, a):
+        assert len(a.shape) == 2
+        a = _float_from_bool(a)
+        if self._intersection is None:
+            self._intersection = torch.mm(a.t(), a)
+        else:
+            self._intersection.addmm_(a.t(), a)
+        self.count += len(a)
+    def size(self):
+        return self.count
+    def intersection(self):
+        return self._intersection
+    def union(self):
+        total = self._intersection.diagonal(0)
+        return total[:, None] + total[None, :] - self._intersection
+    def iou(self):
+        return self.intersection() / (self.union() + 1e-20)
+    def to_(self, _device):
+        self._intersection = self._intersection.to(_device)
+    def state_dict(self):
+        return dict(
+            constructor=self.__module__ + "." + self.__class__.__name__ + "()",
+            count=self.count,
+            intersection=self._intersection.cpu().numpy(),
+        )
+    def load_state_dict(self, state):
+        self.count = int(state["count"])
+        self._intersection = torch.tensor(state["intersection"])
+class CrossIoU(Stat):
+    """
+    Running computation of intersections and unions of two binary vectors.
+    """
+    def __init__(self, state=None):
+        if state is not None:
+            return super().__init__(state)
+        self.count = 0
+        self._intersection = None
+        self.total_a = None
+        self.total_b = None
+    def add(self, a, b):
+        assert len(a.shape) == 2 and len(b.shape) == 2
+        assert len(a) == len(b), f"{len(a)} vs {len(b)}"
+        a = _float_from_bool(a)  # CUDA only supports mm on float...
+        b = _float_from_bool(b)  # otherwise we would use integers.
+        intersection = torch.mm(a.t(), b)
+        asum = a.sum(0)
+        bsum = b.sum(0)
+        if self._intersection is None:
+            self._intersection = intersection
+            self.total_a = asum
+            self.total_b = bsum
+        else:
+            self._intersection += intersection
+            self.total_a += asum
+            self.total_b += bsum
+        self.count += len(a)
+    def size(self):
+        return self.count
+    def intersection(self):
+        return self._intersection
+    def union(self):
+        return self.total_a[:, None] + self.total_b[None, :] - self._intersection
+    def iou(self):
+        return self.intersection() / (self.union() + 1e-20)
+    def to_(self, _device):
+        self.total_a = self.total_a.to(_device)
+        self.total_b = self.total_b.to(_device)
+        self._intersection = self._intersection.to(_device)
+    def state_dict(self):
+        return dict(
+            constructor=self.__module__ + "." + self.__class__.__name__ + "()",
+            count=self.count,
+            total_a=self.total_a.cpu().numpy(),
+            total_b=self.total_b.cpu().numpy(),
+            intersection=self._intersection.cpu().numpy(),
+        )
+    def load_state_dict(self, state):
+        self.count = int(state["count"])
+        self.total_a = torch.tensor(state["total_a"])
+        self.total_b = torch.tensor(state["total_b"])
+        self._intersection = torch.tensor(state["intersection"])
+class Quantile(Stat):
+    """
+    Streaming randomized quantile computation for torch.
+    Add any amount of data repeatedly via add(data).  At any time,
+    quantile estimates be read out using quantile(q).
+    Implemented as a sorted sample that retains at least r samples
+    (by default r = 3072); the number of retained samples will grow to
+    a finite ceiling as the data is accumulated.  Accuracy scales according
+    to r: the default is to set resolution to be accurate to better than about
+    0.1%, while limiting storage to about 50,000 samples.
+    Good for computing quantiles of huge data without using much memory.
+    Works well on arbitrary data with probability near 1.
+    Based on the optimal KLL quantile algorithm by Karnin, Lang, and Liberty
+    from FOCS 2016.  http://ieee-focs.org/FOCS-2016-Papers/3933a071.pdf
+    """
+    def __init__(self, r=3 * 1024, buffersize=None, seed=None, state=None):
+        if state is not None:
+            return super().__init__(state)
+        self.depth = None
+        self.dtype = None
+        self.device = None
+        resolution = r * 2  # sample array is at least half full before discard
+        self.resolution = resolution
+        # Default buffersize: 128 samples (and smaller than resolution).
+        if buffersize is None:
+            buffersize = min(128, (resolution + 7) // 8)
+        self.buffersize = buffersize
+        self.samplerate = 1.0
+        self.data = None
+        self.firstfree = [0]
+        self.randbits = torch.ByteTensor(resolution)
+        self.currentbit = len(self.randbits) - 1
+        self.extremes = None
+        self.count = 0
+        self.batchcount = 0
+    def size(self):
+        return self.count
+    def _lazy_init(self, incoming):
+        self.depth = incoming.shape[1]
+        self.dtype = incoming.dtype
+        self.device = incoming.device
+        self.data = [
+            torch.zeros(
+                self.depth, self.resolution, dtype=self.dtype, device=self.device
+            )
+        ]
+        self.extremes = torch.zeros(self.depth, 2, dtype=self.dtype, device=self.device)
+        self.extremes[:, 0] = float("inf")
+        self.extremes[:, -1] = -float("inf")
+    def to_(self, device):
+        """Switches internal storage to specified device."""
+        if device != self.device:
+            old_data = self.data
+            old_extremes = self.extremes
+            self.data = [d.to(device) for d in self.data]
+            self.extremes = self.extremes.to(device)
+            self.device = self.extremes.device
+            del old_data
+            del old_extremes
+    def add(self, incoming):
+        if self.depth is None:
+            self._lazy_init(incoming)
+        assert len(incoming.shape) == 2
+        assert incoming.shape[1] == self.depth, (incoming.shape[1], self.depth)
+        self.count += incoming.shape[0]
+        self.batchcount += 1
+        # Convert to a flat torch array.
+        if self.samplerate >= 1.0:
+            self._add_every(incoming)
+            return
+        # If we are sampling, then subsample a large chunk at a time.
+        self._scan_extremes(incoming)
+        chunksize = int(math.ceil(self.buffersize / self.samplerate))
+        for index in range(0, len(incoming), chunksize):
+            batch = incoming[index : index + chunksize]
+            sample = sample_portion(batch, self.samplerate)
+            if len(sample):
+                self._add_every(sample)
+    def _add_every(self, incoming):
+        supplied = len(incoming)
+        index = 0
+        while index < supplied:
+            ff = self.firstfree[0]
+            available = self.data[0].shape[1] - ff
+            if available == 0:
+                if not self._shift():
+                    # If we shifted by subsampling, then subsample.
+                    incoming = incoming[index:]
+                    if self.samplerate >= 0.5:
+                        # First time sampling - the data source is very large.
+                        self._scan_extremes(incoming)
+                    incoming = sample_portion(incoming, self.samplerate)
+                    index = 0
+                    supplied = len(incoming)
+                ff = self.firstfree[0]
+                available = self.data[0].shape[1] - ff
+            copycount = min(available, supplied - index)
+            self.data[0][:, ff : ff + copycount] = torch.t(
+                incoming[index : index + copycount, :]
+            )
+            self.firstfree[0] += copycount
+            index += copycount
+    def _shift(self):
+        index = 0
+        # If remaining space at the current layer is less than half prev
+        # buffer size (rounding up), then we need to shift it up to ensure
+        # enough space for future shifting.
+        while self.data[index].shape[1] - self.firstfree[index] < (
+            -(-self.data[index - 1].shape[1] // 2) if index else 1
+        ):
+            if index + 1 >= len(self.data):
+                return self._expand()
+            data = self.data[index][:, 0 : self.firstfree[index]]
+            data = data.sort()[0]
+            if index == 0 and self.samplerate >= 1.0:
+                self._update_extremes(data[:, 0], data[:, -1])
+            offset = self._randbit()
+            position = self.firstfree[index + 1]
+            subset = data[:, offset::2]
+            self.data[index + 1][:, position : position + subset.shape[1]] = subset
+            self.firstfree[index] = 0
+            self.firstfree[index + 1] += subset.shape[1]
+            index += 1
+        return True
+    def _scan_extremes(self, incoming):
+        # When sampling, we need to scan every item still to get extremes
+        self._update_extremes(
+            torch.min(incoming, dim=0)[0], torch.max(incoming, dim=0)[0]
+        )
+    def _update_extremes(self, minr, maxr):
+        self.extremes[:, 0] = torch.min(
+            torch.stack([self.extremes[:, 0], minr]), dim=0
+        )[0]
+        self.extremes[:, -1] = torch.max(
+            torch.stack([self.extremes[:, -1], maxr]), dim=0
+        )[0]
+    def _randbit(self):
+        self.currentbit += 1
+        if self.currentbit >= len(self.randbits):
+            self.randbits.random_(to=2)
+            self.currentbit = 0
+        return self.randbits[self.currentbit]
+    def state_dict(self):
+        state = dict(
+            constructor=self.__module__ + "." + self.__class__.__name__ + "()",
+            resolution=self.resolution,
+            depth=self.depth,
+            buffersize=self.buffersize,
+            samplerate=self.samplerate,
+            sizes=numpy.array([d.shape[1] for d in self.data]),
+            extremes=self.extremes.cpu().detach().numpy(),
+            size=self.count,
+            batchcount=self.batchcount,
+        )
+        for i, (d, f) in enumerate(zip(self.data, self.firstfree)):
+            state[f"data.{i}"] = d.cpu().detach().numpy()[:, :f].T
+        return state
+    def load_state_dict(self, state):
+        self.resolution = int(state["resolution"])
+        self.randbits = torch.ByteTensor(self.resolution)
+        self.currentbit = len(self.randbits) - 1
+        self.depth = int(state["depth"])
+        self.buffersize = int(state["buffersize"])
+        self.samplerate = float(state["samplerate"])
+        firstfree = []
+        buffers = []
+        for i, s in enumerate(state["sizes"]):
+            d = state[f"data.{i}"]
+            firstfree.append(d.shape[0])
+            buf = numpy.zeros((d.shape[1], s), dtype=d.dtype)
+            buf[:, : d.shape[0]] = d.T
+            buffers.append(torch.from_numpy(buf))
+        self.firstfree = firstfree
+        self.data = buffers
+        self.extremes = torch.from_numpy((state["extremes"]))
+        self.count = int(state["size"])
+        self.batchcount = int(state.get("batchcount", 0))
+        self.dtype = self.extremes.dtype
+        self.device = self.extremes.device
+    def min(self):
+        return self.minmax()[0]
+    def max(self):
+        return self.minmax()[-1]
+    def minmax(self):
+        if self.firstfree[0]:
+            self._scan_extremes(self.data[0][:, : self.firstfree[0]].t())
+        return self.extremes.clone()
+    def median(self):
+        return self.quantiles(0.5)
+    def mean(self):
+        return self.integrate(lambda x: x) / self.count
+    def variance(self, unbiased=True):
+        mean = self.mean()[:, None]
+        return self.integrate(lambda x: (x - mean).pow(2)) / (
+            self.count - (1 if unbiased else 0)
+        )
+    def stdev(self, unbiased=True):
+        return self.variance(unbiased=unbiased).sqrt()
+    def _expand(self):
+        cap = self._next_capacity()
+        if cap > 0:
+            # First, make a new layer of the proper capacity.
+            self.data.insert(
+                0, torch.zeros(self.depth, cap, dtype=self.dtype, device=self.device)
+            )
+            self.firstfree.insert(0, 0)
+        else:
+            # Unless we're so big we are just subsampling.
+            assert self.firstfree[0] == 0
+            self.samplerate *= 0.5
+        for index in range(1, len(self.data)):
+            # Scan for existing data that needs to be moved down a level.
+            amount = self.firstfree[index]
+            if amount == 0:
+                continue
+            position = self.firstfree[index - 1]
+            # Move data down if it would leave enough empty space there
+            # This is the key invariant: enough empty space to fit half
+            # of the previous level's buffer size (rounding up)
+            if self.data[index - 1].shape[1] - (amount + position) >= (
+                -(-self.data[index - 2].shape[1] // 2) if (index - 1) else 1
+            ):
+                self.data[index - 1][:, position : position + amount] = self.data[
+                    index
+                ][:, :amount]
+                self.firstfree[index - 1] += amount
+                self.firstfree[index] = 0
+            else:
+                # Scrunch the data if it would not.
+                data = self.data[index][:, :amount]
+                data = data.sort()[0]
+                if index == 1:
+                    self._update_extremes(data[:, 0], data[:, -1])
+                offset = self._randbit()
+                scrunched = data[:, offset::2]
+                self.data[index][:, : scrunched.shape[1]] = scrunched
+                self.firstfree[index] = scrunched.shape[1]
+        return cap > 0
+    def _next_capacity(self):
+        cap = int(math.ceil(self.resolution * (0.67 ** len(self.data))))
+        if cap < 2:
+            return 0
+        # Round up to the nearest multiple of 8 for better GPU alignment.
+        cap = -8 * (-cap // 8)
+        return max(self.buffersize, cap)
+    def _weighted_summary(self, sort=True):
+        if self.firstfree[0]:
+            self._scan_extremes(self.data[0][:, : self.firstfree[0]].t())
+        size = sum(self.firstfree)
+        weights = torch.FloatTensor(size)  # Floating point
+        summary = torch.zeros(self.depth, size, dtype=self.dtype, device=self.device)
+        index = 0
+        for level, ff in enumerate(self.firstfree):
+            if ff == 0:
+                continue
+            summary[:, index : index + ff] = self.data[level][:, :ff]
+            weights[index : index + ff] = 2.0**level
+            index += ff
+        assert index == summary.shape[1]
+        if sort:
+            summary, order = torch.sort(summary, dim=-1)
+            weights = weights[order.view(-1).cpu()].view(order.shape)
+            summary = torch.cat(
+                [self.extremes[:, :1], summary, self.extremes[:, 1:]], dim=-1
+            )
+            weights = torch.cat(
+                [
+                    torch.zeros(weights.shape[0], 1),
+                    weights,
+                    torch.zeros(weights.shape[0], 1),
+                ],
+                dim=-1,
+            )
+        return (summary, weights)
+    def quantiles(self, quantiles):
+        if not hasattr(quantiles, "cpu"):
+            quantiles = torch.tensor(quantiles)
+        qshape = quantiles.shape
+        if self.count == 0:
+            return torch.full((self.depth,) + qshape, torch.nan)
+        summary, weights = self._weighted_summary()
+        cumweights = torch.cumsum(weights, dim=-1) - weights / 2
+        cumweights /= torch.sum(weights, dim=-1, keepdim=True)
+        result = torch.zeros(
+            self.depth, quantiles.numel(), dtype=self.dtype, device=self.device
+        )
+        # numpy is needed for interpolation
+        nq = quantiles.view(-1).cpu().detach().numpy()
+        ncw = cumweights.cpu().detach().numpy()
+        nsm = summary.cpu().detach().numpy()
+        for d in range(self.depth):
+            result[d] = torch.tensor(
+                numpy.interp(nq, ncw[d], nsm[d]), dtype=self.dtype, device=self.device
+            )
+        return result.view((self.depth,) + qshape)
+    def integrate(self, fun):
+        result = []
+        for level, ff in enumerate(self.firstfree):
+            if ff == 0:
+                continue
+            result.append(
+                torch.sum(fun(self.data[level][:, :ff]) * (2.0**level), dim=-1)
+            )
+        if len(result) == 0:
+            return None
+        return torch.stack(result).sum(dim=0) / self.samplerate
+    def readout(self, count=1001):
+        return self.quantiles(torch.linspace(0.0, 1.0, count))
+    def normalize(self, data):
+        """
+        Given input data as taken from the training distirbution,
+        normalizes every channel to reflect quantile values,
+        uniformly distributed, within [0, 1].
+        """
+        assert self.count > 0
+        assert data.shape[0] == self.depth
+        summary, weights = self._weighted_summary()
+        cumweights = torch.cumsum(weights, dim=-1) - weights / 2
+        cumweights /= torch.sum(weights, dim=-1, keepdim=True)
+        result = torch.zeros_like(data).float()
+        # numpy is needed for interpolation
+        ndata = data.cpu().numpy().reshape((data.shape[0], -1))
+        ncw = cumweights.cpu().numpy()
+        nsm = summary.cpu().numpy()
+        for d in range(self.depth):
+            normed = torch.tensor(
+                numpy.interp(ndata[d], nsm[d], ncw[d]),
+                dtype=torch.float,
+                device=data.device,
+            ).clamp_(0.0, 1.0)
+            if len(data.shape) > 1:
+                normed = normed.view(*(data.shape[1:]))
+            result[d] = normed
+        return result
+def sample_portion(vec, p=0.5):
+    """
+    Subsamples a fraction (given by p) of the given batch.  Used by
+    Quantile when the data gets very very large.
+    """
+    bits = torch.bernoulli(
+        torch.zeros(vec.shape[0], dtype=torch.uint8, device=vec.device), p
+    )
+    return vec[bits]
+class TopK:
+    """
+    A class to keep a running tally of the the top k values (and indexes)
+    of any number of torch feature components.  Will work on the GPU if
+    the data is on the GPU.  Tracks largest by default, but tracks smallest
+    if largest=False is passed.
+    This version flattens all arrays to avoid crashes.
+    """
+    def __init__(self, k=100, largest=True, state=None):
+        if state is not None:
+            return super().__init__(state)
+        self.k = k
+        self.count = 0
+        # This version flattens all data internally to 2-d tensors,
+        # to avoid crashes with the current pytorch topk implementation.
+        # The data is puffed back out to arbitrary tensor shapes on output.
+        self.data_shape = None
+        self.top_data = None
+        self.top_index = None
+        self.next = 0
+        self.linear_index = 0
+        self.perm = None
+        self.largest = largest
+    def add(self, data, index=None):
+        """
+        Adds a batch of data to be considered for the running top k.
+        The zeroth dimension enumerates the observations.  All other
+        dimensions enumerate different features.
+        """
+        if self.top_data is None:
+            # Allocation: allocate a buffer of size 5*k, at least 10, for each.
+            self.data_shape = data.shape[1:]
+            feature_size = int(numpy.prod(self.data_shape))
+            self.top_data = torch.zeros(
+                feature_size, max(10, self.k * 5), out=data.new()
+            )
+            self.top_index = self.top_data.clone().long()
+            self.linear_index = (
+                0
+                if len(data.shape) == 1
+                else torch.arange(feature_size, out=self.top_index.new()).mul_(
+                    self.top_data.shape[-1]
+                )[:, None]
+            )
+        size = data.shape[0]
+        sk = min(size, self.k)
+        if self.top_data.shape[-1] < self.next + sk:
+            # Compression: if full, keep topk only.
+            self.top_data[:, : self.k], self.top_index[:, : self.k] = self.topk(
+                sorted=False, flat=True
+            )
+            self.next = self.k
+        # Pick: copy the top sk of the next batch into the buffer.
+        # Currently strided topk is slow.  So we clone after transpose.
+        # TODO: remove the clone() if it becomes faster.
+        cdata = data.reshape(size, numpy.prod(data.shape[1:])).t().clone()
+        td, ti = cdata.topk(sk, sorted=False, largest=self.largest)
+        self.top_data[:, self.next : self.next + sk] = td
+        if index is not None:
+            ti = index[ti]
+        else:
+            ti = ti + self.count
+        self.top_index[:, self.next : self.next + sk] = ti
+        self.next += sk
+        self.count += size
+    def size(self):
+        return self.count
+    def topk(self, sorted=True, flat=False):
+        """
+        Returns top k data items and indexes in each dimension,
+        with channels in the first dimension and k in the last dimension.
+        """
+        k = min(self.k, self.next)
+        # bti are top indexes relative to buffer array.
+        td, bti = self.top_data[:, : self.next].topk(
+            k, sorted=sorted, largest=self.largest
+        )
+        # we want to report top indexes globally, which is ti.
+        ti = self.top_index.view(-1)[(bti + self.linear_index).view(-1)].view(
+            *bti.shape
+        )
+        if flat:
+            return td, ti
+        else:
+            return (
+                td.view(*(self.data_shape + (-1,))),
+                ti.view(*(self.data_shape + (-1,))),
+            )
+    def to_(self, device):
+        if self.top_data is not None:
+            self.top_data = self.top_data.to(device)
+        if self.top_index is not None:
+            self.top_index = self.top_index.to(device)
+        if isinstance(self.linear_index, torch.Tensor):
+            self.linear_index = self.linear_index.to(device)
+    def state_dict(self):
+        return dict(
+            constructor=self.__module__ + "." + self.__class__.__name__ + "()",
+            k=self.k,
+            count=self.count,
+            largest=self.largest,
+            data_shape=self.data_shape and tuple(self.data_shape),
+            top_data=self.top_data.cpu().detach().numpy(),
+            top_index=self.top_index.cpu().detach().numpy(),
+            next=self.next,
+            linear_index=(
+                self.linear_index.cpu().numpy()
+                if isinstance(self.linear_index, torch.Tensor)
+                else self.linear_index
+            ),
+            perm=self.perm,
+        )
+    def load_state_dict(self, state):
+        self.k = int(state["k"])
+        self.count = int(state["count"])
+        self.largest = bool(state.get("largest", True))
+        self.data_shape = (
+            None if state["data_shape"] is None else tuple(state["data_shape"])
+        )
+        self.top_data = torch.from_numpy(state["top_data"])
+        self.top_index = torch.from_numpy(state["top_index"])
+        self.next = int(state["next"])
+        self.linear_index = (
+            torch.from_numpy(state["linear_index"])
+            if len(state["linear_index"].shape) > 0
+            else int(state["linear_index"])
+        )
+class History(Stat):
+    """
+    Accumulates the concatenation of all the added data.
+    """
+    def __init__(self, data=None, state=None):
+        if state is not None:
+            return super().__init__(state)
+        self._data = data
+        self._added = []
+    def _cat_added(self):
+        if len(self._added):
+            self._data = torch.cat(
+                ([self._data] if self._data is not None else []) + self._added
+            )
+            self._added = []
+    def add(self, d):
+        self._added.append(d)
+        if len(self._added) > 100:
+            self._cat_added()
+    def history(self):
+        self._cat_added()
+        return self._data
+    def load_state_dict(self, state):
+        data = state["data"]
+        self._data = None if data is None else torch.from_numpy(data)
+        self._added = []
+    def state_dict(self):
+        self._cat_added()
+        return dict(
+            constructor=self.__module__ + "." + self.__class__.__name__ + "()",
+            data=None if self._data is None else self._data.cpu().numpy(),
+        )
+    def to_(self, device):
+        """Switches internal storage to specified device."""
+        self._cat_added()
+        if self._data is not None:
+            self._data = self._data.to(device)
+class CombinedStat(Stat):
+    """
+    A Stat that bundles together multiple Stat objects.
+    Convenient for loading and saving a state_dict made up of a
+    hierarchy of stats, and for use with the tally() function.
+    Example:
+        cs = CombinedStat(m=Mean(), q=Quantile())
+        for [b] in tally(cs, MyDataSet(), cache=fn, batch_size=100):
+            cs.add(b)
+        print(cs.m.mean())
+        print(cs.q.median())
+    """
+    def __init__(self, state=None, **kwargs):
+        self._objs = kwargs
+        if state is not None:
+            return super().__init__(state)
+    def __getattr__(self, k):
+        if k in self._objs:
+            return self._objs[k]
+        raise AttributeError()
+    def add(self, d, *args, **kwargs):
+        for obj in self._objs.values():
+            obj.add(d, *args, **kwargs)
+    def load_state_dict(self, state):
+        for prefix, obj in self._objs.items():
+            obj.load_state_dict(pull_key_prefix(prefix, state))
+    def state_dict(self):
+        result = {}
+        for prefix, obj in self._objs.items():
+            result.update(push_key_prefix(prefix, obj.state_dict()))
+        return result
+    def to_(self, device):
+        """Switches internal storage to specified device."""
+        for v in self._objs.values():
+            v.to_(device)
+def push_key_prefix(prefix, d):
+    """
+    Returns a dict with the same values as d, but where each key
+    adds the prefix, followed by a dot.
+    """
+    return {prefix + "." + k: v for k, v in d.items()}
+def pull_key_prefix(prefix, d):
+    """
+    Returns a filtered dict of all the items of d that start with
+    the given key prefix, plus a dot, with that prefix removed.
+    """
+    pd = prefix + "."
+    lpd = len(pd)
+    return {k[lpd:]: v for k, v in d.items() if k.startswith(pd)}
+# We wish to be able to save None (null) values in numpy npz files,
+# yet do so without setting the unsecure 'allow_pickle' flag.  To do
+# that, we will encode null as a special kind of IEEE 754 NaN value.
+# Inspired by https://github.com/zuiderkwast/nanbox/blob/master/nanbox.h
+# we follow the same Nanboxing scheme used in JavaScriptCore
+# (search for JSCJSValue.h#L435), which encodes null values in NaN
+# as the NaN value with hex pattern 0xfff8000000000002.
+null_numpy_value = numpy.array(
+    struct.unpack(">d", struct.pack(">Q", 0xFFF8000000000002))[0], dtype=numpy.float64
+)
+def is_null_numpy_value(v):
+    """
+    True if v is a 64-bit float numpy scalar NaN matching null_numpy_value.
+    """
+    return (
+        isinstance(v, numpy.ndarray)
+        and numpy.ndim(v) == 0
+        and v.dtype == numpy.float64
+        and numpy.isnan(v)
+        and 0xFFF8000000000002 == struct.unpack(">Q", struct.pack(">d", v))[0]
+    )
+def box_numpy_null(d):
+    """
+    Replaces None with null_numpy_value, leaving non-None values unchanged.
+    Recursively descends into a dictionary replacing None values.
+    """
+    try:
+        return {k: box_numpy_null(v) for k, v in d.items()}
+    except Exception:
+        return null_numpy_value if d is None else d
+def unbox_numpy_null(d):
+    """
+    Reverses box_numpy_null, replacing null_numpy_value with None.
+    Recursively descends into a dictionary replacing None values.
+    """
+    try:
+        return {k: unbox_numpy_null(v) for k, v in d.items()}
+    except Exception:
+        return None if is_null_numpy_value(d) else d
+def resolve_state_dict(s):
+    """
+    Resolves a state, which can be a filename or a dict-like object.
+    """
+    if isinstance(s, str):
+        return unbox_numpy_null(numpy.load(s))
+    return s
+global_load_cache_enabled = True
+def load_cached_state(cachefile, args, quiet=False, throw=False):
+    """
+    Resolves a state, which can be a filename or a dict-like object.
+    """
+    if not global_load_cache_enabled or cachefile is None:
+        return None
+    try:
+        if isinstance(cachefile, dict):
+            dat = cachefile
+            cachefile = "state"  # for printed messages
+        else:
+            dat = unbox_numpy_null(numpy.load(cachefile))
+        for a, v in args.items():
+            if a not in dat or dat[a] != v:
+                if not quiet:
+                    print("%s %s changed from %s to %s" % (cachefile, a, dat[a], v))
+                return None
+    except (FileNotFoundError, ValueError) as e:
+        if throw:
+            raise e
+        return None
+    else:
+        if not quiet:
+            print("Loading cached %s" % cachefile)
+        return dat
+def save_cached_state(cachefile, obj, args):
+    """
+    Saves the state_dict of the given object in a dict or npz file.
+    """
+    if cachefile is None:
+        return
+    dat = obj.state_dict()
+    for a, v in args.items():
+        if a in dat:
+            assert dat[a] == v
+        dat[a] = v
+    if isinstance(cachefile, dict):
+        cachefile.clear()
+        cachefile.update(dat)
+    else:
+        os.makedirs(os.path.dirname(cachefile), exist_ok=True)
+        numpy.savez(cachefile, **box_numpy_null(dat))
+class FixedSubsetSampler(Sampler):
+    """Represents a fixed sequence of data set indices.
+    Subsets can be created by specifying a subset of output indexes.
+    """
+    def __init__(self, samples):
+        self.samples = samples
+    def __iter__(self):
+        return iter(self.samples)
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, key):
+        return self.samples[key]
+    def subset(self, new_subset):
+        return FixedSubsetSampler(self.dereference(new_subset))
+    def dereference(self, indices):
+        """
+        Translate output sample indices (small numbers indexing the sample)
+        to input sample indices (larger number indexing the original full set)
+        """
+        return [self.samples[i] for i in indices]
+class FixedRandomSubsetSampler(FixedSubsetSampler):
+    """Samples a fixed number of samples from the dataset, deterministically.
+    Arguments:
+        data_source,
+        sample_size,
+        seed (optional)
+    """
+    def __init__(self, data_source, start=None, end=None, seed=1):
+        rng = random.Random(seed)
+        shuffled = list(range(len(data_source)))
+        rng.shuffle(shuffled)
+        self.data_source = data_source
+        super(FixedRandomSubsetSampler, self).__init__(shuffled[start:end])
+    def class_subset(self, class_filter):
+        """
+        Returns only the subset matching the given rule.
+        """
+        if isinstance(class_filter, int):
+            def rule(d):
+                return d[1] == class_filter
+        else:
+            rule = class_filter
+        return self.subset(
+            [i for i, j in enumerate(self.samples) if rule(self.data_source[j])]
+        )
+def make_loader(
+    dataset, sample_size=None, batch_size=1, sampler=None, random_sample=None, **kwargs
+):
+    """Utility for creating a dataloader on fixed sample subset."""
+    import typing
+    if isinstance(dataset, typing.Callable):
+        # To support deferred dataset loading, support passing a factory
+        # that creates the dataset when called.
+        dataset = dataset()
+    if isinstance(dataset, torch.Tensor):
+        # The dataset can be a simple tensor.
+        dataset = torch.utils.data.TensorDataset(dataset)
+    if sample_size is not None:
+        assert sampler is None, "sampler cannot be specified with sample_size"
+        if sample_size > len(dataset):
+            print(
+                "Warning: sample size %d > dataset size %d"
+                % (sample_size, len(dataset))
+            )
+            sample_size = len(dataset)
+        if random_sample is None:
+            sampler = FixedSubsetSampler(list(range(sample_size)))
+        else:
+            sampler = FixedRandomSubsetSampler(
+                dataset, seed=random_sample, end=sample_size
+            )
+    return torch.utils.data.DataLoader(
+        dataset, sampler=sampler, batch_size=batch_size, **kwargs
+    )
+# Unit Tests
+def _unit_test():
+    import warnings
+    warnings.filterwarnings("error")
+    import argparse
+    import random
+    import shutil
+    import tempfile
+    import time
+    parser = argparse.ArgumentParser(description="Test things out")
+    parser.add_argument("--mode", default="cpu", help="cpu or cuda")
+    parser.add_argument("--test_size", type=int, default=1000000)
+    args = parser.parse_args()
+    testdir = tempfile.mkdtemp()
+    batch_size = random.randint(500, 1500)
+    # Test NaNboxing.
+    assert numpy.isnan(null_numpy_value)
+    assert is_null_numpy_value(null_numpy_value)
+    assert not is_null_numpy_value(numpy.nan)
+    # Test Covariance
+    goal = torch.tensor(numpy.random.RandomState(1).standard_normal(10 * 10)).view(
+        10, 10
+    )
+    data = (
+        torch.tensor(numpy.random.RandomState(2).standard_normal(args.test_size * 10))
+        .view(args.test_size, 10)
+        .mm(goal)
+    )
+    data += torch.randn(1, 10) * 999
+    dcov = data.t().cov()
+    dcorr = data.t().corrcoef()
+    rcov = Covariance()
+    rcov.add(data)  # All one batch
+    assert (rcov.covariance() - dcov).abs().max() < 1e-16
+    cs = CombinedStat(cov=Covariance(), xcov=CrossCovariance())
+    ds = torch.utils.data.TensorDataset(data)
+    for [a] in tally(cs, ds, batch_size=9876):
+        cs.cov.add(a)
+        cs.xcov.add(a[:, :3], a[:, 3:])
+    assert (data.mean(0) - cs.cov.mean()).abs().max() < 1e-12
+    assert (dcov - cs.cov.covariance()).abs().max() < 2e-12
+    assert (dcov[:3, 3:] - cs.xcov.covariance()).abs().max() < 1e-12
+    assert (dcov.diagonal() - torch.cat(cs.xcov.variance())).abs().max() < 1e-12
+    assert (dcorr - cs.cov.correlation()).abs().max() < 2e-12
+    # Test CrossCovariance and CrossIoU
+    fn = f"{testdir}/cross_cache.npz"
+    ds = torch.utils.data.TensorDataset(
+        (
+            torch.arange(args.test_size)[:, None] % torch.arange(1, 6)[None, :] == 0
+        ).double(),
+        (
+            torch.arange(args.test_size)[:, None] % torch.arange(5, 8)[None, :] == 0
+        ).double(),
+    )
+    c = CombinedStat(c=CrossCovariance(), iou=CrossIoU())
+    riou = IoU()
+    count = 0
+    for [a, b] in tally(c, ds, cache=fn, batch_size=100):
+        count += 1
+        c.add(a, b)
+        riou.add(torch.cat([a, b], dim=1))
+    assert count == -(-args.test_size // 100)
+    cor = c.c.correlation()
+    iou = c.iou.iou()
+    assert cor.shape == iou.shape == (5, 3)
+    assert iou[4, 0] == 1.0
+    assert abs(iou[0, 2] + (-args.test_size // 7 / float(args.test_size))) < 1e-6
+    assert abs(cor[4, 0] - 1.0) < 1e-2
+    assert abs(cor[0, 2] - 0.0) < 1e-6
+    assert all((riou.iou()[:5, -3:] == iou).view(-1))
+    assert all(riou.iou().diagonal(0) == 1)
+    c = CombinedStat(c=CrossCovariance(), iou=CrossIoU())
+    count = 0
+    for [a, b] in tally(c, ds, cache=fn, batch_size=10):
+        count += 1
+        c.add(a, b)
+    assert count == 0
+    assert all((c.c.correlation() == cor).view(-1))
+    assert all((c.iou.iou() == iou).view(-1))
+    # Test Concatantaion, Mean, Bincount and tally.
+    fn = f"{testdir}/series_cache.npz"
+    count = 0
+    ds = torch.utils.data.TensorDataset(torch.arange(args.test_size))
+    c = CombinedStat(s=History(), m=Mean(), b=Bincount())
+    for [b] in tally(c, ds, cache=fn, batch_size=batch_size):
+        count += 1
+        c.add(b)
+    assert count == -(-args.test_size // batch_size)
+    assert len(c.s.history()) == args.test_size
+    assert c.s.history()[-1] == args.test_size - 1
+    assert all(c.s.history() == ds.tensors[0])
+    assert all(c.b.bincount() == torch.ones(args.test_size))
+    assert c.m.mean() == float(args.test_size - 1) / 2.0
+    c2 = CombinedStat(s=History(), m=Mean(), b=Bincount())
+    batches = tally(c2, ds, cache=fn)
+    assert len(c2.s.history()) == args.test_size
+    assert all(c2.s.history() == c.s.history())
+    assert all(c2.b.bincount() == torch.ones(args.test_size))
+    assert c2.m.mean() == c.m.mean()
+    count = 0
+    for b in batches:
+        count += 1
+    assert count == 0  # Shouldn't do anything when it's cached
+    # An adverarial case: we keep finding more numbers in the middle
+    # as the stream goes on.
+    amount = args.test_size
+    quantiles = 1000
+    data = numpy.arange(float(amount))
+    data[1::2] = data[-1::-2] + (len(data) - 1)
+    data /= 2
+    depth = 50
+    alldata = data[:, None] + (numpy.arange(depth) * amount)[None, :]
+    actual_sum = torch.FloatTensor(numpy.sum(alldata * alldata, axis=0))
+    amt = amount // depth
+    for r in range(depth):
+        numpy.random.shuffle(alldata[r * amt : r * amt + amt, r])
+    if args.mode == "cuda":
+        alldata = torch.cuda.FloatTensor(alldata)
+        device = torch.device("cuda")
+    else:
+        alldata = torch.FloatTensor(alldata)
+        device = None
+    starttime = time.time()
+    cs = CombinedStat(
+        qc=Quantile(),
+        m=Mean(),
+        v=Variance(),
+        c=Covariance(),
+        s=SecondMoment(),
+        t=TopK(),
+        i=IoU(),
+    )
+    # Feed data in little batches
+    i = 0
+    while i < len(alldata):
+        batch_size = numpy.random.randint(1000)
+        cs.add(alldata[i : i + batch_size])
+        i += batch_size
+    # Test state dict
+    saved = cs.state_dict()
+    # numpy.savez(f'{testdir}/saved.npz', **box_numpy_null(saved))
+    # saved = unbox_numpy_null(numpy.load(f'{testdir}/saved.npz'))
+    cs.save(f"{testdir}/saved.npz")
+    loaded = unbox_numpy_null(numpy.load(f"{testdir}/saved.npz"))
+    assert set(loaded.keys()) == set(saved.keys())
+    # Restore using state=saved in constructor.
+    cs2 = CombinedStat(
+        qc=Quantile(),
+        m=Mean(),
+        v=Variance(),
+        c=Covariance(),
+        s=SecondMoment(),
+        t=TopK(),
+        i=IoU(),
+        state=saved,
+    )
+    # saved = unbox_numpy_null(numpy.load(f'{testdir}/saved.npz'))
+    assert not cs2.qc.device.type == "cuda"
+    cs2.to_(device)
+    # alldata = alldata.cpu()
+    cs2.add(alldata)
+    actual_sum *= 2
+    # print(abs(alldata.mean(0) - cs2.m.mean()) / alldata.mean())
+    assert all(abs(alldata.mean(0) - cs2.m.mean()) / alldata.mean() < 1e-5)
+    assert all(abs(alldata.mean(0) - cs2.v.mean()) / alldata.mean() < 1e-5)
+    assert all(abs(alldata.mean(0) - cs2.c.mean()) / alldata.mean() < 1e-5)
+    # print(abs(alldata.var(0) - cs2.v.variance()) / alldata.var(0))
+    assert all(abs(alldata.var(0) - cs2.v.variance()) / alldata.var(0) < 1e-3)
+    assert all(abs(alldata.var(0) - cs2.c.variance()) / alldata.var(0) < 1e-2)
+    # print(abs(alldata.std(0) - cs2.v.stdev()) / alldata.std(0))
+    assert all(abs(alldata.std(0) - cs2.v.stdev()) / alldata.std(0) < 1e-4)
+    # print(abs(alldata.std(0) - cs2.c.stdev()) / alldata.std(0))
+    assert all(abs(alldata.std(0) - cs2.c.stdev()) / alldata.std(0) < 2e-3)
+    moment = (alldata.t() @ alldata) / len(alldata)
+    # print(abs(moment - cs2.s.moment()) / moment.abs())
+    assert all((abs(moment - cs2.s.moment()) / moment.abs()).view(-1) < 1e-2)
+    assert all(alldata.max(dim=0)[0] == cs2.t.topk()[0][:, 0])
+    assert cs2.i.iou()[0, 0] == 1
+    assert all((cs2.i.iou()[1:, 1:] == 1).view(-1))
+    assert all(cs2.i.iou()[1:, 0] < 1)
+    assert all(cs2.i.iou()[1:, 0] == cs2.i.iou()[0, 1:])
+    # Restore using cs.load() method.
+    cs = CombinedStat(
+        qc=Quantile(),
+        m=Mean(),
+        v=Variance(),
+        c=Covariance(),
+        s=SecondMoment(),
+        t=TopK(),
+        i=IoU(),
+    )
+    cs.load(f"{testdir}/saved.npz")
+    assert not cs.qc.device.type == "cuda"
+    cs.to_(device)
+    cs.add(alldata)
+    # actual_sum *= 2
+    # print(abs(alldata.mean(0) - cs.m.mean()) / alldata.mean())
+    assert all(abs(alldata.mean(0) - cs.m.mean()) / alldata.mean() < 1e-5)
+    assert all(abs(alldata.mean(0) - cs.v.mean()) / alldata.mean() < 1e-5)
+    assert all(abs(alldata.mean(0) - cs.c.mean()) / alldata.mean() < 1e-5)
+    # print(abs(alldata.var(0) - cs.v.variance()) / alldata.var(0))
+    assert all(abs(alldata.var(0) - cs.v.variance()) / alldata.var(0) < 1e-3)
+    assert all(abs(alldata.var(0) - cs.c.variance()) / alldata.var(0) < 1e-2)
+    # print(abs(alldata.std(0) - cs.v.stdev()) / alldata.std(0))
+    assert all(abs(alldata.std(0) - cs.v.stdev()) / alldata.std(0) < 1e-4)
+    # print(abs(alldata.std(0) - cs.c.stdev()) / alldata.std(0))
+    assert all(abs(alldata.std(0) - cs.c.stdev()) / alldata.std(0) < 2e-3)
+    moment = (alldata.t() @ alldata) / len(alldata)
+    # print(abs(moment - cs.s.moment()) / moment.abs())
+    assert all((abs(moment - cs.s.moment()) / moment.abs()).view(-1) < 1e-2)
+    assert all(alldata.max(dim=0)[0] == cs.t.topk()[0][:, 0])
+    assert cs.i.iou()[0, 0] == 1
+    assert all((cs.i.iou()[1:, 1:] == 1).view(-1))
+    assert all(cs.i.iou()[1:, 0] < 1)
+    assert all(cs.i.iou()[1:, 0] == cs.i.iou()[0, 1:])
+    # Randomized quantile test
+    qc = cs.qc
+    ro = qc.readout(1001).cpu()
+    endtime = time.time()
+    gt = (
+        torch.linspace(0, amount, quantiles + 1)[None, :]
+        + (torch.arange(qc.depth, dtype=torch.float) * amount)[:, None]
+    )
+    maxreldev = torch.max(torch.abs(ro - gt) / amount) * quantiles
+    print("Randomized quantile test results:")
+    print("Maximum relative deviation among %d perentiles: %f" % (quantiles, maxreldev))
+    minerr = torch.max(
+        torch.abs(
+            qc.minmax().cpu()[:, 0] - torch.arange(qc.depth, dtype=torch.float) * amount
+        )
+    )
+    maxerr = torch.max(
+        torch.abs(
+            (qc.minmax().cpu()[:, -1] + 1)
+            - (torch.arange(qc.depth, dtype=torch.float) + 1) * amount
+        )
+    )
+    print("Minmax error %f, %f" % (minerr, maxerr))
+    interr = torch.max(
+        torch.abs(qc.integrate(lambda x: x * x).cpu() - actual_sum) / actual_sum
+    )
+    print("Integral error: %f" % interr)
+    medianerr = torch.max(
+        torch.abs(qc.median() - alldata.median(0)[0]) / alldata.median(0)[0]
+    ).cpu()
+    print("Median error: %f" % medianerr)
+    meanerr = torch.max(torch.abs(qc.mean() - alldata.mean(0)) / alldata.mean(0)).cpu()
+    print("Mean error: %f" % meanerr)
+    varerr = torch.max(torch.abs(qc.variance() - alldata.var(0)) / alldata.var(0)).cpu()
+    print("Variance error: %f" % varerr)
+    counterr = (
+        (qc.integrate(lambda x: torch.ones(x.shape[-1]).cpu()) - qc.size())
+        / (0.0 + qc.size())
+    ).item()
+    print("Count error: %f" % counterr)
+    print("Time %f" % (endtime - starttime))
+    # Algorithm is randomized, so some of these will fail with low probability.
+    assert maxreldev < 1.0
+    assert minerr == 0.0
+    assert maxerr == 0.0
+    assert interr < 0.01
+    assert abs(counterr) < 0.001
+    shutil.rmtree(testdir, ignore_errors=True)
+    print("OK")
+if __name__ == "__main__":
+    _unit_test()

hparams/GRACE/README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+alg_name: "GRACE"
+model_name: "./hugging_cache/gpt-j-6B"
+device: 0
+inner_params:
+- transformer.h[25].mlp.fc_out.weight
+edit_lr: 1.0
+n_iter: 200
+eps: 1.0
+dist_fn: euc # euc, mmd, cos
+val_init: cold # cold, warm
+val_train: sgd # sgd, pert
+val_reg: None # early
+reg: early_stop # early_stop
+replacement: replace_last # replace_last, replace_all, replace_prompt
+eps_expand: coverage # , moving_avg, decay
+num_pert: 8 # only matters when using perturbation training
+dropout: 0.0

hparams/GRACE/gpt2-xl.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+alg_name: "GRACE"
+model_name: "./hugging_cache/gpt2-xl"
+device: 0
+inner_params:
+- transformer.h[35].mlp.c_fc.weight
+edit_lr: 1.0
+n_iter: 50
+eps: 1.0
+dist_fn: euc # euc, mmd, cos
+val_init: cold # cold, warm
+val_train: sgd # sgd, pert
+val_reg: None # early
+reg: early_stop # early_stop
+replacement: replace_last # replace_last, replace_all, replace_prompt
+eps_expand: coverage # , moving_avg, decay
+num_pert: 8 # only matters when using perturbation training
+dropout: 0.0

hparams/config.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+save_dir: models/
+log_dir: logs/
+defaults:
+  alg_name: KN # Editing Method
+  hparams_name: KN/t5-3b # Edited Model Config Path

utils.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
+from transformers import GPT2TokenizerFast, GPT2Tokenizer
+from easyeditor import apply_grace_to_model, GraceHyperParams,nethook
+import torch
+def edit(prompt, target_new):
+    request={"prompt":prompt,"target_new":target_new}
+    hparams = GraceHyperParams.from_hparams("./hparams/GRACE/gpt2-xl.yaml")
+    model = AutoModelForCausalLM.from_pretrained("./models/gpt2-xl")
+    tok = GPT2Tokenizer.from_pretrained("./models/gpt2-xl")
+    tok.pad_token_id = tok.eos_token_id
+    global edit_model
+    edit_model,_ = apply_grace_to_model(model,tok,request,hparams,keep_original_weight=True)
+    return "finish"
+def generate(input_text):
+    tok = GPT2Tokenizer.from_pretrained("./models/gpt2-xl")
+    hparams = GraceHyperParams.from_hparams("./hparams/GRACE/gpt2-xl.yaml")
+    tok.pad_token_id = tok.eos_token_id
+    global edit_model
+    input_ids = tok.encode(input_text, return_tensors='pt').to(f'cuda:{hparams.device}')
+    edit_output = edit_model.generate(input_ids, max_length=30, pad_token_id=tok.eos_token_id)
+    edit_reply = tok.decode(edit_output[0], skip_special_tokens=True)
+    del edit_model
+    torch.cuda.empty_cache()
+    ori_model = AutoModelForCausalLM.from_pretrained("./models/gpt2-xl").to(f'cuda:{hparams.device}')
+    ori_output = ori_model.generate(input_ids, max_length=30, pad_token_id=tok.eos_token_id)
+    ori_reply = tok.decode(ori_output[0], skip_special_tokens=True)
+    return ori_reply, edit_reply