dbal0503's picture
Upload 693 files
2ce7b1a
raw
history blame
27.3 kB
"""Long Range Arena datasets"""
import io
import logging
import os
import pickle
from pathlib import Path
import torch
from torch import nn
import torch.nn.functional as F
import torchtext
import torchvision
from einops.layers.torch import Rearrange, Reduce
from PIL import Image # Only used for Pathfinder
from datasets import DatasetDict, Value, load_dataset, load_from_disk
from .base import default_data_path, SequenceDataset, ImageResolutionSequenceDataset
class IMDB(SequenceDataset):
_name_ = "imdb"
d_output = 2
l_output = 0
@property
def init_defaults(self):
return {
"l_max": 4096,
"level": "char",
"min_freq": 15,
"seed": 42,
"val_split": 0.0,
"append_bos": False,
"append_eos": True,
# 'max_vocab': 135,
"n_workers": 4, # Only used for tokenizing dataset before caching
}
@property
def n_tokens(self):
return len(self.vocab)
def prepare_data(self):
if self.cache_dir is None: # Just download the dataset
load_dataset(self._name_, cache_dir=self.data_dir)
else: # Process the dataset and save it
self.process_dataset()
def setup(self, stage=None):
"""If cache_dir is not None, we'll cache the processed dataset there."""
# # NOTE - AW - we manually set these elsewhere.
# self.data_dir = self.data_dir or default_data_path / self._name_
# self.cache_dir = self.data_dir / "cache"
assert self.level in [
"word",
"char",
], f"level {self.level} not supported"
if stage == "test" and hasattr(self, "dataset_test"):
return
dataset, self.tokenizer, self.vocab = self.process_dataset()
print(
f"IMDB {self.level} level | min_freq {self.min_freq} | vocab size {len(self.vocab)}"
)
dataset.set_format(type="torch", columns=["input_ids", "label"])
# Create all splits
dataset_train, self.dataset_test = dataset["train"], dataset["test"]
if self.val_split == 0.0:
# Use test set as val set, as done in the LRA paper
self.dataset_train, self.dataset_val = dataset_train, None
else:
train_val = dataset_train.train_test_split(
test_size=self.val_split, seed=self.seed
)
self.dataset_train, self.dataset_val = (
train_val["train"],
train_val["test"],
)
def _collate_fn(self, batch):
xs, ys = zip(*[(data["input_ids"], data["label"]) for data in batch])
lengths = torch.tensor([len(x) for x in xs])
xs = nn.utils.rnn.pad_sequence(
xs, padding_value=self.vocab["<pad>"], batch_first=True
)
ys = torch.tensor(ys)
return xs, ys, {"lengths": lengths}
# self._collate_fn = collate_batch
def process_dataset(self):
cache_dir = (
None if self.cache_dir is None else self.cache_dir / self._cache_dir_name
)
if cache_dir is not None:
if cache_dir.is_dir():
return self._load_from_cache(cache_dir)
print(f"self._name_: {self._name_}")
print(f"self.data_dir: {self.data_dir}")
# dataset = load_dataset(self._name_, cache_dir=self.data_dir)
dataset = load_from_disk('essays')
print(type(dataset))
dataset = DatasetDict(train=dataset["train"], test=dataset["test"])
if self.level == "word":
tokenizer = torchtext.data.utils.get_tokenizer(
"spacy", language="en_core_web_sm"
)
else: # self.level == 'char'
tokenizer = list # Just convert a string to a list of chars
# Account for <bos> and <eos> tokens
l_max = self.l_max - int(self.append_bos) - int(self.append_eos)
tokenize = lambda example: {"tokens": tokenizer(example["text"])[:l_max]}
dataset = dataset.map(
tokenize,
remove_columns=["text"],
keep_in_memory=True,
load_from_cache_file=False,
num_proc=max(self.n_workers, 1),
)
vocab = torchtext.vocab.build_vocab_from_iterator(
dataset["train"]["tokens"],
min_freq=self.min_freq,
specials=(
["<pad>", "<unk>"]
+ (["<bos>"] if self.append_bos else [])
+ (["<eos>"] if self.append_eos else [])
),
)
vocab.set_default_index(vocab["<unk>"])
numericalize = lambda example: {
"input_ids": vocab(
(["<bos>"] if self.append_bos else [])
+ example["tokens"]
+ (["<eos>"] if self.append_eos else [])
)
}
dataset = dataset.map(
numericalize,
remove_columns=["tokens"],
keep_in_memory=True,
load_from_cache_file=False,
num_proc=max(self.n_workers, 1),
)
if cache_dir is not None:
self._save_to_cache(dataset, tokenizer, vocab, cache_dir)
return dataset, tokenizer, vocab
def _save_to_cache(self, dataset, tokenizer, vocab, cache_dir):
cache_dir = self.cache_dir / self._cache_dir_name
logger = logging.getLogger(__name__)
logger.info(f"Saving to cache at {str(cache_dir)}")
dataset.save_to_disk(str(cache_dir))
with open(cache_dir / "tokenizer.pkl", "wb") as f:
pickle.dump(tokenizer, f)
with open(cache_dir / "vocab.pkl", "wb") as f:
pickle.dump(vocab, f)
def _load_from_cache(self, cache_dir):
assert cache_dir.is_dir()
logger = logging.getLogger(__name__)
logger.info(f"Load from cache at {str(cache_dir)}")
dataset = DatasetDict.load_from_disk(str(cache_dir))
with open(cache_dir / "tokenizer.pkl", "rb") as f:
tokenizer = pickle.load(f)
with open(cache_dir / "vocab.pkl", "rb") as f:
vocab = pickle.load(f)
return dataset, tokenizer, vocab
@property
def _cache_dir_name(self):
return f"l_max-{self.l_max}-level-{self.level}-min_freq-{self.min_freq}-append_bos-{self.append_bos}-append_eos-{self.append_eos}"
class TabularDataset(torch.utils.data.Dataset):
def __init__(
self,
path,
format,
col_idx=None,
skip_header=False,
csv_reader_params=None,
):
"""
col_idx: the indices of the columns.
"""
if csv_reader_params is None:
csv_reader_params = {}
format = format.lower()
assert format in ["tsv", "csv"]
with io.open(os.path.expanduser(path), encoding="utf8") as f:
if format == "csv":
reader = torchtext.utils.unicode_csv_reader(f, **csv_reader_params)
elif format == "tsv":
reader = torchtext.utils.unicode_csv_reader(
f, delimiter="\t", **csv_reader_params
)
else:
reader = f
if skip_header:
next(reader)
self._data = [
line if col_idx is None else [line[c] for c in col_idx]
for line in reader
]
def __len__(self):
return len(self._data)
def __getitem__(self, idx):
return self._data[idx]
# LRA tokenizer renames ']' to 'X' and delete parentheses as their tokenizer removes
# non-alphanumeric characters.
# https://github.com/google-research/long-range-arena/blob/264227cbf9591e39dd596d2dc935297a2070bdfe/lra_benchmarks/listops/input_pipeline.py#L46
def listops_tokenizer(s):
return s.translate({ord("]"): ord("X"), ord("("): None, ord(")"): None}).split()
class ListOps(SequenceDataset):
_name_ = "listops"
d_output = 10
l_output = 0
@property
def init_defaults(self):
return {
"l_max": 2048,
"append_bos": False,
"append_eos": True,
# 'max_vocab': 20, # Actual size 18
"n_workers": 4, # Only used for tokenizing dataset
}
@property
def n_tokens(self):
return len(self.vocab)
@property
def _cache_dir_name(self):
return f"l_max-{self.l_max}-append_bos-{self.append_bos}-append_eos-{self.append_eos}"
def init(self):
if self.data_dir is None:
self.data_dir = default_data_path / self._name_
self.cache_dir = self.data_dir / self._cache_dir_name
def prepare_data(self):
if self.cache_dir is None:
for split in ["train", "val", "test"]:
split_path = self.data_dir / f"basic_{split}.tsv"
if not split_path.is_file():
raise FileNotFoundError(
f"""
File {str(split_path)} not found.
To get the dataset, download lra_release.gz from
https://github.com/google-research/long-range-arena,
then unzip it with tar -xvf lra_release.gz.
Then point data_dir to the listops-1000 directory.
"""
)
else: # Process the dataset and save it
self.process_dataset()
def setup(self, stage=None):
if stage == "test" and hasattr(self, "dataset_test"):
return
dataset, self.tokenizer, self.vocab = self.process_dataset()
self.vocab_size = len(self.vocab)
dataset.set_format(type="torch", columns=["input_ids", "Target"])
self.dataset_train, self.dataset_val, self.dataset_test = (
dataset["train"],
dataset["val"],
dataset["test"],
)
def collate_batch(batch):
xs, ys = zip(*[(data["input_ids"], data["Target"]) for data in batch])
lengths = torch.tensor([len(x) for x in xs])
xs = nn.utils.rnn.pad_sequence(
xs, padding_value=self.vocab["<pad>"], batch_first=True
)
ys = torch.tensor(ys)
return xs, ys, {"lengths": lengths}
self._collate_fn = collate_batch
def process_dataset(self):
cache_dir = (
None if self.cache_dir is None else self.cache_dir / self._cache_dir_name
)
if cache_dir is not None:
if cache_dir.is_dir():
return self._load_from_cache(cache_dir)
dataset = load_dataset(
"csv",
data_files={
"train": str(self.data_dir / "basic_train.tsv"),
"val": str(self.data_dir / "basic_val.tsv"),
"test": str(self.data_dir / "basic_test.tsv"),
},
delimiter="\t",
keep_in_memory=True,
)
tokenizer = listops_tokenizer
# Account for <bos> and <eos> tokens
l_max = self.l_max - int(self.append_bos) - int(self.append_eos)
tokenize = lambda example: {"tokens": tokenizer(example["Source"])[:l_max]}
dataset = dataset.map(
tokenize,
remove_columns=["Source"],
keep_in_memory=True,
load_from_cache_file=False,
num_proc=max(self.n_workers, 1),
)
vocab = torchtext.vocab.build_vocab_from_iterator(
dataset["train"]["tokens"],
specials=(
["<pad>", "<unk>"]
+ (["<bos>"] if self.append_bos else [])
+ (["<eos>"] if self.append_eos else [])
),
)
vocab.set_default_index(vocab["<unk>"])
numericalize = lambda example: {
"input_ids": vocab(
(["<bos>"] if self.append_bos else [])
+ example["tokens"]
+ (["<eos>"] if self.append_eos else [])
)
}
dataset = dataset.map(
numericalize,
remove_columns=["tokens"],
keep_in_memory=True,
load_from_cache_file=False,
num_proc=max(self.n_workers, 1),
)
if cache_dir is not None:
self._save_to_cache(dataset, tokenizer, vocab, cache_dir)
return dataset, tokenizer, vocab
def _save_to_cache(self, dataset, tokenizer, vocab, cache_dir):
cache_dir = self.cache_dir / self._cache_dir_name
logger = logging.getLogger(__name__)
logger.info(f"Saving to cache at {str(cache_dir)}")
dataset.save_to_disk(str(cache_dir))
with open(cache_dir / "tokenizer.pkl", "wb") as f:
pickle.dump(tokenizer, f)
with open(cache_dir / "vocab.pkl", "wb") as f:
pickle.dump(vocab, f)
def _load_from_cache(self, cache_dir):
assert cache_dir.is_dir()
logger = logging.getLogger(__name__)
logger.info(f"Load from cache at {str(cache_dir)}")
dataset = DatasetDict.load_from_disk(str(cache_dir))
with open(cache_dir / "tokenizer.pkl", "rb") as f:
tokenizer = pickle.load(f)
with open(cache_dir / "vocab.pkl", "rb") as f:
vocab = pickle.load(f)
return dataset, tokenizer, vocab
class PathFinderDataset(torch.utils.data.Dataset):
"""Path Finder dataset."""
# There's an empty file in the dataset
blacklist = {"pathfinder32/curv_baseline/imgs/0/sample_172.png"}
def __init__(self, data_dir, transform=None):
"""
Args:
data_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.data_dir = Path(data_dir).expanduser()
assert self.data_dir.is_dir(), f"data_dir {str(self.data_dir)} does not exist"
self.transform = transform
samples = []
# for diff_level in ['curv_baseline', 'curv_contour_length_9', 'curv_contour_length_14']:
for diff_level in ["curv_contour_length_14"]:
path_list = sorted(
list((self.data_dir / diff_level / "metadata").glob("*.npy")),
key=lambda path: int(path.stem),
)
assert path_list, "No metadata found"
for metadata_file in path_list:
with open(metadata_file, "r") as f:
for metadata in f.read().splitlines():
metadata = metadata.split()
image_path = Path(diff_level) / metadata[0] / metadata[1]
if (
str(Path(self.data_dir.stem) / image_path)
not in self.blacklist
):
label = int(metadata[3])
samples.append((image_path, label))
self.samples = samples
def __len__(self):
return len(self.samples)
def __getitem__(self, idx):
path, target = self.samples[idx]
# https://github.com/pytorch/vision/blob/9b29f3f22783112406d9c1a6db47165a297c3942/torchvision/datasets/folder.py#L247
with open(self.data_dir / path, "rb") as f:
sample = Image.open(f).convert("L") # Open in grayscale
if self.transform is not None:
sample = self.transform(sample)
return sample, target
class PathFinder(ImageResolutionSequenceDataset):
_name_ = "pathfinder"
d_input = 1
d_output = 2
l_output = 0
@property
def n_tokens(self):
if self.tokenize:
return 256
@property
def init_defaults(self):
return {
"resolution": 32,
"sequential": True,
"tokenize": False,
"pool": 1,
"val_split": 0.1,
"test_split": 0.1,
"seed": 42, # Controls the train/val/test split
}
def default_transforms(self):
transform_list = [torchvision.transforms.ToTensor()]
if self.pool > 1:
transform_list.append(
Reduce(
"1 (h h2) (w w2) -> 1 h w",
"mean",
h2=self.pool,
w2=self.pool,
)
)
if self.tokenize:
transform_list.append(
torchvision.transforms.Lambda(lambda x: (x * 255).long())
)
else:
transform_list.append(torchvision.transforms.Normalize(mean=0.5, std=0.5))
if self.sequential:
# If tokenize, it makes more sense to get rid of the channel dimension
transform_list.append(
Rearrange("1 h w -> (h w)")
if self.tokenize
else Rearrange("1 h w -> (h w) 1")
)
else:
transform_list.append(Rearrange("1 h w -> h w 1"))
return torchvision.transforms.Compose(transform_list)
def prepare_data(self):
if not self.data_dir.is_dir():
raise FileNotFoundError(
f"""
Directory {str(self.data_dir)} not found.
To get the dataset, download lra_release.gz from
https://github.com/google-research/long-range-arena,
then unzip it with tar -xvf lra_release.gz.
Then point data_dir to the pathfinderX directory, where X is either 32, 64, 128, or 256.
"""
)
def setup(self, stage=None):
if self.data_dir is None:
self.data_dir = (
default_data_path / self._name_ / f"pathfinder{self.resolution}"
)
if self.cache_dir is not None:
if Path(self.cache_dir / (self._cache_dir_name + '.pt')).exists():
with open(self.cache_dir / (self._cache_dir_name + '.pt'), 'rb') as f:
dset = torch.load(f)
self.dataset_train = dset['train']
self.dataset_val = dset['val']
self.dataset_test = dset['test']
return None
if stage == "test" and hasattr(self, "dataset_test"):
return
# [2021-08-18] TD: I ran into RuntimeError: Too many open files.
# https://github.com/pytorch/pytorch/issues/11201
torch.multiprocessing.set_sharing_strategy("file_system")
dataset = PathFinderDataset(self.data_dir, transform=self.default_transforms())
len_dataset = len(dataset)
val_len = int(self.val_split * len_dataset)
test_len = int(self.test_split * len_dataset)
train_len = len_dataset - val_len - test_len
(
self.dataset_train,
self.dataset_val,
self.dataset_test,
) = torch.utils.data.random_split(
dataset,
[train_len, val_len, test_len],
generator=torch.Generator().manual_seed(self.seed),
)
# AW - Now we need to iterate over each of these datasets and store them in a proper cache.
def _compile_convert(dset, tag):
"""
:param dset:
:param tag:
:return:
"""
loader = torch.utils.data.DataLoader(dataset=dset, batch_size=len(dset), shuffle=False, drop_last=False)
inp, out = next(iter(loader))
dset_compiled = torch.utils.data.TensorDataset(inp, out)
return dset_compiled
os.makedirs(self.cache_dir, exist_ok=True)
self.dataset_train = _compile_convert(self.dataset_train, tag='train')
self.dataset_val = _compile_convert(self.dataset_val, tag='val')
self.dataset_test = _compile_convert(self.dataset_test, tag='test')
# Cache.
cache_path = self.cache_dir / (self._cache_dir_name + '.pt')
logger = logging.getLogger(__name__)
logger.info(f"Saving to cache at {str(cache_path)}")
with open(cache_path, 'wb') as f:
torch.save({'train': self.dataset_train,
'val': self.dataset_val,
'test': self.dataset_test},
f)
@property
def _cache_dir_name(self):
return f"pathfinder-resolution-{self.resolution}"
class AAN(SequenceDataset):
_name_ = "aan"
d_output = 2 # Use accuracy instead of binary_accuracy
l_output = 0
@property
def n_tokens(self):
return len(self.vocab)
@property
def init_defaults(self):
return {
"l_max": 4000,
# 'max_vocab': 100, # Full size 98
"append_bos": False,
"append_eos": True,
"n_workers": 4, # For tokenizing only
}
@property
def _cache_dir_name(self):
return f"l_max-{self.l_max}-append_bos-{self.append_bos}-append_eos-{self.append_eos}"
def init(self):
if self.data_dir is None:
self.data_dir = default_data_path / self._name_
self.cache_dir = self.data_dir / self._cache_dir_name
def prepare_data(self):
if self.cache_dir is None:
for split in ["train", "eval", "test"]:
split_path = self.data_dir / f"new_aan_pairs.{split}.tsv"
if not split_path.is_file():
raise FileNotFoundError(
f"""
File {str(split_path)} not found.
To get the dataset, download lra_release.gz from
https://github.com/google-research/long-range-arena,
then unzip it with tar -xvf lra_release.gz.
Then point data_dir to the tsv_data directory.
"""
)
else: # Process the dataset and save it
self.process_dataset()
def setup(self, stage=None):
if stage == "test" and hasattr(self, "dataset_test"):
return
# [2021-08-18] TD: I ran into RuntimeError: Too many open files.
# https://github.com/pytorch/pytorch/issues/11201
torch.multiprocessing.set_sharing_strategy("file_system")
dataset, self.tokenizer, self.vocab = self.process_dataset()
# self.vocab_size = len(self.vocab)
print("AAN vocab size:", len(self.vocab))
dataset.set_format(type="torch", columns=["input_ids1", "input_ids2", "label"])
self.dataset_train, self.dataset_val, self.dataset_test = (
dataset["train"],
dataset["val"],
dataset["test"],
)
def collate_batch(batch):
xs1, xs2, ys = zip(
*[
(data["input_ids1"], data["input_ids2"], data["label"])
for data in batch
]
)
lengths1 = torch.tensor([len(x) for x in xs1])
lengths2 = torch.tensor([len(x) for x in xs2])
xs1 = nn.utils.rnn.pad_sequence(
xs1, padding_value=self.vocab["<pad>"], batch_first=True
)
xs2 = nn.utils.rnn.pad_sequence(
xs2, padding_value=self.vocab["<pad>"], batch_first=True
)
# Pad both to same length
# Shape (batch, length)
L = max(xs1.size(1), xs2.size(1))
xs1 = F.pad(xs1, (0, L-xs1.size(1)), value=self.vocab["<pad>"])
xs2 = F.pad(xs2, (0, L-xs2.size(1)), value=self.vocab["<pad>"])
ys = torch.tensor(ys)
# return xs1, xs2, ys, lengths1, lengths2
# Concatenate two batches
xs = torch.cat([xs1, xs2], dim=0)
lengths = torch.cat([lengths1, lengths2], dim=0)
return xs, ys, {"lengths": lengths}
self._collate_fn = collate_batch
def process_dataset(self):
cache_dir = (
None if self.cache_dir is None else self.cache_dir / self._cache_dir_name
)
if cache_dir is not None:
if cache_dir.is_dir():
return self._load_from_cache(cache_dir)
dataset = load_dataset(
"csv",
data_files={
"train": str(self.data_dir / "new_aan_pairs.train.tsv"),
"val": str(self.data_dir / "new_aan_pairs.eval.tsv"),
"test": str(self.data_dir / "new_aan_pairs.test.tsv"),
},
delimiter="\t",
column_names=["label", "input1_id", "input2_id", "text1", "text2"],
keep_in_memory=True,
) # True)
dataset = dataset.remove_columns(["input1_id", "input2_id"])
new_features = dataset["train"].features.copy()
new_features["label"] = Value("int32")
dataset = dataset.cast(new_features)
tokenizer = list # Just convert a string to a list of chars
# Account for <bos> and <eos> tokens
l_max = self.l_max - int(self.append_bos) - int(self.append_eos)
tokenize = lambda example: {
"tokens1": tokenizer(example["text1"])[:l_max],
"tokens2": tokenizer(example["text2"])[:l_max],
}
dataset = dataset.map(
tokenize,
remove_columns=["text1", "text2"],
keep_in_memory=True,
load_from_cache_file=False,
num_proc=max(self.n_workers, 1),
)
vocab = torchtext.vocab.build_vocab_from_iterator(
dataset["train"]["tokens1"] + dataset["train"]["tokens2"],
specials=(
["<pad>", "<unk>"]
+ (["<bos>"] if self.append_bos else [])
+ (["<eos>"] if self.append_eos else [])
),
)
vocab.set_default_index(vocab["<unk>"])
encode = lambda text: vocab(
(["<bos>"] if self.append_bos else [])
+ text
+ (["<eos>"] if self.append_eos else [])
)
numericalize = lambda example: {
"input_ids1": encode(example["tokens1"]),
"input_ids2": encode(example["tokens2"]),
}
dataset = dataset.map(
numericalize,
remove_columns=["tokens1", "tokens2"],
keep_in_memory=True,
load_from_cache_file=False,
num_proc=max(self.n_workers, 1),
)
if cache_dir is not None:
self._save_to_cache(dataset, tokenizer, vocab, cache_dir)
return dataset, tokenizer, vocab
def _save_to_cache(self, dataset, tokenizer, vocab, cache_dir):
cache_dir = self.cache_dir / self._cache_dir_name
logger = logging.getLogger(__name__)
logger.info(f"Saving to cache at {str(cache_dir)}")
dataset.save_to_disk(str(cache_dir))
with open(cache_dir / "tokenizer.pkl", "wb") as f:
pickle.dump(tokenizer, f)
with open(cache_dir / "vocab.pkl", "wb") as f:
pickle.dump(vocab, f)
def _load_from_cache(self, cache_dir):
assert cache_dir.is_dir()
logger = logging.getLogger(__name__)
logger.info(f"Load from cache at {str(cache_dir)}")
dataset = DatasetDict.load_from_disk(str(cache_dir))
with open(cache_dir / "tokenizer.pkl", "rb") as f:
tokenizer = pickle.load(f)
with open(cache_dir / "vocab.pkl", "rb") as f:
vocab = pickle.load(f)
return dataset, tokenizer, vocab