Spaces:
Paused
Paused
#!/usr/bin/env python3 | |
# Copyright (c) Facebook, Inc. and its affiliates. | |
import os | |
import numpy as np | |
import sys | |
sys.path.insert(0, 'third_party/CenterNet2/projects/CenterNet2/') | |
sys.path.insert(0, 'third_party/Deformable-DETR') | |
from detic.data.tar_dataset import _TarDataset, DiskTarDataset | |
import pickle | |
import io | |
import gzip | |
import time | |
class _RawTarDataset(object): | |
def __init__(self, filename, indexname, preload=False): | |
self.filename = filename | |
self.names = [] | |
self.offsets = [] | |
for l in open(indexname): | |
ll = l.split() | |
a, b, c = ll[:3] | |
offset = int(b[:-1]) | |
if l.endswith('** Block of NULs **\n'): | |
self.offsets.append(offset) | |
break | |
else: | |
if c.endswith('JPEG'): | |
self.names.append(c) | |
self.offsets.append(offset) | |
else: | |
# ignore directories | |
pass | |
if preload: | |
self.data = np.memmap(filename, mode='r', dtype='uint8') | |
else: | |
self.data = None | |
def __len__(self): | |
return len(self.names) | |
def __getitem__(self, idx): | |
if self.data is None: | |
self.data = np.memmap(self.filename, mode='r', dtype='uint8') | |
ofs = self.offsets[idx] * 512 | |
fsize = 512 * (self.offsets[idx + 1] - self.offsets[idx]) | |
data = self.data[ofs:ofs + fsize] | |
if data[:13].tostring() == '././@LongLink': | |
data = data[3 * 512:] | |
else: | |
data = data[512:] | |
# just to make it more fun a few JPEGs are GZIP compressed... | |
# catch this case | |
if tuple(data[:2]) == (0x1f, 0x8b): | |
s = io.StringIO(data.tostring()) | |
g = gzip.GzipFile(None, 'r', 0, s) | |
sdata = g.read() | |
else: | |
sdata = data.tostring() | |
return sdata | |
def preprocess(): | |
# Follow https://github.com/Alibaba-MIIL/ImageNet21K/blob/main/dataset_preprocessing/processing_script.sh | |
# Expect 12358684 samples with 11221 classes | |
# ImageNet folder has 21841 classes (synsets) | |
i22kdir = '/datasets01/imagenet-22k/062717/' | |
i22ktarlogs = '/checkpoint/imisra/datasets/imagenet-22k/tarindex' | |
class_names_file = '/checkpoint/imisra/datasets/imagenet-22k/words.txt' | |
output_dir = '/checkpoint/zhouxy/Datasets/ImageNet/metadata-22k/' | |
i22knpytarlogs = '/checkpoint/zhouxy/Datasets/ImageNet/metadata-22k/tarindex_npy' | |
print('Listing dir') | |
log_files = os.listdir(i22ktarlogs) | |
log_files = [x for x in log_files if x.endswith(".tarlog")] | |
log_files.sort() | |
chunk_datasets = [] | |
dataset_lens = [] | |
min_count = 0 | |
create_npy_tarlogs = True | |
print('Creating folders') | |
if create_npy_tarlogs: | |
os.makedirs(i22knpytarlogs, exist_ok=True) | |
for log_file in log_files: | |
syn = log_file.replace(".tarlog", "") | |
dataset = _RawTarDataset(os.path.join(i22kdir, syn + ".tar"), | |
os.path.join(i22ktarlogs, syn + ".tarlog"), | |
preload=False) | |
names = np.array(dataset.names) | |
offsets = np.array(dataset.offsets, dtype=np.int64) | |
np.save(os.path.join(i22knpytarlogs, f"{syn}_names.npy"), names) | |
np.save(os.path.join(i22knpytarlogs, f"{syn}_offsets.npy"), offsets) | |
os.makedirs(output_dir, exist_ok=True) | |
start_time = time.time() | |
for log_file in log_files: | |
syn = log_file.replace(".tarlog", "") | |
dataset = _TarDataset(os.path.join(i22kdir, syn + ".tar"), i22knpytarlogs) | |
# dataset = _RawTarDataset(os.path.join(i22kdir, syn + ".tar"), | |
# os.path.join(i22ktarlogs, syn + ".tarlog"), | |
# preload=False) | |
dataset_lens.append(len(dataset)) | |
end_time = time.time() | |
print(f"Time {end_time - start_time}") | |
dataset_lens = np.array(dataset_lens) | |
dataset_valid = dataset_lens > min_count | |
syn2class = {} | |
with open(class_names_file) as fh: | |
for line in fh: | |
line = line.strip().split("\t") | |
syn2class[line[0]] = line[1] | |
tarlog_files = [] | |
class_names = [] | |
tar_files = [] | |
for k in range(len(dataset_valid)): | |
if not dataset_valid[k]: | |
continue | |
syn = log_files[k].replace(".tarlog", "") | |
tarlog_files.append(os.path.join(i22ktarlogs, syn + ".tarlog")) | |
tar_files.append(os.path.join(i22kdir, syn + ".tar")) | |
class_names.append(syn2class[syn]) | |
tarlog_files = np.array(tarlog_files) | |
tar_files = np.array(tar_files) | |
class_names = np.array(class_names) | |
print(f"Have {len(class_names)} classes and {dataset_lens[dataset_valid].sum()} samples") | |
np.save(os.path.join(output_dir, "tarlog_files.npy"), tarlog_files) | |
np.save(os.path.join(output_dir, "tar_files.npy"), tar_files) | |
np.save(os.path.join(output_dir, "class_names.npy"), class_names) | |
np.save(os.path.join(output_dir, "tar_files.npy"), tar_files) | |
if __name__ == "__main__": | |
preprocess() | |