|
import json |
|
import glob |
|
import os |
|
from pathlib import Path |
|
import sys |
|
|
|
|
|
wd = Path(__file__).parent.parent.resolve() |
|
sys.path.append(str(wd)) |
|
|
|
import numpy as np |
|
from tqdm import tqdm |
|
|
|
from lit_llama import Tokenizer |
|
import lit_llama.packed_dataset as packed_dataset |
|
|
|
|
|
filenames_sample = [ |
|
"arxiv_sample.jsonl", |
|
"book_sample.jsonl", |
|
"c4_sample.jsonl", |
|
"cc_2019-30_sample.jsonl", |
|
"cc_2020-05_sample.jsonl", |
|
"cc_2021-04_sample.jsonl", |
|
"cc_2022-05_sample.jsonl", |
|
"cc_2023-06_sample.jsonl", |
|
"github_sample.jsonl", |
|
"stackexchange_sample.jsonl", |
|
"wikipedia_sample.jsonl", |
|
] |
|
|
|
filename_sets = { |
|
"arxiv": "arxiv/arxiv*", |
|
"book": "book/book*", |
|
"c4": "c4/c4-train*", |
|
"common_crawl": "common_crawl/*", |
|
"github": "github/filtered*", |
|
"stackexchange": "stackexchange/stackexchange*", |
|
"wikipedia": "wikipedia/wiki*", |
|
} |
|
|
|
|
|
def prepare_sample( |
|
source_path: Path, |
|
tokenizer_path: Path, |
|
destination_path: Path, |
|
chunk_size: int, |
|
match = "" |
|
) -> None: |
|
"""Prepare the "Red Pajama" dataset. We assume tokenizer has been trained (i.e. we reuse LLaMA's tokenizer model).""" |
|
destination_path.mkdir(parents=True, exist_ok=True) |
|
|
|
tokenizer = Tokenizer(tokenizer_path) |
|
|
|
for name in filenames_sample: |
|
if match and match not in name: |
|
continue |
|
|
|
filepath = source_path / name |
|
|
|
if not filepath.is_file(): |
|
raise RuntimeError( |
|
f"Input file not found at {filepath}. \n" |
|
"Make sure you download the data, e.g. wget -i https://data.together.xyz/redpajama-data-1T/v1.0.0/urls.txt or through \n" |
|
"https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T \n" |
|
"https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample \n" |
|
) |
|
|
|
prefix, _ = os.path.splitext(name) |
|
|
|
builder = packed_dataset.PackedDatasetBuilder( |
|
outdir=destination_path, |
|
prefix=prefix, |
|
chunk_size=chunk_size, |
|
sep_token=tokenizer.bos_id, |
|
dtype="auto", |
|
vocab_size=tokenizer.vocab_size, |
|
) |
|
|
|
print(f"Processing {name}") |
|
|
|
with open(filepath, encoding="utf-8") as f: |
|
for row in tqdm(f): |
|
text = json.loads(row)["text"] |
|
text_ids = tokenizer.encode(text) |
|
builder.add_array(np.array(text_ids, dtype=builder.dtype)) |
|
|
|
builder.write_reminder() |
|
|
|
|
|
def prepare_full( |
|
source_path: Path, |
|
tokenizer_path: Path, |
|
destination_path: Path, |
|
chunk_size: int, |
|
match: str = "" |
|
) -> None: |
|
"""Prepare the "Red Pajama" dataset. We assume tokenizer has been trained (i.e. we reuse LLaMA's tokenizer model).""" |
|
import zstandard as zstd |
|
|
|
destination_path.mkdir(parents=True, exist_ok=True) |
|
|
|
tokenizer = Tokenizer(tokenizer_path) |
|
|
|
for set_name, pattern in filename_sets.items(): |
|
if match and match not in set_name: |
|
continue |
|
|
|
is_cc = set_name == "common_crawl" |
|
|
|
filenames = glob.glob(os.path.join(source_path, pattern), recursive=True) |
|
|
|
if not filenames: |
|
raise RuntimeError( |
|
f"No files matching {pattern} found at {source_path}. \n" |
|
"Make sure you download the data, e.g. wget -i https://data.together.xyz/redpajama-data-1T/v1.0.0/urls.txt or through \n" |
|
"https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T \n" |
|
"https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample \n" |
|
) |
|
|
|
builder = packed_dataset.PackedDatasetBuilder( |
|
outdir=destination_path, |
|
prefix=set_name, |
|
chunk_size=chunk_size, |
|
sep_token=tokenizer.bos_id, |
|
dtype="auto", |
|
vocab_size=tokenizer.vocab_size, |
|
) |
|
|
|
for name in filenames: |
|
filepath = source_path / name |
|
|
|
print(f"Processing {name}") |
|
|
|
if is_cc: |
|
with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f: |
|
for row in tqdm(f): |
|
text = json.loads(row)["text"] |
|
text_ids = tokenizer.encode(text) |
|
builder.add_array(np.array(text_ids, dtype=builder.dtype)) |
|
else: |
|
with open(filepath, encoding="utf-8") as f: |
|
for row in tqdm(f): |
|
text = json.loads(row)["text"] |
|
text_ids = tokenizer.encode(text) |
|
builder.add_array(np.array(text_ids, dtype=builder.dtype)) |
|
|
|
builder.write_reminder() |
|
|
|
|
|
def prepare( |
|
source_path: Path = Path("data/RedPajama-Data-1T-Sample"), |
|
tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"), |
|
destination_path: Path = Path("data/red_pajama_sample"), |
|
chunk_size: int = 2049 * 1024, |
|
sample: bool = False, |
|
match: str = "", |
|
) -> None: |
|
"""Prepare the "Red Pajama" dataset. We assume tokenizer has been trained (i.e. we reuse LLaMA's tokenizer model).""" |
|
if sample: |
|
prepare_sample( |
|
source_path=source_path, |
|
tokenizer_path=tokenizer_path, |
|
destination_path=destination_path, |
|
chunk_size=chunk_size, |
|
match=match, |
|
) |
|
else: |
|
prepare_full( |
|
source_path=source_path, |
|
tokenizer_path=tokenizer_path, |
|
destination_path=destination_path, |
|
chunk_size=chunk_size, |
|
match=match, |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
from jsonargparse import CLI |
|
|
|
CLI(prepare) |
|
|