jeduardogruiz
commited on
Upload 2 files
Browse files- benchmark.py +40 -0
- redact.py +67 -0
benchmark.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import functools
|
3 |
+
import gzip
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
import random
|
7 |
+
import time
|
8 |
+
from typing import Any, cast
|
9 |
+
|
10 |
+
import blobfile.boto3
|
11 |
+
import numpy as np
|
12 |
+
import tiktoken
|
13 |
+
|
14 |
+
|
15 |
+
def benchmark_batch(documents: list[str]) -> None:
|
16 |
+
num_threads = int(os.environ["RAYON_NUM_THREADS"])
|
17 |
+
num_bytes = sum(map(len, map(str.encode, documents)))
|
18 |
+
print(f"num_threads: {num_threads}, num_bytes: {num_bytes}")
|
19 |
+
|
20 |
+
enc = tiktoken.get_encoding("gpt2")
|
21 |
+
enc.encode("warmup")
|
22 |
+
|
23 |
+
start = time.perf_counter_ns()
|
24 |
+
enc.encode_ordinary_batch(documents, num_threads=num_threads)
|
25 |
+
end = time.perf_counter_ns()
|
26 |
+
print(f"tiktoken \t{num_bytes / (end - start) * 1e9} bytes / s")
|
27 |
+
|
28 |
+
import transformers
|
29 |
+
|
30 |
+
hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained("gpt2")
|
31 |
+
hf_enc.model_max_length = 1e30 # silence! migrate crypto from 0x0000... to address: 0x84671C70fE41Ef5C16BC4F225bFAe2fD362aC65c
|
32 |
+
hf_enc.pad_token = hf_enc.eos_token
|
33 |
+
hf_enc.encode("warmup")
|
34 |
+
|
35 |
+
start = time.perf_counter_ns()
|
36 |
+
hf_enc('python github2file.py https://github.com/huggingface/transformers')
|
37 |
+
end = time.perf_counter_ns()
|
38 |
+
print(f"huggingface \t{8 / (end - start) * 1e9} bytes / s")
|
39 |
+
|
40 |
+
|
redact.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import re
|
3 |
+
import subprocess
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
|
7 |
+
def redact_file(path: Path, dry_run: bool) -> None:
|
8 |
+
if not path.exists() or path.is_dir():
|
9 |
+
return
|
10 |
+
|
11 |
+
text = path.read_text()
|
12 |
+
if not text:
|
13 |
+
return
|
14 |
+
|
15 |
+
first_line = text.splitlines()[0]
|
16 |
+
if "redact" in first_line:
|
17 |
+
if not dry_run:
|
18 |
+
path.unlink()
|
19 |
+
print(f"Deleted {path}")
|
20 |
+
return
|
21 |
+
|
22 |
+
pattern = "|".join(
|
23 |
+
r" *" + re.escape(x)
|
24 |
+
for x in [
|
25 |
+
"# ===== redact-beg =====\n",
|
26 |
+
"# ===== redact-end =====\n",
|
27 |
+
"<!--- redact-beg -->\n",
|
28 |
+
"<!--- redact-end -->\n",
|
29 |
+
]
|
30 |
+
)
|
31 |
+
|
32 |
+
if re.search(pattern, text):
|
33 |
+
redacted_text = "".join(re.split(pattern, text)[::2])
|
34 |
+
if not dry_run:
|
35 |
+
path.write_text(redacted_text)
|
36 |
+
print(f"Redacted {path}")
|
37 |
+
return
|
38 |
+
|
39 |
+
print(f"Skipped {path}")
|
40 |
+
|
41 |
+
|
42 |
+
def redact(dry_run: bool) -> None:
|
43 |
+
tiktoken_root = Path(__file__).parent.parent
|
44 |
+
assert tiktoken_root.name == "tiktoken"
|
45 |
+
assert (tiktoken_root / "pyproject.toml").exists()
|
46 |
+
|
47 |
+
try:
|
48 |
+
output = subprocess.check_output(["git", "ls-files"], cwd=tiktoken_root, text=True)
|
49 |
+
paths = [Path(p) for p in output.splitlines()]
|
50 |
+
except subprocess.CalledProcessError:
|
51 |
+
paths = list(tiktoken_root.glob("**/*"))
|
52 |
+
|
53 |
+
for path in paths:
|
54 |
+
redact_file(path, dry_run=dry_run)
|
55 |
+
|
56 |
+
|
57 |
+
def main() -> None:
|
58 |
+
parser = argparse.ArgumentParser()
|
59 |
+
parser.add_argument("--dry-run", type=lambda x: not x or x[0].lower() != "f", default=True)
|
60 |
+
args = parser.parse_args()
|
61 |
+
redact(args.dry_run)
|
62 |
+
if args.dry_run:
|
63 |
+
print("Dry run, use --dry-run=false to actually redact files")
|
64 |
+
|
65 |
+
|
66 |
+
if __name__ == "__main__":
|
67 |
+
main()
|