File size: 6,440 Bytes
ec0f90b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Tokenizer
import argparse, os
import sys
import json
from conversion.tokenize import tokenize
from conversion.quantize import embeddings, measure_quant, quant
from conversion.optimize import optimize
from conversion.compile import compile_model

# import tracemalloc
# tracemalloc.start()

parser = argparse.ArgumentParser(description = "Convert model to ExLlamaV2")
parser.add_argument("-i", "--in_dir", type = str, help = "Input directory", default = "")
parser.add_argument("-o", "--out_dir", type = str, help = "Output directory")
parser.add_argument("-c", "--cal_dataset", type = str, help = "Calibration dataset (.parquet file)", default = "")
parser.add_argument("-r", "--dataset_rows", type = int, default = 100, help = "Number of rows to apply from dataset")
parser.add_argument("-mr", "--measurement_rows", type = int, default = 16, help = "Number of rows to apply from dataset when measuring")
parser.add_argument("-gr", "--gpu_rows", type = int, default = 16, help = "Threshold for paging hidden state to CPU")
parser.add_argument("-l", "--length", type = int, default = 2048, help = "Max no. tokens per sample")
parser.add_argument("-ml", "--measurement_length", type = int, default = 2048, help = "Max no. tokens per sample when measuring")
parser.add_argument("-b", "--bits", type = float, default = 4.156, help = "Target bits per weight")
parser.add_argument("-hb", "--head_bits", type = int, default = 6, help = "Target bits per weight (head layer)")
parser.add_argument("-m", "--measurement", type = str, help = "Reuse previous measurement")

args = parser.parse_args()

# Arguments

in_dir = None if args.in_dir == "" else os.path.abspath(args.in_dir)
out_dir = os.path.abspath(args.out_dir)
cal_dataset = None if args.cal_dataset == "" else os.path.abspath(args.cal_dataset)
dataset_rows = args.dataset_rows
measurement_rows = args.measurement_rows
gpu_rows = args.gpu_rows
length = args.length
measurement_length = args.measurement_length
bits = args.bits
head_bits = args.head_bits
reuse_measurement = args.measurement

if not os.path.exists(out_dir):
    print(f" ## Error: Directory not found: {out_dir}")
    sys.exit()

# Create model without loading weights

config = ExLlamaV2Config()
config.model_dir = in_dir
config.prepare()

model = ExLlamaV2(config)
model.load(lazy = True)

tokenizer = ExLlamaV2Tokenizer(config)

# Job file

job_file = os.path.join(out_dir, "job.json")

# Create new job

def save_job():
    global job_file, job
    with open(job_file, "w") as f:
        f.write(json.dumps(job, indent = 4))

if not os.path.exists(job_file):

    print(f" -- Beginning new job")

    if len(os.listdir(out_dir)) != 0:
        print(f" !! Warning: Output directory is not empty: {out_dir}")

    if in_dir is None:
        print(f" ## Error: No input directory specified")
        sys.exit()

    if cal_dataset is None:
        print(f" ## Error: No calibration dataset specified")
        sys.exit()

    job = { "in_dir": in_dir,
            "out_dir": out_dir,
            "cal_dataset": cal_dataset,
            "dataset_rows": dataset_rows,
            "measurement_rows": measurement_rows,
            "gpu_rows": gpu_rows,
            "length": length,
            "measurement_length": measurement_length,
            "bits": bits,
            "head_bits": head_bits,
            "progress": "begin",
            }

    if reuse_measurement is not None:

        with open(reuse_measurement, "r") as f:

            imp_measurement = json.load(f)
            job["measurement"] = imp_measurement["measurement"]
            job["last_module_idx"] = imp_measurement["last_module_idx"]
            job["base_perplexity"] = imp_measurement["base_perplexity"]
            job["reuse_measurement"] = reuse_measurement

    save_job()

# Resume existing job

else:

    print(f" -- Resuming job")
    print(f" !! Note: Overriding options with settings from existing job")

    with open(job_file, "r") as f:
        job = json.load(f)

    if "invalid" in job:
        print(" ** Error: Corrupted job")
        sys.exit()

    job["out_dir"] = out_dir

# Feedback

print(f" -- Input: {job['in_dir']}")
print(f" -- Output: {out_dir}")
print(f" -- Calibration dataset: {job['cal_dataset']}, {job['dataset_rows']} / {job['measurement_rows']} ({job['gpu_rows']}) rows, {job['length']} tokens per sample")
print(f" -- Target bits per weight: {job['bits']} (decoder), {job['head_bits']} (head)")

# Make sure subfolders exist

out_tensor_dir = os.path.join(job["out_dir"], "out_tensor")
if not os.path.exists(out_tensor_dir):
    os.makedirs(out_tensor_dir)

# Do the things

while True:

    progress = job["progress"]

    if progress == "begin":

        if "reuse_measurement" in job:

            print(f" -- Reusing measurement: {job['reuse_measurement']}")
            job["progress"] = "optimize"
            save_job()

        else:

            print(f" -- Tokenizing samples (measurement)...")
            tokenize(job, save_job, tokenizer, measure = True)
            job["progress"] = "initial_embeddings"
            save_job()

    if progress == "initial_embeddings":

        print(f" -- Token embeddings (measurement)...")
        embeddings(job, save_job, model)
        job["progress"] = "measure_quant"
        save_job()

    if progress == "measure_quant":

        print(f" -- Measuring quantization impact...")
        measure_quant(job, save_job, model)
        job["progress"] = "optimize"
        save_job()

    if progress == "optimize":

        print(f" -- Optimizing...")
        optimize(job, save_job)
        job["progress"] = "tokens_cal"
        save_job()

    if progress == "tokens_cal":

        print(f" -- Tokenizing samples...")
        tokenize(job, save_job, tokenizer)
        job["progress"] = "embeddings"
        save_job()

    if progress == "embeddings":
        print(f" -- Token embeddings again...")
        embeddings(job, save_job, model)
        job["progress"] = "quant"
        save_job()

    if progress == "quant":

        print(f" -- Quantizing...")
        quant(job, save_job, model)
        job["progress"] = "compile"
        save_job()

    if progress == "compile":

        print(f" -- Compiling output file...")
        compile_model(job, save_job, model)
        job["progress"] = "finished"
        save_job()

    if progress == "finished": break

print(f" -- Finished")