|
import argparse |
|
import uvicorn |
|
from api import app |
|
|
|
|
|
def parse_args(): |
|
parser = argparse.ArgumentParser(description="Launch Flux API server") |
|
parser.add_argument( |
|
"-c", |
|
"--config-path", |
|
type=str, |
|
help="Path to the configuration file, if not provided, the model will be loaded from the command line arguments", |
|
) |
|
parser.add_argument( |
|
"-p", |
|
"--port", |
|
type=int, |
|
default=8088, |
|
help="Port to run the server on", |
|
) |
|
parser.add_argument( |
|
"-H", |
|
"--host", |
|
type=str, |
|
default="0.0.0.0", |
|
help="Host to run the server on", |
|
) |
|
parser.add_argument( |
|
"-f", "--flow-model-path", type=str, help="Path to the flow model" |
|
) |
|
parser.add_argument( |
|
"-t", "--text-enc-path", type=str, help="Path to the text encoder" |
|
) |
|
parser.add_argument( |
|
"-a", "--autoencoder-path", type=str, help="Path to the autoencoder" |
|
) |
|
parser.add_argument( |
|
"-m", |
|
"--model-version", |
|
type=str, |
|
choices=["flux-dev", "flux-schnell"], |
|
default="flux-dev", |
|
help="Choose model version", |
|
) |
|
parser.add_argument( |
|
"-F", |
|
"--flux-device", |
|
type=str, |
|
default="cuda:0", |
|
help="Device to run the flow model on", |
|
) |
|
parser.add_argument( |
|
"-T", |
|
"--text-enc-device", |
|
type=str, |
|
default="cuda:0", |
|
help="Device to run the text encoder on", |
|
) |
|
parser.add_argument( |
|
"-A", |
|
"--autoencoder-device", |
|
type=str, |
|
default="cuda:0", |
|
help="Device to run the autoencoder on", |
|
) |
|
parser.add_argument( |
|
"-q", |
|
"--num-to-quant", |
|
type=int, |
|
default=20, |
|
help="Number of linear layers in flow transformer (the 'unet') to quantize", |
|
) |
|
parser.add_argument( |
|
"-C", |
|
"--compile", |
|
action="store_true", |
|
default=False, |
|
help="Compile the flow model with extra optimizations", |
|
) |
|
parser.add_argument( |
|
"-qT", |
|
"--quant-text-enc", |
|
type=str, |
|
default="qfloat8", |
|
choices=["qint4", "qfloat8", "qint2", "qint8", "bf16"], |
|
help="Quantize the t5 text encoder to the given dtype, if bf16, will not quantize", |
|
dest="quant_text_enc", |
|
) |
|
parser.add_argument( |
|
"-qA", |
|
"--quant-ae", |
|
action="store_true", |
|
default=False, |
|
help="Quantize the autoencoder with float8 linear layers, otherwise will use bfloat16", |
|
dest="quant_ae", |
|
) |
|
parser.add_argument( |
|
"-OF", |
|
"--offload-flow", |
|
action="store_true", |
|
default=False, |
|
dest="offload_flow", |
|
help="Offload the flow model to the CPU when not being used to save memory", |
|
) |
|
parser.add_argument( |
|
"-OA", |
|
"--no-offload-ae", |
|
action="store_false", |
|
default=True, |
|
dest="offload_ae", |
|
help="Disable offloading the autoencoder to the CPU when not being used to increase e2e inference speed", |
|
) |
|
parser.add_argument( |
|
"-OT", |
|
"--no-offload-text-enc", |
|
action="store_false", |
|
default=True, |
|
dest="offload_text_enc", |
|
help="Disable offloading the text encoder to the CPU when not being used to increase e2e inference speed", |
|
) |
|
parser.add_argument( |
|
"-PF", |
|
"--prequantized-flow", |
|
action="store_true", |
|
default=False, |
|
dest="prequantized_flow", |
|
help="Load the flow model from a prequantized checkpoint " |
|
+ "(requires loading the flow model, running a minimum of 24 steps, " |
|
+ "and then saving the state_dict as a safetensors file), " |
|
+ "which reduces the size of the checkpoint by about 50% & reduces startup time", |
|
) |
|
parser.add_argument( |
|
"-nqfm", |
|
"--no-quantize-flow-modulation", |
|
action="store_false", |
|
default=True, |
|
dest="quantize_modulation", |
|
help="Disable quantization of the modulation layers in the flow model, adds ~2GB vram usage for moderate precision improvements", |
|
) |
|
parser.add_argument( |
|
"-qfl", |
|
"--quantize-flow-embedder-layers", |
|
action="store_true", |
|
default=False, |
|
dest="quantize_flow_embedder_layers", |
|
help="Quantize the flow embedder layers in the flow model, saves ~512MB vram usage, but precision loss is very noticeable", |
|
) |
|
return parser.parse_args() |
|
|
|
|
|
def main(): |
|
args = parse_args() |
|
|
|
|
|
from flux_pipeline import FluxPipeline |
|
from util import load_config, ModelVersion |
|
|
|
if args.config_path: |
|
app.state.model = FluxPipeline.load_pipeline_from_config_path( |
|
args.config_path, flow_model_path=args.flow_model_path |
|
) |
|
else: |
|
model_version = ( |
|
ModelVersion.flux_dev |
|
if args.model_version == "flux-dev" |
|
else ModelVersion.flux_schnell |
|
) |
|
config = load_config( |
|
model_version, |
|
flux_path=args.flow_model_path, |
|
flux_device=args.flux_device, |
|
ae_path=args.autoencoder_path, |
|
ae_device=args.autoencoder_device, |
|
text_enc_path=args.text_enc_path, |
|
text_enc_device=args.text_enc_device, |
|
flow_dtype="float16", |
|
text_enc_dtype="bfloat16", |
|
ae_dtype="bfloat16", |
|
num_to_quant=args.num_to_quant, |
|
compile_extras=args.compile, |
|
compile_blocks=args.compile, |
|
quant_text_enc=( |
|
None if args.quant_text_enc == "bf16" else args.quant_text_enc |
|
), |
|
quant_ae=args.quant_ae, |
|
offload_flow=args.offload_flow, |
|
offload_ae=args.offload_ae, |
|
offload_text_enc=args.offload_text_enc, |
|
prequantized_flow=args.prequantized_flow, |
|
quantize_modulation=args.quantize_modulation, |
|
quantize_flow_embedder_layers=args.quantize_flow_embedder_layers, |
|
) |
|
app.state.model = FluxPipeline.load_pipeline_from_config(config) |
|
|
|
uvicorn.run(app, host=args.host, port=args.port) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|