aredden commited on Aug 5, 2024

Commit

d9aea20

0 Parent(s):

initial commit

Browse files

Files changed (17) hide show

.gitignore +1 -0
README.md +267 -0
api.py +25 -0
configs/config-dev-cuda0.json +51 -0
configs/config-dev.json +51 -0
configs/config-schnell-cuda0.json +51 -0
configs/config-schnell.json +51 -0
cublas_linear.py +152 -0
flux_impl.py +272 -0
main.py +89 -0
modules/autoencoder.py +336 -0
modules/conditioner.py +53 -0
modules/flux_model.py +492 -0
requirements.txt +12 -0
sampling.py +152 -0
turbojpeg_imgs.py +134 -0
util.py +275 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

README.md ADDED Viewed

	@@ -0,0 +1,267 @@

+Sure, here's a draft for your README:
+````markdown
+# Flux FP16 Accumulate Model Implementation with FastAPI
+This repository contains an implementation of the Flux model, along with an API that allows you to generate images based on text prompts. The API can be run via command-line arguments.
+## Table of Contents
+-   [Installation](#installation)
+-   [Usage](#usage)
+-   [Configuration](#configuration)
+-   [API Endpoints](#api-endpoints)
+-   [Examples](#examples)
+-   [License](#license)
+## Installation
+To install the required dependencies, run:
+```bash
+pip install -r requirements.txt
+```
+````
+## Usage
+You can run the API server using the following command:
+```bash
+python main.py --config-path <path_to_config> --port <port_number> --host <host_address>
+```
+### Command-Line Arguments
+-   `--config-path`: Path to the configuration file. If not provided, the model will be loaded from the command line arguments.
+-   `--port`: Port to run the server on (default: 8088).
+-   `--host`: Host to run the server on (default: 0.0.0.0).
+-   `--flow-model-path`: Path to the flow model.
+-   `--text-enc-path`: Path to the text encoder.
+-   `--autoencoder-path`: Path to the autoencoder.
+-   `--model-version`: Choose model version (`flux-dev` or `flux-schnell`).
+-   `--flux-device`: Device to run the flow model on (default: cuda:0).
+-   `--text-enc-device`: Device to run the text encoder on (default: cuda:0).
+-   `--autoencoder-device`: Device to run the autoencoder on (default: cuda:0).
+-   `--num-to-quant`: Number of linear layers in the flow transformer to quantize (default: 20).
+## Configuration
+The configuration files are located in the `configs` directory. You can specify different configurations for different model versions and devices.
+Example configuration file (`configs/config-dev.json`):
+```json
+{
+    "version": "flux-dev",
+    "params": {
+        "in_channels": 64,
+        "vec_in_dim": 768,
+        "context_in_dim": 4096,
+        "hidden_size": 3072,
+        "mlp_ratio": 4.0,
+        "num_heads": 24,
+        "depth": 19,
+        "depth_single_blocks": 38,
+        "axes_dim": [16, 56, 56],
+        "theta": 10000,
+        "qkv_bias": true,
+        "guidance_embed": true
+    },
+    "ae_params": {
+        "resolution": 256,
+        "in_channels": 3,
+        "ch": 128,
+        "out_ch": 3,
+        "ch_mult": [1, 2, 4, 4],
+        "num_res_blocks": 2,
+        "z_channels": 16,
+        "scale_factor": 0.3611,
+        "shift_factor": 0.1159
+    },
+    "ckpt_path": "/path/to/your/flux1-dev.sft",
+    "ae_path": "/path/to/your/ae.sft",
+    "repo_id": "black-forest-labs/FLUX.1-dev",
+    "repo_flow": "flux1-dev.sft",
+    "repo_ae": "ae.sft",
+    "text_enc_max_length": 512,
+    "text_enc_path": "path/to/your/t5-v1_1-xxl-encoder-bf16", // or "city96/t5-v1_1-xxl-encoder-bf16" for a simple to download version
+    "text_enc_device": "cuda:1",
+    "ae_device": "cuda:1",
+    "flux_device": "cuda:0",
+    "flow_dtype": "float16",
+    "ae_dtype": "bfloat16",
+    "text_enc_dtype": "bfloat16",
+    "num_to_quant": 20
+}
+```
+## API Endpoints
+### Generate Image
+-   **URL**: `/generate`
+-   **Method**: `POST`
+-   **Request Body**:
+    -   `prompt` (str): The text prompt for image generation.
+    -   `width` (int, optional): The width of the generated image (default: 720).
+    -   `height` (int, optional): The height of the generated image (default: 1024).
+    -   `num_steps` (int, optional): The number of steps for the generation process (default: 24).
+    -   `guidance` (float, optional): The guidance scale for the generation process (default: 3.5).
+    -   `seed` (int, optional): The seed for random number generation.
+-   **Response**: A JPEG image stream.
+## Examples
+### Running the Server
+```bash
+python main.py --config-path configs/config-dev.json --port 8088 --host 0.0.0.0
+```
+OR, if you need more granular control over the server, you can run the server with something like this:
+```bash
+python main.py --port 8088 --host 0.0.0.0 \
+    --flow-model-path /path/to/your/flux1-dev.sft \
+    --text-enc-path /path/to/your/t5-v1_1-xxl-encoder-bf16 \
+    --autoencoder-path /path/to/your/ae.sft \
+    --model-version flux-dev \
+    --flux-device cuda:0 \
+    --text-enc-device cuda:1 \
+    --autoencoder-device cuda:1 \
+    --num-to-quant 20
+```
+### Generating an Image
+Send a POST request to `http://<host>:<port>/generate` with the following JSON body:
+```json
+{
+    "prompt": "a beautiful asian woman in traditional clothing with golden hairpin and blue eyes, wearing a red kimono with dragon patterns",
+    "width": 1024,
+    "height": 1024,
+    "num_steps": 24,
+    "guidance": 3.0,
+    "seed": 13456
+}
+```
+For an example of how to generate from a python client using the FastAPI server:
+```py
+import requests
+import io
+prompt = "a beautiful asian woman in traditional clothing with golden hairpin and blue eyes, wearing a red kimono with dragon patterns"
+res = requests.post(
+    "http://localhost:8088/generate",
+    json={
+        "width": 1024,
+        "height": 720,
+        "num_steps": 20,
+        "guidance": 4,
+        "prompt": prompt,
+    },
+    stream=True,
+)
+with open(f"output.jpg", "wb") as f:
+    f.write(io.BytesIO(res.content).read())
+```
+## License
+This project is licensed under the MIT License.
+````
+## References
+- Code for loading the pipeline from the configuration path:
+```200:310:flux_impl.py
+@torch.inference_mode()
+def load_pipeline_from_config(config: ModelSpec) -> Model:
+    models = load_models_from_config(config)
+    config = models.config
+    num_quanted = 0
+    max_quanted = config.num_to_quant
+    flux_device = into_device(config.flux_device)
+    ae_device = into_device(config.ae_device)
+    clip_device = into_device(config.text_enc_device)
+    t5_device = into_device(config.text_enc_device)
+    flux_dtype = into_dtype(config.flow_dtype)
+    device_index = flux_device.index or 0
+    flow_model = models.flow.requires_grad_(False).eval().type(flux_dtype)
+    for block in flow_model.single_blocks:
+        block.cuda(flux_device)
+        if num_quanted < max_quanted:
+            num_quanted = quant_module(
+                block.linear1, num_quanted, device_index=device_index
+            )
+    for block in flow_model.double_blocks:
+        block.cuda(flux_device)
+        if num_quanted < max_quanted:
+            num_quanted = full_quant(
+                block, max_quanted, num_quanted, device_index=device_index
+            )
+    to_gpu_extras = [
+        "vector_in",
+        "img_in",
+        "txt_in",
+        "time_in",
+        "guidance_in",
+        "final_layer",
+        "pe_embedder",
+    ]
+    for extra in to_gpu_extras:
+        getattr(flow_model, extra).cuda(flux_device).type(flux_dtype)
+````
+-   Code for the main entry point:
+```59:85:main.py
+def main():
+    args = parse_args()
+    if args.config_path:
+        app.state.model = load_pipeline_from_config_path(args.config_path)
+    else:
+        model_version = (
+            ModelVersion.flux_dev
+            if args.model_version == "flux-dev"
+            else ModelVersion.flux_schnell
+        )
+        config = load_config(
+            model_version,
+            flux_path=args.flow_model_path,
+            flux_device=args.flux_device,
+            ae_path=args.autoencoder_path,
+            ae_device=args.autoencoder_device,
+            text_enc_path=args.text_enc_path,
+            text_enc_device=args.text_enc_device,
+            flow_dtype="float16",
+            text_enc_dtype="bfloat16",
+            ae_dtype="bfloat16",
+            num_to_quant=args.num_to_quant,
+        )
+        app.state.model = load_pipeline_from_config(config)
+    uvicorn.run(app, host=args.host, port=args.port)
+```
+-   Code for the API endpoint:
+```22:25:api.py
+@app.post("/generate")
+def generate(args: GenerateArgs):
+    result = app.state.model.generate(**args.model_dump())
+    return StreamingResponse(result, media_type="image/jpeg")
+```

api.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from typing import Optional
+import numpy as np
+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel, Field
+app = FastAPI()
+class GenerateArgs(BaseModel):
+    prompt: str
+    width: Optional[int] = Field(default=720)
+    height: Optional[int] = Field(default=1024)
+    num_steps: Optional[int] = Field(default=24)
+    guidance: Optional[float] = Field(default=3.5)
+    seed: Optional[int] = Field(
+        default_factory=lambda: np.random.randint(0, 2**32 - 1), gt=0, lt=2**32 - 1
+    )
+@app.post("/generate")
+def generate(args: GenerateArgs):
+    result = app.state.model.generate(**args.model_dump())
+    return StreamingResponse(result, media_type="image/jpeg")

configs/config-dev-cuda0.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "num_to_quant": 20
+}

configs/config-dev.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:1",
+  "ae_device": "cuda:1",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "num_to_quant": 20
+}

configs/config-schnell-cuda0.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "version": "flux-schnell",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir-schnell/flux1-schnell.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir-schnell/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-schnell",
+  "repo_flow": "flux1-schnell.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 256,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "num_to_quant": 20
+}

configs/config-schnell.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "version": "flux-schnell",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir-schnell/flux1-schnell.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir-schnell/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-schnell",
+  "repo_flow": "flux1-schnell.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 256,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:1",
+  "ae_device": "cuda:1",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "num_to_quant": 20
+}

cublas_linear.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import math
+from typing import Literal, Optional
+import torch
+from torch.nn import functional as F
+from cublas_ops_ext import _simt_hgemv
+from cublas_ops_ext import cublas_hgemm_axbT as _cublas_hgemm_axbT
+from cublas_ops_ext import cublas_hgemm_batched_simple as _cublas_hgemm_batched_simple
+from cublas_ops_ext import (
+    cublaslt_hgemm_batched_simple as _cublaslt_hgemm_batched_simple,
+)
+from cublas_ops_ext import cublaslt_hgemm_simple as _cublaslt_hgemm_simple
+from torch import Tensor, nn
+global has_moved
+has_moved = {idx: False for idx in range(torch.cuda.device_count())}
+class StaticState:
+    workspace = {
+        idx: torch.empty((1024 * 1024 * 8,), dtype=torch.uint8)
+        for idx in range(torch.cuda.device_count())
+    }
+    workspace_size = workspace[0].nelement()
+    bias_g = {
+        idx: torch.tensor([], dtype=torch.float16)
+        for idx in range(torch.cuda.device_count())
+    }
+    @classmethod
+    def get(cls, __name: str, device: torch.device) -> torch.Any:
+        global has_moved
+        idx = device.index if device.index is not None else 0
+        if not has_moved[idx]:
+            cls.workspace[idx] = cls.workspace[idx].cuda(idx)
+            cls.bias_g[idx] = cls.bias_g[idx].cuda(idx)
+            has_moved[idx] = True
+        if "bias" in __name:
+            return cls.bias_g[idx]
+        if "workspace" in __name:
+            return cls.workspace[idx]
+        if "workspace_size" in __name:
+            return cls.workspace_size
+@torch.no_grad()
+def hgemv_simt(vec: torch.HalfTensor, mat: torch.HalfTensor, block_dim_x: int = 32):
+    prev_dims = vec.shape[:-1]
+    out = _simt_hgemv(mat, vec.view(-1, 1), block_dim_x=block_dim_x).view(
+        *prev_dims, -1
+    )
+    return out
+@torch.no_grad()
+def cublas_half_matmul_batched_simple(a: torch.Tensor, b: torch.Tensor):
+    out = _cublas_hgemm_batched_simple(a, b)
+    return out
+@torch.no_grad()
+def cublas_half_matmul_simple(a: torch.Tensor, b: torch.Tensor):
+    out = _cublas_hgemm_axbT(b, a)
+    return out
+@torch.no_grad()
+def cublaslt_fused_half_matmul_simple(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    epilogue_str: Optional[Literal["NONE", "RELU", "GELU"]] = "NONE",
+):
+    if bias is None:
+        bias = StaticState.get("bias", a.device)
+    out = _cublaslt_hgemm_simple(
+        a, b, bias, epilogue_str, StaticState.get("workspace", a.device)
+    )
+    return out
+@torch.no_grad()
+def cublaslt_fused_half_matmul_batched_simple(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    epilogue_str: Optional[Literal["NONE", "RELU", "GELU"]] = "NONE",
+):
+    if bias is None:
+        bias = StaticState.get("bias", a.device)
+    out = _cublaslt_hgemm_batched_simple(
+        a, b, bias, epilogue_str, StaticState.get("workspace", a.device)
+    )
+    return out
+class CublasLinear(nn.Linear):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        bias=True,
+        device=None,
+        dtype=torch.float16,
+        epilogue_str="NONE",
+    ):
+        super().__init__(
+            in_features, out_features, bias=bias, device=device, dtype=dtype
+        )
+        self._epilogue_str = epilogue_str
+        self.has_bias = bias
+        self.has_checked_weight = False
+    def forward(self, x: Tensor) -> Tensor:
+        if not self.has_checked_weight:
+            if not self.weight.dtype == torch.float16:
+                self.to(dtype=torch.float16)
+            self.has_checked_weight = True
+        out_dtype = x.dtype
+        needs_convert = out_dtype != torch.float16
+        if needs_convert:
+            x = x.type(torch.float16)
+        use_cublasLt = self.has_bias or self._epilogue_str != "NONE"
+        if x.ndim == 1:
+            x = x.unsqueeze(0)
+        if math.prod(x.shape) == x.shape[-1]:
+            out = F.linear(x, self.weight, bias=self.bias)
+            if self._epilogue_str == "RELU":
+                return F.relu(out)
+            elif self._epilogue_str == "GELU":
+                return F.gelu(out)
+            if needs_convert:
+                return out.type(out_dtype)
+            return out
+        if use_cublasLt:
+            leading_dims = x.shape[:-1]
+            x = x.reshape(-1, x.shape[-1])
+            out = cublaslt_fused_half_matmul_simple(
+                x, self.weight, bias=self.bias.data, epilogue_str=self._epilogue_str
+            )
+            if needs_convert:
+                return out.view(*leading_dims, out.shape[-1]).type(out_dtype)
+            return out.view(*leading_dims, out.shape[-1])
+        else:
+            leading_dims = x.shape[:-1]
+            x = x.reshape(-1, x.shape[-1])
+            out = cublas_half_matmul_simple(x, self.weight)
+            if needs_convert:
+                return out.view(*leading_dims, out.shape[-1]).type(out_dtype)
+            return out.view(*leading_dims, out.shape[-1])

flux_impl.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import io
+from typing import List
+import torch
+from torch import nn
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.backends.cudnn.benchmark = True
+torch.backends.cudnn.benchmark_limit = 20
+torch.set_float32_matmul_precision("high")
+from torch._dynamo import config
+from torch._inductor import config as ind_config
+config.cache_size_limit = 10000000000
+ind_config.force_fuse_int_mm_with_mul = True
+from loguru import logger
+from torchao.quantization.quant_api import int8_weight_only, quantize_
+from cublas_linear import CublasLinear as F16Linear
+from modules.flux_model import RMSNorm
+from sampling import denoise, get_noise, get_schedule, prepare, unpack
+from turbojpeg_imgs import TurboImage
+from util import (
+    ModelSpec,
+    into_device,
+    into_dtype,
+    load_config_from_path,
+    load_models_from_config,
+)
+class Model:
+    def __init__(
+        self,
+        name,
+        offload=False,
+        clip=None,
+        t5=None,
+        model=None,
+        ae=None,
+        dtype=torch.bfloat16,
+        verbose=False,
+        flux_device="cuda:0",
+        ae_device="cuda:1",
+        clip_device="cuda:1",
+        t5_device="cuda:1",
+    ):
+        self.name = name
+        self.device_flux = (
+            flux_device
+            if isinstance(flux_device, torch.device)
+            else torch.device(flux_device)
+        )
+        self.device_ae = (
+            ae_device
+            if isinstance(ae_device, torch.device)
+            else torch.device(ae_device)
+        )
+        self.device_clip = (
+            clip_device
+            if isinstance(clip_device, torch.device)
+            else torch.device(clip_device)
+        )
+        self.device_t5 = (
+            t5_device
+            if isinstance(t5_device, torch.device)
+            else torch.device(t5_device)
+        )
+        self.dtype = dtype
+        self.offload = offload
+        self.clip = clip
+        self.t5 = t5
+        self.model = model
+        self.ae = ae
+        self.rng = torch.Generator(device="cpu")
+        self.turbojpeg = TurboImage()
+        self.verbose = verbose
+    @torch.inference_mode()
+    def generate(
+        self,
+        prompt,
+        width=720,
+        height=1023,
+        num_steps=24,
+        guidance=3.5,
+        seed=None,
+    ):
+        if num_steps is None:
+            num_steps = 4 if self.name == "flux-schnell" else 50
+        # allow for packing and conversion to latent space
+        height = 16 * (height // 16)
+        width = 16 * (width // 16)
+        if seed is None:
+            seed = self.rng.seed()
+        logger.info(f"Generating with:\nSeed: {seed}\nPrompt: {prompt}")
+        x = get_noise(
+            1,
+            height,
+            width,
+            device=self.device_t5,
+            dtype=torch.bfloat16,
+            seed=seed,
+        )
+        inp = prepare(self.t5, self.clip, x, prompt=prompt)
+        timesteps = get_schedule(
+            num_steps, inp["img"].shape[1], shift=(self.name != "flux-schnell")
+        )
+        for k in inp:
+            inp[k] = inp[k].to(self.device_flux).type(self.dtype)
+        # denoise initial noise
+        x = denoise(
+            self.model,
+            **inp,
+            timesteps=timesteps,
+            guidance=guidance,
+            dtype=self.dtype,
+            device=self.device_flux,
+        )
+        inp.clear()
+        timesteps.clear()
+        torch.cuda.empty_cache()
+        x = x.to(self.device_ae)
+        # decode latents to pixel space
+        x = unpack(x.float(), height, width)
+        with torch.autocast(
+            device_type=self.device_ae.type, dtype=torch.bfloat16, cache_enabled=False
+        ):
+            x = self.ae.decode(x)
+        # bring into PIL format and save
+        x = x.clamp(-1, 1)
+        num_images = x.shape[0]
+        images: List[torch.Tensor] = []
+        for i in range(num_images):
+            x = x[i].permute(1, 2, 0).add(1.0).mul(127.5).type(torch.uint8).contiguous()
+            images.append(x)
+        if len(images) == 1:
+            im = images[0]
+        else:
+            im = torch.vstack(images)
+        im = self.turbojpeg.encode_torch(im, quality=95)
+        images.clear()
+        return io.BytesIO(im)
+def quant_module(module, running_sum_quants=0, device_index=0):
+    if isinstance(module, nn.Linear) and not isinstance(module, F16Linear):
+        module.cuda(device_index)
+        module.compile()
+        quantize_(module, int8_weight_only())
+        running_sum_quants += 1
+    elif isinstance(module, F16Linear):
+        module.cuda(device_index)
+    elif isinstance(module, nn.Conv2d):
+        module.cuda(device_index)
+    elif isinstance(module, nn.Embedding):
+        module.cuda(device_index)
+    elif isinstance(module, nn.ConvTranspose2d):
+        module.cuda(device_index)
+    elif isinstance(module, nn.Conv1d):
+        module.cuda(device_index)
+    elif isinstance(module, nn.Conv3d):
+        module.cuda(device_index)
+    elif isinstance(module, nn.ConvTranspose3d):
+        module.cuda(device_index)
+    elif isinstance(module, nn.RMSNorm):
+        module.cuda(device_index)
+    elif isinstance(module, RMSNorm):
+        module.cuda(device_index)
+    elif isinstance(module, nn.LayerNorm):
+        module.cuda(device_index)
+    return running_sum_quants
+def full_quant(model, max_quants=24, current_quants=0, device_index=0):
+    for module in model.modules():
+        if current_quants < max_quants:
+            current_quants = quant_module(
+                module, current_quants, device_index=device_index
+            )
+    return current_quants
+@torch.inference_mode()
+def load_pipeline_from_config_path(path: str) -> Model:
+    config = load_config_from_path(path)
+    return load_pipeline_from_config(config)
+@torch.inference_mode()
+def load_pipeline_from_config(config: ModelSpec) -> Model:
+    models = load_models_from_config(config)
+    config = models.config
+    num_quanted = 0
+    max_quanted = config.num_to_quant
+    flux_device = into_device(config.flux_device)
+    ae_device = into_device(config.ae_device)
+    clip_device = into_device(config.text_enc_device)
+    t5_device = into_device(config.text_enc_device)
+    flux_dtype = into_dtype(config.flow_dtype)
+    device_index = flux_device.index or 0
+    flow_model = models.flow.requires_grad_(False).eval().type(flux_dtype)
+    for block in flow_model.single_blocks:
+        block.cuda(flux_device)
+        if num_quanted < max_quanted:
+            num_quanted = quant_module(
+                block.linear1, num_quanted, device_index=device_index
+            )
+    for block in flow_model.double_blocks:
+        block.cuda(flux_device)
+        if num_quanted < max_quanted:
+            num_quanted = full_quant(
+                block, max_quanted, num_quanted, device_index=device_index
+            )
+    to_gpu_extras = [
+        "vector_in",
+        "img_in",
+        "txt_in",
+        "time_in",
+        "guidance_in",
+        "final_layer",
+        "pe_embedder",
+    ]
+    for extra in to_gpu_extras:
+        getattr(flow_model, extra).cuda(flux_device).type(flux_dtype)
+    return Model(
+        name=config.version,
+        clip=models.clip,
+        t5=models.t5,
+        model=flow_model,
+        ae=models.ae,
+        dtype=flux_dtype,
+        verbose=False,
+        flux_device=flux_device,
+        ae_device=ae_device,
+        clip_device=clip_device,
+        t5_device=t5_device,
+    )
+if __name__ == "__main__":
+    pipe = load_pipeline_from_config_path("config-dev.json")
+    o = pipe.generate(
+        prompt="a beautiful asian woman in traditional clothing with golden hairpin and blue eyes, wearing a red kimono with dragon patterns",
+        height=1024,
+        width=1024,
+        seed=13456,
+        num_steps=24,
+        guidance=3.0,
+    )
+    open("out.jpg", "wb").write(o.read())
+    o = pipe.generate(
+        prompt="a beautiful asian woman in traditional clothing with golden hairpin and blue eyes, wearing a red kimono with dragon patterns",
+        height=1024,
+        width=1024,
+        seed=7,
+        num_steps=24,
+        guidance=3.0,
+    )
+    open("out2.jpg", "wb").write(o.read())

main.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import argparse
+import uvicorn
+from api import app
+from flux_impl import load_pipeline_from_config, load_pipeline_from_config_path
+from util import load_config, ModelVersion
+def parse_args():
+    parser = argparse.ArgumentParser(description="Launch Flux API server")
+    parser.add_argument(
+        "--config-path",
+        type=str,
+        help="Path to the configuration file, if not provided, the model will be loaded from the command line arguments",
+    )
+    parser.add_argument(
+        "--port", type=int, default=8088, help="Port to run the server on"
+    )
+    parser.add_argument(
+        "--host", type=str, default="0.0.0.0", help="Host to run the server on"
+    )
+    parser.add_argument("--flow-model-path", type=str, help="Path to the flow model")
+    parser.add_argument("--text-enc-path", type=str, help="Path to the text encoder")
+    parser.add_argument("--autoencoder-path", type=str, help="Path to the autoencoder")
+    parser.add_argument(
+        "--model-version",
+        type=str,
+        choices=["flux-dev", "flux-schnell"],
+        default="flux-dev",
+        help="Choose model version",
+    )
+    parser.add_argument(
+        "--flux-device",
+        type=str,
+        default="cuda:0",
+        help="Device to run the flow model on",
+    )
+    parser.add_argument(
+        "--text-enc-device",
+        type=str,
+        default="cuda:0",
+        help="Device to run the text encoder on",
+    )
+    parser.add_argument(
+        "--autoencoder-device",
+        type=str,
+        default="cuda:0",
+        help="Device to run the autoencoder on",
+    )
+    parser.add_argument(
+        "--num-to-quant",
+        type=int,
+        default=20,
+        help="Number of linear layers in flow transformer (the 'unet') to quantize",
+    )
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    if args.config_path:
+        app.state.model = load_pipeline_from_config_path(args.config_path)
+    else:
+        model_version = (
+            ModelVersion.flux_dev
+            if args.model_version == "flux-dev"
+            else ModelVersion.flux_schnell
+        )
+        config = load_config(
+            model_version,
+            flux_path=args.flow_model_path,
+            flux_device=args.flux_device,
+            ae_path=args.autoencoder_path,
+            ae_device=args.autoencoder_device,
+            text_enc_path=args.text_enc_path,
+            text_enc_device=args.text_enc_device,
+            flow_dtype="float16",
+            text_enc_dtype="bfloat16",
+            ae_dtype="bfloat16",
+            num_to_quant=args.num_to_quant,
+        )
+        app.state.model = load_pipeline_from_config(config)
+    uvicorn.run(app, host=args.host, port=args.port)
+if __name__ == "__main__":
+    main()

modules/autoencoder.py ADDED Viewed

	@@ -0,0 +1,336 @@

+import torch
+from einops import rearrange
+from torch import Tensor, nn
+from pydantic import BaseModel
+class AutoEncoderParams(BaseModel):
+    resolution: int
+    in_channels: int
+    ch: int
+    out_ch: int
+    ch_mult: list[int]
+    num_res_blocks: int
+    z_channels: int
+    scale_factor: float
+    shift_factor: float
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+        self.conv1 = nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        self.norm2 = nn.GroupNorm(
+            num_groups=32, num_channels=out_channels, eps=1e-6, affine=True
+        )
+        self.conv2 = nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(
+                in_channels, out_channels, kernel_size=1, stride=1, padding=0
+            )
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=2, padding=0
+        )
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1
+        )
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(
+            num_groups=32, num_channels=block_in, eps=1e-6, affine=True
+        )
+        self.conv_out = nn.Conv2d(
+            block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(
+            num_groups=32, num_channels=block_in, eps=1e-6, affine=True
+        )
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, z: Tensor) -> Tensor:
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.reg = DiagonalGaussian()
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))

modules/conditioner.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from torch import Tensor, nn
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5Tokenizer
+from transformers.utils.quantization_config import BitsAndBytesConfig
+class HFEmbedder(nn.Module):
+    def __init__(
+        self, version: str, max_length: int, device: torch.device | int, **hf_kwargs
+    ):
+        super().__init__()
+        self.is_clip = version.startswith("openai")
+        self.max_length = max_length
+        self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+        if self.is_clip:
+            self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(
+                version, max_length=max_length
+            )
+            self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(
+                version, **hf_kwargs
+            )
+            self.hf_module = self.hf_module.eval().requires_grad_(False).to(device)
+        else:
+            self.tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(
+                version, max_length=max_length
+            )
+            self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(
+                version,
+                **hf_kwargs,
+                device_map={"": device},
+                quantization_config=BitsAndBytesConfig(
+                    load_in_4bit=True,
+                ),
+            )
+    def forward(self, text: list[str]) -> Tensor:
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        outputs = self.hf_module(
+            input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )
+        return outputs[self.output_key]

modules/flux_model.py ADDED Viewed

	@@ -0,0 +1,492 @@

+import torch
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.backends.cudnn.benchmark = True
+torch.backends.cudnn.benchmark_limit = 20
+torch.set_float32_matmul_precision("high")
+import math
+from dataclasses import dataclass
+from cublas_linear import CublasLinear as F16Linear
+from einops.layers.torch import Rearrange
+from torch import Tensor, nn
+from torch._dynamo import config
+from torch._inductor import config as ind_config
+from xformers.ops import memory_efficient_attention
+from pydantic import BaseModel
+config.cache_size_limit = 10000000000
+ind_config.force_fuse_int_mm_with_mul = True
+class FluxParams(BaseModel):
+    in_channels: int
+    vec_in_dim: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list[int]
+    theta: int
+    qkv_bias: bool
+    guidance_embed: bool
+@torch.compile(mode="reduce-overhead")
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    x = memory_efficient_attention(
+        q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
+    )
+    x = x.reshape(*x.shape[:-2], -1)
+    return x
+@torch.compile(mode="reduce-overhead")
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    scale = torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack(
+        [torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1
+    )
+    out = out.reshape(*out.shape[:-1], 2, 2)
+    return out
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape), xk_out.reshape(*xk.shape)
+class EmbedND(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        theta: int,
+        axes_dim: list[int],
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+        self.dtype = dtype
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [
+                rope(ids[..., i], self.axes_dim[i], self.theta).type(self.dtype)
+                for i in range(n_axes)
+            ],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device)
+        / half
+    )
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = F16Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = F16Linear(hidden_dim, hidden_dim, bias=True)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+@torch.compile(mode="reduce-overhead", dynamic=True)
+def calculation(
+    x,
+):
+    rrms = torch.rsqrt(torch.mean(x.pow(2), dim=-1, keepdim=True) + 1e-6)
+    x = x * rrms
+    return x
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+    def forward(self, x: Tensor):
+        return calculation(x) * self.scale
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q, k
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.qkv = F16Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = F16Linear(dim, dim)
+        self.rearrange = Rearrange("B L (K H D) -> K B H L D", K=3, H=num_heads)
+    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = self.rearrange(qkv)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = self.proj(x)
+        return x
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+        self.act = nn.SiLU()
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(self.act(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+class DoubleStreamBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float,
+        qkv_bias: bool = False,
+        dtype: torch.dtype = torch.bfloat16,
+        idx: int = 0,
+    ):
+        super().__init__()
+        self.dtype = dtype
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(
+            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
+        )
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            F16Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            F16Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(
+            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
+        )
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            (F16Linear(hidden_size, mlp_hidden_dim, bias=True)),
+            nn.GELU(approximate="tanh"),
+            (F16Linear(mlp_hidden_dim, hidden_size, bias=True)),
+        )
+        self.rearrange_for_norm = Rearrange(
+            "B L (K H D) -> K B H L D", K=3, H=num_heads
+        )
+    def forward(
+        self,
+        img: Tensor,
+        txt: Tensor,
+        vec: Tensor,
+        pe: Tensor,
+    ) -> tuple[Tensor, Tensor]:
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = self.rearrange_for_norm(img_qkv)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = self.rearrange_for_norm(txt_qkv)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+        attn = attention(q, k, v, pe=pe)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+        # calculate the img bloks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp(
+            (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
+        ).clamp(min=-384, max=384)
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp(
+            (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
+        ).clamp(min=-384, max=384)
+        return img, txt
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.dtype = dtype
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = F16Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+        self.norm = QKNorm(head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+        self.rearrange_for_norm = Rearrange(
+            "B L (K H D) -> K B H L D", K=3, H=num_heads
+        )
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+        mod = self.modulation(vec)[0]
+        pre_norm = self.pre_norm(x)
+        x_mod = (1 + mod.scale) * pre_norm + mod.shift
+        qkv, mlp = torch.split(
+            self.linear1(x_mod),
+            [3 * self.hidden_size, self.mlp_hidden_dim],
+            dim=-1,
+        )
+        q, k, v = self.rearrange_for_norm(qkv)
+        q, k = self.norm(q, k, v)
+        attn = attention(q, k, v, pe=pe)
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)).clamp(
+            min=-384, max=384
+        )
+        return x + mod.gate * output
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(
+            hidden_size, patch_size * patch_size * out_channels, bias=True
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x
+class Flux(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+    def __init__(self, params: FluxParams, dtype: torch.dtype = torch.bfloat16):
+        super().__init__()
+        self.dtype = dtype
+        self.params = params
+        self.in_channels = params.in_channels
+        self.out_channels = self.in_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
+            )
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(
+                f"Got {params.axes_dim} but expected positional dim {pe_dim}"
+            )
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(
+            dim=pe_dim,
+            theta=params.theta,
+            axes_dim=params.axes_dim,
+            dtype=self.dtype,
+        )
+        self.img_in = F16Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+            if params.guidance_embed
+            else nn.Identity()
+        )
+        self.txt_in = F16Linear(params.context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                    dtype=self.dtype,
+                    idx=idx,
+                )
+                for idx in range(params.depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    dtype=self.dtype,
+                )
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256).type(self.dtype))
+        if self.params.guidance_embed:
+            if guidance is None:
+                raise ValueError(
+                    "Didn't get guidance strength for guidance distilled model."
+                )
+            vec = vec + self.guidance_in(
+                timestep_embedding(guidance, 256).type(self.dtype)
+            )
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        for i, block in enumerate(self.double_blocks):
+            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+        img = torch.cat((txt, img), 1)
+        for block in self.single_blocks:
+            img = block(img, vec=vec, pe=pe)
+        img = img[:, txt.shape[1] :, ...]
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img
+    @classmethod
+    def from_safetensors(
+        self,
+        model_path: str,
+        model_params: FluxParams,
+        dtype: torch.dtype = torch.bfloat16,
+        device: torch.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        ),
+    ):
+        model = Flux(params=model_params, dtype=dtype)
+        model.load_state_dict(model_path.state_dict())
+        model.to(device)
+        return model

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+git+https://github.com/aredden/torch-cublas-hgemm.git@master
+git+https://github.com/pytorch/ao.git@main
+einops
+PyTurboJPEG
+pydantic
+fastapi
+bitsandbytes
+xformers
+loguru
+transformers
+tokenizers
+sentencepiece

sampling.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import math
+from typing import Callable
+import torch
+from einops import rearrange, repeat
+from torch import Tensor
+from modules.flux_model import Flux
+from modules.conditioner import HFEmbedder
+@torch.inference_mode()
+def get_noise(
+    num_samples: int,
+    height: int,
+    width: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    seed: int,
+):
+    return torch.randn(
+        num_samples,
+        16,
+        # allow for packing
+        2 * math.ceil(height / 16),
+        2 * math.ceil(width / 16),
+        device=device,
+        dtype=dtype,
+        generator=torch.Generator(device=device).manual_seed(seed),
+    )
+@torch.inference_mode()
+def prepare(
+    t5: HFEmbedder, clip: HFEmbedder, img: Tensor, prompt: str | list[str]
+) -> dict[str, Tensor]:
+    bs, c, h, w = img.shape
+    if bs == 1 and not isinstance(prompt, str):
+        bs = len(prompt)
+    img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+    if img.shape[0] == 1 and bs > 1:
+        img = repeat(img, "1 ... -> bs ...", bs=bs)
+    img_ids = torch.zeros(h // 2, w // 2, 3)
+    img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None]
+    img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :]
+    img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    txt = t5(prompt)
+    if txt.shape[0] == 1 and bs > 1:
+        txt = repeat(txt, "1 ... -> bs ...", bs=bs)
+    txt_ids = torch.zeros(bs, txt.shape[1], 3)
+    vec = clip(prompt)
+    if vec.shape[0] == 1 and bs > 1:
+        vec = repeat(vec, "1 ... -> bs ...", bs=bs)
+    return {
+        "img": img,
+        "img_ids": img_ids.to(img.device),
+        "txt": txt.to(img.device),
+        "txt_ids": txt_ids.to(img.device),
+        "vec": vec.to(img.device),
+    }
+def time_shift(mu: float, sigma: float, t: Tensor):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+def get_lin_function(
+    x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
+) -> Callable[[float], float]:
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+    return lambda x: m * x + b
+def get_schedule(
+    num_steps: int,
+    image_seq_len: int,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+    shift: bool = True,
+) -> list[float]:
+    # extra step for zero
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+    # shifting the schedule to favor high timesteps for higher signal images
+    if shift:
+        # eastimate mu based on linear estimation between two points
+        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+        timesteps = time_shift(mu, 1.0, timesteps)
+    return timesteps.tolist()
+@torch.inference_mode()
+def denoise(
+    model: Flux,
+    # model input
+    img: Tensor,
+    img_ids: Tensor,
+    txt: Tensor,
+    txt_ids: Tensor,
+    vec: Tensor,
+    # sampling parameters
+    timesteps: list[float],
+    guidance: float = 4.0,
+    dtype: torch.dtype = torch.bfloat16,
+    device: torch.device = torch.device("cuda:0"),
+):
+    from tqdm import tqdm
+    # this is ignored for schnell
+    img = img.to(device=device, dtype=dtype)
+    img_ids = img_ids.to(device=device, dtype=dtype)
+    txt = txt.to(device=device, dtype=dtype)
+    txt_ids = txt_ids.to(device=device, dtype=dtype)
+    vec = vec.to(device=device, dtype=dtype)
+    guidance_vec = torch.full((img.shape[0],), guidance, device=device, dtype=dtype)
+    for t_curr, t_prev in tqdm(
+        zip(timesteps[:-1], timesteps[1:]), total=len(timesteps) - 1
+    ):
+        t_vec = torch.full((img.shape[0],), t_curr, dtype=dtype, device=device)
+        pred = model(
+            img=img,
+            img_ids=img_ids,
+            txt=txt,
+            txt_ids=txt_ids,
+            y=vec,
+            timesteps=t_vec,
+            guidance=guidance_vec,
+        )
+        img = img + (t_prev - t_curr) * pred
+    return img
+def unpack(x: Tensor, height: int, width: int) -> Tensor:
+    return rearrange(
+        x,
+        "b (h w) (c ph pw) -> b c (h ph) (w pw)",
+        h=math.ceil(height / 16),
+        w=math.ceil(width / 16),
+        ph=2,
+        pw=2,
+    )

turbojpeg_imgs.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import numpy as np
+import torch
+from turbojpeg import (
+    TurboJPEG,
+    TJPF_GRAY,
+    TJFLAG_PROGRESSIVE,
+    TJFLAG_FASTUPSAMPLE,
+    TJFLAG_FASTDCT,
+    TJPF_RGB,
+    TJPF_BGR,
+    TJSAMP_GRAY,
+    TJSAMP_411,
+    TJSAMP_420,
+    TJSAMP_422,
+    TJSAMP_444,
+    TJSAMP_440,
+    TJSAMP_441,
+)
+class Subsampling:
+    S411 = TJSAMP_411
+    S420 = TJSAMP_420
+    S422 = TJSAMP_422
+    S444 = TJSAMP_444
+    S440 = TJSAMP_440
+    S441 = TJSAMP_441
+    GRAY = TJSAMP_GRAY
+class Flags:
+    PROGRESSIVE = TJFLAG_PROGRESSIVE
+    FASTUPSAMPLE = TJFLAG_FASTUPSAMPLE
+    FASTDCT = TJFLAG_FASTDCT
+class PixelFormat:
+    GRAY = TJPF_GRAY
+    RGB = TJPF_RGB
+    BGR = TJPF_BGR
+class TurboImage:
+    def __init__(self):
+        self.tj = TurboJPEG()
+        self.flags = Flags.PROGRESSIVE
+        self.subsampling_gray = Subsampling.GRAY
+        self.pixel_format_gray = PixelFormat.GRAY
+        self.subsampling_rgb = Subsampling.S420
+        self.pixel_format_rgb = PixelFormat.RGB
+    def set_subsampling_gray(self, subsampling):
+        self.subsampling_gray = subsampling
+    def set_subsampling_rgb(self, subsampling):
+        self.subsampling_rgb = subsampling
+    def set_pixel_format_gray(self, pixel_format):
+        self.pixel_format_gray = pixel_format
+    def set_pixel_format_rgb(self, pixel_format):
+        self.pixel_format_rgb = pixel_format
+    def set_flags(self, flags):
+        self.flags = flags
+    def encode(
+        self,
+        img,
+        subsampling,
+        pixel_format,
+        quality=90,
+    ):
+        return self.tj.encode(
+            img,
+            quality=quality,
+            flags=self.flags,
+            pixel_format=pixel_format,
+            jpeg_subsample=subsampling,
+        )
+    @torch.inference_mode()
+    def encode_torch(self, img: torch.Tensor, quality=90):
+        if img.ndim == 2:
+            subsampling = self.subsampling_gray
+            pixel_format = self.pixel_format_gray
+            img = img.clamp(0, 255).cpu().contiguous().numpy().astype(np.uint8)
+        elif img.ndim == 3:
+            subsampling = self.subsampling_rgb
+            pixel_format = self.pixel_format_rgb
+            if img.shape[0] == 3:
+                img = (
+                    img.permute(1, 2, 0)
+                    .clamp(0, 255)
+                    .cpu()
+                    .contiguous()
+                    .numpy()
+                    .astype(np.uint8)
+                )
+            elif img.shape[2] == 3:
+                img = img.clamp(0, 255).cpu().contiguous().numpy().astype(np.uint8)
+            else:
+                raise ValueError(f"Unsupported image shape: {img.shape}")
+        else:
+            raise ValueError(f"Unsupported image num dims: {img.ndim}")
+        return self.encode(
+            img,
+            quality=quality,
+            subsampling=subsampling,
+            pixel_format=pixel_format,
+        )
+    def encode_numpy(self, img: np.ndarray, quality=90):
+        if img.ndim == 2:
+            subsampling = self.subsampling_gray
+            pixel_format = self.pixel_format_gray
+        elif img.ndim == 3:
+            if img.shape[0] == 3:
+                img = np.ascontiguousarray(img.transpose(1, 2, 0))
+            elif img.shape[2] == 3:
+                img = np.ascontiguousarray(img)
+            else:
+                raise ValueError(f"Unsupported image shape: {img.shape}")
+            subsampling = self.subsampling_rgb
+            pixel_format = self.pixel_format_rgb
+        else:
+            raise ValueError(f"Unsupported image num dims: {img.ndim}")
+        img = img.clip(0, 255).astype(np.uint8)
+        return self.encode(
+            img, quality=quality, subsampling=subsampling, pixel_format=pixel_format
+        )

util.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import json
+from pathlib import Path
+from typing import Optional
+import torch
+from modules.autoencoder import AutoEncoder, AutoEncoderParams
+from modules.conditioner import HFEmbedder
+from modules.flux_model import Flux, FluxParams
+from safetensors.torch import load_file as load_sft
+from enum import StrEnum
+from pydantic import BaseModel, ConfigDict
+from loguru import logger
+class ModelVersion(StrEnum):
+    flux_dev = "flux-dev"
+    flux_schnell = "flux-schnell"
+class ModelSpec(BaseModel):
+    version: ModelVersion
+    params: FluxParams
+    ae_params: AutoEncoderParams
+    ckpt_path: str | None
+    ae_path: str | None
+    repo_id: str | None
+    repo_flow: str | None
+    repo_ae: str | None
+    text_enc_max_length: int = 512
+    text_enc_path: str | None
+    text_enc_device: str | torch.device | None = "cuda:0"
+    ae_device: str | torch.device | None = "cuda:0"
+    flux_device: str | torch.device | None = "cuda:0"
+    flow_dtype: str = "float16"
+    ae_dtype: str = "bfloat16"
+    text_enc_dtype: str = "bfloat16"
+    num_to_quant: Optional[int] = 20
+    model_config: ConfigDict = {
+        "arbitrary_types_allowed": True,
+        "use_enum_values": True,
+    }
+def load_models(config: ModelSpec) -> tuple[Flux, AutoEncoder, HFEmbedder, HFEmbedder]:
+    flow = load_flow_model(config)
+    ae = load_autoencoder(config)
+    clip, t5 = load_text_encoders(config)
+    return flow, ae, clip, t5
+def parse_device(device: str | torch.device | None) -> torch.device:
+    if isinstance(device, str):
+        return torch.device(device)
+    elif isinstance(device, torch.device):
+        return device
+    else:
+        return torch.device("cuda:0")
+def into_dtype(dtype: str) -> torch.dtype:
+    if dtype == "float16":
+        return torch.float16
+    elif dtype == "bfloat16":
+        return torch.bfloat16
+    elif dtype == "float32":
+        return torch.float32
+    else:
+        raise ValueError(f"Invalid dtype: {dtype}")
+def into_device(device: str | torch.device | None) -> torch.device:
+    if isinstance(device, str):
+        return torch.device(device)
+    elif isinstance(device, torch.device):
+        return device
+    elif isinstance(device, int):
+        return torch.device(f"cuda:{device}")
+    else:
+        return torch.device("cuda:0")
+def load_config(
+    name: ModelVersion = ModelVersion.flux_dev,
+    flux_path: str | None = None,
+    ae_path: str | None = None,
+    text_enc_path: str | None = None,
+    text_enc_device: str | torch.device | None = None,
+    ae_device: str | torch.device | None = None,
+    flux_device: str | torch.device | None = None,
+    flow_dtype: str = "float16",
+    ae_dtype: str = "bfloat16",
+    text_enc_dtype: str = "bfloat16",
+    num_to_quant: Optional[int] = 20,
+):
+    text_enc_device = str(parse_device(text_enc_device))
+    ae_device = str(parse_device(ae_device))
+    flux_device = str(parse_device(flux_device))
+    return ModelSpec(
+        version=name,
+        repo_id=(
+            "black-forest-labs/FLUX.1-dev"
+            if name == ModelVersion.flux_dev
+            else "black-forest-labs/FLUX.1-schnell"
+        ),
+        repo_flow=(
+            "flux1-dev.sft" if name == ModelVersion.flux_dev else "flux1-schnell.sft"
+        ),
+        repo_ae="ae.sft",
+        ckpt_path=flux_path,
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=True,
+        ),
+        ae_path=ae_path,
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+        text_enc_path=text_enc_path,
+        text_enc_device=text_enc_device,
+        ae_device=ae_device,
+        flux_device=flux_device,
+        flow_dtype=flow_dtype,
+        ae_dtype=ae_dtype,
+        text_enc_dtype=text_enc_dtype,
+        text_enc_max_length=512 if name == ModelVersion.flux_dev else 256,
+        num_to_quant=num_to_quant,
+    )
+def load_config_from_path(path: str) -> ModelSpec:
+    path_path = Path(path)
+    if not path_path.exists():
+        raise ValueError(f"Path {path} does not exist")
+    if not path_path.is_file():
+        raise ValueError(f"Path {path} is not a file")
+    return ModelSpec(**json.loads(path_path.read_text()))
+def print_load_warning(missing: list[str], unexpected: list[str]) -> None:
+    if len(missing) > 0 and len(unexpected) > 0:
+        logger.warning(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+        logger.warning("\n" + "-" * 79 + "\n")
+        logger.warning(
+            f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected)
+        )
+    elif len(missing) > 0:
+        logger.warning(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+    elif len(unexpected) > 0:
+        logger.warning(
+            f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected)
+        )
+def load_flow_model(config: ModelSpec) -> Flux:
+    ckpt_path = config.ckpt_path
+    with torch.device("meta"):
+        model = Flux(config.params, dtype=into_dtype(config.flow_dtype)).type(
+            into_dtype(config.flow_dtype)
+        )
+    if ckpt_path is not None:
+        # load_sft doesn't support torch.device
+        sd = load_sft(ckpt_path, device="cpu")
+        missing, unexpected = model.load_state_dict(sd, strict=False, assign=True)
+        print_load_warning(missing, unexpected)
+    return model
+def load_text_encoders(config: ModelSpec) -> tuple[HFEmbedder, HFEmbedder]:
+    clip = HFEmbedder(
+        "openai/clip-vit-large-patch14",
+        max_length=77,
+        torch_dtype=into_dtype(config.text_enc_dtype),
+        device=into_device(config.text_enc_device),
+    )
+    t5 = HFEmbedder(
+        config.text_enc_path,
+        max_length=config.text_enc_max_length,
+        torch_dtype=into_dtype(config.text_enc_dtype),
+        device=into_device(config.text_enc_device).index or 0,
+    )
+    return clip, t5
+def load_autoencoder(config: ModelSpec) -> AutoEncoder:
+    ckpt_path = config.ae_path
+    with torch.device("meta" if ckpt_path is not None else config.ae_device):
+        ae = AutoEncoder(config.ae_params)
+    if ckpt_path is not None:
+        sd = load_sft(ckpt_path, device=str(config.ae_device))
+        missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True)
+        print_load_warning(missing, unexpected)
+    return ae
+class LoadedModels(BaseModel):
+    flow: Flux
+    ae: AutoEncoder
+    clip: HFEmbedder
+    t5: HFEmbedder
+    config: ModelSpec
+    model_config = {
+        "arbitrary_types_allowed": True,
+        "use_enum_values": True,
+    }
+def load_models_from_config_path(
+    path: str,
+) -> LoadedModels:
+    config = load_config_from_path(path)
+    clip, t5 = load_text_encoders(config)
+    return LoadedModels(
+        flow=load_flow_model(config),
+        ae=load_autoencoder(config),
+        clip=clip,
+        t5=t5,
+        config=config,
+    )
+def load_models_from_config(config: ModelSpec) -> LoadedModels:
+    clip, t5 = load_text_encoders(config)
+    return LoadedModels(
+        flow=load_flow_model(config),
+        ae=load_autoencoder(config),
+        clip=clip,
+        t5=t5,
+        config=config,
+    )
+if __name__ == "__main__":
+    p = "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft"
+    ae_p = "/big/generator-ui/flux-testing/flux/model-dir/ae.sft"
+    config = load_config(
+        ModelVersion.flux_dev,
+        flux_path=p,
+        ae_path=ae_p,
+        text_enc_path="city96/t5-v1_1-xxl-encoder-bf16",
+        text_enc_device="cuda:0",
+        ae_device="cuda:0",
+        flux_device="cuda:0",
+        flow_dtype="float16",
+        ae_dtype="bfloat16",
+        text_enc_dtype="bfloat16",
+        num_to_quant=20,
+    )
+    with open("configs/config-dev-cuda0.json", "w") as f:
+        json.dump(config.model_dump(), f, indent=2)
+    print(config)