Spaces:

multimodalart
/

ctrl-x

Runtime error

App Files Files Community

multimodalart HF staff commited on Sep 28

Commit

3aff77a

•

1 Parent(s): 073f81a

Upload folder using huggingface_hub

Browse files

Files changed (48) hide show

.gitattributes +11 -0
.gitignore +162 -0
README.md +46 -12
app_ctrlx.py +412 -0
assets/images/bear_avocado__spatext.jpg +0 -0
assets/images/bedroom__sketch.jpg +0 -0
assets/images/cat__mesh.jpg +0 -0
assets/images/cat__point_cloud.jpg +0 -0
assets/images/dog__sketch.jpg +0 -0
assets/images/fruit_bowl.jpg +0 -0
assets/images/grapes.jpg +0 -0
assets/images/horse.jpg +0 -0
assets/images/horse__point_cloud.jpg +0 -0
assets/images/knight__humanoid.jpg +0 -0
assets/images/library__mesh.jpg +0 -0
assets/images/living_room__seg.jpg +0 -0
assets/images/living_room_modern.jpg +0 -0
assets/images/man_park.jpg +0 -0
assets/images/person__mesh.jpg +0 -0
assets/images/running__pose.jpg +0 -0
assets/images/squirrel.jpg +0 -0
assets/images/tiger.jpg +0 -0
assets/images/van_gogh.jpg +0 -0
ctrl_x/__init__.py +0 -0
ctrl_x/pipelines/__init__.py +0 -0
ctrl_x/pipelines/pipeline_sdxl.py +665 -0
ctrl_x/utils/__init__.py +3 -0
ctrl_x/utils/feature.py +79 -0
ctrl_x/utils/media.py +21 -0
ctrl_x/utils/sdxl.py +274 -0
ctrl_x/utils/utils.py +88 -0
docs/assets/bootstrap.min.css +0 -0
docs/assets/cross_image_attention.jpg +3 -0
docs/assets/ctrl-x.jpg +3 -0
docs/assets/font.css +37 -0
docs/assets/freecontrol.jpg +3 -0
docs/assets/genforce.png +0 -0
docs/assets/pipeline.jpg +3 -0
docs/assets/results_animatediff.mp4 +3 -0
docs/assets/results_multi_subject.jpg +3 -0
docs/assets/results_struct+app.jpg +3 -0
docs/assets/results_struct+app_2.jpg +3 -0
docs/assets/results_struct+prompt.jpg +3 -0
docs/assets/style.css +139 -0
docs/assets/teaser_github.jpg +3 -0
docs/assets/teaser_small.jpg +3 -0
docs/index.html +186 -0
environment.yaml +125 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/assets/cross_image_attention.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/ctrl-x.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/freecontrol.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/pipeline.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/results_animatediff.mp4 filter=lfs diff=lfs merge=lfs -text
+docs/assets/results_multi_subject.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/results_struct+app.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/results_struct+app_2.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/results_struct+prompt.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/teaser_github.jpg filter=lfs diff=lfs merge=lfs -text
+docs/assets/teaser_small.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

README.md CHANGED Viewed

@@ -1,12 +1,46 @@
----
-title: Ctrl X
-emoji: 🌖
-colorFrom: purple
-colorTo: gray
-sdk: gradio
-sdk_version: 4.44.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Ctrl-X: Controlling Structure and Appearance for Text-To-Image Generation Without Guidance (NeurIPS 2024)
+<a href="https://arxiv.org/abs/2406.07540"><img src="https://img.shields.io/badge/arXiv-Paper-red"></a>
+<a href="https://genforce.github.io/ctrl-x"><img src="https://img.shields.io/badge/Project-Page-yellow"></a>
+[![GitHub](https://img.shields.io/github/stars/genforce/ctrl-x?style=social)](https://github.com/genforce/ctrl-x)
+[Kuan Heng Lin](https://kuanhenglin.github.io)<sup>1*</sup>, [Sicheng Mo](https://sichengmo.github.io/)<sup>1*</sup>, [Ben Klingher](https://bklingher.github.io)<sup>1</sup>, [Fangzhou Mu](https://pages.cs.wisc.edu/~fmu/)<sup>2</sup>, [Bolei Zhou](https://boleizhou.github.io/)<sup>1</sup> <br>
+<sup>1</sup>UCLA&emsp;<sup>2</sup>NVIDIA <br>
+<sup>*</sup>Equal contribution <br>
+![Ctrl-X teaser figure](docs/assets/teaser_github.jpg)
+## Getting started
+### Environment setup
+Our code is built on top of [`diffusers v0.28.0`](https://github.com/huggingface/diffusers). To set up the environment, please run the following.
+```
+conda env create -f environment.yaml
+conda activate ctrlx
+```
+### Gradio demo
+We provide a user interface for testing our method. Running the following command starts the demo.
+```
+python3 app_ctrlx.py
+```
+Have fun playing around! :D
+## Contact
+For any questions, thoughts, discussions, and any other things you want to reach out for, please contact [Kuan Heng (Jordan) Lin](https://kuanhenglin.github.io) (kuanhenglin@ucla.edu).
+## Reference
+If you use our code in your research, please cite the following work.
+```bibtex
+@inproceedings{lin2024ctrlx,
+    author = {Lin, {Kuan Heng} and Mo, Sicheng and Klingher, Ben and Mu, Fangzhou and Zhou, Bolei},
+    booktitle = {Advances in Neural Information Processing Systems},
+    title = {Ctrl-X: Controlling Structure and Appearance for Text-To-Image Generation Without Guidance},
+    year = {2024}
+}
+```

app_ctrlx.py ADDED Viewed

	@@ -0,0 +1,412 @@

+from argparse import ArgumentParser
+from diffusers import DDIMScheduler, StableDiffusionXLImg2ImgPipeline
+import gradio as gr
+import torch
+import yaml
+from ctrl_x.pipelines.pipeline_sdxl import CtrlXStableDiffusionXLPipeline
+from ctrl_x.utils import *
+from ctrl_x.utils.sdxl import *
+parser = ArgumentParser()
+parser.add_argument("-m", "--model", type=str, default=None)  # Optionally, load model checkpoint from single file
+args = parser.parse_args()
+torch.backends.cudnn.enabled = False  # Sometimes necessary to suppress CUDNN_STATUS_NOT_SUPPORTED
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+model_id_or_path = "stabilityai/stable-diffusion-xl-base-1.0"
+refiner_id_or_path = "stabilityai/stable-diffusion-xl-refiner-1.0"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+variant = "fp16" if device == "cuda" else "fp32"
+scheduler = DDIMScheduler.from_config(model_id_or_path, subfolder="scheduler")  # TODO: Support other schedulers
+if args.model is None:
+    pipe = CtrlXStableDiffusionXLPipeline.from_pretrained(
+        model_id_or_path, scheduler=scheduler, torch_dtype=torch_dtype, variant=variant, use_safetensors=True
+    )
+else:
+    print(f"Using weights {args.model} for SDXL base model.")
+    pipe = CtrlXStableDiffusionXLPipeline.from_single_file(args.model, scheduler=scheduler, torch_dtype=torch_dtype)
+refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained(
+    refiner_id_or_path, scheduler=scheduler, text_encoder_2=pipe.text_encoder_2, vae=pipe.vae,
+    torch_dtype=torch_dtype, variant=variant, use_safetensors=True,
+)
+if torch.cuda.is_available():
+    pipe = pipe.to("cuda")
+    refiner = refiner.to("cuda")
+def get_control_config(structure_schedule, appearance_schedule):
+    s = structure_schedule
+    a = appearance_schedule
+    control_config =\
+f"""control_schedule:
+    #       structure_conv   structure_attn   appearance_attn  conv/attn
+    encoder:                                                # (num layers)
+        0: [[             ], [             ], [             ]]  # 2/0
+        1: [[             ], [             ], [{a}, {a}     ]]  # 2/2
+        2: [[             ], [             ], [{a}, {a}     ]]  # 2/2
+    middle: [[            ], [             ], [             ]]  # 2/1
+    decoder:
+        0: [[{s}          ], [{s}, {s}, {s}], [0.0, {a}, {a}]]  # 3/3
+        1: [[             ], [             ], [{a}, {a}     ]]  # 3/3
+        2: [[             ], [             ], [             ]]  # 3/0
+control_target:
+    - [output_tensor]  # structure_conv   choices: {{hidden_states, output_tensor}}
+    - [query, key]     # structure_attn   choices: {{query, key, value}}
+    - [before]         # appearance_attn  choices: {{before, value, after}}
+self_recurrence_schedule:
+    - [0.1, 0.5, 2]  # format: [start, end, num_recurrence]"""
+    return control_config
+css = """
+.config textarea {font-family: monospace; font-size: 80%; white-space: pre}
+.mono {font-family: monospace}
+"""
+title = """
+<div style="display: flex; align-items: center; justify-content: center;margin-bottom: -15px">
+    <h1 style="margin-left: 12px;text-align: center;display: inline-block">
+        Ctrl-X: Controlling Structure and Appearance for Text-To-Image Generation Without Guidance
+    </h1>
+    <h3 style="display: inline-block; margin-left: 10px; margin-top: 7.5px; font-weight: 500">
+        SDXL v1.0
+    </h3>
+</div>
+<div style="display: flex; align-items: center; justify-content: center;margin-bottom: 25px">
+    <h3 style="text-align: center">
+        [<a href="https://genforce.github.io/ctrl-x/">Page</a>]
+        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+        [<a href="https://arxiv.org/abs/2406.07540">Paper</a>]
+        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+        [<a href="https://github.com/genforce/ctrl-x">Code</a>]
+    </h3>
+</div>
+<div>
+    <p>
+        <b>Ctrl-X</b> is a simple training-free and guidance-free framework for text-to-image (T2I) generation with
+        structure and appearance control. Given structure and appearance images, Ctrl-X designs feedforward structure
+        control to enable structure alignment with the arbitrary structure image and semantic-aware appearance transfer
+        to facilitate the appearance transfer from the appearance image.
+    </p>
+    <p>
+        Here are some notes and tips for this demo:
+    </p>
+    <ul>
+        <li> On input images:
+            <ul>
+                <li>
+                    If both the structure and appearance images are provided, then Ctrl-X does <i>structure and
+                    appearance</i> control.
+                </li>
+                <li>
+                    If only the structure image is provided, then Ctrl-X does <i>structure-only</i> control and the
+                    appearance image is jointly generated with the output image.
+                </li>
+                <li>
+                    Similarly, if only the appearance image is provided, then Ctrl-X does <i>appearance-only</i>
+                    control.
+                </li>
+            </ul>
+        </li>
+        <li> On prompts:
+            <ul>
+                <li>
+                    Though the output prompt can affect the output image to a noticeable extent, the "accuracy" of the
+                    structure and appearance prompts are not impactful to the final image.
+                </li>
+                <li>
+                    If the structure or appearance prompt is left blank, then it uses the (non-optional) output prompt
+                    by default.
+                </li>
+            </ul>
+        </li>
+        <li> On control schedules:
+            <ul>
+                <li>
+                    When "Use advanced config" is <b>OFF</b>, the demo uses the structure guidance
+                    (<span class="mono">structure_conv</span> and <span class="mono">structure_attn</span>
+                    in the advanced config) and appearance guidance (<span class="mono">appearance_attn</span> in the
+                    advanced config) sliders to change the control schedules.
+                </li>
+                <li>
+                    Otherwise, the demo uses "Advanced control config," which allows per-layer structure and
+                    appearance schedule control, along with self-recurrence control. <i>This should be used
+                    carefully</i>, and we recommend switching "Use advanced config" <b>OFF</b> in most cases. (For the
+                    examples provided at the bottom of the demo, the advanced config uses the default schedules that
+                    may not be the best settings for these examples.)
+                </li>
+            </ul>
+        </li>
+    </ul>
+    <p>
+        Have fun! :D
+    </p>
+</div>
+"""
+def inference(
+    structure_image, appearance_image,
+    prompt, structure_prompt, appearance_prompt,
+    positive_prompt, negative_prompt,
+    guidance_scale, structure_guidance_scale, appearance_guidance_scale,
+    num_inference_steps, eta, seed,
+    width, height,
+    structure_schedule, appearance_schedule, use_advanced_config,
+    control_config,
+):
+    torch.manual_seed(seed)
+    pipe.scheduler.set_timesteps(num_inference_steps, device=device)
+    timesteps = pipe.scheduler.timesteps
+    print(f"\nUsing the following control config (use_advanced_config={use_advanced_config}):")
+    if not use_advanced_config:
+        control_config = get_control_config(structure_schedule, appearance_schedule)
+    print(control_config, end="\n\n")
+    config = yaml.safe_load(control_config)
+    register_control(
+        model = pipe,
+        timesteps = timesteps,
+        control_schedule = config["control_schedule"],
+        control_target = config["control_target"],
+    )
+    pipe.safety_checker = None
+    pipe.requires_safety_checker = False
+    self_recurrence_schedule = get_self_recurrence_schedule(config["self_recurrence_schedule"], num_inference_steps)
+    pipe.set_progress_bar_config(desc="Ctrl-X inference")
+    refiner.set_progress_bar_config(desc="Refiner")
+    result, structure, appearance = pipe(
+        prompt = prompt,
+        structure_prompt = structure_prompt,
+        appearance_prompt = appearance_prompt,
+        structure_image = structure_image,
+        appearance_image = appearance_image,
+        num_inference_steps = num_inference_steps,
+        negative_prompt = negative_prompt,
+        positive_prompt = positive_prompt,
+        height = height,
+        width = width,
+        guidance_scale = guidance_scale,
+        structure_guidance_scale = structure_guidance_scale,
+        appearance_guidance_scale = appearance_guidance_scale,
+        eta = eta,
+        output_type = "pil",
+        return_dict = False,
+        control_schedule = config["control_schedule"],
+        self_recurrence_schedule = self_recurrence_schedule,
+    )
+    result_refiner = refiner(
+        image = pipe.refiner_args["latents"],
+        prompt = pipe.refiner_args["prompt"],
+        negative_prompt = pipe.refiner_args["negative_prompt"],
+        height = height,
+        width = width,
+        num_inference_steps = num_inference_steps,
+        guidance_scale = guidance_scale,
+        guidance_rescale = 0.7,
+        num_images_per_prompt = 1,
+        eta = eta,
+        output_type = "pil",
+    ).images
+    del pipe.refiner_args
+    return [result[0], result_refiner[0], structure[0], appearance[0]]
+with gr.Blocks(theme=gr.themes.Default(), css=css, title="Ctrl-X (SDXL v1.0)") as app:
+    gr.HTML(title)
+    with gr.Row():
+        with gr.Column(scale=55):
+            with gr.Group():
+                kwargs = {}  # {"width": 400, "height": 400}
+                with gr.Row():
+                    result = gr.Image(label="Output image", format="jpg", **kwargs)
+                    result_refiner = gr.Image(label="Output image w/ refiner", format="jpg", **kwargs)
+                with gr.Row():
+                    structure_recon = gr.Image(label="Structure image", format="jpg", **kwargs)
+                    appearance_recon = gr.Image(label="Style image", format="jpg", **kwargs)
+                with gr.Row():
+                    structure_image = gr.Image(label="Upload structure image (optional)", type="pil", **kwargs)
+                    appearance_image = gr.Image(label="Upload appearance image (optional)", type="pil", **kwargs)
+        with gr.Column(scale=45):
+            with gr.Group():
+                with gr.Row():
+                    structure_prompt = gr.Textbox(label="Structure prompt (optional)", placeholder="Prompt which describes the structure image")
+                    appearance_prompt = gr.Textbox(label="Appearance prompt (optional)", placeholder="Prompt which describes the style image")
+                with gr.Row():
+                    prompt = gr.Textbox(label="Output prompt", placeholder="Prompt which describes the output image")
+                with gr.Row():
+                    positive_prompt = gr.Textbox(label="Positive prompt", value="high quality", placeholder="")
+                    negative_prompt = gr.Textbox(label="Negative prompt", value="ugly, blurry, dark, low res, unrealistic", placeholder="")
+                with gr.Row():
+                    guidance_scale = gr.Slider(label="Target guidance scale", value=5.0, minimum=1, maximum=10)
+                    structure_guidance_scale = gr.Slider(label="Structure guidance scale", value=5.0, minimum=1, maximum=10)
+                    appearance_guidance_scale = gr.Slider(label="Appearance guidance scale", value=5.0, minimum=1, maximum=10)
+                with gr.Row():
+                    num_inference_steps = gr.Slider(label="# inference steps", value=50, minimum=1, maximum=200, step=1)
+                    eta = gr.Slider(label="Eta (noise)", value=1.0, minimum=0, maximum=1.0, step=0.01)
+                    seed = gr.Slider(0, 2147483647, label="Seed", value=90095, step=1)
+                with gr.Row():
+                    width = gr.Slider(label="Width", value=1024, minimum=256, maximum=2048, step=pipe.vae_scale_factor)
+                    height = gr.Slider(label="Height", value=1024, minimum=256, maximum=2048, step=pipe.vae_scale_factor)
+                with gr.Row():
+                    structure_schedule = gr.Slider(label="Structure schedule", value=0.6, minimum=0.0, maximum=1.0, step=0.01, scale=2)
+                    appearance_schedule = gr.Slider(label="Appearance schedule", value=0.6, minimum=0.0, maximum=1.0, step=0.01, scale=2)
+                    use_advanced_config = gr.Checkbox(label="Use advanced config", value=False, scale=1)
+                with gr.Row():
+                    control_config = gr.Textbox(
+                        label="Advanced control config", lines=20, value=get_control_config(0.6, 0.6), elem_classes=["config"], visible=False,
+                    )
+                    use_advanced_config.change(
+                        fn=lambda value: gr.update(visible=value), inputs=use_advanced_config, outputs=control_config,
+                    )
+                with gr.Row():
+                    generate = gr.Button(value="Run")
+    inputs = [
+        structure_image, appearance_image,
+        prompt, structure_prompt, appearance_prompt,
+        positive_prompt, negative_prompt,
+        guidance_scale, structure_guidance_scale, appearance_guidance_scale,
+        num_inference_steps, eta, seed,
+        width, height,
+        structure_schedule, appearance_schedule, use_advanced_config,
+        control_config,
+    ]
+    outputs = [result, result_refiner, structure_recon, appearance_recon]
+    generate.click(inference, inputs=inputs, outputs=outputs)
+    examples = gr.Examples(
+        [
+            [
+                "assets/images/horse__point_cloud.jpg",
+                "assets/images/horse.jpg",
+                "a 3D point cloud of a horse",
+                "",
+                "a photo of a horse standing on grass",
+                0.6, 0.6,
+            ],
+            [
+                "assets/images/cat__mesh.jpg",
+                "assets/images/tiger.jpg",
+                "a 3D mesh of a cat",
+                "",
+                "a photo of a tiger standing on snow",
+                0.6, 0.6,
+            ],
+            [
+                "assets/images/dog__sketch.jpg",
+                "assets/images/squirrel.jpg",
+                "a sketch of a dog",
+                "",
+                "a photo of a squirrel",
+                0.6, 0.6,
+            ],
+            [
+                "assets/images/living_room__seg.jpg",
+                "assets/images/van_gogh.jpg",
+                "a segmentation map of a living room",
+                "",
+                "a Van Gogh painting of a living room",
+                0.6, 0.6,
+            ],
+            [
+                "assets/images/bedroom__sketch.jpg",
+                "assets/images/living_room_modern.jpg",
+                "a sketch of a bedroom",
+                "",
+                "a photo of a modern bedroom during sunset",
+                0.6, 0.6,
+            ],
+            [
+                "assets/images/running__pose.jpg",
+                "assets/images/man_park.jpg",
+                "a pose image of a person running",
+                "",
+                "a photo of a man running in a park",
+                0.4, 0.6,
+            ],
+            [
+                "assets/images/fruit_bowl.jpg",
+                "assets/images/grapes.jpg",
+                "a photo of a bowl of fruits",
+                "",
+                "a photo of a bowl of grapes in the trees",
+                0.6, 0.6,
+            ],
+            [
+                "assets/images/bear_avocado__spatext.jpg",
+                None,
+                "a segmentation map of a bear and an avocado",
+                "",
+                "a realistic photo of a bear and an avocado in a forest",
+                0.6, 0.6,
+            ],
+            [
+                "assets/images/cat__point_cloud.jpg",
+                None,
+                "a 3D point cloud of a cat",
+                "",
+                "an embroidery of a white cat sitting on a rock under the night sky",
+                0.6, 0.6,
+            ],
+            [
+                "assets/images/library__mesh.jpg",
+                None,
+                "a 3D mesh of a library",
+                "",
+                "a Polaroid photo of an old library, sunlight streaming in",
+                0.6, 0.6,
+            ],
+            [
+                "assets/images/knight__humanoid.jpg",
+                None,
+                "a 3D model of a person holding a sword and shield",
+                "",
+                "a photo of a medieval soldier standing on a barren field, raining",
+                0.6, 0.6,
+            ],
+            [
+                "assets/images/person__mesh.jpg",
+                None,
+                "a 3D mesh of a person",
+                "",
+                "a photo of a Karate man performing in a cyberpunk city at night",
+                0.5, 0.6,
+            ],
+        ],
+        [
+            structure_image,
+            appearance_image,
+            structure_prompt,
+            appearance_prompt,
+            prompt,
+            structure_schedule,
+            appearance_schedule,
+        ],
+        examples_per_page=50,
+    )
+app.launch(debug=False, share=False)

assets/images/bear_avocado__spatext.jpg ADDED Viewed

assets/images/bedroom__sketch.jpg ADDED Viewed

assets/images/cat__mesh.jpg ADDED Viewed

assets/images/cat__point_cloud.jpg ADDED Viewed

assets/images/dog__sketch.jpg ADDED Viewed

assets/images/fruit_bowl.jpg ADDED Viewed

assets/images/grapes.jpg ADDED Viewed

assets/images/horse.jpg ADDED Viewed

assets/images/horse__point_cloud.jpg ADDED Viewed

assets/images/knight__humanoid.jpg ADDED Viewed

assets/images/library__mesh.jpg ADDED Viewed

assets/images/living_room__seg.jpg ADDED Viewed

assets/images/living_room_modern.jpg ADDED Viewed

assets/images/man_park.jpg ADDED Viewed

assets/images/person__mesh.jpg ADDED Viewed

assets/images/running__pose.jpg ADDED Viewed

assets/images/squirrel.jpg ADDED Viewed

assets/images/tiger.jpg ADDED Viewed

assets/images/van_gogh.jpg ADDED Viewed

ctrl_x/__init__.py ADDED Viewed

File without changes

ctrl_x/pipelines/__init__.py ADDED Viewed

File without changes

ctrl_x/pipelines/pipeline_sdxl.py ADDED Viewed

	@@ -0,0 +1,665 @@

+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from diffusers import StableDiffusionXLPipeline
+from diffusers.image_processor import PipelineImageInput
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img import\
+    rescale_noise_cfg, retrieve_latents, retrieve_timesteps
+from diffusers.utils import BaseOutput, deprecate
+from diffusers.utils.torch_utils import randn_tensor
+import numpy as np
+import PIL
+import torch
+from ..utils import *
+from ..utils.sdxl import *
+BATCH_ORDER = [
+    "structure_uncond", "appearance_uncond", "uncond", "structure_cond", "appearance_cond", "cond",
+]
+def get_last_control_i(control_schedule, num_inference_steps):
+    if control_schedule is None:
+        return num_inference_steps, num_inference_steps
+    def max_(l):
+        if len(l) == 0:
+            return 0.0
+        return max(l)
+    structure_max = 0.0
+    appearance_max = 0.0
+    for block in control_schedule.values():
+        if isinstance(block, list):  # Handling mid_block
+            block = {0: block}
+        for layer in block.values():
+            structure_max = max(structure_max, max_(layer[0] + layer[1]))
+            appearance_max = max(appearance_max, max_(layer[2]))
+    structure_i = round(num_inference_steps * structure_max)
+    appearance_i = round(num_inference_steps * appearance_max)
+    return structure_i, appearance_i
+@dataclass
+class CtrlXStableDiffusionXLPipelineOutput(BaseOutput):
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    structures = Union[List[PIL.Image.Image], np.ndarray]
+    appearances = Union[List[PIL.Image.Image], np.ndarray]
+class CtrlXStableDiffusionXLPipeline(StableDiffusionXLPipeline):  # diffusers==0.28.0
+    def prepare_latents(
+        self, image, batch_size, num_images_per_prompt, num_channels_latents, height, width,
+        dtype, device, generator=None, noise=None,
+    ):
+        batch_size = batch_size * num_images_per_prompt
+        if noise is None:
+            shape = (
+                batch_size,
+                num_channels_latents,
+                height // self.vae_scale_factor,
+                width // self.vae_scale_factor
+            )
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            noise = noise * self.scheduler.init_noise_sigma  # Starting noise, need to scale
+        else:
+            noise = noise.to(device)
+        if image is None:
+            return noise, None
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+        # Offload text encoder if `enable_model_cpu_offload` was enabled
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.text_encoder_2.to("cpu")
+            torch.cuda.empty_cache()
+        image = image.to(device=device, dtype=dtype)
+        if image.shape[1] == 4:  # Image already in latents form
+            init_latents = image
+        else:
+            # Make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                image = image.to(torch.float32)
+                self.vae.to(torch.float32)
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+            init_latents = init_latents.to(dtype)
+            init_latents = self.vae.config.scaling_factor * init_latents
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # Expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+        return noise, init_latents
+    @property
+    def structure_guidance_scale(self):
+        return self._guidance_scale if self._structure_guidance_scale is None else self._structure_guidance_scale
+    @property
+    def appearance_guidance_scale(self):
+        return self._guidance_scale if self._appearance_guidance_scale is None else self._appearance_guidance_scale
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,  # TODO: Support prompt_2 and negative_prompt_2
+        structure_prompt: Optional[Union[str, List[str]]] = None,
+        appearance_prompt: Optional[Union[str, List[str]]] = None,
+        structure_image: Optional[PipelineImageInput] = None,
+        appearance_image: Optional[PipelineImageInput] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        positive_prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        guidance_scale: float = 5.0,
+        structure_guidance_scale: Optional[float] = None,
+        appearance_guidance_scale: Optional[float] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        structure_latents: Optional[torch.Tensor] = None,
+        appearance_latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,  # Positive prompt is concatenated with prompt, so no embeddings
+        structure_prompt_embeds: Optional[torch.Tensor] = None,
+        appearance_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        structure_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        appearance_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        control_schedule: Optional[Dict] = None,
+        self_recurrence_schedule: Optional[List[int]] = [],  # Format: [(start, end, num_repeat)]
+        decode_structure: Optional[bool] = True,
+        decode_appearance: Optional[bool] = True,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        # TODO: Add function argument documentation
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        # 0. Default height and width to U-Net
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(  # TODO: Custom check_inputs for our method
+            prompt,
+            None,  # prompt_2
+            height,
+            width,
+            callback_steps,
+            negative_prompt = negative_prompt,
+            negative_prompt_2 = None,  # negative_prompt_2
+            prompt_embeds = prompt_embeds,
+            negative_prompt_embeds = negative_prompt_embeds,
+            pooled_prompt_embeds = pooled_prompt_embeds,
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs = callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._structure_guidance_scale = structure_guidance_scale
+        self._appearance_guidance_scale = appearance_guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = None  # denoising_end
+        self._denoising_start = None  # denoising_start
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if batch_size * num_images_per_prompt != 1:
+            raise ValueError(
+                f"Pipeline currently does not support batch_size={batch_size} and num_images_per_prompt=1. "
+                "Effective batch size (batch_size * num_images_per_prompt) must be 1."
+            )
+        device = self._execution_device
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        if positive_prompt is not None and positive_prompt != "":
+            prompt = prompt + ", " + positive_prompt  # Add positive prompt with comma
+            # By default, only add positive prompt to the appearance prompt and not the structure prompt
+            if appearance_prompt is not None and appearance_prompt != "":
+                appearance_prompt = appearance_prompt + ", " + positive_prompt
+        (
+            prompt_embeds_,
+            negative_prompt_embeds,
+            pooled_prompt_embeds_,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt = prompt,
+            prompt_2 = None,  # prompt_2
+            device = device,
+            num_images_per_prompt = num_images_per_prompt,
+            do_classifier_free_guidance = True,  # self.do_classifier_free_guidance, TODO: Support no CFG
+            negative_prompt = negative_prompt,
+            negative_prompt_2 = None,  # negative_prompt_2
+            prompt_embeds = prompt_embeds,
+            negative_prompt_embeds = negative_prompt_embeds,
+            pooled_prompt_embeds = pooled_prompt_embeds,
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds,
+            lora_scale = text_encoder_lora_scale,
+            clip_skip = self.clip_skip,
+        )
+        prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds_], dim=0).to(device)
+        add_text_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds_], dim=0).to(device)
+        # 3.1. Structure prompt embeddings
+        if structure_prompt is not None and structure_prompt != "":
+            (
+                structure_prompt_embeds,
+                negative_structure_prompt_embeds,
+                structure_pooled_prompt_embeds,
+                negative_structure_pooled_prompt_embeds,
+            ) = self.encode_prompt(
+                prompt = structure_prompt,
+                prompt_2 = None,  # prompt_2
+                device = device,
+                num_images_per_prompt = num_images_per_prompt,
+                do_classifier_free_guidance = True,  # self.do_classifier_free_guidance, TODO: Support no CFG
+                negative_prompt = negative_prompt if structure_image is None else "",
+                negative_prompt_2 = None,  # negative_prompt_2
+                prompt_embeds = structure_prompt_embeds,
+                negative_prompt_embeds = None,  # negative_prompt_embeds
+                pooled_prompt_embeds = structure_pooled_prompt_embeds,
+                negative_pooled_prompt_embeds = None,  # negative_pooled_prompt_embeds
+                lora_scale = text_encoder_lora_scale,
+                clip_skip = self.clip_skip,
+            )
+            structure_prompt_embeds = torch.cat(
+                [negative_structure_prompt_embeds, structure_prompt_embeds], dim=0
+            ).to(device)
+            structure_add_text_embeds = torch.cat(
+                [negative_structure_pooled_prompt_embeds, structure_pooled_prompt_embeds], dim=0
+            ).to(device)
+        else:
+            structure_prompt_embeds = prompt_embeds
+            structure_add_text_embeds = add_text_embeds
+        # 3.2. Appearance prompt embeddings
+        if appearance_prompt is not None and appearance_prompt != "":
+            (
+                appearance_prompt_embeds,
+                negative_appearance_prompt_embeds,
+                appearance_pooled_prompt_embeds,
+                negative_appearance_pooled_prompt_embeds,
+            ) = self.encode_prompt(
+                prompt = appearance_prompt,
+                prompt_2 = None,  # prompt_2
+                device = device,
+                num_images_per_prompt = num_images_per_prompt,
+                do_classifier_free_guidance = True,  # self.do_classifier_free_guidance, TODO: Support no CFG
+                negative_prompt = negative_prompt if appearance_image is None else "",
+                negative_prompt_2 = None,  # negative_prompt_2
+                prompt_embeds = appearance_prompt_embeds,
+                negative_prompt_embeds = None,  # negative_prompt_embeds
+                pooled_prompt_embeds = appearance_pooled_prompt_embeds,  # pooled_prompt_embeds
+                negative_pooled_prompt_embeds = None,  # negative_pooled_prompt_embeds
+                lora_scale = text_encoder_lora_scale,
+                clip_skip = self.clip_skip,
+            )
+            appearance_prompt_embeds = torch.cat(
+                [negative_appearance_prompt_embeds, appearance_prompt_embeds], dim=0
+            ).to(device)
+            appearance_add_text_embeds = torch.cat(
+                [negative_appearance_pooled_prompt_embeds, appearance_pooled_prompt_embeds], dim=0
+            ).to(device)
+        else:
+            appearance_prompt_embeds = prompt_embeds
+            appearance_add_text_embeds = add_text_embeds
+        # 3.3. Prepare added time ids & embeddings, TODO: Support no CFG
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype = prompt_embeds.dtype,
+            text_encoder_projection_dim = text_encoder_projection_dim,
+        )
+        negative_add_time_ids = add_time_ids
+        add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0).to(device)
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents, _ = self.prepare_latents(
+            None, batch_size, num_images_per_prompt, num_channels_latents, height, width,
+            prompt_embeds.dtype, device, generator, latents
+        )
+        if structure_image is not None:
+            structure_image = preprocess(  # Center crop + resize
+                structure_image, self.image_processor, height=height, width=width, resize_mode="crop"
+            )
+            _, clean_structure_latents = self.prepare_latents(
+                structure_image, batch_size, num_images_per_prompt, num_channels_latents, height, width,
+                prompt_embeds.dtype, device, generator, structure_latents,
+            )
+        else:
+            clean_structure_latents = None
+        structure_latents = latents if structure_latents is None else structure_latents
+        if appearance_image is not None:
+            appearance_image = preprocess(  # Center crop + resize
+                appearance_image, self.image_processor, height=height, width=width, resize_mode="crop"
+            )
+            _, clean_appearance_latents = self.prepare_latents(
+                appearance_image, batch_size, num_images_per_prompt, num_channels_latents, height, width,
+                prompt_embeds.dtype, device, generator, appearance_latents,
+            )
+        else:
+            clean_appearance_latents = None
+        appearance_latents = latents if appearance_latents is None else appearance_latents
+        # 6. Prepare extra step kwargs
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        # 7.1 Apply denoising_end
+        def denoising_value_valid(dnv):
+            return isinstance(self.denoising_end, float) and 0 < dnv < 1
+        if (
+            self.denoising_end is not None
+            and self.denoising_start is not None
+            and denoising_value_valid(self.denoising_end)
+            and denoising_value_valid(self.denoising_start)
+            and self.denoising_start >= self.denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {self.denoising_end} when using type float."
+            )
+        elif self.denoising_end is not None and denoising_value_valid(self.denoising_end):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+        # 7.2 Optionally get guidance scale embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:  # TODO: Make guidance scale embedding work with batch_order
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        # 7.3 Get batch order
+        batch_order = deepcopy(BATCH_ORDER)
+        if structure_image is not None:  # If image is provided, not generating, so no CFG needed
+            batch_order.remove("structure_uncond")
+        if appearance_image is not None:
+            batch_order.remove("appearance_uncond")
+        structure_control_stop_i, appearance_control_stop_i = get_last_control_i(control_schedule, num_inference_steps)
+        if self_recurrence_schedule is None:
+            self_recurrence_schedule = [0] * num_inference_steps
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                if i == structure_control_stop_i:  # If not generating structure/appearance, drop after last control
+                    if "structure_uncond" not in batch_order:
+                        batch_order.remove("structure_cond")
+                if i == appearance_control_stop_i:
+                    if "appearance_uncond" not in batch_order:
+                        batch_order.remove("appearance_cond")
+                register_attr(self, t=t.item(), do_control=True, batch_order=batch_order)
+                # TODO: For now, assume we are doing classifier-free guidance, support no CF-guidance later
+                latent_model_input = self.scheduler.scale_model_input(latents, t)
+                structure_latent_model_input = self.scheduler.scale_model_input(structure_latents, t)
+                appearance_latent_model_input = self.scheduler.scale_model_input(appearance_latents, t)
+                all_latent_model_input = {
+                    "structure_uncond": structure_latent_model_input[0:1],
+                    "appearance_uncond": appearance_latent_model_input[0:1],
+                    "uncond": latent_model_input[0:1],
+                    "structure_cond": structure_latent_model_input[0:1],
+                    "appearance_cond": appearance_latent_model_input[0:1],
+                    "cond": latent_model_input[0:1],
+                }
+                all_prompt_embeds = {
+                    "structure_uncond": structure_prompt_embeds[0:1],
+                    "appearance_uncond": appearance_prompt_embeds[0:1],
+                    "uncond": prompt_embeds[0:1],
+                    "structure_cond": structure_prompt_embeds[1:2],
+                    "appearance_cond": appearance_prompt_embeds[1:2],
+                    "cond": prompt_embeds[1:2],
+                }
+                all_add_text_embeds = {
+                    "structure_uncond": structure_add_text_embeds[0:1],
+                    "appearance_uncond": appearance_add_text_embeds[0:1],
+                    "uncond": add_text_embeds[0:1],
+                    "structure_cond": structure_add_text_embeds[1:2],
+                    "appearance_cond": appearance_add_text_embeds[1:2],
+                    "cond": add_text_embeds[1:2],
+                }
+                all_time_ids = {
+                    "structure_uncond": add_time_ids[0:1],
+                    "appearance_uncond": add_time_ids[0:1],
+                    "uncond": add_time_ids[0:1],
+                    "structure_cond": add_time_ids[1:2],
+                    "appearance_cond": add_time_ids[1:2],
+                    "cond": add_time_ids[1:2],
+                }
+                concat_latent_model_input = batch_dict_to_tensor(all_latent_model_input, batch_order)
+                concat_prompt_embeds = batch_dict_to_tensor(all_prompt_embeds, batch_order)
+                concat_add_text_embeds = batch_dict_to_tensor(all_add_text_embeds, batch_order)
+                concat_add_time_ids = batch_dict_to_tensor(all_time_ids, batch_order)
+                # Predict the noise residual
+                added_cond_kwargs = {"text_embeds": concat_add_text_embeds, "time_ids": concat_add_time_ids}
+                concat_noise_pred = self.unet(
+                    concat_latent_model_input,
+                    t,
+                    encoder_hidden_states = concat_prompt_embeds,
+                    timestep_cond = timestep_cond,
+                    cross_attention_kwargs = self.cross_attention_kwargs,
+                    added_cond_kwargs = added_cond_kwargs,
+                ).sample
+                all_noise_pred = batch_tensor_to_dict(concat_noise_pred, batch_order)
+                # Classifier-free guidance, TODO: Support no CFG
+                noise_pred = all_noise_pred["uncond"] +\
+                    self.guidance_scale * (all_noise_pred["cond"] - all_noise_pred["uncond"])
+                structure_noise_pred = all_noise_pred["structure_cond"]\
+                    if "structure_cond" in batch_order else noise_pred
+                if "structure_uncond" in all_noise_pred:
+                    structure_noise_pred = all_noise_pred["structure_uncond"] +\
+                        self.structure_guidance_scale * (structure_noise_pred - all_noise_pred["structure_uncond"])
+                appearance_noise_pred = all_noise_pred["appearance_cond"]\
+                    if "appearance_cond" in batch_order else noise_pred
+                if "appearance_uncond" in all_noise_pred:
+                    appearance_noise_pred = all_noise_pred["appearance_uncond"] +\
+                        self.appearance_guidance_scale * (appearance_noise_pred - all_noise_pred["appearance_uncond"])
+                if self.guidance_rescale > 0.0:
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred, all_noise_pred["cond"], guidance_rescale=self.guidance_rescale
+                    )
+                    if "structure_uncond" in all_noise_pred:
+                        structure_noise_pred = rescale_noise_cfg(
+                            structure_noise_pred, all_noise_pred["structure_cond"],
+                            guidance_rescale=self.guidance_rescale
+                        )
+                    if "appearance_uncond" in all_noise_pred:
+                        appearance_noise_pred = rescale_noise_cfg(
+                            appearance_noise_pred, all_noise_pred["appearance_cond"],
+                            guidance_rescale=self.guidance_rescale
+                        )
+                # Compute the previous noisy sample x_t -> x_t-1
+                concat_noise_pred = torch.cat(
+                    [structure_noise_pred, appearance_noise_pred, noise_pred], dim=0,
+                )
+                concat_latents = torch.cat(
+                    [structure_latents, appearance_latents, latents], dim=0,
+                )
+                structure_latents, appearance_latents, latents = self.scheduler.step(
+                    concat_noise_pred, t, concat_latents, **extra_step_kwargs,
+                ).prev_sample.chunk(3)
+                if clean_structure_latents is not None:
+                    structure_latents = noise_prev(self.scheduler, t, clean_structure_latents)
+                if clean_appearance_latents is not None:
+                    appearance_latents = noise_prev(self.scheduler, t, clean_appearance_latents)
+                # Self-recurrence
+                for _ in range(self_recurrence_schedule[i]):
+                    if hasattr(self.scheduler, "_step_index"):  # For fancier schedulers
+                        self.scheduler._step_index -= 1  # TODO: Does this actually work?
+                    t_prev = 0 if i + 1 >= num_inference_steps else timesteps[i + 1]
+                    latents = noise_t2t(self.scheduler, t_prev, t, latents)
+                    latent_model_input = torch.cat([latents] * 2)
+                    register_attr(self, t=t.item(), do_control=False, batch_order=["uncond", "cond"])
+                    # Predict the noise residual
+                    added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                    noise_pred_uncond, noise_pred_ = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states = prompt_embeds,
+                        timestep_cond = timestep_cond,
+                        cross_attention_kwargs = self.cross_attention_kwargs,
+                        added_cond_kwargs = added_cond_kwargs,
+                    ).sample.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_ - noise_pred_uncond)
+                    if self.guidance_rescale > 0.0:
+                        noise_pred = rescale_noise_cfg(noise_pred, noise_pred_, guidance_rescale=self.guidance_rescale)
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # Callbacks
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        # "Reconstruction"
+        if clean_structure_latents is not None:
+            structure_latents = clean_structure_latents
+        if clean_appearance_latents is not None:
+            appearance_latents = clean_appearance_latents
+        # For passing important information onto the refiner
+        self.refiner_args = {"latents": latents.detach(), "prompt": prompt, "negative_prompt": negative_prompt}
+        if not output_type == "latent":
+            # Make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                self.vae.to(torch.float32)  # self.upcast_vae() is buggy
+                latents = latents.to(torch.float32)
+                structure_latents = structure_latents.to(torch.float32)
+                appearance_latents = appearance_latents.to(torch.float32)
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+            if decode_structure:
+                structure = self.vae.decode(structure_latents / self.vae.config.scaling_factor, return_dict=False)[0]
+                structure = self.image_processor.postprocess(structure, output_type=output_type)
+            else:
+                structure = structure_latents
+            if decode_appearance:
+                appearance = self.vae.decode(appearance_latents / self.vae.config.scaling_factor, return_dict=False)[0]
+                appearance = self.image_processor.postprocess(appearance, output_type=output_type)
+            else:
+                appearance = appearance_latents
+            # Cast back to fp16 if needed
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype=torch.float16)
+        else:
+            return CtrlXStableDiffusionXLPipelineOutput(
+                images=latents, structures=structure_latents, appearances=appearance_latents
+            )
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, structure, appearance)
+        return CtrlXStableDiffusionXLPipelineOutput(images=image, structures=structure, appearances=appearance)

ctrl_x/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .feature import *
+from .media import *
+from .utils import *

ctrl_x/utils/feature.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import math
+import torch.nn.functional as F
+from .utils import *
+def get_schedule(timesteps, schedule):
+    end = round(len(timesteps) * schedule)
+    timesteps = timesteps[:end]
+    return timesteps
+def get_elem(l, i, default=0.0):
+    if i >= len(l):
+        return default
+    return l[i]
+def pad_list(l_1, l_2, pad=0.0):
+    max_len = max(len(l_1), len(l_2))
+    l_1 = l_1 + [pad] * (max_len - len(l_1))
+    l_2 = l_2 + [pad] * (max_len - len(l_2))
+    return l_1, l_2
+def normalize(x, dim):
+    x_mean = x.mean(dim=dim, keepdim=True)
+    x_std = x.std(dim=dim, keepdim=True)
+    x_normalized = (x - x_mean) / x_std
+    return x_normalized
+# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+def appearance_mean_std(q_c_normed, k_s_normed, v_s):  # c: content, s: style
+    q_c = q_c_normed  # q_c and k_s must be projected from normalized features
+    k_s = k_s_normed
+    scale_factor = 1 / math.sqrt(q_c.shape[-1])
+    # My notation below is very jank: D = (H W) is number of tokens, and C is token dimension
+    # Horrible notation coming from how self-attention dimensions work in Stable Diffusion
+    A = q_c @ k_s.mT  # (B H D C/H) (B H C/H D)^T -> (B H D D)
+    A = F.softmax(A * scale_factor, dim=-1)  # Softmax on last D in (B H D D)
+    mean = A @ v_s  # (B H D D) (B H D C/H) -> (B H D C/H)
+    std = (A @ v_s.square() - mean.square()).relu().sqrt()
+    return mean, std
+def feature_injection(features, batch_order):
+    assert features.shape[0] % len(batch_order) == 0
+    features_dict = batch_tensor_to_dict(features, batch_order)
+    features_dict["cond"] = features_dict["structure_cond"]
+    features = batch_dict_to_tensor(features_dict, batch_order)
+    return features
+def appearance_transfer(features, q_normed, k_normed, batch_order, v=None, reshape_fn=None):
+    assert features.shape[0] % len(batch_order) == 0
+    features_dict = batch_tensor_to_dict(features, batch_order)
+    q_normed_dict = batch_tensor_to_dict(q_normed, batch_order)
+    k_normed_dict = batch_tensor_to_dict(k_normed, batch_order)
+    v_dict = features_dict
+    if v is not None:
+        v_dict = batch_tensor_to_dict(v, batch_order)
+    mean_cond, std_cond = appearance_mean_std(
+        q_normed_dict["cond"], k_normed_dict["appearance_cond"], v_dict["appearance_cond"],
+    )
+    if reshape_fn is not None:
+        mean_cond = reshape_fn(mean_cond)
+        std_cond = reshape_fn(std_cond)
+    features_dict["cond"] = std_cond * normalize(features_dict["cond"], dim=-2) + mean_cond
+    features = batch_dict_to_tensor(features_dict, batch_order)
+    return features

ctrl_x/utils/media.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import numpy as np
+import torch
+import torchvision.transforms.functional as vF
+import PIL
+JPEG_QUALITY = 95
+def preprocess(image, processor, **kwargs):
+    if isinstance(image, PIL.Image.Image):
+        pass
+    elif isinstance(image, np.ndarray):
+        image = PIL.Image.fromarray(image)
+    elif isinstance(image, torch.Tensor):
+        image = vF.to_pil_image(image)
+    else:
+        raise TypeError(f"Image must be of type PIL.Image, np.ndarray, or torch.Tensor, got {type(image)} instead.")
+    image = processor.preprocess(image, **kwargs)
+    return image

ctrl_x/utils/sdxl.py ADDED Viewed

	@@ -0,0 +1,274 @@

+from types import MethodType
+from typing import Optional
+from diffusers.models.attention_processor import Attention
+import torch
+import torch.nn.functional as F
+from .feature import *
+from .utils import *
+def convolution_forward(  # From <class 'diffusers.models.resnet.ResnetBlock2D'>, forward (diffusers==0.28.0)
+    self,
+    input_tensor: torch.Tensor,
+    temb: torch.Tensor,
+    *args,
+    **kwargs,
+) -> torch.Tensor:
+    do_structure_control = self.do_control and self.t in self.structure_schedule
+    hidden_states = input_tensor
+    hidden_states = self.norm1(hidden_states)
+    hidden_states = self.nonlinearity(hidden_states)
+    if self.upsample is not None:
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            input_tensor = input_tensor.contiguous()
+            hidden_states = hidden_states.contiguous()
+        input_tensor = self.upsample(input_tensor)
+        hidden_states = self.upsample(hidden_states)
+    elif self.downsample is not None:
+        input_tensor = self.downsample(input_tensor)
+        hidden_states = self.downsample(hidden_states)
+    hidden_states = self.conv1(hidden_states)
+    if self.time_emb_proj is not None:
+        if not self.skip_time_act:
+            temb = self.nonlinearity(temb)
+        temb = self.time_emb_proj(temb)[:, :, None, None]
+    if self.time_embedding_norm == "default":
+        if temb is not None:
+            hidden_states = hidden_states + temb
+        hidden_states = self.norm2(hidden_states)
+    elif self.time_embedding_norm == "scale_shift":
+        if temb is None:
+            raise ValueError(
+                f" `temb` should not be None when `time_embedding_norm` is {self.time_embedding_norm}"
+            )
+        time_scale, time_shift = torch.chunk(temb, 2, dim=1)
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = hidden_states * (1 + time_scale) + time_shift
+    else:
+        hidden_states = self.norm2(hidden_states)
+    hidden_states = self.nonlinearity(hidden_states)
+    hidden_states = self.dropout(hidden_states)
+    hidden_states = self.conv2(hidden_states)
+    # Feature injection and AdaIN (hidden_states)
+    if do_structure_control and "hidden_states" in self.structure_target:
+        hidden_states = feature_injection(hidden_states, batch_order=self.batch_order)
+    if self.conv_shortcut is not None:
+        input_tensor = self.conv_shortcut(input_tensor)
+    output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+    # Feature injection and AdaIN (output_tensor)
+    if do_structure_control and "output_tensor" in self.structure_target:
+        output_tensor = feature_injection(output_tensor, batch_order=self.batch_order)
+    return output_tensor
+class AttnProcessor2_0:  # From <class 'diffusers.models.attention_processor.AttnProcessor2_0'> (diffusers==0.28.0)
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        do_structure_control = attn.do_control and attn.t in attn.structure_schedule
+        do_appearance_control = attn.do_control and attn.t in attn.appearance_schedule
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        no_encoder_hidden_states = encoder_hidden_states is None
+        if no_encoder_hidden_states:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        if do_appearance_control:  # Assume we only have this for self attention
+            hidden_states_normed = normalize(hidden_states, dim=-2)  # B H D C
+            encoder_hidden_states_normed = normalize(encoder_hidden_states, dim=-2)
+            query_normed = attn.to_q(hidden_states_normed)
+            key_normed = attn.to_k(encoder_hidden_states_normed)
+            inner_dim = key_normed.shape[-1]
+            head_dim = inner_dim // attn.heads
+            query_normed = query_normed.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key_normed = key_normed.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            # Match query and key injection with structure injection (if injection is happening this layer)
+            if do_structure_control:
+                if "query" in attn.structure_target:
+                    query_normed = feature_injection(query_normed, batch_order=attn.batch_order)
+                if "key" in attn.structure_target:
+                    key_normed = feature_injection(key_normed, batch_order=attn.batch_order)
+        # Appearance transfer (before)
+        if do_appearance_control and "before" in attn.appearance_target:
+            hidden_states = hidden_states.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            hidden_states = appearance_transfer(hidden_states, query_normed, key_normed, batch_order=attn.batch_order)
+            hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+            if no_encoder_hidden_states:
+                encoder_hidden_states = hidden_states
+            elif attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # Feature injection (query, key, and/or value)
+        if do_structure_control:
+            if "query" in attn.structure_target:
+                query = feature_injection(query, batch_order=attn.batch_order)
+            if "key" in attn.structure_target:
+                key = feature_injection(key, batch_order=attn.batch_order)
+            if "value" in attn.structure_target:
+                value = feature_injection(value, batch_order=attn.batch_order)
+        # Appearance transfer (value)
+        if do_appearance_control and "value" in attn.appearance_target:
+            value = appearance_transfer(value, query_normed, key_normed, batch_order=attn.batch_order)
+        # The output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        # Appearance transfer (after)
+        if do_appearance_control and "after" in attn.appearance_target:
+            hidden_states = appearance_transfer(hidden_states, query_normed, key_normed, batch_order=attn.batch_order)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # Linear projection
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # Dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+def register_control(
+    model,
+    timesteps,
+    control_schedule,  # structure_conv, structure_attn, appearance_attn
+    control_target = [["output_tensor"], ["query", "key"], ["before"]],
+):
+    # Assume timesteps in reverse order (T -> 0)
+    for block_type in ["encoder", "decoder", "middle"]:
+        blocks = {
+            "encoder": model.unet.down_blocks,
+            "decoder": model.unet.up_blocks,
+            "middle": [model.unet.mid_block],
+        }[block_type]
+        control_schedule_block = control_schedule[block_type]
+        if block_type == "middle":
+            control_schedule_block = [control_schedule_block]
+        for layer in range(len(control_schedule_block)):
+            # Convolution
+            num_blocks = len(blocks[layer].resnets) if hasattr(blocks[layer], "resnets") else 0
+            for block in range(num_blocks):
+                convolution = blocks[layer].resnets[block]
+                convolution.structure_target = control_target[0]
+                convolution.structure_schedule = get_schedule(
+                    timesteps, get_elem(control_schedule_block[layer][0], block)
+                )
+                convolution.forward = MethodType(convolution_forward, convolution)
+            # Self-attention
+            num_blocks = len(blocks[layer].attentions) if hasattr(blocks[layer], "attentions") else 0
+            for block in range(num_blocks):
+                for transformer_block in blocks[layer].attentions[block].transformer_blocks:
+                    attention = transformer_block.attn1
+                    attention.structure_target = control_target[1]
+                    attention.structure_schedule = get_schedule(
+                        timesteps, get_elem(control_schedule_block[layer][1], block)
+                    )
+                    attention.appearance_target = control_target[2]
+                    attention.appearance_schedule = get_schedule(
+                        timesteps, get_elem(control_schedule_block[layer][2], block)
+                    )
+                    attention.processor = AttnProcessor2_0()
+def register_attr(model, t, do_control, batch_order):
+    for layer_type in ["encoder", "decoder", "middle"]:
+        blocks = {"encoder": model.unet.down_blocks, "decoder": model.unet.up_blocks,
+                  "middle": [model.unet.mid_block]}[layer_type]
+        for layer in blocks:
+            # Convolution
+            for module in layer.resnets:
+                module.t = t
+                module.do_control = do_control
+                module.batch_order = batch_order
+            # Self-attention
+            if hasattr(layer, "attentions"):
+                for block in layer.attentions:
+                    for module in block.transformer_blocks:
+                        module.attn1.t = t
+                        module.attn1.do_control = do_control
+                        module.attn1.batch_order = batch_order

ctrl_x/utils/utils.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+JPEG_QUALITY = 95
+def exists(x):
+    return x is not None
+def get(x, default):
+    if exists(x):
+        return x
+    return default
+def get_self_recurrence_schedule(schedule, num_inference_steps):
+    self_recurrence_schedule = [0] * num_inference_steps
+    for schedule_current in reversed(schedule):
+        if schedule_current is None or len(schedule_current) == 0:
+            continue
+        [start, end, repeat] = schedule_current
+        start_i = round(num_inference_steps * start)
+        end_i = round(num_inference_steps * end)
+        for i in range(start_i, end_i):
+            self_recurrence_schedule[i] = repeat
+    return self_recurrence_schedule
+def batch_dict_to_tensor(batch_dict, batch_order):
+    batch_tensor = []
+    for batch_type in batch_order:
+        batch_tensor.append(batch_dict[batch_type])
+    batch_tensor = torch.cat(batch_tensor, dim=0)
+    return batch_tensor
+def batch_tensor_to_dict(batch_tensor, batch_order):
+    batch_tensor_chunk = batch_tensor.chunk(len(batch_order))
+    batch_dict = {}
+    for i, batch_type in enumerate(batch_order):
+        batch_dict[batch_type] = batch_tensor_chunk[i]
+    return batch_dict
+def noise_prev(scheduler, timestep, x_0, noise=None):
+    if scheduler.num_inference_steps is None:
+        raise ValueError(
+            "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+        )
+    if noise is None:
+        noise = torch.randn_like(x_0).to(x_0)
+    # From DDIMScheduler step function (hopefully this works)
+    timestep_i = (scheduler.timesteps == timestep).nonzero(as_tuple=True)[0][0].item()
+    if timestep_i + 1 >= scheduler.timesteps.shape[0]:  # We are at t = 0 (ish)
+        return x_0
+    prev_timestep = scheduler.timesteps[timestep_i + 1:timestep_i + 2]  # Make sure t is not 0-dim
+    x_t_prev = scheduler.add_noise(x_0, noise, prev_timestep)
+    return x_t_prev
+def noise_t2t(scheduler, timestep, timestep_target, x_t, noise=None):
+    assert timestep_target >= timestep
+    if noise is None:
+        noise = torch.randn_like(x_t).to(x_t)
+    alphas_cumprod = scheduler.alphas_cumprod.to(device=x_t.device, dtype=x_t.dtype)
+    timestep = timestep.to(torch.long)
+    timestep_target = timestep_target.to(torch.long)
+    alpha_prod_t = alphas_cumprod[timestep]
+    alpha_prod_tt = alphas_cumprod[timestep_target]
+    alpha_prod = alpha_prod_tt / alpha_prod_t
+    sqrt_alpha_prod = (alpha_prod ** 0.5).flatten()
+    while len(sqrt_alpha_prod.shape) < len(x_t.shape):
+        sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+    sqrt_one_minus_alpha_prod = ((1 - alpha_prod) ** 0.5).flatten()
+    while len(sqrt_one_minus_alpha_prod.shape) < len(x_t.shape):
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+    x_tt = sqrt_alpha_prod * x_t + sqrt_one_minus_alpha_prod * noise
+    return x_tt

docs/assets/bootstrap.min.css ADDED Viewed

The diff for this file is too large to render. See raw diff

docs/assets/cross_image_attention.jpg ADDED Viewed

Git LFS Details

SHA256: 74471768c9fff458ad3091524e97995ba1f7c2768b175026c3238a0f92f11ebe
Pointer size: 132 Bytes
Size of remote file: 2.3 MB

docs/assets/ctrl-x.jpg ADDED Viewed

Git LFS Details

SHA256: b5eee53a38a4a4c013487588a6ea771b85a8f3ef9cb6047da8550df731aba5a2
Pointer size: 132 Bytes
Size of remote file: 2.85 MB

docs/assets/font.css ADDED Viewed

	@@ -0,0 +1,37 @@

+/* Homepage Font */
+/* latin-ext */
+@font-face {
+  font-family: 'Lato';
+  font-style: normal;
+  font-weight: 400;
+  src: local('Lato Regular'), local('Lato-Regular'), url(https://fonts.gstatic.com/s/lato/v16/S6uyw4BMUTPHjxAwXjeu.woff2) format('woff2');
+  unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF;
+}
+/* latin */
+@font-face {
+  font-family: 'Lato';
+  font-style: normal;
+  font-weight: 400;
+  src: local('Lato Regular'), local('Lato-Regular'), url(https://fonts.gstatic.com/s/lato/v16/S6uyw4BMUTPHjx4wXg.woff2) format('woff2');
+  unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD;
+}
+/* latin-ext */
+@font-face {
+  font-family: 'Lato';
+  font-style: normal;
+  font-weight: 700;
+  src: local('Lato Bold'), local('Lato-Bold'), url(https://fonts.gstatic.com/s/lato/v16/S6u9w4BMUTPHh6UVSwaPGR_p.woff2) format('woff2');
+  unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF;
+}
+/* latin */
+@font-face {
+  font-family: 'Lato';
+  font-style: normal;
+  font-weight: 700;
+  src: local('Lato Bold'), local('Lato-Bold'), url(https://fonts.gstatic.com/s/lato/v16/S6u9w4BMUTPHh6UVSwiPGQ.woff2) format('woff2');
+  unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD;
+}

docs/assets/freecontrol.jpg ADDED Viewed

Git LFS Details

SHA256: dd3ecd3e30ab1bb2b2a4975cdc28cbc158147eb1a8281e11c24d3d1555d52162
Pointer size: 132 Bytes
Size of remote file: 1.19 MB

docs/assets/genforce.png ADDED Viewed

docs/assets/pipeline.jpg ADDED Viewed

Git LFS Details

SHA256: af6388fc737245419b8ac5a827802aba023433a5b13a9d4c4b88337938ac1a4c
Pointer size: 132 Bytes
Size of remote file: 1.12 MB

docs/assets/results_animatediff.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43e29629924da2f368048016b2bb4ee973d0d38dc6f868098b0d9fbd6ac2e8ea
+size 20573323

docs/assets/results_multi_subject.jpg ADDED Viewed

Git LFS Details

SHA256: 4ef6fdeb2edb368677da193271af001db94509566fec6c9fce84d95c0ee3e893
Pointer size: 132 Bytes
Size of remote file: 2.82 MB

docs/assets/results_struct+app.jpg ADDED Viewed

Git LFS Details

SHA256: 0a92eb6caf1365b7877968b308638d33ca3a4fe440a62a244c9dee060a35f59f
Pointer size: 132 Bytes
Size of remote file: 3.44 MB

docs/assets/results_struct+app_2.jpg ADDED Viewed

Git LFS Details

SHA256: f8e2baf23f336abb76aeefa1d960378c8e01acc47194e59026914047004d1c1d
Pointer size: 132 Bytes
Size of remote file: 2.52 MB

docs/assets/results_struct+prompt.jpg ADDED Viewed

Git LFS Details

SHA256: 9e2de2a7e09ea9da9e962b4bcffaea69179bbb6470977353a172e55b06df3d20
Pointer size: 132 Bytes
Size of remote file: 3.53 MB

docs/assets/style.css ADDED Viewed

	@@ -0,0 +1,139 @@

+/* Body */
+body {
+  background: #e3e5e8;
+  color: #ffffff;
+  font-family: 'Lato', Verdana, Helvetica, sans-serif;
+  font-weight: 300;
+  font-size: 14pt;
+}
+/* Hyperlinks */
+a {text-decoration: none;}
+a:link {color: #1772d0;}
+a:visited {color: #1772d0;}
+a:active {color: red;}
+a:hover {color: #f09228;}
+/* Pre-formatted Text */
+pre {
+  margin: 5pt 0;
+  border: 0;
+  font-size: 12pt;
+  background: #fcfcfc;
+}
+/* Project Page Style */
+/* Section */
+.section {
+  width: 768pt;
+  min-height: 100pt;
+  margin: 15pt auto;
+  padding: 20pt 30pt;
+  border: 1pt hidden #000;
+  text-align: justify;
+  color: #000000;
+  background: #ffffff;
+}
+/* Header (Title and Logo) */
+.section .header {
+  min-height: 80pt;
+  margin-top: 30pt;
+}
+.section .header .logo {
+  width: 80pt;
+  margin-left: 10pt;
+  float: left;
+}
+.section .header .logo img {
+  width: 80pt;
+  object-fit: cover;
+}
+.section .header .title {
+  margin: 0 120pt;
+  text-align: center;
+  font-size: 22pt;
+}
+/* Author */
+.section .author {
+  margin: 5pt 0;
+  text-align: center;
+  font-size: 16pt;
+}
+/* Institution */
+.section .institution {
+  margin: 5pt 0;
+  text-align: center;
+  font-size: 16pt;
+}
+/* Note */
+.section .note {
+  margin: 5pt 0;
+  text-align: center;
+  font-size: 12pt;
+}
+/* Hyperlink (such as Paper and Code) */
+.section .link {
+  margin: 5pt 0;
+  text-align: center;
+  font-size: 16pt;
+}
+/* Teaser */
+.section .teaser {
+  margin: 20pt 0;
+  text-align: center;
+}
+/* Section Title */
+.section .title {
+  text-align: center;
+  font-size: 22pt;
+  margin: 5pt 0 15pt 0;  /* top right bottom left */
+}
+/* Section Body */
+.section .body {
+  margin-bottom: 15pt;
+  text-align: justify;
+  font-size: 14pt;
+}
+/* BibTeX */
+.section .bibtex {
+  margin: 5pt 0;
+  text-align: left;
+  font-size: 22pt;
+}
+/* Related Work */
+.section .ref {
+  margin: 20pt 0 10pt 0;  /* top right bottom left */
+  text-align: left;
+  font-size: 18pt;
+  font-weight: bold;
+}
+/* Citation */
+.section .citation {
+  min-height: 60pt;
+  margin: 10pt 0;
+}
+.section .citation .image {
+  width: 120pt;
+  float: left;
+}
+.section .citation .image img {
+  max-height: 60pt;
+  width: 120pt;
+  object-fit: cover;
+}
+.section .citation .comment{
+  margin-left: 130pt;
+  text-align: left;
+  font-size: 14pt;
+}

docs/assets/teaser_github.jpg ADDED Viewed

Git LFS Details

SHA256: 403e32b1fad7e2a24e47da71f345b9028f08f09419b309ad5c739db7a45564d3
Pointer size: 132 Bytes
Size of remote file: 2.23 MB

docs/assets/teaser_small.jpg ADDED Viewed

Git LFS Details

SHA256: ceb5deec9fff40573b3b5dea7314854cd6d54e575af413f8b97a3feeaa4a1606
Pointer size: 132 Bytes
Size of remote file: 2.22 MB

docs/index.html ADDED Viewed

	@@ -0,0 +1,186 @@

+<!doctype html>
+<html lang="en">
+<!-- === Header Starts === -->
+<head>
+  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+  <title>Ctrl-X</title>
+  <link href="./assets/bootstrap.min.css" rel="stylesheet">
+  <link href="./assets/font.css" rel="stylesheet" type="text/css">
+  <link href="./assets/style.css" rel="stylesheet" type="text/css">
+</head>
+<!-- === Header Ends === -->
+<body>
+<!-- === Home Section Starts === -->
+<div class="section">
+  <!-- === Title Starts === -->
+  <div class="header">
+    <div class="logo">
+      <a href="https://genforce.github.io/" target="_blank"><img src="./assets/genforce.png"></a>
+    </div>
+    <div class="title", style="padding-top: 25pt;">  <!-- Set padding as 10 if title is with two lines. -->
+      Ctrl-X: Controlling Structure and Appearance for Text-To-Image Generation Without Guidance
+    </div>
+  </div>
+  <!-- === Title Ends === -->
+  <div class="author">
+    <a href="https://kuanhenglin.github.io" target="_blank">Kuan Heng Lin</a><sup>1</sup>*&nbsp;&nbsp;&nbsp;
+    <a href="https://sichengmo.github.io/" target="_blank">Sicheng Mo</a><sup>1</sup>*&nbsp;&nbsp;&nbsp;
+    <a href="https://bklingher.github.io" target="_blank">Ben Klingher</a><sup>1</sup>&nbsp;&nbsp;&nbsp;
+    <a href="https://pages.cs.wisc.edu/~fmu/" target="_blank">Fangzhou Mu</a><sup>2</sup>&nbsp;&nbsp;&nbsp;
+    <a href="https://boleizhou.github.io/" target="_blank">Bolei Zhou</a><sup>1</sup>
+  </div>
+  <div class="institution">
+    <sup>1</sup>UCLA&nbsp;&nbsp;&nbsp;
+    <sup>2</sup>NVIDIA
+  </div>
+  <div class="note">
+    *Equal contribution
+  </div>
+  <div class="title" style="font-size: 18pt;margin: 15pt 0 15pt 0">
+    NeurIPS 2024
+  </div>
+  <div class="link">
+    [<a href="https://arxiv.org/abs/2406.07540" target="_blank">Paper</a>]&nbsp;&nbsp;&nbsp;
+    [<a href="https://github.com/genforce/ctrl-x" target="_blank">Code</a>]
+  </div>
+  <div class="teaser">
+    <img src="assets/ctrl-x.jpg" width="85%">
+  </div>
+</div>
+<!-- === Home Section Ends === -->
+<!-- === Overview Section Starts === -->
+<div class="section">
+  <div class="title">Overview</div>
+  <div class="body">
+    We present <b>Ctrl-X</b>, a simple <i>training-free</i> and <i>guidance-free</i> framework for text-to-image (T2I) generation with structure and appearance control. Given user-provided structure and appearance images, Ctrl-X designs feedforward structure control to enable structure alignment with the structure image and semantic-aware appearance transfer to facilitate the appearance transfer from the appearance image. Ctrl-X supports novel structure control with arbitrary condition images of any modality, is significantly faster than prior training-free appearance transfer methods, and provides instant plug-and-play to any T2I and text-to-video (T2V) diffusion model.
+    <table width="100%" style="margin: 20pt 0; text-align: center;">
+      <tr>
+        <td><img src="assets/pipeline.jpg" width="85%"></td>
+      </tr>
+    </table>
+    <b>How does it work?</b>&nbsp;&nbsp;&nbsp;Given clean structure and appearance latents, we first obtain noised structure and appearance latents via the diffusion forward process, then extracting their U-Net features from a pretrained T2I diffusion model. When denoising the output latent, we inject convolution and self-attention features from the structure latent and leverage self-attention correspondence to transfer spatially-aware appearance statistics from the appearance latent to achieve structure and appearance control. We name our method "Ctrl-X" because we reformulate the controllable generation problem by 'cutting' (and 'pasting') structure preservation and semantic-aware stylization together.
+  </div>
+</div>
+<!-- === Overview Section Ends === -->
+<!-- === Result Section Starts === -->
+<div class="section">
+  <div class="title">Results: Structure and appearance control</div>
+  <div class="body">
+    Results of training-free and guidance-free T2I diffusion with structure and appearance control, where Ctrl-X supports a diverse variety of structure images, including natural images, ControlNet-supported conditions (e.g., canny maps, normal maps), and in-the-wild conditions (e.g., wireframes, 3D meshes). The base model here is <a href="https://arxiv.org/abs/2307.01952" target="_blank">Stable Diffusion XL v1.0</a>.
+    <!-- Adjust the number of rows and columns (EVERY project differs). -->
+    <table width="100%" style="margin: 20pt 0; text-align: center;">
+      <tr>
+        <td><img src="assets/results_struct+app.jpg" width="100%"></td>
+      </tr>
+    </table>
+    <table width="100%" style="margin: 20pt 0; text-align: center;">
+      <tr>
+        <td><img src="assets/results_struct+app_2.jpg" width="85%"></td>
+      </tr>
+    </table>
+  </div>
+</div>
+<div class="section">
+  <div class="title">Results: Multi-subject structure and appearance control</div>
+  <div class="body">
+    Ctrl-X is capable of multi-subject generation with semantic correspondence between appearance and structure images across both subjects and backgrounds. In comparison, <a href="https://arxiv.org/abs/2302.05543" target="_blank">ControlNet</a> + <a href="https://arxiv.org/abs/2308.06721" target="_blank">IP-Adapter</a> often fails at transferring all subject and background appearances.
+    <!-- Adjust the number of rows and columns (EVERY project differs). -->
+    <table width="100%" style="margin: 20pt 0; text-align: center;">
+      <tr>
+        <td><img src="assets/results_multi_subject.jpg" width="90%"></td>
+      </tr>
+    </table>
+  </div>
+</div>
+<div class="section">
+  <div class="title">Results: Prompt-driven conditional generation</div>
+  <div class="body">
+    Ctrl-X also supports prompt-driven conditional generation, where it generates an output image complying with the given text prompt while aligning with the structure of the structure image. Ctrl-X continues to support any structure image/condition type here as well. The base model here is <a href="https://arxiv.org/abs/2307.01952" target="_blank">Stable Diffusion XL v1.0</a>.
+    <!-- Adjust the number of rows and columns (EVERY project differs). -->
+    <table width="100%" style="margin: 20pt 0; text-align: center;">
+      <tr>
+        <td><img src="assets/results_struct+prompt.jpg" width="100%"></td>
+      </tr>
+    </table>
+  </div>
+</div>
+<div class="section">
+  <div class="title">Results: Extension to video generation</div>
+  <div class="body">
+    We can directly apply Ctrl-X to text-to-video (T2V) models. We show results of <a href="https://animatediff.github.io/" target="_blank">AnimateDiff v1.5.3</a> (with base model <a href="https://huggingface.co/SG161222/Realistic_Vision_V5.1_noVAE" target="_blank">Realistic Vision v5.1</a>) here.
+    <!-- Demo video here. Adjust the frame size based on the demo (EVERY project differs). -->
+    <div style="position: relative; padding-top: 50%; margin: 20pt 0; text-align: center;">
+      <iframe src="assets/results_animatediff.mp4" frameborder=0
+              style="position: absolute; top: 2.5%; left: 0%; width: 100%; height: 100%;"
+              allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture"
+              allowfullscreen></iframe>
+    </div>
+  </div>
+</div>
+<!-- === Result Section Ends === -->
+<!-- === Reference Section Starts === -->
+<div class="section">
+  <div class="bibtex">BibTeX</div>
+<pre>
+@inproceedings{lin2024ctrlx,
+    author = {Lin, {Kuan Heng} and Mo, Sicheng and Klingher, Ben and Mu, Fangzhou and Zhou, Bolei},
+    booktitle = {Advances in Neural Information Processing Systems},
+    title = {Ctrl-X: Controlling Structure and Appearance for Text-To-Image Generation Without Guidance},
+    year = {2024}
+}
+</pre>
+  <!-- BZ: we should give other related work enough credits, -->
+  <!--     so please include some most relevant work and leave some comment to summarize work and the difference. -->
+  <div class="ref">Related Work</div>
+  <div class="citation">
+    <div class="image"><img src="assets/freecontrol.jpg"></div>
+    <div class="comment">
+      <a href="https://genforce.github.io/freecontrol/" target="_blank">
+        Sicheng Mo, Fangzhou Mu, Kuan Heng Lin, Yanli Liu, Bochen Guan, Yin Li, Bolei Zhou.
+        FreeControl: Training-Free Spatial Control of Any Text-to-Image Diffusion Model with Any Condition.
+        CVPR 2024.</a><br>
+      <b>Comment:</b>
+      Training-free conditional generation by guidance in diffusion U-Net subspaces for structure control and appearance regularization.
+    </div>
+  </div>
+  <div class="citation">
+    <div class="image"><img src="assets/cross_image_attention.jpg"></div>
+    <div class="comment">
+      <a href="https://garibida.github.io/cross-image-attention/" target="_blank">
+        Yuval Alaluf, Daniel Garibi, Or Patashnik, Hadar Averbuch-Elor, Daniel Cohen-Or.
+        Cross-Image Attention for Zero-Shot Appearance Transfer.
+        SIGGRAPH 2024.</a><br>
+      <b>Comment:</b>
+      Guidance-free appearance transfer to natural images with self-attention key + value swaps via cross-image correspondence.
+    </div>
+  </div>
+</div>
+<!-- === Reference Section Ends === -->
+</body>
+</html>

environment.yaml ADDED Viewed

	@@ -0,0 +1,125 @@

+name: ctrlx
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2024.3.11=h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.0.13=h7f8727e_2
+  - pip=24.0=py310h06a4308_0
+  - python=3.10.14=h955ad1f_1
+  - readline=8.2=h5eee18b_0
+  - setuptools=69.5.1=py310h06a4308_0
+  - sqlite=3.45.3=h5eee18b_0
+  - tk=8.6.14=h39e8969_0
+  - wheel=0.43.0=py310h06a4308_0
+  - xz=5.4.6=h5eee18b_1
+  - zlib=1.2.13=h5eee18b_1
+  - pip:
+      - aiofiles==23.2.1
+      - altair==5.3.0
+      - annotated-types==0.7.0
+      - anyio==4.4.0
+      - attrs==23.2.0
+      - certifi==2024.2.2
+      - charset-normalizer==3.3.2
+      - click==8.1.7
+      - contourpy==1.2.1
+      - cycler==0.12.1
+      - diffusers==0.28.0
+      - dnspython==2.6.1
+      - einops==0.8.0
+      - email-validator==2.1.1
+      - exceptiongroup==1.2.1
+      - fastapi==0.111.0
+      - fastapi-cli==0.0.4
+      - ffmpy==0.3.2
+      - filelock==3.14.0
+      - fonttools==4.52.4
+      - fsspec==2024.5.0
+      - gradio==4.31.5
+      - gradio-client==0.16.4
+      - h11==0.14.0
+      - httpcore==1.0.5
+      - httptools==0.6.1
+      - httpx==0.27.0
+      - huggingface-hub==0.23.2
+      - idna==3.7
+      - importlib-metadata==7.1.0
+      - importlib-resources==6.4.0
+      - jinja2==3.1.4
+      - jsonschema==4.22.0
+      - jsonschema-specifications==2023.12.1
+      - kiwisolver==1.4.5
+      - markdown-it-py==3.0.0
+      - markupsafe==2.1.5
+      - matplotlib==3.9.0
+      - mdurl==0.1.2
+      - mpmath==1.3.0
+      - networkx==3.3
+      - numpy==1.26.4
+      - nvidia-cublas-cu12==12.1.3.1
+      - nvidia-cuda-cupti-cu12==12.1.105
+      - nvidia-cuda-nvrtc-cu12==12.1.105
+      - nvidia-cuda-runtime-cu12==12.1.105
+      - nvidia-cudnn-cu12==8.9.2.26
+      - nvidia-cufft-cu12==11.0.2.54
+      - nvidia-curand-cu12==10.3.2.106
+      - nvidia-cusolver-cu12==11.4.5.107
+      - nvidia-cusparse-cu12==12.1.0.106
+      - nvidia-nccl-cu12==2.20.5
+      - nvidia-nvjitlink-cu12==12.5.40
+      - nvidia-nvtx-cu12==12.1.105
+      - orjson==3.10.3
+      - packaging==24.0
+      - pandas==2.2.2
+      - pillow==10.3.0
+      - pydantic==2.7.2
+      - pydantic-core==2.18.3
+      - pydub==0.25.1
+      - pygments==2.18.0
+      - pyparsing==3.1.2
+      - python-dateutil==2.9.0.post0
+      - python-dotenv==1.0.1
+      - python-multipart==0.0.9
+      - pytz==2024.1
+      - pyyaml==6.0.1
+      - referencing==0.35.1
+      - regex==2024.5.15
+      - requests==2.32.2
+      - rich==13.7.1
+      - rpds-py==0.18.1
+      - ruff==0.4.6
+      - safetensors==0.4.3
+      - semantic-version==2.10.0
+      - shellingham==1.5.4
+      - six==1.16.0
+      - sniffio==1.3.1
+      - starlette==0.37.2
+      - sympy==1.12
+      - tokenizers==0.19.1
+      - tomlkit==0.12.0
+      - toolz==0.12.1
+      - torch==2.3.0
+      - torchvision==0.18.0
+      - tqdm==4.66.4
+      - transformers==4.41.1
+      - triton==2.3.0
+      - typer==0.12.3
+      - typing-extensions==4.12.0
+      - tzdata==2024.1
+      - ujson==5.10.0
+      - urllib3==2.2.1
+      - uvicorn==0.30.0
+      - uvloop==0.19.0
+      - watchfiles==0.22.0
+      - websockets==11.0.3
+      - zipp==3.19.0