first
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +2 -0
- ComfyUI/.gitignore +19 -0
- ComfyUI/CODEOWNERS +1 -0
- ComfyUI/LICENSE +674 -0
- ComfyUI/README.md +224 -0
- ComfyUI/app/app_settings.py +54 -0
- ComfyUI/app/user_manager.py +140 -0
- ComfyUI/comfy/checkpoint_pickle.py +13 -0
- ComfyUI/comfy/cldm/cldm.py +312 -0
- ComfyUI/comfy/cli_args.py +136 -0
- ComfyUI/comfy/clip_config_bigg.json +23 -0
- ComfyUI/comfy/clip_model.py +194 -0
- ComfyUI/comfy/clip_vision.py +117 -0
- ComfyUI/comfy/clip_vision_config_g.json +18 -0
- ComfyUI/comfy/clip_vision_config_h.json +18 -0
- ComfyUI/comfy/clip_vision_config_vitl.json +18 -0
- ComfyUI/comfy/conds.py +78 -0
- ComfyUI/comfy/controlnet.py +550 -0
- ComfyUI/comfy/diffusers_convert.py +266 -0
- ComfyUI/comfy/diffusers_load.py +36 -0
- ComfyUI/comfy/extra_samplers/uni_pc.py +875 -0
- ComfyUI/comfy/gligen.py +343 -0
- ComfyUI/comfy/k_diffusion/sampling.py +810 -0
- ComfyUI/comfy/k_diffusion/utils.py +313 -0
- ComfyUI/comfy/latent_formats.py +104 -0
- ComfyUI/comfy/ldm/cascade/common.py +161 -0
- ComfyUI/comfy/ldm/cascade/controlnet.py +93 -0
- ComfyUI/comfy/ldm/cascade/stage_a.py +258 -0
- ComfyUI/comfy/ldm/cascade/stage_b.py +257 -0
- ComfyUI/comfy/ldm/cascade/stage_c.py +274 -0
- ComfyUI/comfy/ldm/cascade/stage_c_coder.py +95 -0
- ComfyUI/comfy/ldm/models/autoencoder.py +228 -0
- ComfyUI/comfy/ldm/modules/attention.py +801 -0
- ComfyUI/comfy/ldm/modules/diffusionmodules/__init__.py +0 -0
- ComfyUI/comfy/ldm/modules/diffusionmodules/model.py +651 -0
- ComfyUI/comfy/ldm/modules/diffusionmodules/openaimodel.py +890 -0
- ComfyUI/comfy/ldm/modules/diffusionmodules/upscaling.py +85 -0
- ComfyUI/comfy/ldm/modules/diffusionmodules/util.py +306 -0
- ComfyUI/comfy/ldm/modules/distributions/__init__.py +0 -0
- ComfyUI/comfy/ldm/modules/distributions/distributions.py +92 -0
- ComfyUI/comfy/ldm/modules/ema.py +80 -0
- ComfyUI/comfy/ldm/modules/encoders/__init__.py +0 -0
- ComfyUI/comfy/ldm/modules/encoders/noise_aug_modules.py +35 -0
- ComfyUI/comfy/ldm/modules/sub_quadratic_attention.py +274 -0
- ComfyUI/comfy/ldm/modules/temporal_ae.py +245 -0
- ComfyUI/comfy/ldm/util.py +197 -0
- ComfyUI/comfy/lora.py +235 -0
- ComfyUI/comfy/model_base.py +492 -0
- ComfyUI/comfy/model_detection.py +364 -0
- ComfyUI/comfy/model_management.py +832 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
*.log
|
ComfyUI/.gitignore
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
*.py[cod]
|
3 |
+
/output/
|
4 |
+
/input/
|
5 |
+
!/input/example.png
|
6 |
+
/models/
|
7 |
+
/temp/
|
8 |
+
/custom_nodes/
|
9 |
+
!custom_nodes/example_node.py.example
|
10 |
+
extra_model_paths.yaml
|
11 |
+
/.vs
|
12 |
+
.idea/
|
13 |
+
venv/
|
14 |
+
/web/extensions/*
|
15 |
+
!/web/extensions/logging.js.example
|
16 |
+
!/web/extensions/core/
|
17 |
+
/tests-ui/data/object_info.json
|
18 |
+
/user/
|
19 |
+
ComfyUI-to-Python-Extension
|
ComfyUI/CODEOWNERS
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
* @comfyanonymous
|
ComfyUI/LICENSE
ADDED
@@ -0,0 +1,674 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
GNU GENERAL PUBLIC LICENSE
|
2 |
+
Version 3, 29 June 2007
|
3 |
+
|
4 |
+
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
5 |
+
Everyone is permitted to copy and distribute verbatim copies
|
6 |
+
of this license document, but changing it is not allowed.
|
7 |
+
|
8 |
+
Preamble
|
9 |
+
|
10 |
+
The GNU General Public License is a free, copyleft license for
|
11 |
+
software and other kinds of works.
|
12 |
+
|
13 |
+
The licenses for most software and other practical works are designed
|
14 |
+
to take away your freedom to share and change the works. By contrast,
|
15 |
+
the GNU General Public License is intended to guarantee your freedom to
|
16 |
+
share and change all versions of a program--to make sure it remains free
|
17 |
+
software for all its users. We, the Free Software Foundation, use the
|
18 |
+
GNU General Public License for most of our software; it applies also to
|
19 |
+
any other work released this way by its authors. You can apply it to
|
20 |
+
your programs, too.
|
21 |
+
|
22 |
+
When we speak of free software, we are referring to freedom, not
|
23 |
+
price. Our General Public Licenses are designed to make sure that you
|
24 |
+
have the freedom to distribute copies of free software (and charge for
|
25 |
+
them if you wish), that you receive source code or can get it if you
|
26 |
+
want it, that you can change the software or use pieces of it in new
|
27 |
+
free programs, and that you know you can do these things.
|
28 |
+
|
29 |
+
To protect your rights, we need to prevent others from denying you
|
30 |
+
these rights or asking you to surrender the rights. Therefore, you have
|
31 |
+
certain responsibilities if you distribute copies of the software, or if
|
32 |
+
you modify it: responsibilities to respect the freedom of others.
|
33 |
+
|
34 |
+
For example, if you distribute copies of such a program, whether
|
35 |
+
gratis or for a fee, you must pass on to the recipients the same
|
36 |
+
freedoms that you received. You must make sure that they, too, receive
|
37 |
+
or can get the source code. And you must show them these terms so they
|
38 |
+
know their rights.
|
39 |
+
|
40 |
+
Developers that use the GNU GPL protect your rights with two steps:
|
41 |
+
(1) assert copyright on the software, and (2) offer you this License
|
42 |
+
giving you legal permission to copy, distribute and/or modify it.
|
43 |
+
|
44 |
+
For the developers' and authors' protection, the GPL clearly explains
|
45 |
+
that there is no warranty for this free software. For both users' and
|
46 |
+
authors' sake, the GPL requires that modified versions be marked as
|
47 |
+
changed, so that their problems will not be attributed erroneously to
|
48 |
+
authors of previous versions.
|
49 |
+
|
50 |
+
Some devices are designed to deny users access to install or run
|
51 |
+
modified versions of the software inside them, although the manufacturer
|
52 |
+
can do so. This is fundamentally incompatible with the aim of
|
53 |
+
protecting users' freedom to change the software. The systematic
|
54 |
+
pattern of such abuse occurs in the area of products for individuals to
|
55 |
+
use, which is precisely where it is most unacceptable. Therefore, we
|
56 |
+
have designed this version of the GPL to prohibit the practice for those
|
57 |
+
products. If such problems arise substantially in other domains, we
|
58 |
+
stand ready to extend this provision to those domains in future versions
|
59 |
+
of the GPL, as needed to protect the freedom of users.
|
60 |
+
|
61 |
+
Finally, every program is threatened constantly by software patents.
|
62 |
+
States should not allow patents to restrict development and use of
|
63 |
+
software on general-purpose computers, but in those that do, we wish to
|
64 |
+
avoid the special danger that patents applied to a free program could
|
65 |
+
make it effectively proprietary. To prevent this, the GPL assures that
|
66 |
+
patents cannot be used to render the program non-free.
|
67 |
+
|
68 |
+
The precise terms and conditions for copying, distribution and
|
69 |
+
modification follow.
|
70 |
+
|
71 |
+
TERMS AND CONDITIONS
|
72 |
+
|
73 |
+
0. Definitions.
|
74 |
+
|
75 |
+
"This License" refers to version 3 of the GNU General Public License.
|
76 |
+
|
77 |
+
"Copyright" also means copyright-like laws that apply to other kinds of
|
78 |
+
works, such as semiconductor masks.
|
79 |
+
|
80 |
+
"The Program" refers to any copyrightable work licensed under this
|
81 |
+
License. Each licensee is addressed as "you". "Licensees" and
|
82 |
+
"recipients" may be individuals or organizations.
|
83 |
+
|
84 |
+
To "modify" a work means to copy from or adapt all or part of the work
|
85 |
+
in a fashion requiring copyright permission, other than the making of an
|
86 |
+
exact copy. The resulting work is called a "modified version" of the
|
87 |
+
earlier work or a work "based on" the earlier work.
|
88 |
+
|
89 |
+
A "covered work" means either the unmodified Program or a work based
|
90 |
+
on the Program.
|
91 |
+
|
92 |
+
To "propagate" a work means to do anything with it that, without
|
93 |
+
permission, would make you directly or secondarily liable for
|
94 |
+
infringement under applicable copyright law, except executing it on a
|
95 |
+
computer or modifying a private copy. Propagation includes copying,
|
96 |
+
distribution (with or without modification), making available to the
|
97 |
+
public, and in some countries other activities as well.
|
98 |
+
|
99 |
+
To "convey" a work means any kind of propagation that enables other
|
100 |
+
parties to make or receive copies. Mere interaction with a user through
|
101 |
+
a computer network, with no transfer of a copy, is not conveying.
|
102 |
+
|
103 |
+
An interactive user interface displays "Appropriate Legal Notices"
|
104 |
+
to the extent that it includes a convenient and prominently visible
|
105 |
+
feature that (1) displays an appropriate copyright notice, and (2)
|
106 |
+
tells the user that there is no warranty for the work (except to the
|
107 |
+
extent that warranties are provided), that licensees may convey the
|
108 |
+
work under this License, and how to view a copy of this License. If
|
109 |
+
the interface presents a list of user commands or options, such as a
|
110 |
+
menu, a prominent item in the list meets this criterion.
|
111 |
+
|
112 |
+
1. Source Code.
|
113 |
+
|
114 |
+
The "source code" for a work means the preferred form of the work
|
115 |
+
for making modifications to it. "Object code" means any non-source
|
116 |
+
form of a work.
|
117 |
+
|
118 |
+
A "Standard Interface" means an interface that either is an official
|
119 |
+
standard defined by a recognized standards body, or, in the case of
|
120 |
+
interfaces specified for a particular programming language, one that
|
121 |
+
is widely used among developers working in that language.
|
122 |
+
|
123 |
+
The "System Libraries" of an executable work include anything, other
|
124 |
+
than the work as a whole, that (a) is included in the normal form of
|
125 |
+
packaging a Major Component, but which is not part of that Major
|
126 |
+
Component, and (b) serves only to enable use of the work with that
|
127 |
+
Major Component, or to implement a Standard Interface for which an
|
128 |
+
implementation is available to the public in source code form. A
|
129 |
+
"Major Component", in this context, means a major essential component
|
130 |
+
(kernel, window system, and so on) of the specific operating system
|
131 |
+
(if any) on which the executable work runs, or a compiler used to
|
132 |
+
produce the work, or an object code interpreter used to run it.
|
133 |
+
|
134 |
+
The "Corresponding Source" for a work in object code form means all
|
135 |
+
the source code needed to generate, install, and (for an executable
|
136 |
+
work) run the object code and to modify the work, including scripts to
|
137 |
+
control those activities. However, it does not include the work's
|
138 |
+
System Libraries, or general-purpose tools or generally available free
|
139 |
+
programs which are used unmodified in performing those activities but
|
140 |
+
which are not part of the work. For example, Corresponding Source
|
141 |
+
includes interface definition files associated with source files for
|
142 |
+
the work, and the source code for shared libraries and dynamically
|
143 |
+
linked subprograms that the work is specifically designed to require,
|
144 |
+
such as by intimate data communication or control flow between those
|
145 |
+
subprograms and other parts of the work.
|
146 |
+
|
147 |
+
The Corresponding Source need not include anything that users
|
148 |
+
can regenerate automatically from other parts of the Corresponding
|
149 |
+
Source.
|
150 |
+
|
151 |
+
The Corresponding Source for a work in source code form is that
|
152 |
+
same work.
|
153 |
+
|
154 |
+
2. Basic Permissions.
|
155 |
+
|
156 |
+
All rights granted under this License are granted for the term of
|
157 |
+
copyright on the Program, and are irrevocable provided the stated
|
158 |
+
conditions are met. This License explicitly affirms your unlimited
|
159 |
+
permission to run the unmodified Program. The output from running a
|
160 |
+
covered work is covered by this License only if the output, given its
|
161 |
+
content, constitutes a covered work. This License acknowledges your
|
162 |
+
rights of fair use or other equivalent, as provided by copyright law.
|
163 |
+
|
164 |
+
You may make, run and propagate covered works that you do not
|
165 |
+
convey, without conditions so long as your license otherwise remains
|
166 |
+
in force. You may convey covered works to others for the sole purpose
|
167 |
+
of having them make modifications exclusively for you, or provide you
|
168 |
+
with facilities for running those works, provided that you comply with
|
169 |
+
the terms of this License in conveying all material for which you do
|
170 |
+
not control copyright. Those thus making or running the covered works
|
171 |
+
for you must do so exclusively on your behalf, under your direction
|
172 |
+
and control, on terms that prohibit them from making any copies of
|
173 |
+
your copyrighted material outside their relationship with you.
|
174 |
+
|
175 |
+
Conveying under any other circumstances is permitted solely under
|
176 |
+
the conditions stated below. Sublicensing is not allowed; section 10
|
177 |
+
makes it unnecessary.
|
178 |
+
|
179 |
+
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
180 |
+
|
181 |
+
No covered work shall be deemed part of an effective technological
|
182 |
+
measure under any applicable law fulfilling obligations under article
|
183 |
+
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
184 |
+
similar laws prohibiting or restricting circumvention of such
|
185 |
+
measures.
|
186 |
+
|
187 |
+
When you convey a covered work, you waive any legal power to forbid
|
188 |
+
circumvention of technological measures to the extent such circumvention
|
189 |
+
is effected by exercising rights under this License with respect to
|
190 |
+
the covered work, and you disclaim any intention to limit operation or
|
191 |
+
modification of the work as a means of enforcing, against the work's
|
192 |
+
users, your or third parties' legal rights to forbid circumvention of
|
193 |
+
technological measures.
|
194 |
+
|
195 |
+
4. Conveying Verbatim Copies.
|
196 |
+
|
197 |
+
You may convey verbatim copies of the Program's source code as you
|
198 |
+
receive it, in any medium, provided that you conspicuously and
|
199 |
+
appropriately publish on each copy an appropriate copyright notice;
|
200 |
+
keep intact all notices stating that this License and any
|
201 |
+
non-permissive terms added in accord with section 7 apply to the code;
|
202 |
+
keep intact all notices of the absence of any warranty; and give all
|
203 |
+
recipients a copy of this License along with the Program.
|
204 |
+
|
205 |
+
You may charge any price or no price for each copy that you convey,
|
206 |
+
and you may offer support or warranty protection for a fee.
|
207 |
+
|
208 |
+
5. Conveying Modified Source Versions.
|
209 |
+
|
210 |
+
You may convey a work based on the Program, or the modifications to
|
211 |
+
produce it from the Program, in the form of source code under the
|
212 |
+
terms of section 4, provided that you also meet all of these conditions:
|
213 |
+
|
214 |
+
a) The work must carry prominent notices stating that you modified
|
215 |
+
it, and giving a relevant date.
|
216 |
+
|
217 |
+
b) The work must carry prominent notices stating that it is
|
218 |
+
released under this License and any conditions added under section
|
219 |
+
7. This requirement modifies the requirement in section 4 to
|
220 |
+
"keep intact all notices".
|
221 |
+
|
222 |
+
c) You must license the entire work, as a whole, under this
|
223 |
+
License to anyone who comes into possession of a copy. This
|
224 |
+
License will therefore apply, along with any applicable section 7
|
225 |
+
additional terms, to the whole of the work, and all its parts,
|
226 |
+
regardless of how they are packaged. This License gives no
|
227 |
+
permission to license the work in any other way, but it does not
|
228 |
+
invalidate such permission if you have separately received it.
|
229 |
+
|
230 |
+
d) If the work has interactive user interfaces, each must display
|
231 |
+
Appropriate Legal Notices; however, if the Program has interactive
|
232 |
+
interfaces that do not display Appropriate Legal Notices, your
|
233 |
+
work need not make them do so.
|
234 |
+
|
235 |
+
A compilation of a covered work with other separate and independent
|
236 |
+
works, which are not by their nature extensions of the covered work,
|
237 |
+
and which are not combined with it such as to form a larger program,
|
238 |
+
in or on a volume of a storage or distribution medium, is called an
|
239 |
+
"aggregate" if the compilation and its resulting copyright are not
|
240 |
+
used to limit the access or legal rights of the compilation's users
|
241 |
+
beyond what the individual works permit. Inclusion of a covered work
|
242 |
+
in an aggregate does not cause this License to apply to the other
|
243 |
+
parts of the aggregate.
|
244 |
+
|
245 |
+
6. Conveying Non-Source Forms.
|
246 |
+
|
247 |
+
You may convey a covered work in object code form under the terms
|
248 |
+
of sections 4 and 5, provided that you also convey the
|
249 |
+
machine-readable Corresponding Source under the terms of this License,
|
250 |
+
in one of these ways:
|
251 |
+
|
252 |
+
a) Convey the object code in, or embodied in, a physical product
|
253 |
+
(including a physical distribution medium), accompanied by the
|
254 |
+
Corresponding Source fixed on a durable physical medium
|
255 |
+
customarily used for software interchange.
|
256 |
+
|
257 |
+
b) Convey the object code in, or embodied in, a physical product
|
258 |
+
(including a physical distribution medium), accompanied by a
|
259 |
+
written offer, valid for at least three years and valid for as
|
260 |
+
long as you offer spare parts or customer support for that product
|
261 |
+
model, to give anyone who possesses the object code either (1) a
|
262 |
+
copy of the Corresponding Source for all the software in the
|
263 |
+
product that is covered by this License, on a durable physical
|
264 |
+
medium customarily used for software interchange, for a price no
|
265 |
+
more than your reasonable cost of physically performing this
|
266 |
+
conveying of source, or (2) access to copy the
|
267 |
+
Corresponding Source from a network server at no charge.
|
268 |
+
|
269 |
+
c) Convey individual copies of the object code with a copy of the
|
270 |
+
written offer to provide the Corresponding Source. This
|
271 |
+
alternative is allowed only occasionally and noncommercially, and
|
272 |
+
only if you received the object code with such an offer, in accord
|
273 |
+
with subsection 6b.
|
274 |
+
|
275 |
+
d) Convey the object code by offering access from a designated
|
276 |
+
place (gratis or for a charge), and offer equivalent access to the
|
277 |
+
Corresponding Source in the same way through the same place at no
|
278 |
+
further charge. You need not require recipients to copy the
|
279 |
+
Corresponding Source along with the object code. If the place to
|
280 |
+
copy the object code is a network server, the Corresponding Source
|
281 |
+
may be on a different server (operated by you or a third party)
|
282 |
+
that supports equivalent copying facilities, provided you maintain
|
283 |
+
clear directions next to the object code saying where to find the
|
284 |
+
Corresponding Source. Regardless of what server hosts the
|
285 |
+
Corresponding Source, you remain obligated to ensure that it is
|
286 |
+
available for as long as needed to satisfy these requirements.
|
287 |
+
|
288 |
+
e) Convey the object code using peer-to-peer transmission, provided
|
289 |
+
you inform other peers where the object code and Corresponding
|
290 |
+
Source of the work are being offered to the general public at no
|
291 |
+
charge under subsection 6d.
|
292 |
+
|
293 |
+
A separable portion of the object code, whose source code is excluded
|
294 |
+
from the Corresponding Source as a System Library, need not be
|
295 |
+
included in conveying the object code work.
|
296 |
+
|
297 |
+
A "User Product" is either (1) a "consumer product", which means any
|
298 |
+
tangible personal property which is normally used for personal, family,
|
299 |
+
or household purposes, or (2) anything designed or sold for incorporation
|
300 |
+
into a dwelling. In determining whether a product is a consumer product,
|
301 |
+
doubtful cases shall be resolved in favor of coverage. For a particular
|
302 |
+
product received by a particular user, "normally used" refers to a
|
303 |
+
typical or common use of that class of product, regardless of the status
|
304 |
+
of the particular user or of the way in which the particular user
|
305 |
+
actually uses, or expects or is expected to use, the product. A product
|
306 |
+
is a consumer product regardless of whether the product has substantial
|
307 |
+
commercial, industrial or non-consumer uses, unless such uses represent
|
308 |
+
the only significant mode of use of the product.
|
309 |
+
|
310 |
+
"Installation Information" for a User Product means any methods,
|
311 |
+
procedures, authorization keys, or other information required to install
|
312 |
+
and execute modified versions of a covered work in that User Product from
|
313 |
+
a modified version of its Corresponding Source. The information must
|
314 |
+
suffice to ensure that the continued functioning of the modified object
|
315 |
+
code is in no case prevented or interfered with solely because
|
316 |
+
modification has been made.
|
317 |
+
|
318 |
+
If you convey an object code work under this section in, or with, or
|
319 |
+
specifically for use in, a User Product, and the conveying occurs as
|
320 |
+
part of a transaction in which the right of possession and use of the
|
321 |
+
User Product is transferred to the recipient in perpetuity or for a
|
322 |
+
fixed term (regardless of how the transaction is characterized), the
|
323 |
+
Corresponding Source conveyed under this section must be accompanied
|
324 |
+
by the Installation Information. But this requirement does not apply
|
325 |
+
if neither you nor any third party retains the ability to install
|
326 |
+
modified object code on the User Product (for example, the work has
|
327 |
+
been installed in ROM).
|
328 |
+
|
329 |
+
The requirement to provide Installation Information does not include a
|
330 |
+
requirement to continue to provide support service, warranty, or updates
|
331 |
+
for a work that has been modified or installed by the recipient, or for
|
332 |
+
the User Product in which it has been modified or installed. Access to a
|
333 |
+
network may be denied when the modification itself materially and
|
334 |
+
adversely affects the operation of the network or violates the rules and
|
335 |
+
protocols for communication across the network.
|
336 |
+
|
337 |
+
Corresponding Source conveyed, and Installation Information provided,
|
338 |
+
in accord with this section must be in a format that is publicly
|
339 |
+
documented (and with an implementation available to the public in
|
340 |
+
source code form), and must require no special password or key for
|
341 |
+
unpacking, reading or copying.
|
342 |
+
|
343 |
+
7. Additional Terms.
|
344 |
+
|
345 |
+
"Additional permissions" are terms that supplement the terms of this
|
346 |
+
License by making exceptions from one or more of its conditions.
|
347 |
+
Additional permissions that are applicable to the entire Program shall
|
348 |
+
be treated as though they were included in this License, to the extent
|
349 |
+
that they are valid under applicable law. If additional permissions
|
350 |
+
apply only to part of the Program, that part may be used separately
|
351 |
+
under those permissions, but the entire Program remains governed by
|
352 |
+
this License without regard to the additional permissions.
|
353 |
+
|
354 |
+
When you convey a copy of a covered work, you may at your option
|
355 |
+
remove any additional permissions from that copy, or from any part of
|
356 |
+
it. (Additional permissions may be written to require their own
|
357 |
+
removal in certain cases when you modify the work.) You may place
|
358 |
+
additional permissions on material, added by you to a covered work,
|
359 |
+
for which you have or can give appropriate copyright permission.
|
360 |
+
|
361 |
+
Notwithstanding any other provision of this License, for material you
|
362 |
+
add to a covered work, you may (if authorized by the copyright holders of
|
363 |
+
that material) supplement the terms of this License with terms:
|
364 |
+
|
365 |
+
a) Disclaiming warranty or limiting liability differently from the
|
366 |
+
terms of sections 15 and 16 of this License; or
|
367 |
+
|
368 |
+
b) Requiring preservation of specified reasonable legal notices or
|
369 |
+
author attributions in that material or in the Appropriate Legal
|
370 |
+
Notices displayed by works containing it; or
|
371 |
+
|
372 |
+
c) Prohibiting misrepresentation of the origin of that material, or
|
373 |
+
requiring that modified versions of such material be marked in
|
374 |
+
reasonable ways as different from the original version; or
|
375 |
+
|
376 |
+
d) Limiting the use for publicity purposes of names of licensors or
|
377 |
+
authors of the material; or
|
378 |
+
|
379 |
+
e) Declining to grant rights under trademark law for use of some
|
380 |
+
trade names, trademarks, or service marks; or
|
381 |
+
|
382 |
+
f) Requiring indemnification of licensors and authors of that
|
383 |
+
material by anyone who conveys the material (or modified versions of
|
384 |
+
it) with contractual assumptions of liability to the recipient, for
|
385 |
+
any liability that these contractual assumptions directly impose on
|
386 |
+
those licensors and authors.
|
387 |
+
|
388 |
+
All other non-permissive additional terms are considered "further
|
389 |
+
restrictions" within the meaning of section 10. If the Program as you
|
390 |
+
received it, or any part of it, contains a notice stating that it is
|
391 |
+
governed by this License along with a term that is a further
|
392 |
+
restriction, you may remove that term. If a license document contains
|
393 |
+
a further restriction but permits relicensing or conveying under this
|
394 |
+
License, you may add to a covered work material governed by the terms
|
395 |
+
of that license document, provided that the further restriction does
|
396 |
+
not survive such relicensing or conveying.
|
397 |
+
|
398 |
+
If you add terms to a covered work in accord with this section, you
|
399 |
+
must place, in the relevant source files, a statement of the
|
400 |
+
additional terms that apply to those files, or a notice indicating
|
401 |
+
where to find the applicable terms.
|
402 |
+
|
403 |
+
Additional terms, permissive or non-permissive, may be stated in the
|
404 |
+
form of a separately written license, or stated as exceptions;
|
405 |
+
the above requirements apply either way.
|
406 |
+
|
407 |
+
8. Termination.
|
408 |
+
|
409 |
+
You may not propagate or modify a covered work except as expressly
|
410 |
+
provided under this License. Any attempt otherwise to propagate or
|
411 |
+
modify it is void, and will automatically terminate your rights under
|
412 |
+
this License (including any patent licenses granted under the third
|
413 |
+
paragraph of section 11).
|
414 |
+
|
415 |
+
However, if you cease all violation of this License, then your
|
416 |
+
license from a particular copyright holder is reinstated (a)
|
417 |
+
provisionally, unless and until the copyright holder explicitly and
|
418 |
+
finally terminates your license, and (b) permanently, if the copyright
|
419 |
+
holder fails to notify you of the violation by some reasonable means
|
420 |
+
prior to 60 days after the cessation.
|
421 |
+
|
422 |
+
Moreover, your license from a particular copyright holder is
|
423 |
+
reinstated permanently if the copyright holder notifies you of the
|
424 |
+
violation by some reasonable means, this is the first time you have
|
425 |
+
received notice of violation of this License (for any work) from that
|
426 |
+
copyright holder, and you cure the violation prior to 30 days after
|
427 |
+
your receipt of the notice.
|
428 |
+
|
429 |
+
Termination of your rights under this section does not terminate the
|
430 |
+
licenses of parties who have received copies or rights from you under
|
431 |
+
this License. If your rights have been terminated and not permanently
|
432 |
+
reinstated, you do not qualify to receive new licenses for the same
|
433 |
+
material under section 10.
|
434 |
+
|
435 |
+
9. Acceptance Not Required for Having Copies.
|
436 |
+
|
437 |
+
You are not required to accept this License in order to receive or
|
438 |
+
run a copy of the Program. Ancillary propagation of a covered work
|
439 |
+
occurring solely as a consequence of using peer-to-peer transmission
|
440 |
+
to receive a copy likewise does not require acceptance. However,
|
441 |
+
nothing other than this License grants you permission to propagate or
|
442 |
+
modify any covered work. These actions infringe copyright if you do
|
443 |
+
not accept this License. Therefore, by modifying or propagating a
|
444 |
+
covered work, you indicate your acceptance of this License to do so.
|
445 |
+
|
446 |
+
10. Automatic Licensing of Downstream Recipients.
|
447 |
+
|
448 |
+
Each time you convey a covered work, the recipient automatically
|
449 |
+
receives a license from the original licensors, to run, modify and
|
450 |
+
propagate that work, subject to this License. You are not responsible
|
451 |
+
for enforcing compliance by third parties with this License.
|
452 |
+
|
453 |
+
An "entity transaction" is a transaction transferring control of an
|
454 |
+
organization, or substantially all assets of one, or subdividing an
|
455 |
+
organization, or merging organizations. If propagation of a covered
|
456 |
+
work results from an entity transaction, each party to that
|
457 |
+
transaction who receives a copy of the work also receives whatever
|
458 |
+
licenses to the work the party's predecessor in interest had or could
|
459 |
+
give under the previous paragraph, plus a right to possession of the
|
460 |
+
Corresponding Source of the work from the predecessor in interest, if
|
461 |
+
the predecessor has it or can get it with reasonable efforts.
|
462 |
+
|
463 |
+
You may not impose any further restrictions on the exercise of the
|
464 |
+
rights granted or affirmed under this License. For example, you may
|
465 |
+
not impose a license fee, royalty, or other charge for exercise of
|
466 |
+
rights granted under this License, and you may not initiate litigation
|
467 |
+
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
468 |
+
any patent claim is infringed by making, using, selling, offering for
|
469 |
+
sale, or importing the Program or any portion of it.
|
470 |
+
|
471 |
+
11. Patents.
|
472 |
+
|
473 |
+
A "contributor" is a copyright holder who authorizes use under this
|
474 |
+
License of the Program or a work on which the Program is based. The
|
475 |
+
work thus licensed is called the contributor's "contributor version".
|
476 |
+
|
477 |
+
A contributor's "essential patent claims" are all patent claims
|
478 |
+
owned or controlled by the contributor, whether already acquired or
|
479 |
+
hereafter acquired, that would be infringed by some manner, permitted
|
480 |
+
by this License, of making, using, or selling its contributor version,
|
481 |
+
but do not include claims that would be infringed only as a
|
482 |
+
consequence of further modification of the contributor version. For
|
483 |
+
purposes of this definition, "control" includes the right to grant
|
484 |
+
patent sublicenses in a manner consistent with the requirements of
|
485 |
+
this License.
|
486 |
+
|
487 |
+
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
488 |
+
patent license under the contributor's essential patent claims, to
|
489 |
+
make, use, sell, offer for sale, import and otherwise run, modify and
|
490 |
+
propagate the contents of its contributor version.
|
491 |
+
|
492 |
+
In the following three paragraphs, a "patent license" is any express
|
493 |
+
agreement or commitment, however denominated, not to enforce a patent
|
494 |
+
(such as an express permission to practice a patent or covenant not to
|
495 |
+
sue for patent infringement). To "grant" such a patent license to a
|
496 |
+
party means to make such an agreement or commitment not to enforce a
|
497 |
+
patent against the party.
|
498 |
+
|
499 |
+
If you convey a covered work, knowingly relying on a patent license,
|
500 |
+
and the Corresponding Source of the work is not available for anyone
|
501 |
+
to copy, free of charge and under the terms of this License, through a
|
502 |
+
publicly available network server or other readily accessible means,
|
503 |
+
then you must either (1) cause the Corresponding Source to be so
|
504 |
+
available, or (2) arrange to deprive yourself of the benefit of the
|
505 |
+
patent license for this particular work, or (3) arrange, in a manner
|
506 |
+
consistent with the requirements of this License, to extend the patent
|
507 |
+
license to downstream recipients. "Knowingly relying" means you have
|
508 |
+
actual knowledge that, but for the patent license, your conveying the
|
509 |
+
covered work in a country, or your recipient's use of the covered work
|
510 |
+
in a country, would infringe one or more identifiable patents in that
|
511 |
+
country that you have reason to believe are valid.
|
512 |
+
|
513 |
+
If, pursuant to or in connection with a single transaction or
|
514 |
+
arrangement, you convey, or propagate by procuring conveyance of, a
|
515 |
+
covered work, and grant a patent license to some of the parties
|
516 |
+
receiving the covered work authorizing them to use, propagate, modify
|
517 |
+
or convey a specific copy of the covered work, then the patent license
|
518 |
+
you grant is automatically extended to all recipients of the covered
|
519 |
+
work and works based on it.
|
520 |
+
|
521 |
+
A patent license is "discriminatory" if it does not include within
|
522 |
+
the scope of its coverage, prohibits the exercise of, or is
|
523 |
+
conditioned on the non-exercise of one or more of the rights that are
|
524 |
+
specifically granted under this License. You may not convey a covered
|
525 |
+
work if you are a party to an arrangement with a third party that is
|
526 |
+
in the business of distributing software, under which you make payment
|
527 |
+
to the third party based on the extent of your activity of conveying
|
528 |
+
the work, and under which the third party grants, to any of the
|
529 |
+
parties who would receive the covered work from you, a discriminatory
|
530 |
+
patent license (a) in connection with copies of the covered work
|
531 |
+
conveyed by you (or copies made from those copies), or (b) primarily
|
532 |
+
for and in connection with specific products or compilations that
|
533 |
+
contain the covered work, unless you entered into that arrangement,
|
534 |
+
or that patent license was granted, prior to 28 March 2007.
|
535 |
+
|
536 |
+
Nothing in this License shall be construed as excluding or limiting
|
537 |
+
any implied license or other defenses to infringement that may
|
538 |
+
otherwise be available to you under applicable patent law.
|
539 |
+
|
540 |
+
12. No Surrender of Others' Freedom.
|
541 |
+
|
542 |
+
If conditions are imposed on you (whether by court order, agreement or
|
543 |
+
otherwise) that contradict the conditions of this License, they do not
|
544 |
+
excuse you from the conditions of this License. If you cannot convey a
|
545 |
+
covered work so as to satisfy simultaneously your obligations under this
|
546 |
+
License and any other pertinent obligations, then as a consequence you may
|
547 |
+
not convey it at all. For example, if you agree to terms that obligate you
|
548 |
+
to collect a royalty for further conveying from those to whom you convey
|
549 |
+
the Program, the only way you could satisfy both those terms and this
|
550 |
+
License would be to refrain entirely from conveying the Program.
|
551 |
+
|
552 |
+
13. Use with the GNU Affero General Public License.
|
553 |
+
|
554 |
+
Notwithstanding any other provision of this License, you have
|
555 |
+
permission to link or combine any covered work with a work licensed
|
556 |
+
under version 3 of the GNU Affero General Public License into a single
|
557 |
+
combined work, and to convey the resulting work. The terms of this
|
558 |
+
License will continue to apply to the part which is the covered work,
|
559 |
+
but the special requirements of the GNU Affero General Public License,
|
560 |
+
section 13, concerning interaction through a network will apply to the
|
561 |
+
combination as such.
|
562 |
+
|
563 |
+
14. Revised Versions of this License.
|
564 |
+
|
565 |
+
The Free Software Foundation may publish revised and/or new versions of
|
566 |
+
the GNU General Public License from time to time. Such new versions will
|
567 |
+
be similar in spirit to the present version, but may differ in detail to
|
568 |
+
address new problems or concerns.
|
569 |
+
|
570 |
+
Each version is given a distinguishing version number. If the
|
571 |
+
Program specifies that a certain numbered version of the GNU General
|
572 |
+
Public License "or any later version" applies to it, you have the
|
573 |
+
option of following the terms and conditions either of that numbered
|
574 |
+
version or of any later version published by the Free Software
|
575 |
+
Foundation. If the Program does not specify a version number of the
|
576 |
+
GNU General Public License, you may choose any version ever published
|
577 |
+
by the Free Software Foundation.
|
578 |
+
|
579 |
+
If the Program specifies that a proxy can decide which future
|
580 |
+
versions of the GNU General Public License can be used, that proxy's
|
581 |
+
public statement of acceptance of a version permanently authorizes you
|
582 |
+
to choose that version for the Program.
|
583 |
+
|
584 |
+
Later license versions may give you additional or different
|
585 |
+
permissions. However, no additional obligations are imposed on any
|
586 |
+
author or copyright holder as a result of your choosing to follow a
|
587 |
+
later version.
|
588 |
+
|
589 |
+
15. Disclaimer of Warranty.
|
590 |
+
|
591 |
+
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
592 |
+
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
593 |
+
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
594 |
+
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
595 |
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
596 |
+
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
597 |
+
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
598 |
+
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
599 |
+
|
600 |
+
16. Limitation of Liability.
|
601 |
+
|
602 |
+
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
603 |
+
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
604 |
+
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
605 |
+
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
606 |
+
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
607 |
+
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
608 |
+
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
609 |
+
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
610 |
+
SUCH DAMAGES.
|
611 |
+
|
612 |
+
17. Interpretation of Sections 15 and 16.
|
613 |
+
|
614 |
+
If the disclaimer of warranty and limitation of liability provided
|
615 |
+
above cannot be given local legal effect according to their terms,
|
616 |
+
reviewing courts shall apply local law that most closely approximates
|
617 |
+
an absolute waiver of all civil liability in connection with the
|
618 |
+
Program, unless a warranty or assumption of liability accompanies a
|
619 |
+
copy of the Program in return for a fee.
|
620 |
+
|
621 |
+
END OF TERMS AND CONDITIONS
|
622 |
+
|
623 |
+
How to Apply These Terms to Your New Programs
|
624 |
+
|
625 |
+
If you develop a new program, and you want it to be of the greatest
|
626 |
+
possible use to the public, the best way to achieve this is to make it
|
627 |
+
free software which everyone can redistribute and change under these terms.
|
628 |
+
|
629 |
+
To do so, attach the following notices to the program. It is safest
|
630 |
+
to attach them to the start of each source file to most effectively
|
631 |
+
state the exclusion of warranty; and each file should have at least
|
632 |
+
the "copyright" line and a pointer to where the full notice is found.
|
633 |
+
|
634 |
+
<one line to give the program's name and a brief idea of what it does.>
|
635 |
+
Copyright (C) <year> <name of author>
|
636 |
+
|
637 |
+
This program is free software: you can redistribute it and/or modify
|
638 |
+
it under the terms of the GNU General Public License as published by
|
639 |
+
the Free Software Foundation, either version 3 of the License, or
|
640 |
+
(at your option) any later version.
|
641 |
+
|
642 |
+
This program is distributed in the hope that it will be useful,
|
643 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
644 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
645 |
+
GNU General Public License for more details.
|
646 |
+
|
647 |
+
You should have received a copy of the GNU General Public License
|
648 |
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
649 |
+
|
650 |
+
Also add information on how to contact you by electronic and paper mail.
|
651 |
+
|
652 |
+
If the program does terminal interaction, make it output a short
|
653 |
+
notice like this when it starts in an interactive mode:
|
654 |
+
|
655 |
+
<program> Copyright (C) <year> <name of author>
|
656 |
+
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
657 |
+
This is free software, and you are welcome to redistribute it
|
658 |
+
under certain conditions; type `show c' for details.
|
659 |
+
|
660 |
+
The hypothetical commands `show w' and `show c' should show the appropriate
|
661 |
+
parts of the General Public License. Of course, your program's commands
|
662 |
+
might be different; for a GUI interface, you would use an "about box".
|
663 |
+
|
664 |
+
You should also get your employer (if you work as a programmer) or school,
|
665 |
+
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
666 |
+
For more information on this, and how to apply and follow the GNU GPL, see
|
667 |
+
<https://www.gnu.org/licenses/>.
|
668 |
+
|
669 |
+
The GNU General Public License does not permit incorporating your program
|
670 |
+
into proprietary programs. If your program is a subroutine library, you
|
671 |
+
may consider it more useful to permit linking proprietary applications with
|
672 |
+
the library. If this is what you want to do, use the GNU Lesser General
|
673 |
+
Public License instead of this License. But first, please read
|
674 |
+
<https://www.gnu.org/licenses/why-not-lgpl.html>.
|
ComfyUI/README.md
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ComfyUI
|
2 |
+
=======
|
3 |
+
The most powerful and modular stable diffusion GUI and backend.
|
4 |
+
-----------
|
5 |
+
![ComfyUI Screenshot](comfyui_screenshot.png)
|
6 |
+
|
7 |
+
This ui will let you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. For some workflow examples and see what ComfyUI can do you can check out:
|
8 |
+
### [ComfyUI Examples](https://comfyanonymous.github.io/ComfyUI_examples/)
|
9 |
+
|
10 |
+
### [Installing ComfyUI](#installing)
|
11 |
+
|
12 |
+
## Features
|
13 |
+
- Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
|
14 |
+
- Fully supports SD1.x, SD2.x, [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/) and [Stable Cascade](https://comfyanonymous.github.io/ComfyUI_examples/stable_cascade/)
|
15 |
+
- Asynchronous Queue system
|
16 |
+
- Many optimizations: Only re-executes the parts of the workflow that changes between executions.
|
17 |
+
- Command line option: ```--lowvram``` to make it work on GPUs with less than 3GB vram (enabled automatically on GPUs with low vram)
|
18 |
+
- Works even if you don't have a GPU with: ```--cpu``` (slow)
|
19 |
+
- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs and CLIP models.
|
20 |
+
- Embeddings/Textual inversion
|
21 |
+
- [Loras (regular, locon and loha)](https://comfyanonymous.github.io/ComfyUI_examples/lora/)
|
22 |
+
- [Hypernetworks](https://comfyanonymous.github.io/ComfyUI_examples/hypernetworks/)
|
23 |
+
- Loading full workflows (with seeds) from generated PNG files.
|
24 |
+
- Saving/Loading workflows as Json files.
|
25 |
+
- Nodes interface can be used to create complex workflows like one for [Hires fix](https://comfyanonymous.github.io/ComfyUI_examples/2_pass_txt2img/) or much more advanced ones.
|
26 |
+
- [Area Composition](https://comfyanonymous.github.io/ComfyUI_examples/area_composition/)
|
27 |
+
- [Inpainting](https://comfyanonymous.github.io/ComfyUI_examples/inpaint/) with both regular and inpainting models.
|
28 |
+
- [ControlNet and T2I-Adapter](https://comfyanonymous.github.io/ComfyUI_examples/controlnet/)
|
29 |
+
- [Upscale Models (ESRGAN, ESRGAN variants, SwinIR, Swin2SR, etc...)](https://comfyanonymous.github.io/ComfyUI_examples/upscale_models/)
|
30 |
+
- [unCLIP Models](https://comfyanonymous.github.io/ComfyUI_examples/unclip/)
|
31 |
+
- [GLIGEN](https://comfyanonymous.github.io/ComfyUI_examples/gligen/)
|
32 |
+
- [Model Merging](https://comfyanonymous.github.io/ComfyUI_examples/model_merging/)
|
33 |
+
- [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/)
|
34 |
+
- [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
|
35 |
+
- Latent previews with [TAESD](#how-to-show-high-quality-previews)
|
36 |
+
- Starts up very fast.
|
37 |
+
- Works fully offline: will never download anything.
|
38 |
+
- [Config file](extra_model_paths.yaml.example) to set the search paths for models.
|
39 |
+
|
40 |
+
Workflow examples can be found on the [Examples page](https://comfyanonymous.github.io/ComfyUI_examples/)
|
41 |
+
|
42 |
+
## Shortcuts
|
43 |
+
|
44 |
+
| Keybind | Explanation |
|
45 |
+
|---------------------------|--------------------------------------------------------------------------------------------------------------------|
|
46 |
+
| Ctrl + Enter | Queue up current graph for generation |
|
47 |
+
| Ctrl + Shift + Enter | Queue up current graph as first for generation |
|
48 |
+
| Ctrl + Z/Ctrl + Y | Undo/Redo |
|
49 |
+
| Ctrl + S | Save workflow |
|
50 |
+
| Ctrl + O | Load workflow |
|
51 |
+
| Ctrl + A | Select all nodes |
|
52 |
+
| Alt + C | Collapse/uncollapse selected nodes |
|
53 |
+
| Ctrl + M | Mute/unmute selected nodes |
|
54 |
+
| Ctrl + B | Bypass selected nodes (acts like the node was removed from the graph and the wires reconnected through) |
|
55 |
+
| Delete/Backspace | Delete selected nodes |
|
56 |
+
| Ctrl + Delete/Backspace | Delete the current graph |
|
57 |
+
| Space | Move the canvas around when held and moving the cursor |
|
58 |
+
| Ctrl/Shift + Click | Add clicked node to selection |
|
59 |
+
| Ctrl + C/Ctrl + V | Copy and paste selected nodes (without maintaining connections to outputs of unselected nodes) |
|
60 |
+
| Ctrl + C/Ctrl + Shift + V | Copy and paste selected nodes (maintaining connections from outputs of unselected nodes to inputs of pasted nodes) |
|
61 |
+
| Shift + Drag | Move multiple selected nodes at the same time |
|
62 |
+
| Ctrl + D | Load default graph |
|
63 |
+
| Q | Toggle visibility of the queue |
|
64 |
+
| H | Toggle visibility of history |
|
65 |
+
| R | Refresh graph |
|
66 |
+
| Double-Click LMB | Open node quick search palette |
|
67 |
+
|
68 |
+
Ctrl can also be replaced with Cmd instead for macOS users
|
69 |
+
|
70 |
+
# Installing
|
71 |
+
|
72 |
+
## Windows
|
73 |
+
|
74 |
+
There is a portable standalone build for Windows that should work for running on Nvidia GPUs or for running on your CPU only on the [releases page](https://github.com/comfyanonymous/ComfyUI/releases).
|
75 |
+
|
76 |
+
### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/download/latest/ComfyUI_windows_portable_nvidia_cu121_or_cpu.7z)
|
77 |
+
|
78 |
+
Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you put your Stable Diffusion checkpoints/models (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints
|
79 |
+
|
80 |
+
If you have trouble extracting it, right click the file -> properties -> unblock
|
81 |
+
|
82 |
+
#### How do I share models between another UI and ComfyUI?
|
83 |
+
|
84 |
+
See the [Config file](extra_model_paths.yaml.example) to set the search paths for models. In the standalone windows build you can find this file in the ComfyUI directory. Rename this file to extra_model_paths.yaml and edit it with your favorite text editor.
|
85 |
+
|
86 |
+
## Jupyter Notebook
|
87 |
+
|
88 |
+
To run it on services like paperspace, kaggle or colab you can use my [Jupyter Notebook](notebooks/comfyui_colab.ipynb)
|
89 |
+
|
90 |
+
## Manual Install (Windows, Linux)
|
91 |
+
|
92 |
+
Git clone this repo.
|
93 |
+
|
94 |
+
Put your SD checkpoints (the huge ckpt/safetensors files) in: models/checkpoints
|
95 |
+
|
96 |
+
Put your VAE in: models/vae
|
97 |
+
|
98 |
+
|
99 |
+
### AMD GPUs (Linux only)
|
100 |
+
AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version:
|
101 |
+
|
102 |
+
```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7```
|
103 |
+
|
104 |
+
This is the command to install the nightly with ROCm 6.0 which might have some performance improvements:
|
105 |
+
|
106 |
+
```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.0```
|
107 |
+
|
108 |
+
### NVIDIA
|
109 |
+
|
110 |
+
Nvidia users should install stable pytorch using this command:
|
111 |
+
|
112 |
+
```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu121```
|
113 |
+
|
114 |
+
This is the command to install pytorch nightly instead which might have performance improvements:
|
115 |
+
|
116 |
+
```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121```
|
117 |
+
|
118 |
+
#### Troubleshooting
|
119 |
+
|
120 |
+
If you get the "Torch not compiled with CUDA enabled" error, uninstall torch with:
|
121 |
+
|
122 |
+
```pip uninstall torch```
|
123 |
+
|
124 |
+
And install it again with the command above.
|
125 |
+
|
126 |
+
### Dependencies
|
127 |
+
|
128 |
+
Install the dependencies by opening your terminal inside the ComfyUI folder and:
|
129 |
+
|
130 |
+
```pip install -r requirements.txt```
|
131 |
+
|
132 |
+
After this you should have everything installed and can proceed to running ComfyUI.
|
133 |
+
|
134 |
+
### Others:
|
135 |
+
|
136 |
+
#### [Intel Arc](https://github.com/comfyanonymous/ComfyUI/discussions/476)
|
137 |
+
|
138 |
+
#### Apple Mac silicon
|
139 |
+
|
140 |
+
You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS version.
|
141 |
+
|
142 |
+
1. Install pytorch nightly. For instructions, read the [Accelerated PyTorch training on Mac](https://developer.apple.com/metal/pytorch/) Apple Developer guide (make sure to install the latest pytorch nightly).
|
143 |
+
1. Follow the [ComfyUI manual installation](#manual-install-windows-linux) instructions for Windows and Linux.
|
144 |
+
1. Install the ComfyUI [dependencies](#dependencies). If you have another Stable Diffusion UI [you might be able to reuse the dependencies](#i-already-have-another-ui-for-stable-diffusion-installed-do-i-really-have-to-install-all-of-these-dependencies).
|
145 |
+
1. Launch ComfyUI by running `python main.py --force-fp16`. Note that --force-fp16 will only work if you installed the latest pytorch nightly.
|
146 |
+
|
147 |
+
> **Note**: Remember to add your models, VAE, LoRAs etc. to the corresponding Comfy folders, as discussed in [ComfyUI manual installation](#manual-install-windows-linux).
|
148 |
+
|
149 |
+
#### DirectML (AMD Cards on Windows)
|
150 |
+
|
151 |
+
```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml```
|
152 |
+
|
153 |
+
### I already have another UI for Stable Diffusion installed do I really have to install all of these dependencies?
|
154 |
+
|
155 |
+
You don't. If you have another UI installed and working with its own python venv you can use that venv to run ComfyUI. You can open up your favorite terminal and activate it:
|
156 |
+
|
157 |
+
```source path_to_other_sd_gui/venv/bin/activate```
|
158 |
+
|
159 |
+
or on Windows:
|
160 |
+
|
161 |
+
With Powershell: ```"path_to_other_sd_gui\venv\Scripts\Activate.ps1"```
|
162 |
+
|
163 |
+
With cmd.exe: ```"path_to_other_sd_gui\venv\Scripts\activate.bat"```
|
164 |
+
|
165 |
+
And then you can use that terminal to run ComfyUI without installing any dependencies. Note that the venv folder might be called something else depending on the SD UI.
|
166 |
+
|
167 |
+
# Running
|
168 |
+
|
169 |
+
```python main.py```
|
170 |
+
|
171 |
+
### For AMD cards not officially supported by ROCm
|
172 |
+
|
173 |
+
Try running it with this command if you have issues:
|
174 |
+
|
175 |
+
For 6700, 6600 and maybe other RDNA2 or older: ```HSA_OVERRIDE_GFX_VERSION=10.3.0 python main.py```
|
176 |
+
|
177 |
+
For AMD 7600 and maybe other RDNA3 cards: ```HSA_OVERRIDE_GFX_VERSION=11.0.0 python main.py```
|
178 |
+
|
179 |
+
# Notes
|
180 |
+
|
181 |
+
Only parts of the graph that have an output with all the correct inputs will be executed.
|
182 |
+
|
183 |
+
Only parts of the graph that change from each execution to the next will be executed, if you submit the same graph twice only the first will be executed. If you change the last part of the graph only the part you changed and the part that depends on it will be executed.
|
184 |
+
|
185 |
+
Dragging a generated png on the webpage or loading one will give you the full workflow including seeds that were used to create it.
|
186 |
+
|
187 |
+
You can use () to change emphasis of a word or phrase like: (good code:1.2) or (bad code:0.8). The default emphasis for () is 1.1. To use () characters in your actual prompt escape them like \\( or \\).
|
188 |
+
|
189 |
+
You can use {day|night}, for wildcard/dynamic prompts. With this syntax "{wild|card|test}" will be randomly replaced by either "wild", "card" or "test" by the frontend every time you queue the prompt. To use {} characters in your actual prompt escape them like: \\{ or \\}.
|
190 |
+
|
191 |
+
Dynamic prompts also support C-style comments, like `// comment` or `/* comment */`.
|
192 |
+
|
193 |
+
To use a textual inversion concepts/embeddings in a text prompt put them in the models/embeddings directory and use them in the CLIPTextEncode node like this (you can omit the .pt extension):
|
194 |
+
|
195 |
+
```embedding:embedding_filename.pt```
|
196 |
+
|
197 |
+
|
198 |
+
## How to increase generation speed?
|
199 |
+
|
200 |
+
Make sure you use the regular loaders/Load Checkpoint node to load checkpoints. It will auto pick the right settings depending on your GPU.
|
201 |
+
|
202 |
+
You can set this command line setting to disable the upcasting to fp32 in some cross attention operations which will increase your speed. Note that this will very likely give you black images on SD2.x models. If you use xformers or pytorch attention this option does not do anything.
|
203 |
+
|
204 |
+
```--dont-upcast-attention```
|
205 |
+
|
206 |
+
## How to show high-quality previews?
|
207 |
+
|
208 |
+
Use ```--preview-method auto``` to enable previews.
|
209 |
+
|
210 |
+
The default installation includes a fast latent preview method that's low-resolution. To enable higher-quality previews with [TAESD](https://github.com/madebyollin/taesd), download the [taesd_decoder.pth](https://github.com/madebyollin/taesd/raw/main/taesd_decoder.pth) (for SD1.x and SD2.x) and [taesdxl_decoder.pth](https://github.com/madebyollin/taesd/raw/main/taesdxl_decoder.pth) (for SDXL) models and place them in the `models/vae_approx` folder. Once they're installed, restart ComfyUI to enable high-quality previews.
|
211 |
+
|
212 |
+
## Support and dev channel
|
213 |
+
|
214 |
+
[Matrix space: #comfyui_space:matrix.org](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) (it's like discord but open source).
|
215 |
+
|
216 |
+
# QA
|
217 |
+
|
218 |
+
### Why did you make this?
|
219 |
+
|
220 |
+
I wanted to learn how Stable Diffusion worked in detail. I also wanted something clean and powerful that would let me experiment with SD without restrictions.
|
221 |
+
|
222 |
+
### Who is this for?
|
223 |
+
|
224 |
+
This is for anyone that wants to make complex workflows with SD or that wants to learn more how SD works. The interface follows closely how SD works and the code should be much more simple to understand than other SD UIs.
|
ComfyUI/app/app_settings.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from aiohttp import web
|
4 |
+
|
5 |
+
|
6 |
+
class AppSettings():
|
7 |
+
def __init__(self, user_manager):
|
8 |
+
self.user_manager = user_manager
|
9 |
+
|
10 |
+
def get_settings(self, request):
|
11 |
+
file = self.user_manager.get_request_user_filepath(
|
12 |
+
request, "comfy.settings.json")
|
13 |
+
if os.path.isfile(file):
|
14 |
+
with open(file) as f:
|
15 |
+
return json.load(f)
|
16 |
+
else:
|
17 |
+
return {}
|
18 |
+
|
19 |
+
def save_settings(self, request, settings):
|
20 |
+
file = self.user_manager.get_request_user_filepath(
|
21 |
+
request, "comfy.settings.json")
|
22 |
+
with open(file, "w") as f:
|
23 |
+
f.write(json.dumps(settings, indent=4))
|
24 |
+
|
25 |
+
def add_routes(self, routes):
|
26 |
+
@routes.get("/settings")
|
27 |
+
async def get_settings(request):
|
28 |
+
return web.json_response(self.get_settings(request))
|
29 |
+
|
30 |
+
@routes.get("/settings/{id}")
|
31 |
+
async def get_setting(request):
|
32 |
+
value = None
|
33 |
+
settings = self.get_settings(request)
|
34 |
+
setting_id = request.match_info.get("id", None)
|
35 |
+
if setting_id and setting_id in settings:
|
36 |
+
value = settings[setting_id]
|
37 |
+
return web.json_response(value)
|
38 |
+
|
39 |
+
@routes.post("/settings")
|
40 |
+
async def post_settings(request):
|
41 |
+
settings = self.get_settings(request)
|
42 |
+
new_settings = await request.json()
|
43 |
+
self.save_settings(request, {**settings, **new_settings})
|
44 |
+
return web.Response(status=200)
|
45 |
+
|
46 |
+
@routes.post("/settings/{id}")
|
47 |
+
async def post_setting(request):
|
48 |
+
setting_id = request.match_info.get("id", None)
|
49 |
+
if not setting_id:
|
50 |
+
return web.Response(status=400)
|
51 |
+
settings = self.get_settings(request)
|
52 |
+
settings[setting_id] = await request.json()
|
53 |
+
self.save_settings(request, settings)
|
54 |
+
return web.Response(status=200)
|
ComfyUI/app/user_manager.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import uuid
|
5 |
+
from aiohttp import web
|
6 |
+
from comfy.cli_args import args
|
7 |
+
from folder_paths import user_directory
|
8 |
+
from .app_settings import AppSettings
|
9 |
+
|
10 |
+
default_user = "default"
|
11 |
+
users_file = os.path.join(user_directory, "users.json")
|
12 |
+
|
13 |
+
|
14 |
+
class UserManager():
|
15 |
+
def __init__(self):
|
16 |
+
global user_directory
|
17 |
+
|
18 |
+
self.settings = AppSettings(self)
|
19 |
+
if not os.path.exists(user_directory):
|
20 |
+
os.mkdir(user_directory)
|
21 |
+
if not args.multi_user:
|
22 |
+
print("****** User settings have been changed to be stored on the server instead of browser storage. ******")
|
23 |
+
print("****** For multi-user setups add the --multi-user CLI argument to enable multiple user profiles. ******")
|
24 |
+
|
25 |
+
if args.multi_user:
|
26 |
+
if os.path.isfile(users_file):
|
27 |
+
with open(users_file) as f:
|
28 |
+
self.users = json.load(f)
|
29 |
+
else:
|
30 |
+
self.users = {}
|
31 |
+
else:
|
32 |
+
self.users = {"default": "default"}
|
33 |
+
|
34 |
+
def get_request_user_id(self, request):
|
35 |
+
user = "default"
|
36 |
+
if args.multi_user and "comfy-user" in request.headers:
|
37 |
+
user = request.headers["comfy-user"]
|
38 |
+
|
39 |
+
if user not in self.users:
|
40 |
+
raise KeyError("Unknown user: " + user)
|
41 |
+
|
42 |
+
return user
|
43 |
+
|
44 |
+
def get_request_user_filepath(self, request, file, type="userdata", create_dir=True):
|
45 |
+
global user_directory
|
46 |
+
|
47 |
+
if type == "userdata":
|
48 |
+
root_dir = user_directory
|
49 |
+
else:
|
50 |
+
raise KeyError("Unknown filepath type:" + type)
|
51 |
+
|
52 |
+
user = self.get_request_user_id(request)
|
53 |
+
path = user_root = os.path.abspath(os.path.join(root_dir, user))
|
54 |
+
|
55 |
+
# prevent leaving /{type}
|
56 |
+
if os.path.commonpath((root_dir, user_root)) != root_dir:
|
57 |
+
return None
|
58 |
+
|
59 |
+
parent = user_root
|
60 |
+
|
61 |
+
if file is not None:
|
62 |
+
# prevent leaving /{type}/{user}
|
63 |
+
path = os.path.abspath(os.path.join(user_root, file))
|
64 |
+
if os.path.commonpath((user_root, path)) != user_root:
|
65 |
+
return None
|
66 |
+
|
67 |
+
if create_dir and not os.path.exists(parent):
|
68 |
+
os.mkdir(parent)
|
69 |
+
|
70 |
+
return path
|
71 |
+
|
72 |
+
def add_user(self, name):
|
73 |
+
name = name.strip()
|
74 |
+
if not name:
|
75 |
+
raise ValueError("username not provided")
|
76 |
+
user_id = re.sub("[^a-zA-Z0-9-_]+", '-', name)
|
77 |
+
user_id = user_id + "_" + str(uuid.uuid4())
|
78 |
+
|
79 |
+
self.users[user_id] = name
|
80 |
+
|
81 |
+
global users_file
|
82 |
+
with open(users_file, "w") as f:
|
83 |
+
json.dump(self.users, f)
|
84 |
+
|
85 |
+
return user_id
|
86 |
+
|
87 |
+
def add_routes(self, routes):
|
88 |
+
self.settings.add_routes(routes)
|
89 |
+
|
90 |
+
@routes.get("/users")
|
91 |
+
async def get_users(request):
|
92 |
+
if args.multi_user:
|
93 |
+
return web.json_response({"storage": "server", "users": self.users})
|
94 |
+
else:
|
95 |
+
user_dir = self.get_request_user_filepath(request, None, create_dir=False)
|
96 |
+
return web.json_response({
|
97 |
+
"storage": "server",
|
98 |
+
"migrated": os.path.exists(user_dir)
|
99 |
+
})
|
100 |
+
|
101 |
+
@routes.post("/users")
|
102 |
+
async def post_users(request):
|
103 |
+
body = await request.json()
|
104 |
+
username = body["username"]
|
105 |
+
if username in self.users.values():
|
106 |
+
return web.json_response({"error": "Duplicate username."}, status=400)
|
107 |
+
|
108 |
+
user_id = self.add_user(username)
|
109 |
+
return web.json_response(user_id)
|
110 |
+
|
111 |
+
@routes.get("/userdata/{file}")
|
112 |
+
async def getuserdata(request):
|
113 |
+
file = request.match_info.get("file", None)
|
114 |
+
if not file:
|
115 |
+
return web.Response(status=400)
|
116 |
+
|
117 |
+
path = self.get_request_user_filepath(request, file)
|
118 |
+
if not path:
|
119 |
+
return web.Response(status=403)
|
120 |
+
|
121 |
+
if not os.path.exists(path):
|
122 |
+
return web.Response(status=404)
|
123 |
+
|
124 |
+
return web.FileResponse(path)
|
125 |
+
|
126 |
+
@routes.post("/userdata/{file}")
|
127 |
+
async def post_userdata(request):
|
128 |
+
file = request.match_info.get("file", None)
|
129 |
+
if not file:
|
130 |
+
return web.Response(status=400)
|
131 |
+
|
132 |
+
path = self.get_request_user_filepath(request, file)
|
133 |
+
if not path:
|
134 |
+
return web.Response(status=403)
|
135 |
+
|
136 |
+
body = await request.read()
|
137 |
+
with open(path, "wb") as f:
|
138 |
+
f.write(body)
|
139 |
+
|
140 |
+
return web.Response(status=200)
|
ComfyUI/comfy/checkpoint_pickle.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
|
3 |
+
load = pickle.load
|
4 |
+
|
5 |
+
class Empty:
|
6 |
+
pass
|
7 |
+
|
8 |
+
class Unpickler(pickle.Unpickler):
|
9 |
+
def find_class(self, module, name):
|
10 |
+
#TODO: safe unpickle
|
11 |
+
if module.startswith("pytorch_lightning"):
|
12 |
+
return Empty
|
13 |
+
return super().find_class(module, name)
|
ComfyUI/comfy/cldm/cldm.py
ADDED
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#taken from: https://github.com/lllyasviel/ControlNet
|
2 |
+
#and modified
|
3 |
+
|
4 |
+
import torch
|
5 |
+
import torch as th
|
6 |
+
import torch.nn as nn
|
7 |
+
|
8 |
+
from ..ldm.modules.diffusionmodules.util import (
|
9 |
+
zero_module,
|
10 |
+
timestep_embedding,
|
11 |
+
)
|
12 |
+
|
13 |
+
from ..ldm.modules.attention import SpatialTransformer
|
14 |
+
from ..ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample
|
15 |
+
from ..ldm.util import exists
|
16 |
+
import comfy.ops
|
17 |
+
|
18 |
+
class ControlledUnetModel(UNetModel):
|
19 |
+
#implemented in the ldm unet
|
20 |
+
pass
|
21 |
+
|
22 |
+
class ControlNet(nn.Module):
|
23 |
+
def __init__(
|
24 |
+
self,
|
25 |
+
image_size,
|
26 |
+
in_channels,
|
27 |
+
model_channels,
|
28 |
+
hint_channels,
|
29 |
+
num_res_blocks,
|
30 |
+
dropout=0,
|
31 |
+
channel_mult=(1, 2, 4, 8),
|
32 |
+
conv_resample=True,
|
33 |
+
dims=2,
|
34 |
+
num_classes=None,
|
35 |
+
use_checkpoint=False,
|
36 |
+
dtype=torch.float32,
|
37 |
+
num_heads=-1,
|
38 |
+
num_head_channels=-1,
|
39 |
+
num_heads_upsample=-1,
|
40 |
+
use_scale_shift_norm=False,
|
41 |
+
resblock_updown=False,
|
42 |
+
use_new_attention_order=False,
|
43 |
+
use_spatial_transformer=False, # custom transformer support
|
44 |
+
transformer_depth=1, # custom transformer support
|
45 |
+
context_dim=None, # custom transformer support
|
46 |
+
n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model
|
47 |
+
legacy=True,
|
48 |
+
disable_self_attentions=None,
|
49 |
+
num_attention_blocks=None,
|
50 |
+
disable_middle_self_attn=False,
|
51 |
+
use_linear_in_transformer=False,
|
52 |
+
adm_in_channels=None,
|
53 |
+
transformer_depth_middle=None,
|
54 |
+
transformer_depth_output=None,
|
55 |
+
device=None,
|
56 |
+
operations=comfy.ops.disable_weight_init,
|
57 |
+
**kwargs,
|
58 |
+
):
|
59 |
+
super().__init__()
|
60 |
+
assert use_spatial_transformer == True, "use_spatial_transformer has to be true"
|
61 |
+
if use_spatial_transformer:
|
62 |
+
assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
|
63 |
+
|
64 |
+
if context_dim is not None:
|
65 |
+
assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
|
66 |
+
# from omegaconf.listconfig import ListConfig
|
67 |
+
# if type(context_dim) == ListConfig:
|
68 |
+
# context_dim = list(context_dim)
|
69 |
+
|
70 |
+
if num_heads_upsample == -1:
|
71 |
+
num_heads_upsample = num_heads
|
72 |
+
|
73 |
+
if num_heads == -1:
|
74 |
+
assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
|
75 |
+
|
76 |
+
if num_head_channels == -1:
|
77 |
+
assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
|
78 |
+
|
79 |
+
self.dims = dims
|
80 |
+
self.image_size = image_size
|
81 |
+
self.in_channels = in_channels
|
82 |
+
self.model_channels = model_channels
|
83 |
+
|
84 |
+
if isinstance(num_res_blocks, int):
|
85 |
+
self.num_res_blocks = len(channel_mult) * [num_res_blocks]
|
86 |
+
else:
|
87 |
+
if len(num_res_blocks) != len(channel_mult):
|
88 |
+
raise ValueError("provide num_res_blocks either as an int (globally constant) or "
|
89 |
+
"as a list/tuple (per-level) with the same length as channel_mult")
|
90 |
+
self.num_res_blocks = num_res_blocks
|
91 |
+
|
92 |
+
if disable_self_attentions is not None:
|
93 |
+
# should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
|
94 |
+
assert len(disable_self_attentions) == len(channel_mult)
|
95 |
+
if num_attention_blocks is not None:
|
96 |
+
assert len(num_attention_blocks) == len(self.num_res_blocks)
|
97 |
+
assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
|
98 |
+
|
99 |
+
transformer_depth = transformer_depth[:]
|
100 |
+
|
101 |
+
self.dropout = dropout
|
102 |
+
self.channel_mult = channel_mult
|
103 |
+
self.conv_resample = conv_resample
|
104 |
+
self.num_classes = num_classes
|
105 |
+
self.use_checkpoint = use_checkpoint
|
106 |
+
self.dtype = dtype
|
107 |
+
self.num_heads = num_heads
|
108 |
+
self.num_head_channels = num_head_channels
|
109 |
+
self.num_heads_upsample = num_heads_upsample
|
110 |
+
self.predict_codebook_ids = n_embed is not None
|
111 |
+
|
112 |
+
time_embed_dim = model_channels * 4
|
113 |
+
self.time_embed = nn.Sequential(
|
114 |
+
operations.Linear(model_channels, time_embed_dim, dtype=self.dtype, device=device),
|
115 |
+
nn.SiLU(),
|
116 |
+
operations.Linear(time_embed_dim, time_embed_dim, dtype=self.dtype, device=device),
|
117 |
+
)
|
118 |
+
|
119 |
+
if self.num_classes is not None:
|
120 |
+
if isinstance(self.num_classes, int):
|
121 |
+
self.label_emb = nn.Embedding(num_classes, time_embed_dim)
|
122 |
+
elif self.num_classes == "continuous":
|
123 |
+
print("setting up linear c_adm embedding layer")
|
124 |
+
self.label_emb = nn.Linear(1, time_embed_dim)
|
125 |
+
elif self.num_classes == "sequential":
|
126 |
+
assert adm_in_channels is not None
|
127 |
+
self.label_emb = nn.Sequential(
|
128 |
+
nn.Sequential(
|
129 |
+
operations.Linear(adm_in_channels, time_embed_dim, dtype=self.dtype, device=device),
|
130 |
+
nn.SiLU(),
|
131 |
+
operations.Linear(time_embed_dim, time_embed_dim, dtype=self.dtype, device=device),
|
132 |
+
)
|
133 |
+
)
|
134 |
+
else:
|
135 |
+
raise ValueError()
|
136 |
+
|
137 |
+
self.input_blocks = nn.ModuleList(
|
138 |
+
[
|
139 |
+
TimestepEmbedSequential(
|
140 |
+
operations.conv_nd(dims, in_channels, model_channels, 3, padding=1, dtype=self.dtype, device=device)
|
141 |
+
)
|
142 |
+
]
|
143 |
+
)
|
144 |
+
self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels, operations=operations, dtype=self.dtype, device=device)])
|
145 |
+
|
146 |
+
self.input_hint_block = TimestepEmbedSequential(
|
147 |
+
operations.conv_nd(dims, hint_channels, 16, 3, padding=1, dtype=self.dtype, device=device),
|
148 |
+
nn.SiLU(),
|
149 |
+
operations.conv_nd(dims, 16, 16, 3, padding=1, dtype=self.dtype, device=device),
|
150 |
+
nn.SiLU(),
|
151 |
+
operations.conv_nd(dims, 16, 32, 3, padding=1, stride=2, dtype=self.dtype, device=device),
|
152 |
+
nn.SiLU(),
|
153 |
+
operations.conv_nd(dims, 32, 32, 3, padding=1, dtype=self.dtype, device=device),
|
154 |
+
nn.SiLU(),
|
155 |
+
operations.conv_nd(dims, 32, 96, 3, padding=1, stride=2, dtype=self.dtype, device=device),
|
156 |
+
nn.SiLU(),
|
157 |
+
operations.conv_nd(dims, 96, 96, 3, padding=1, dtype=self.dtype, device=device),
|
158 |
+
nn.SiLU(),
|
159 |
+
operations.conv_nd(dims, 96, 256, 3, padding=1, stride=2, dtype=self.dtype, device=device),
|
160 |
+
nn.SiLU(),
|
161 |
+
operations.conv_nd(dims, 256, model_channels, 3, padding=1, dtype=self.dtype, device=device)
|
162 |
+
)
|
163 |
+
|
164 |
+
self._feature_size = model_channels
|
165 |
+
input_block_chans = [model_channels]
|
166 |
+
ch = model_channels
|
167 |
+
ds = 1
|
168 |
+
for level, mult in enumerate(channel_mult):
|
169 |
+
for nr in range(self.num_res_blocks[level]):
|
170 |
+
layers = [
|
171 |
+
ResBlock(
|
172 |
+
ch,
|
173 |
+
time_embed_dim,
|
174 |
+
dropout,
|
175 |
+
out_channels=mult * model_channels,
|
176 |
+
dims=dims,
|
177 |
+
use_checkpoint=use_checkpoint,
|
178 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
179 |
+
dtype=self.dtype,
|
180 |
+
device=device,
|
181 |
+
operations=operations,
|
182 |
+
)
|
183 |
+
]
|
184 |
+
ch = mult * model_channels
|
185 |
+
num_transformers = transformer_depth.pop(0)
|
186 |
+
if num_transformers > 0:
|
187 |
+
if num_head_channels == -1:
|
188 |
+
dim_head = ch // num_heads
|
189 |
+
else:
|
190 |
+
num_heads = ch // num_head_channels
|
191 |
+
dim_head = num_head_channels
|
192 |
+
if legacy:
|
193 |
+
#num_heads = 1
|
194 |
+
dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
|
195 |
+
if exists(disable_self_attentions):
|
196 |
+
disabled_sa = disable_self_attentions[level]
|
197 |
+
else:
|
198 |
+
disabled_sa = False
|
199 |
+
|
200 |
+
if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
|
201 |
+
layers.append(
|
202 |
+
SpatialTransformer(
|
203 |
+
ch, num_heads, dim_head, depth=num_transformers, context_dim=context_dim,
|
204 |
+
disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
|
205 |
+
use_checkpoint=use_checkpoint, dtype=self.dtype, device=device, operations=operations
|
206 |
+
)
|
207 |
+
)
|
208 |
+
self.input_blocks.append(TimestepEmbedSequential(*layers))
|
209 |
+
self.zero_convs.append(self.make_zero_conv(ch, operations=operations, dtype=self.dtype, device=device))
|
210 |
+
self._feature_size += ch
|
211 |
+
input_block_chans.append(ch)
|
212 |
+
if level != len(channel_mult) - 1:
|
213 |
+
out_ch = ch
|
214 |
+
self.input_blocks.append(
|
215 |
+
TimestepEmbedSequential(
|
216 |
+
ResBlock(
|
217 |
+
ch,
|
218 |
+
time_embed_dim,
|
219 |
+
dropout,
|
220 |
+
out_channels=out_ch,
|
221 |
+
dims=dims,
|
222 |
+
use_checkpoint=use_checkpoint,
|
223 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
224 |
+
down=True,
|
225 |
+
dtype=self.dtype,
|
226 |
+
device=device,
|
227 |
+
operations=operations
|
228 |
+
)
|
229 |
+
if resblock_updown
|
230 |
+
else Downsample(
|
231 |
+
ch, conv_resample, dims=dims, out_channels=out_ch, dtype=self.dtype, device=device, operations=operations
|
232 |
+
)
|
233 |
+
)
|
234 |
+
)
|
235 |
+
ch = out_ch
|
236 |
+
input_block_chans.append(ch)
|
237 |
+
self.zero_convs.append(self.make_zero_conv(ch, operations=operations, dtype=self.dtype, device=device))
|
238 |
+
ds *= 2
|
239 |
+
self._feature_size += ch
|
240 |
+
|
241 |
+
if num_head_channels == -1:
|
242 |
+
dim_head = ch // num_heads
|
243 |
+
else:
|
244 |
+
num_heads = ch // num_head_channels
|
245 |
+
dim_head = num_head_channels
|
246 |
+
if legacy:
|
247 |
+
#num_heads = 1
|
248 |
+
dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
|
249 |
+
mid_block = [
|
250 |
+
ResBlock(
|
251 |
+
ch,
|
252 |
+
time_embed_dim,
|
253 |
+
dropout,
|
254 |
+
dims=dims,
|
255 |
+
use_checkpoint=use_checkpoint,
|
256 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
257 |
+
dtype=self.dtype,
|
258 |
+
device=device,
|
259 |
+
operations=operations
|
260 |
+
)]
|
261 |
+
if transformer_depth_middle >= 0:
|
262 |
+
mid_block += [SpatialTransformer( # always uses a self-attn
|
263 |
+
ch, num_heads, dim_head, depth=transformer_depth_middle, context_dim=context_dim,
|
264 |
+
disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
|
265 |
+
use_checkpoint=use_checkpoint, dtype=self.dtype, device=device, operations=operations
|
266 |
+
),
|
267 |
+
ResBlock(
|
268 |
+
ch,
|
269 |
+
time_embed_dim,
|
270 |
+
dropout,
|
271 |
+
dims=dims,
|
272 |
+
use_checkpoint=use_checkpoint,
|
273 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
274 |
+
dtype=self.dtype,
|
275 |
+
device=device,
|
276 |
+
operations=operations
|
277 |
+
)]
|
278 |
+
self.middle_block = TimestepEmbedSequential(*mid_block)
|
279 |
+
self.middle_block_out = self.make_zero_conv(ch, operations=operations, dtype=self.dtype, device=device)
|
280 |
+
self._feature_size += ch
|
281 |
+
|
282 |
+
def make_zero_conv(self, channels, operations=None, dtype=None, device=None):
|
283 |
+
return TimestepEmbedSequential(operations.conv_nd(self.dims, channels, channels, 1, padding=0, dtype=dtype, device=device))
|
284 |
+
|
285 |
+
def forward(self, x, hint, timesteps, context, y=None, **kwargs):
|
286 |
+
t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype)
|
287 |
+
emb = self.time_embed(t_emb)
|
288 |
+
|
289 |
+
guided_hint = self.input_hint_block(hint, emb, context)
|
290 |
+
|
291 |
+
outs = []
|
292 |
+
|
293 |
+
hs = []
|
294 |
+
if self.num_classes is not None:
|
295 |
+
assert y.shape[0] == x.shape[0]
|
296 |
+
emb = emb + self.label_emb(y)
|
297 |
+
|
298 |
+
h = x
|
299 |
+
for module, zero_conv in zip(self.input_blocks, self.zero_convs):
|
300 |
+
if guided_hint is not None:
|
301 |
+
h = module(h, emb, context)
|
302 |
+
h += guided_hint
|
303 |
+
guided_hint = None
|
304 |
+
else:
|
305 |
+
h = module(h, emb, context)
|
306 |
+
outs.append(zero_conv(h, emb, context))
|
307 |
+
|
308 |
+
h = self.middle_block(h, emb, context)
|
309 |
+
outs.append(self.middle_block_out(h, emb, context))
|
310 |
+
|
311 |
+
return outs
|
312 |
+
|
ComfyUI/comfy/cli_args.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import enum
|
3 |
+
import comfy.options
|
4 |
+
|
5 |
+
class EnumAction(argparse.Action):
|
6 |
+
"""
|
7 |
+
Argparse action for handling Enums
|
8 |
+
"""
|
9 |
+
def __init__(self, **kwargs):
|
10 |
+
# Pop off the type value
|
11 |
+
enum_type = kwargs.pop("type", None)
|
12 |
+
|
13 |
+
# Ensure an Enum subclass is provided
|
14 |
+
if enum_type is None:
|
15 |
+
raise ValueError("type must be assigned an Enum when using EnumAction")
|
16 |
+
if not issubclass(enum_type, enum.Enum):
|
17 |
+
raise TypeError("type must be an Enum when using EnumAction")
|
18 |
+
|
19 |
+
# Generate choices from the Enum
|
20 |
+
choices = tuple(e.value for e in enum_type)
|
21 |
+
kwargs.setdefault("choices", choices)
|
22 |
+
kwargs.setdefault("metavar", f"[{','.join(list(choices))}]")
|
23 |
+
|
24 |
+
super(EnumAction, self).__init__(**kwargs)
|
25 |
+
|
26 |
+
self._enum = enum_type
|
27 |
+
|
28 |
+
def __call__(self, parser, namespace, values, option_string=None):
|
29 |
+
# Convert value back into an Enum
|
30 |
+
value = self._enum(values)
|
31 |
+
setattr(namespace, self.dest, value)
|
32 |
+
|
33 |
+
|
34 |
+
parser = argparse.ArgumentParser()
|
35 |
+
|
36 |
+
parser.add_argument("--listen", type=str, default="127.0.0.1", metavar="IP", nargs="?", const="0.0.0.0", help="Specify the IP address to listen on (default: 127.0.0.1). If --listen is provided without an argument, it defaults to 0.0.0.0. (listens on all)")
|
37 |
+
parser.add_argument("--port", type=int, default=8188, help="Set the listen port.")
|
38 |
+
parser.add_argument("--enable-cors-header", type=str, default=None, metavar="ORIGIN", nargs="?", const="*", help="Enable CORS (Cross-Origin Resource Sharing) with optional origin or allow all with default '*'.")
|
39 |
+
parser.add_argument("--max-upload-size", type=float, default=100, help="Set the maximum upload size in MB.")
|
40 |
+
|
41 |
+
parser.add_argument("--extra-model-paths-config", type=str, default=None, metavar="PATH", nargs='+', action='append', help="Load one or more extra_model_paths.yaml files.")
|
42 |
+
parser.add_argument("--output-directory", type=str, default=None, help="Set the ComfyUI output directory.")
|
43 |
+
parser.add_argument("--temp-directory", type=str, default=None, help="Set the ComfyUI temp directory (default is in the ComfyUI directory).")
|
44 |
+
parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory.")
|
45 |
+
parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.")
|
46 |
+
parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.")
|
47 |
+
parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use.")
|
48 |
+
cm_group = parser.add_mutually_exclusive_group()
|
49 |
+
cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).")
|
50 |
+
cm_group.add_argument("--disable-cuda-malloc", action="store_true", help="Disable cudaMallocAsync.")
|
51 |
+
|
52 |
+
parser.add_argument("--dont-upcast-attention", action="store_true", help="Disable upcasting of attention. Can boost speed but increase the chances of black images.")
|
53 |
+
|
54 |
+
fp_group = parser.add_mutually_exclusive_group()
|
55 |
+
fp_group.add_argument("--force-fp32", action="store_true", help="Force fp32 (If this makes your GPU work better please report it).")
|
56 |
+
fp_group.add_argument("--force-fp16", action="store_true", help="Force fp16.")
|
57 |
+
|
58 |
+
fpunet_group = parser.add_mutually_exclusive_group()
|
59 |
+
fpunet_group.add_argument("--bf16-unet", action="store_true", help="Run the UNET in bf16. This should only be used for testing stuff.")
|
60 |
+
fpunet_group.add_argument("--fp16-unet", action="store_true", help="Store unet weights in fp16.")
|
61 |
+
fpunet_group.add_argument("--fp8_e4m3fn-unet", action="store_true", help="Store unet weights in fp8_e4m3fn.")
|
62 |
+
fpunet_group.add_argument("--fp8_e5m2-unet", action="store_true", help="Store unet weights in fp8_e5m2.")
|
63 |
+
|
64 |
+
fpvae_group = parser.add_mutually_exclusive_group()
|
65 |
+
fpvae_group.add_argument("--fp16-vae", action="store_true", help="Run the VAE in fp16, might cause black images.")
|
66 |
+
fpvae_group.add_argument("--fp32-vae", action="store_true", help="Run the VAE in full precision fp32.")
|
67 |
+
fpvae_group.add_argument("--bf16-vae", action="store_true", help="Run the VAE in bf16.")
|
68 |
+
|
69 |
+
parser.add_argument("--cpu-vae", action="store_true", help="Run the VAE on the CPU.")
|
70 |
+
|
71 |
+
fpte_group = parser.add_mutually_exclusive_group()
|
72 |
+
fpte_group.add_argument("--fp8_e4m3fn-text-enc", action="store_true", help="Store text encoder weights in fp8 (e4m3fn variant).")
|
73 |
+
fpte_group.add_argument("--fp8_e5m2-text-enc", action="store_true", help="Store text encoder weights in fp8 (e5m2 variant).")
|
74 |
+
fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text encoder weights in fp16.")
|
75 |
+
fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.")
|
76 |
+
|
77 |
+
|
78 |
+
parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")
|
79 |
+
|
80 |
+
parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize when loading models with Intel GPUs.")
|
81 |
+
|
82 |
+
class LatentPreviewMethod(enum.Enum):
|
83 |
+
NoPreviews = "none"
|
84 |
+
Auto = "auto"
|
85 |
+
Latent2RGB = "latent2rgb"
|
86 |
+
TAESD = "taesd"
|
87 |
+
|
88 |
+
parser.add_argument("--preview-method", type=LatentPreviewMethod, default=LatentPreviewMethod.NoPreviews, help="Default preview method for sampler nodes.", action=EnumAction)
|
89 |
+
|
90 |
+
attn_group = parser.add_mutually_exclusive_group()
|
91 |
+
attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
|
92 |
+
attn_group.add_argument("--use-quad-cross-attention", action="store_true", help="Use the sub-quadratic cross attention optimization . Ignored when xformers is used.")
|
93 |
+
attn_group.add_argument("--use-pytorch-cross-attention", action="store_true", help="Use the new pytorch 2.0 cross attention function.")
|
94 |
+
|
95 |
+
parser.add_argument("--disable-xformers", action="store_true", help="Disable xformers.")
|
96 |
+
|
97 |
+
vram_group = parser.add_mutually_exclusive_group()
|
98 |
+
vram_group.add_argument("--gpu-only", action="store_true", help="Store and run everything (text encoders/CLIP models, etc... on the GPU).")
|
99 |
+
vram_group.add_argument("--highvram", action="store_true", help="By default models will be unloaded to CPU memory after being used. This option keeps them in GPU memory.")
|
100 |
+
vram_group.add_argument("--normalvram", action="store_true", help="Used to force normal vram use if lowvram gets automatically enabled.")
|
101 |
+
vram_group.add_argument("--lowvram", action="store_true", help="Split the unet in parts to use less vram.")
|
102 |
+
vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.")
|
103 |
+
vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")
|
104 |
+
|
105 |
+
|
106 |
+
parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.")
|
107 |
+
parser.add_argument("--deterministic", action="store_true", help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.")
|
108 |
+
|
109 |
+
parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
|
110 |
+
parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI.")
|
111 |
+
parser.add_argument("--windows-standalone-build", action="store_true", help="Windows standalone build: Enable convenient things that most people using the standalone windows build will probably enjoy (like auto opening the page on startup).")
|
112 |
+
|
113 |
+
parser.add_argument("--disable-metadata", action="store_true", help="Disable saving prompt metadata in files.")
|
114 |
+
|
115 |
+
parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")
|
116 |
+
|
117 |
+
parser.add_argument("--verbose", action="store_true", help="Enables more debug prints.")
|
118 |
+
|
119 |
+
|
120 |
+
if comfy.options.args_parsing:
|
121 |
+
args = parser.parse_args()
|
122 |
+
else:
|
123 |
+
args = parser.parse_args([])
|
124 |
+
|
125 |
+
if args.windows_standalone_build:
|
126 |
+
args.auto_launch = True
|
127 |
+
|
128 |
+
if args.disable_auto_launch:
|
129 |
+
args.auto_launch = False
|
130 |
+
|
131 |
+
import logging
|
132 |
+
logging_level = logging.INFO
|
133 |
+
if args.verbose:
|
134 |
+
logging_level = logging.DEBUG
|
135 |
+
|
136 |
+
logging.basicConfig(format="%(message)s", level=logging_level)
|
ComfyUI/comfy/clip_config_bigg.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"CLIPTextModel"
|
4 |
+
],
|
5 |
+
"attention_dropout": 0.0,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"dropout": 0.0,
|
8 |
+
"eos_token_id": 2,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_size": 1280,
|
11 |
+
"initializer_factor": 1.0,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 5120,
|
14 |
+
"layer_norm_eps": 1e-05,
|
15 |
+
"max_position_embeddings": 77,
|
16 |
+
"model_type": "clip_text_model",
|
17 |
+
"num_attention_heads": 20,
|
18 |
+
"num_hidden_layers": 32,
|
19 |
+
"pad_token_id": 1,
|
20 |
+
"projection_dim": 1280,
|
21 |
+
"torch_dtype": "float32",
|
22 |
+
"vocab_size": 49408
|
23 |
+
}
|
ComfyUI/comfy/clip_model.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from comfy.ldm.modules.attention import optimized_attention_for_device
|
3 |
+
|
4 |
+
class CLIPAttention(torch.nn.Module):
|
5 |
+
def __init__(self, embed_dim, heads, dtype, device, operations):
|
6 |
+
super().__init__()
|
7 |
+
|
8 |
+
self.heads = heads
|
9 |
+
self.q_proj = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
|
10 |
+
self.k_proj = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
|
11 |
+
self.v_proj = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
|
12 |
+
|
13 |
+
self.out_proj = operations.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
|
14 |
+
|
15 |
+
def forward(self, x, mask=None, optimized_attention=None):
|
16 |
+
q = self.q_proj(x)
|
17 |
+
k = self.k_proj(x)
|
18 |
+
v = self.v_proj(x)
|
19 |
+
|
20 |
+
out = optimized_attention(q, k, v, self.heads, mask)
|
21 |
+
return self.out_proj(out)
|
22 |
+
|
23 |
+
ACTIVATIONS = {"quick_gelu": lambda a: a * torch.sigmoid(1.702 * a),
|
24 |
+
"gelu": torch.nn.functional.gelu,
|
25 |
+
}
|
26 |
+
|
27 |
+
class CLIPMLP(torch.nn.Module):
|
28 |
+
def __init__(self, embed_dim, intermediate_size, activation, dtype, device, operations):
|
29 |
+
super().__init__()
|
30 |
+
self.fc1 = operations.Linear(embed_dim, intermediate_size, bias=True, dtype=dtype, device=device)
|
31 |
+
self.activation = ACTIVATIONS[activation]
|
32 |
+
self.fc2 = operations.Linear(intermediate_size, embed_dim, bias=True, dtype=dtype, device=device)
|
33 |
+
|
34 |
+
def forward(self, x):
|
35 |
+
x = self.fc1(x)
|
36 |
+
x = self.activation(x)
|
37 |
+
x = self.fc2(x)
|
38 |
+
return x
|
39 |
+
|
40 |
+
class CLIPLayer(torch.nn.Module):
|
41 |
+
def __init__(self, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations):
|
42 |
+
super().__init__()
|
43 |
+
self.layer_norm1 = operations.LayerNorm(embed_dim, dtype=dtype, device=device)
|
44 |
+
self.self_attn = CLIPAttention(embed_dim, heads, dtype, device, operations)
|
45 |
+
self.layer_norm2 = operations.LayerNorm(embed_dim, dtype=dtype, device=device)
|
46 |
+
self.mlp = CLIPMLP(embed_dim, intermediate_size, intermediate_activation, dtype, device, operations)
|
47 |
+
|
48 |
+
def forward(self, x, mask=None, optimized_attention=None):
|
49 |
+
x += self.self_attn(self.layer_norm1(x), mask, optimized_attention)
|
50 |
+
x += self.mlp(self.layer_norm2(x))
|
51 |
+
return x
|
52 |
+
|
53 |
+
|
54 |
+
class CLIPEncoder(torch.nn.Module):
|
55 |
+
def __init__(self, num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations):
|
56 |
+
super().__init__()
|
57 |
+
self.layers = torch.nn.ModuleList([CLIPLayer(embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations) for i in range(num_layers)])
|
58 |
+
|
59 |
+
def forward(self, x, mask=None, intermediate_output=None):
|
60 |
+
optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True)
|
61 |
+
|
62 |
+
if intermediate_output is not None:
|
63 |
+
if intermediate_output < 0:
|
64 |
+
intermediate_output = len(self.layers) + intermediate_output
|
65 |
+
|
66 |
+
intermediate = None
|
67 |
+
for i, l in enumerate(self.layers):
|
68 |
+
x = l(x, mask, optimized_attention)
|
69 |
+
if i == intermediate_output:
|
70 |
+
intermediate = x.clone()
|
71 |
+
return x, intermediate
|
72 |
+
|
73 |
+
class CLIPEmbeddings(torch.nn.Module):
|
74 |
+
def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None):
|
75 |
+
super().__init__()
|
76 |
+
self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
|
77 |
+
self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
|
78 |
+
|
79 |
+
def forward(self, input_tokens):
|
80 |
+
return self.token_embedding(input_tokens) + self.position_embedding.weight
|
81 |
+
|
82 |
+
|
83 |
+
class CLIPTextModel_(torch.nn.Module):
|
84 |
+
def __init__(self, config_dict, dtype, device, operations):
|
85 |
+
num_layers = config_dict["num_hidden_layers"]
|
86 |
+
embed_dim = config_dict["hidden_size"]
|
87 |
+
heads = config_dict["num_attention_heads"]
|
88 |
+
intermediate_size = config_dict["intermediate_size"]
|
89 |
+
intermediate_activation = config_dict["hidden_act"]
|
90 |
+
|
91 |
+
super().__init__()
|
92 |
+
self.embeddings = CLIPEmbeddings(embed_dim, dtype=torch.float32, device=device)
|
93 |
+
self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
|
94 |
+
self.final_layer_norm = operations.LayerNorm(embed_dim, dtype=dtype, device=device)
|
95 |
+
|
96 |
+
def forward(self, input_tokens, attention_mask=None, intermediate_output=None, final_layer_norm_intermediate=True):
|
97 |
+
x = self.embeddings(input_tokens)
|
98 |
+
mask = None
|
99 |
+
if attention_mask is not None:
|
100 |
+
mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1])
|
101 |
+
mask = mask.masked_fill(mask.to(torch.bool), float("-inf"))
|
102 |
+
|
103 |
+
causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
|
104 |
+
if mask is not None:
|
105 |
+
mask += causal_mask
|
106 |
+
else:
|
107 |
+
mask = causal_mask
|
108 |
+
|
109 |
+
x, i = self.encoder(x, mask=mask, intermediate_output=intermediate_output)
|
110 |
+
x = self.final_layer_norm(x)
|
111 |
+
if i is not None and final_layer_norm_intermediate:
|
112 |
+
i = self.final_layer_norm(i)
|
113 |
+
|
114 |
+
pooled_output = x[torch.arange(x.shape[0], device=x.device), input_tokens.to(dtype=torch.int, device=x.device).argmax(dim=-1),]
|
115 |
+
return x, i, pooled_output
|
116 |
+
|
117 |
+
class CLIPTextModel(torch.nn.Module):
|
118 |
+
def __init__(self, config_dict, dtype, device, operations):
|
119 |
+
super().__init__()
|
120 |
+
self.num_layers = config_dict["num_hidden_layers"]
|
121 |
+
self.text_model = CLIPTextModel_(config_dict, dtype, device, operations)
|
122 |
+
embed_dim = config_dict["hidden_size"]
|
123 |
+
self.text_projection = operations.Linear(embed_dim, embed_dim, bias=False, dtype=dtype, device=device)
|
124 |
+
self.text_projection.weight.copy_(torch.eye(embed_dim))
|
125 |
+
self.dtype = dtype
|
126 |
+
|
127 |
+
def get_input_embeddings(self):
|
128 |
+
return self.text_model.embeddings.token_embedding
|
129 |
+
|
130 |
+
def set_input_embeddings(self, embeddings):
|
131 |
+
self.text_model.embeddings.token_embedding = embeddings
|
132 |
+
|
133 |
+
def forward(self, *args, **kwargs):
|
134 |
+
x = self.text_model(*args, **kwargs)
|
135 |
+
out = self.text_projection(x[2])
|
136 |
+
return (x[0], x[1], out, x[2])
|
137 |
+
|
138 |
+
|
139 |
+
class CLIPVisionEmbeddings(torch.nn.Module):
|
140 |
+
def __init__(self, embed_dim, num_channels=3, patch_size=14, image_size=224, dtype=None, device=None, operations=None):
|
141 |
+
super().__init__()
|
142 |
+
self.class_embedding = torch.nn.Parameter(torch.empty(embed_dim, dtype=dtype, device=device))
|
143 |
+
|
144 |
+
self.patch_embedding = operations.Conv2d(
|
145 |
+
in_channels=num_channels,
|
146 |
+
out_channels=embed_dim,
|
147 |
+
kernel_size=patch_size,
|
148 |
+
stride=patch_size,
|
149 |
+
bias=False,
|
150 |
+
dtype=dtype,
|
151 |
+
device=device
|
152 |
+
)
|
153 |
+
|
154 |
+
num_patches = (image_size // patch_size) ** 2
|
155 |
+
num_positions = num_patches + 1
|
156 |
+
self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
|
157 |
+
|
158 |
+
def forward(self, pixel_values):
|
159 |
+
embeds = self.patch_embedding(pixel_values).flatten(2).transpose(1, 2)
|
160 |
+
return torch.cat([self.class_embedding.to(embeds.device).expand(pixel_values.shape[0], 1, -1), embeds], dim=1) + self.position_embedding.weight.to(embeds.device)
|
161 |
+
|
162 |
+
|
163 |
+
class CLIPVision(torch.nn.Module):
|
164 |
+
def __init__(self, config_dict, dtype, device, operations):
|
165 |
+
super().__init__()
|
166 |
+
num_layers = config_dict["num_hidden_layers"]
|
167 |
+
embed_dim = config_dict["hidden_size"]
|
168 |
+
heads = config_dict["num_attention_heads"]
|
169 |
+
intermediate_size = config_dict["intermediate_size"]
|
170 |
+
intermediate_activation = config_dict["hidden_act"]
|
171 |
+
|
172 |
+
self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], dtype=torch.float32, device=device, operations=operations)
|
173 |
+
self.pre_layrnorm = operations.LayerNorm(embed_dim)
|
174 |
+
self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device, operations)
|
175 |
+
self.post_layernorm = operations.LayerNorm(embed_dim)
|
176 |
+
|
177 |
+
def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
|
178 |
+
x = self.embeddings(pixel_values)
|
179 |
+
x = self.pre_layrnorm(x)
|
180 |
+
#TODO: attention_mask?
|
181 |
+
x, i = self.encoder(x, mask=None, intermediate_output=intermediate_output)
|
182 |
+
pooled_output = self.post_layernorm(x[:, 0, :])
|
183 |
+
return x, i, pooled_output
|
184 |
+
|
185 |
+
class CLIPVisionModelProjection(torch.nn.Module):
|
186 |
+
def __init__(self, config_dict, dtype, device, operations):
|
187 |
+
super().__init__()
|
188 |
+
self.vision_model = CLIPVision(config_dict, dtype, device, operations)
|
189 |
+
self.visual_projection = operations.Linear(config_dict["hidden_size"], config_dict["projection_dim"], bias=False)
|
190 |
+
|
191 |
+
def forward(self, *args, **kwargs):
|
192 |
+
x = self.vision_model(*args, **kwargs)
|
193 |
+
out = self.visual_projection(x[2])
|
194 |
+
return (x[0], x[1], out)
|
ComfyUI/comfy/clip_vision.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .utils import load_torch_file, transformers_convert, state_dict_prefix_replace
|
2 |
+
import os
|
3 |
+
import torch
|
4 |
+
import json
|
5 |
+
import logging
|
6 |
+
|
7 |
+
import comfy.ops
|
8 |
+
import comfy.model_patcher
|
9 |
+
import comfy.model_management
|
10 |
+
import comfy.utils
|
11 |
+
import comfy.clip_model
|
12 |
+
|
13 |
+
class Output:
|
14 |
+
def __getitem__(self, key):
|
15 |
+
return getattr(self, key)
|
16 |
+
def __setitem__(self, key, item):
|
17 |
+
setattr(self, key, item)
|
18 |
+
|
19 |
+
def clip_preprocess(image, size=224):
|
20 |
+
mean = torch.tensor([ 0.48145466,0.4578275,0.40821073], device=image.device, dtype=image.dtype)
|
21 |
+
std = torch.tensor([0.26862954,0.26130258,0.27577711], device=image.device, dtype=image.dtype)
|
22 |
+
image = image.movedim(-1, 1)
|
23 |
+
if not (image.shape[2] == size and image.shape[3] == size):
|
24 |
+
scale = (size / min(image.shape[2], image.shape[3]))
|
25 |
+
image = torch.nn.functional.interpolate(image, size=(round(scale * image.shape[2]), round(scale * image.shape[3])), mode="bicubic", antialias=True)
|
26 |
+
h = (image.shape[2] - size)//2
|
27 |
+
w = (image.shape[3] - size)//2
|
28 |
+
image = image[:,:,h:h+size,w:w+size]
|
29 |
+
image = torch.clip((255. * image), 0, 255).round() / 255.0
|
30 |
+
return (image - mean.view([3,1,1])) / std.view([3,1,1])
|
31 |
+
|
32 |
+
class ClipVisionModel():
|
33 |
+
def __init__(self, json_config):
|
34 |
+
with open(json_config) as f:
|
35 |
+
config = json.load(f)
|
36 |
+
|
37 |
+
self.load_device = comfy.model_management.text_encoder_device()
|
38 |
+
offload_device = comfy.model_management.text_encoder_offload_device()
|
39 |
+
self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
|
40 |
+
self.model = comfy.clip_model.CLIPVisionModelProjection(config, self.dtype, offload_device, comfy.ops.manual_cast)
|
41 |
+
self.model.eval()
|
42 |
+
|
43 |
+
self.patcher = comfy.model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
|
44 |
+
|
45 |
+
def load_sd(self, sd):
|
46 |
+
return self.model.load_state_dict(sd, strict=False)
|
47 |
+
|
48 |
+
def get_sd(self):
|
49 |
+
return self.model.state_dict()
|
50 |
+
|
51 |
+
def encode_image(self, image):
|
52 |
+
comfy.model_management.load_model_gpu(self.patcher)
|
53 |
+
pixel_values = clip_preprocess(image.to(self.load_device)).float()
|
54 |
+
out = self.model(pixel_values=pixel_values, intermediate_output=-2)
|
55 |
+
|
56 |
+
outputs = Output()
|
57 |
+
outputs["last_hidden_state"] = out[0].to(comfy.model_management.intermediate_device())
|
58 |
+
outputs["image_embeds"] = out[2].to(comfy.model_management.intermediate_device())
|
59 |
+
outputs["penultimate_hidden_states"] = out[1].to(comfy.model_management.intermediate_device())
|
60 |
+
return outputs
|
61 |
+
|
62 |
+
def convert_to_transformers(sd, prefix):
|
63 |
+
sd_k = sd.keys()
|
64 |
+
if "{}transformer.resblocks.0.attn.in_proj_weight".format(prefix) in sd_k:
|
65 |
+
keys_to_replace = {
|
66 |
+
"{}class_embedding".format(prefix): "vision_model.embeddings.class_embedding",
|
67 |
+
"{}conv1.weight".format(prefix): "vision_model.embeddings.patch_embedding.weight",
|
68 |
+
"{}positional_embedding".format(prefix): "vision_model.embeddings.position_embedding.weight",
|
69 |
+
"{}ln_post.bias".format(prefix): "vision_model.post_layernorm.bias",
|
70 |
+
"{}ln_post.weight".format(prefix): "vision_model.post_layernorm.weight",
|
71 |
+
"{}ln_pre.bias".format(prefix): "vision_model.pre_layrnorm.bias",
|
72 |
+
"{}ln_pre.weight".format(prefix): "vision_model.pre_layrnorm.weight",
|
73 |
+
}
|
74 |
+
|
75 |
+
for x in keys_to_replace:
|
76 |
+
if x in sd_k:
|
77 |
+
sd[keys_to_replace[x]] = sd.pop(x)
|
78 |
+
|
79 |
+
if "{}proj".format(prefix) in sd_k:
|
80 |
+
sd['visual_projection.weight'] = sd.pop("{}proj".format(prefix)).transpose(0, 1)
|
81 |
+
|
82 |
+
sd = transformers_convert(sd, prefix, "vision_model.", 48)
|
83 |
+
else:
|
84 |
+
replace_prefix = {prefix: ""}
|
85 |
+
sd = state_dict_prefix_replace(sd, replace_prefix)
|
86 |
+
return sd
|
87 |
+
|
88 |
+
def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
|
89 |
+
if convert_keys:
|
90 |
+
sd = convert_to_transformers(sd, prefix)
|
91 |
+
if "vision_model.encoder.layers.47.layer_norm1.weight" in sd:
|
92 |
+
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_g.json")
|
93 |
+
elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
|
94 |
+
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
|
95 |
+
elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
|
96 |
+
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
|
97 |
+
else:
|
98 |
+
return None
|
99 |
+
|
100 |
+
clip = ClipVisionModel(json_config)
|
101 |
+
m, u = clip.load_sd(sd)
|
102 |
+
if len(m) > 0:
|
103 |
+
logging.warning("missing clip vision: {}".format(m))
|
104 |
+
u = set(u)
|
105 |
+
keys = list(sd.keys())
|
106 |
+
for k in keys:
|
107 |
+
if k not in u:
|
108 |
+
t = sd.pop(k)
|
109 |
+
del t
|
110 |
+
return clip
|
111 |
+
|
112 |
+
def load(ckpt_path):
|
113 |
+
sd = load_torch_file(ckpt_path)
|
114 |
+
if "visual.transformer.resblocks.0.attn.in_proj_weight" in sd:
|
115 |
+
return load_clipvision_from_sd(sd, prefix="visual.", convert_keys=True)
|
116 |
+
else:
|
117 |
+
return load_clipvision_from_sd(sd)
|
ComfyUI/comfy/clip_vision_config_g.json
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"attention_dropout": 0.0,
|
3 |
+
"dropout": 0.0,
|
4 |
+
"hidden_act": "gelu",
|
5 |
+
"hidden_size": 1664,
|
6 |
+
"image_size": 224,
|
7 |
+
"initializer_factor": 1.0,
|
8 |
+
"initializer_range": 0.02,
|
9 |
+
"intermediate_size": 8192,
|
10 |
+
"layer_norm_eps": 1e-05,
|
11 |
+
"model_type": "clip_vision_model",
|
12 |
+
"num_attention_heads": 16,
|
13 |
+
"num_channels": 3,
|
14 |
+
"num_hidden_layers": 48,
|
15 |
+
"patch_size": 14,
|
16 |
+
"projection_dim": 1280,
|
17 |
+
"torch_dtype": "float32"
|
18 |
+
}
|
ComfyUI/comfy/clip_vision_config_h.json
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"attention_dropout": 0.0,
|
3 |
+
"dropout": 0.0,
|
4 |
+
"hidden_act": "gelu",
|
5 |
+
"hidden_size": 1280,
|
6 |
+
"image_size": 224,
|
7 |
+
"initializer_factor": 1.0,
|
8 |
+
"initializer_range": 0.02,
|
9 |
+
"intermediate_size": 5120,
|
10 |
+
"layer_norm_eps": 1e-05,
|
11 |
+
"model_type": "clip_vision_model",
|
12 |
+
"num_attention_heads": 16,
|
13 |
+
"num_channels": 3,
|
14 |
+
"num_hidden_layers": 32,
|
15 |
+
"patch_size": 14,
|
16 |
+
"projection_dim": 1024,
|
17 |
+
"torch_dtype": "float32"
|
18 |
+
}
|
ComfyUI/comfy/clip_vision_config_vitl.json
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"attention_dropout": 0.0,
|
3 |
+
"dropout": 0.0,
|
4 |
+
"hidden_act": "quick_gelu",
|
5 |
+
"hidden_size": 1024,
|
6 |
+
"image_size": 224,
|
7 |
+
"initializer_factor": 1.0,
|
8 |
+
"initializer_range": 0.02,
|
9 |
+
"intermediate_size": 4096,
|
10 |
+
"layer_norm_eps": 1e-05,
|
11 |
+
"model_type": "clip_vision_model",
|
12 |
+
"num_attention_heads": 16,
|
13 |
+
"num_channels": 3,
|
14 |
+
"num_hidden_layers": 24,
|
15 |
+
"patch_size": 14,
|
16 |
+
"projection_dim": 768,
|
17 |
+
"torch_dtype": "float32"
|
18 |
+
}
|
ComfyUI/comfy/conds.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import math
|
3 |
+
import comfy.utils
|
4 |
+
|
5 |
+
|
6 |
+
def lcm(a, b): #TODO: eventually replace by math.lcm (added in python3.9)
|
7 |
+
return abs(a*b) // math.gcd(a, b)
|
8 |
+
|
9 |
+
class CONDRegular:
|
10 |
+
def __init__(self, cond):
|
11 |
+
self.cond = cond
|
12 |
+
|
13 |
+
def _copy_with(self, cond):
|
14 |
+
return self.__class__(cond)
|
15 |
+
|
16 |
+
def process_cond(self, batch_size, device, **kwargs):
|
17 |
+
return self._copy_with(comfy.utils.repeat_to_batch_size(self.cond, batch_size).to(device))
|
18 |
+
|
19 |
+
def can_concat(self, other):
|
20 |
+
if self.cond.shape != other.cond.shape:
|
21 |
+
return False
|
22 |
+
return True
|
23 |
+
|
24 |
+
def concat(self, others):
|
25 |
+
conds = [self.cond]
|
26 |
+
for x in others:
|
27 |
+
conds.append(x.cond)
|
28 |
+
return torch.cat(conds)
|
29 |
+
|
30 |
+
class CONDNoiseShape(CONDRegular):
|
31 |
+
def process_cond(self, batch_size, device, area, **kwargs):
|
32 |
+
data = self.cond[:,:,area[2]:area[0] + area[2],area[3]:area[1] + area[3]]
|
33 |
+
return self._copy_with(comfy.utils.repeat_to_batch_size(data, batch_size).to(device))
|
34 |
+
|
35 |
+
|
36 |
+
class CONDCrossAttn(CONDRegular):
|
37 |
+
def can_concat(self, other):
|
38 |
+
s1 = self.cond.shape
|
39 |
+
s2 = other.cond.shape
|
40 |
+
if s1 != s2:
|
41 |
+
if s1[0] != s2[0] or s1[2] != s2[2]: #these 2 cases should not happen
|
42 |
+
return False
|
43 |
+
|
44 |
+
mult_min = lcm(s1[1], s2[1])
|
45 |
+
diff = mult_min // min(s1[1], s2[1])
|
46 |
+
if diff > 4: #arbitrary limit on the padding because it's probably going to impact performance negatively if it's too much
|
47 |
+
return False
|
48 |
+
return True
|
49 |
+
|
50 |
+
def concat(self, others):
|
51 |
+
conds = [self.cond]
|
52 |
+
crossattn_max_len = self.cond.shape[1]
|
53 |
+
for x in others:
|
54 |
+
c = x.cond
|
55 |
+
crossattn_max_len = lcm(crossattn_max_len, c.shape[1])
|
56 |
+
conds.append(c)
|
57 |
+
|
58 |
+
out = []
|
59 |
+
for c in conds:
|
60 |
+
if c.shape[1] < crossattn_max_len:
|
61 |
+
c = c.repeat(1, crossattn_max_len // c.shape[1], 1) #padding with repeat doesn't change result
|
62 |
+
out.append(c)
|
63 |
+
return torch.cat(out)
|
64 |
+
|
65 |
+
class CONDConstant(CONDRegular):
|
66 |
+
def __init__(self, cond):
|
67 |
+
self.cond = cond
|
68 |
+
|
69 |
+
def process_cond(self, batch_size, device, **kwargs):
|
70 |
+
return self._copy_with(self.cond)
|
71 |
+
|
72 |
+
def can_concat(self, other):
|
73 |
+
if self.cond != other.cond:
|
74 |
+
return False
|
75 |
+
return True
|
76 |
+
|
77 |
+
def concat(self, others):
|
78 |
+
return self.cond
|
ComfyUI/comfy/controlnet.py
ADDED
@@ -0,0 +1,550 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import math
|
3 |
+
import os
|
4 |
+
import logging
|
5 |
+
import comfy.utils
|
6 |
+
import comfy.model_management
|
7 |
+
import comfy.model_detection
|
8 |
+
import comfy.model_patcher
|
9 |
+
import comfy.ops
|
10 |
+
|
11 |
+
import comfy.cldm.cldm
|
12 |
+
import comfy.t2i_adapter.adapter
|
13 |
+
import comfy.ldm.cascade.controlnet
|
14 |
+
|
15 |
+
|
16 |
+
def broadcast_image_to(tensor, target_batch_size, batched_number):
|
17 |
+
current_batch_size = tensor.shape[0]
|
18 |
+
#print(current_batch_size, target_batch_size)
|
19 |
+
if current_batch_size == 1:
|
20 |
+
return tensor
|
21 |
+
|
22 |
+
per_batch = target_batch_size // batched_number
|
23 |
+
tensor = tensor[:per_batch]
|
24 |
+
|
25 |
+
if per_batch > tensor.shape[0]:
|
26 |
+
tensor = torch.cat([tensor] * (per_batch // tensor.shape[0]) + [tensor[:(per_batch % tensor.shape[0])]], dim=0)
|
27 |
+
|
28 |
+
current_batch_size = tensor.shape[0]
|
29 |
+
if current_batch_size == target_batch_size:
|
30 |
+
return tensor
|
31 |
+
else:
|
32 |
+
return torch.cat([tensor] * batched_number, dim=0)
|
33 |
+
|
34 |
+
class ControlBase:
|
35 |
+
def __init__(self, device=None):
|
36 |
+
self.cond_hint_original = None
|
37 |
+
self.cond_hint = None
|
38 |
+
self.strength = 1.0
|
39 |
+
self.timestep_percent_range = (0.0, 1.0)
|
40 |
+
self.global_average_pooling = False
|
41 |
+
self.timestep_range = None
|
42 |
+
self.compression_ratio = 8
|
43 |
+
self.upscale_algorithm = 'nearest-exact'
|
44 |
+
|
45 |
+
if device is None:
|
46 |
+
device = comfy.model_management.get_torch_device()
|
47 |
+
self.device = device
|
48 |
+
self.previous_controlnet = None
|
49 |
+
|
50 |
+
def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0)):
|
51 |
+
self.cond_hint_original = cond_hint
|
52 |
+
self.strength = strength
|
53 |
+
self.timestep_percent_range = timestep_percent_range
|
54 |
+
return self
|
55 |
+
|
56 |
+
def pre_run(self, model, percent_to_timestep_function):
|
57 |
+
self.timestep_range = (percent_to_timestep_function(self.timestep_percent_range[0]), percent_to_timestep_function(self.timestep_percent_range[1]))
|
58 |
+
if self.previous_controlnet is not None:
|
59 |
+
self.previous_controlnet.pre_run(model, percent_to_timestep_function)
|
60 |
+
|
61 |
+
def set_previous_controlnet(self, controlnet):
|
62 |
+
self.previous_controlnet = controlnet
|
63 |
+
return self
|
64 |
+
|
65 |
+
def cleanup(self):
|
66 |
+
if self.previous_controlnet is not None:
|
67 |
+
self.previous_controlnet.cleanup()
|
68 |
+
if self.cond_hint is not None:
|
69 |
+
del self.cond_hint
|
70 |
+
self.cond_hint = None
|
71 |
+
self.timestep_range = None
|
72 |
+
|
73 |
+
def get_models(self):
|
74 |
+
out = []
|
75 |
+
if self.previous_controlnet is not None:
|
76 |
+
out += self.previous_controlnet.get_models()
|
77 |
+
return out
|
78 |
+
|
79 |
+
def copy_to(self, c):
|
80 |
+
c.cond_hint_original = self.cond_hint_original
|
81 |
+
c.strength = self.strength
|
82 |
+
c.timestep_percent_range = self.timestep_percent_range
|
83 |
+
c.global_average_pooling = self.global_average_pooling
|
84 |
+
c.compression_ratio = self.compression_ratio
|
85 |
+
c.upscale_algorithm = self.upscale_algorithm
|
86 |
+
|
87 |
+
def inference_memory_requirements(self, dtype):
|
88 |
+
if self.previous_controlnet is not None:
|
89 |
+
return self.previous_controlnet.inference_memory_requirements(dtype)
|
90 |
+
return 0
|
91 |
+
|
92 |
+
def control_merge(self, control_input, control_output, control_prev, output_dtype):
|
93 |
+
out = {'input':[], 'middle':[], 'output': []}
|
94 |
+
|
95 |
+
if control_input is not None:
|
96 |
+
for i in range(len(control_input)):
|
97 |
+
key = 'input'
|
98 |
+
x = control_input[i]
|
99 |
+
if x is not None:
|
100 |
+
x *= self.strength
|
101 |
+
if x.dtype != output_dtype:
|
102 |
+
x = x.to(output_dtype)
|
103 |
+
out[key].insert(0, x)
|
104 |
+
|
105 |
+
if control_output is not None:
|
106 |
+
for i in range(len(control_output)):
|
107 |
+
if i == (len(control_output) - 1):
|
108 |
+
key = 'middle'
|
109 |
+
index = 0
|
110 |
+
else:
|
111 |
+
key = 'output'
|
112 |
+
index = i
|
113 |
+
x = control_output[i]
|
114 |
+
if x is not None:
|
115 |
+
if self.global_average_pooling:
|
116 |
+
x = torch.mean(x, dim=(2, 3), keepdim=True).repeat(1, 1, x.shape[2], x.shape[3])
|
117 |
+
|
118 |
+
x *= self.strength
|
119 |
+
if x.dtype != output_dtype:
|
120 |
+
x = x.to(output_dtype)
|
121 |
+
|
122 |
+
out[key].append(x)
|
123 |
+
if control_prev is not None:
|
124 |
+
for x in ['input', 'middle', 'output']:
|
125 |
+
o = out[x]
|
126 |
+
for i in range(len(control_prev[x])):
|
127 |
+
prev_val = control_prev[x][i]
|
128 |
+
if i >= len(o):
|
129 |
+
o.append(prev_val)
|
130 |
+
elif prev_val is not None:
|
131 |
+
if o[i] is None:
|
132 |
+
o[i] = prev_val
|
133 |
+
else:
|
134 |
+
if o[i].shape[0] < prev_val.shape[0]:
|
135 |
+
o[i] = prev_val + o[i]
|
136 |
+
else:
|
137 |
+
o[i] += prev_val
|
138 |
+
return out
|
139 |
+
|
140 |
+
class ControlNet(ControlBase):
|
141 |
+
def __init__(self, control_model, global_average_pooling=False, device=None, load_device=None, manual_cast_dtype=None):
|
142 |
+
super().__init__(device)
|
143 |
+
self.control_model = control_model
|
144 |
+
self.load_device = load_device
|
145 |
+
self.control_model_wrapped = comfy.model_patcher.ModelPatcher(self.control_model, load_device=load_device, offload_device=comfy.model_management.unet_offload_device())
|
146 |
+
self.global_average_pooling = global_average_pooling
|
147 |
+
self.model_sampling_current = None
|
148 |
+
self.manual_cast_dtype = manual_cast_dtype
|
149 |
+
|
150 |
+
def get_control(self, x_noisy, t, cond, batched_number):
|
151 |
+
control_prev = None
|
152 |
+
if self.previous_controlnet is not None:
|
153 |
+
control_prev = self.previous_controlnet.get_control(x_noisy, t, cond, batched_number)
|
154 |
+
|
155 |
+
if self.timestep_range is not None:
|
156 |
+
if t[0] > self.timestep_range[0] or t[0] < self.timestep_range[1]:
|
157 |
+
if control_prev is not None:
|
158 |
+
return control_prev
|
159 |
+
else:
|
160 |
+
return None
|
161 |
+
|
162 |
+
dtype = self.control_model.dtype
|
163 |
+
if self.manual_cast_dtype is not None:
|
164 |
+
dtype = self.manual_cast_dtype
|
165 |
+
|
166 |
+
output_dtype = x_noisy.dtype
|
167 |
+
if self.cond_hint is None or x_noisy.shape[2] * self.compression_ratio != self.cond_hint.shape[2] or x_noisy.shape[3] * self.compression_ratio != self.cond_hint.shape[3]:
|
168 |
+
if self.cond_hint is not None:
|
169 |
+
del self.cond_hint
|
170 |
+
self.cond_hint = None
|
171 |
+
self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[3] * self.compression_ratio, x_noisy.shape[2] * self.compression_ratio, self.upscale_algorithm, "center").to(dtype).to(self.device)
|
172 |
+
if x_noisy.shape[0] != self.cond_hint.shape[0]:
|
173 |
+
self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)
|
174 |
+
|
175 |
+
context = cond.get('crossattn_controlnet', cond['c_crossattn'])
|
176 |
+
y = cond.get('y', None)
|
177 |
+
if y is not None:
|
178 |
+
y = y.to(dtype)
|
179 |
+
timestep = self.model_sampling_current.timestep(t)
|
180 |
+
x_noisy = self.model_sampling_current.calculate_input(t, x_noisy)
|
181 |
+
|
182 |
+
control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.float(), context=context.to(dtype), y=y)
|
183 |
+
return self.control_merge(None, control, control_prev, output_dtype)
|
184 |
+
|
185 |
+
def copy(self):
|
186 |
+
c = ControlNet(self.control_model, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
|
187 |
+
self.copy_to(c)
|
188 |
+
return c
|
189 |
+
|
190 |
+
def get_models(self):
|
191 |
+
out = super().get_models()
|
192 |
+
out.append(self.control_model_wrapped)
|
193 |
+
return out
|
194 |
+
|
195 |
+
def pre_run(self, model, percent_to_timestep_function):
|
196 |
+
super().pre_run(model, percent_to_timestep_function)
|
197 |
+
self.model_sampling_current = model.model_sampling
|
198 |
+
|
199 |
+
def cleanup(self):
|
200 |
+
self.model_sampling_current = None
|
201 |
+
super().cleanup()
|
202 |
+
|
203 |
+
class ControlLoraOps:
|
204 |
+
class Linear(torch.nn.Module, comfy.ops.CastWeightBiasOp):
|
205 |
+
def __init__(self, in_features: int, out_features: int, bias: bool = True,
|
206 |
+
device=None, dtype=None) -> None:
|
207 |
+
factory_kwargs = {'device': device, 'dtype': dtype}
|
208 |
+
super().__init__()
|
209 |
+
self.in_features = in_features
|
210 |
+
self.out_features = out_features
|
211 |
+
self.weight = None
|
212 |
+
self.up = None
|
213 |
+
self.down = None
|
214 |
+
self.bias = None
|
215 |
+
|
216 |
+
def forward(self, input):
|
217 |
+
weight, bias = comfy.ops.cast_bias_weight(self, input)
|
218 |
+
if self.up is not None:
|
219 |
+
return torch.nn.functional.linear(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias)
|
220 |
+
else:
|
221 |
+
return torch.nn.functional.linear(input, weight, bias)
|
222 |
+
|
223 |
+
class Conv2d(torch.nn.Module, comfy.ops.CastWeightBiasOp):
|
224 |
+
def __init__(
|
225 |
+
self,
|
226 |
+
in_channels,
|
227 |
+
out_channels,
|
228 |
+
kernel_size,
|
229 |
+
stride=1,
|
230 |
+
padding=0,
|
231 |
+
dilation=1,
|
232 |
+
groups=1,
|
233 |
+
bias=True,
|
234 |
+
padding_mode='zeros',
|
235 |
+
device=None,
|
236 |
+
dtype=None
|
237 |
+
):
|
238 |
+
super().__init__()
|
239 |
+
self.in_channels = in_channels
|
240 |
+
self.out_channels = out_channels
|
241 |
+
self.kernel_size = kernel_size
|
242 |
+
self.stride = stride
|
243 |
+
self.padding = padding
|
244 |
+
self.dilation = dilation
|
245 |
+
self.transposed = False
|
246 |
+
self.output_padding = 0
|
247 |
+
self.groups = groups
|
248 |
+
self.padding_mode = padding_mode
|
249 |
+
|
250 |
+
self.weight = None
|
251 |
+
self.bias = None
|
252 |
+
self.up = None
|
253 |
+
self.down = None
|
254 |
+
|
255 |
+
|
256 |
+
def forward(self, input):
|
257 |
+
weight, bias = comfy.ops.cast_bias_weight(self, input)
|
258 |
+
if self.up is not None:
|
259 |
+
return torch.nn.functional.conv2d(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias, self.stride, self.padding, self.dilation, self.groups)
|
260 |
+
else:
|
261 |
+
return torch.nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups)
|
262 |
+
|
263 |
+
|
264 |
+
class ControlLora(ControlNet):
|
265 |
+
def __init__(self, control_weights, global_average_pooling=False, device=None):
|
266 |
+
ControlBase.__init__(self, device)
|
267 |
+
self.control_weights = control_weights
|
268 |
+
self.global_average_pooling = global_average_pooling
|
269 |
+
|
270 |
+
def pre_run(self, model, percent_to_timestep_function):
|
271 |
+
super().pre_run(model, percent_to_timestep_function)
|
272 |
+
controlnet_config = model.model_config.unet_config.copy()
|
273 |
+
controlnet_config.pop("out_channels")
|
274 |
+
controlnet_config["hint_channels"] = self.control_weights["input_hint_block.0.weight"].shape[1]
|
275 |
+
self.manual_cast_dtype = model.manual_cast_dtype
|
276 |
+
dtype = model.get_dtype()
|
277 |
+
if self.manual_cast_dtype is None:
|
278 |
+
class control_lora_ops(ControlLoraOps, comfy.ops.disable_weight_init):
|
279 |
+
pass
|
280 |
+
else:
|
281 |
+
class control_lora_ops(ControlLoraOps, comfy.ops.manual_cast):
|
282 |
+
pass
|
283 |
+
dtype = self.manual_cast_dtype
|
284 |
+
|
285 |
+
controlnet_config["operations"] = control_lora_ops
|
286 |
+
controlnet_config["dtype"] = dtype
|
287 |
+
self.control_model = comfy.cldm.cldm.ControlNet(**controlnet_config)
|
288 |
+
self.control_model.to(comfy.model_management.get_torch_device())
|
289 |
+
diffusion_model = model.diffusion_model
|
290 |
+
sd = diffusion_model.state_dict()
|
291 |
+
cm = self.control_model.state_dict()
|
292 |
+
|
293 |
+
for k in sd:
|
294 |
+
weight = sd[k]
|
295 |
+
try:
|
296 |
+
comfy.utils.set_attr_param(self.control_model, k, weight)
|
297 |
+
except:
|
298 |
+
pass
|
299 |
+
|
300 |
+
for k in self.control_weights:
|
301 |
+
if k not in {"lora_controlnet"}:
|
302 |
+
comfy.utils.set_attr_param(self.control_model, k, self.control_weights[k].to(dtype).to(comfy.model_management.get_torch_device()))
|
303 |
+
|
304 |
+
def copy(self):
|
305 |
+
c = ControlLora(self.control_weights, global_average_pooling=self.global_average_pooling)
|
306 |
+
self.copy_to(c)
|
307 |
+
return c
|
308 |
+
|
309 |
+
def cleanup(self):
|
310 |
+
del self.control_model
|
311 |
+
self.control_model = None
|
312 |
+
super().cleanup()
|
313 |
+
|
314 |
+
def get_models(self):
|
315 |
+
out = ControlBase.get_models(self)
|
316 |
+
return out
|
317 |
+
|
318 |
+
def inference_memory_requirements(self, dtype):
|
319 |
+
return comfy.utils.calculate_parameters(self.control_weights) * comfy.model_management.dtype_size(dtype) + ControlBase.inference_memory_requirements(self, dtype)
|
320 |
+
|
321 |
+
def load_controlnet(ckpt_path, model=None):
|
322 |
+
controlnet_data = comfy.utils.load_torch_file(ckpt_path, safe_load=True)
|
323 |
+
if "lora_controlnet" in controlnet_data:
|
324 |
+
return ControlLora(controlnet_data)
|
325 |
+
|
326 |
+
controlnet_config = None
|
327 |
+
supported_inference_dtypes = None
|
328 |
+
|
329 |
+
if "controlnet_cond_embedding.conv_in.weight" in controlnet_data: #diffusers format
|
330 |
+
controlnet_config = comfy.model_detection.unet_config_from_diffusers_unet(controlnet_data)
|
331 |
+
diffusers_keys = comfy.utils.unet_to_diffusers(controlnet_config)
|
332 |
+
diffusers_keys["controlnet_mid_block.weight"] = "middle_block_out.0.weight"
|
333 |
+
diffusers_keys["controlnet_mid_block.bias"] = "middle_block_out.0.bias"
|
334 |
+
|
335 |
+
count = 0
|
336 |
+
loop = True
|
337 |
+
while loop:
|
338 |
+
suffix = [".weight", ".bias"]
|
339 |
+
for s in suffix:
|
340 |
+
k_in = "controlnet_down_blocks.{}{}".format(count, s)
|
341 |
+
k_out = "zero_convs.{}.0{}".format(count, s)
|
342 |
+
if k_in not in controlnet_data:
|
343 |
+
loop = False
|
344 |
+
break
|
345 |
+
diffusers_keys[k_in] = k_out
|
346 |
+
count += 1
|
347 |
+
|
348 |
+
count = 0
|
349 |
+
loop = True
|
350 |
+
while loop:
|
351 |
+
suffix = [".weight", ".bias"]
|
352 |
+
for s in suffix:
|
353 |
+
if count == 0:
|
354 |
+
k_in = "controlnet_cond_embedding.conv_in{}".format(s)
|
355 |
+
else:
|
356 |
+
k_in = "controlnet_cond_embedding.blocks.{}{}".format(count - 1, s)
|
357 |
+
k_out = "input_hint_block.{}{}".format(count * 2, s)
|
358 |
+
if k_in not in controlnet_data:
|
359 |
+
k_in = "controlnet_cond_embedding.conv_out{}".format(s)
|
360 |
+
loop = False
|
361 |
+
diffusers_keys[k_in] = k_out
|
362 |
+
count += 1
|
363 |
+
|
364 |
+
new_sd = {}
|
365 |
+
for k in diffusers_keys:
|
366 |
+
if k in controlnet_data:
|
367 |
+
new_sd[diffusers_keys[k]] = controlnet_data.pop(k)
|
368 |
+
|
369 |
+
leftover_keys = controlnet_data.keys()
|
370 |
+
if len(leftover_keys) > 0:
|
371 |
+
logging.warning("leftover keys: {}".format(leftover_keys))
|
372 |
+
controlnet_data = new_sd
|
373 |
+
|
374 |
+
pth_key = 'control_model.zero_convs.0.0.weight'
|
375 |
+
pth = False
|
376 |
+
key = 'zero_convs.0.0.weight'
|
377 |
+
if pth_key in controlnet_data:
|
378 |
+
pth = True
|
379 |
+
key = pth_key
|
380 |
+
prefix = "control_model."
|
381 |
+
elif key in controlnet_data:
|
382 |
+
prefix = ""
|
383 |
+
else:
|
384 |
+
net = load_t2i_adapter(controlnet_data)
|
385 |
+
if net is None:
|
386 |
+
logging.error("error checkpoint does not contain controlnet or t2i adapter data {}".format(ckpt_path))
|
387 |
+
return net
|
388 |
+
|
389 |
+
if controlnet_config is None:
|
390 |
+
model_config = comfy.model_detection.model_config_from_unet(controlnet_data, prefix, True)
|
391 |
+
supported_inference_dtypes = model_config.supported_inference_dtypes
|
392 |
+
controlnet_config = model_config.unet_config
|
393 |
+
|
394 |
+
load_device = comfy.model_management.get_torch_device()
|
395 |
+
if supported_inference_dtypes is None:
|
396 |
+
unet_dtype = comfy.model_management.unet_dtype()
|
397 |
+
else:
|
398 |
+
unet_dtype = comfy.model_management.unet_dtype(supported_dtypes=supported_inference_dtypes)
|
399 |
+
|
400 |
+
manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
|
401 |
+
if manual_cast_dtype is not None:
|
402 |
+
controlnet_config["operations"] = comfy.ops.manual_cast
|
403 |
+
controlnet_config["dtype"] = unet_dtype
|
404 |
+
controlnet_config.pop("out_channels")
|
405 |
+
controlnet_config["hint_channels"] = controlnet_data["{}input_hint_block.0.weight".format(prefix)].shape[1]
|
406 |
+
control_model = comfy.cldm.cldm.ControlNet(**controlnet_config)
|
407 |
+
|
408 |
+
if pth:
|
409 |
+
if 'difference' in controlnet_data:
|
410 |
+
if model is not None:
|
411 |
+
comfy.model_management.load_models_gpu([model])
|
412 |
+
model_sd = model.model_state_dict()
|
413 |
+
for x in controlnet_data:
|
414 |
+
c_m = "control_model."
|
415 |
+
if x.startswith(c_m):
|
416 |
+
sd_key = "diffusion_model.{}".format(x[len(c_m):])
|
417 |
+
if sd_key in model_sd:
|
418 |
+
cd = controlnet_data[x]
|
419 |
+
cd += model_sd[sd_key].type(cd.dtype).to(cd.device)
|
420 |
+
else:
|
421 |
+
logging.warning("WARNING: Loaded a diff controlnet without a model. It will very likely not work.")
|
422 |
+
|
423 |
+
class WeightsLoader(torch.nn.Module):
|
424 |
+
pass
|
425 |
+
w = WeightsLoader()
|
426 |
+
w.control_model = control_model
|
427 |
+
missing, unexpected = w.load_state_dict(controlnet_data, strict=False)
|
428 |
+
else:
|
429 |
+
missing, unexpected = control_model.load_state_dict(controlnet_data, strict=False)
|
430 |
+
|
431 |
+
if len(missing) > 0:
|
432 |
+
logging.warning("missing controlnet keys: {}".format(missing))
|
433 |
+
|
434 |
+
if len(unexpected) > 0:
|
435 |
+
logging.debug("unexpected controlnet keys: {}".format(unexpected))
|
436 |
+
|
437 |
+
global_average_pooling = False
|
438 |
+
filename = os.path.splitext(ckpt_path)[0]
|
439 |
+
if filename.endswith("_shuffle") or filename.endswith("_shuffle_fp16"): #TODO: smarter way of enabling global_average_pooling
|
440 |
+
global_average_pooling = True
|
441 |
+
|
442 |
+
control = ControlNet(control_model, global_average_pooling=global_average_pooling, load_device=load_device, manual_cast_dtype=manual_cast_dtype)
|
443 |
+
return control
|
444 |
+
|
445 |
+
class T2IAdapter(ControlBase):
|
446 |
+
def __init__(self, t2i_model, channels_in, compression_ratio, upscale_algorithm, device=None):
|
447 |
+
super().__init__(device)
|
448 |
+
self.t2i_model = t2i_model
|
449 |
+
self.channels_in = channels_in
|
450 |
+
self.control_input = None
|
451 |
+
self.compression_ratio = compression_ratio
|
452 |
+
self.upscale_algorithm = upscale_algorithm
|
453 |
+
|
454 |
+
def scale_image_to(self, width, height):
|
455 |
+
unshuffle_amount = self.t2i_model.unshuffle_amount
|
456 |
+
width = math.ceil(width / unshuffle_amount) * unshuffle_amount
|
457 |
+
height = math.ceil(height / unshuffle_amount) * unshuffle_amount
|
458 |
+
return width, height
|
459 |
+
|
460 |
+
def get_control(self, x_noisy, t, cond, batched_number):
|
461 |
+
control_prev = None
|
462 |
+
if self.previous_controlnet is not None:
|
463 |
+
control_prev = self.previous_controlnet.get_control(x_noisy, t, cond, batched_number)
|
464 |
+
|
465 |
+
if self.timestep_range is not None:
|
466 |
+
if t[0] > self.timestep_range[0] or t[0] < self.timestep_range[1]:
|
467 |
+
if control_prev is not None:
|
468 |
+
return control_prev
|
469 |
+
else:
|
470 |
+
return None
|
471 |
+
|
472 |
+
if self.cond_hint is None or x_noisy.shape[2] * self.compression_ratio != self.cond_hint.shape[2] or x_noisy.shape[3] * self.compression_ratio != self.cond_hint.shape[3]:
|
473 |
+
if self.cond_hint is not None:
|
474 |
+
del self.cond_hint
|
475 |
+
self.control_input = None
|
476 |
+
self.cond_hint = None
|
477 |
+
width, height = self.scale_image_to(x_noisy.shape[3] * self.compression_ratio, x_noisy.shape[2] * self.compression_ratio)
|
478 |
+
self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, width, height, self.upscale_algorithm, "center").float().to(self.device)
|
479 |
+
if self.channels_in == 1 and self.cond_hint.shape[1] > 1:
|
480 |
+
self.cond_hint = torch.mean(self.cond_hint, 1, keepdim=True)
|
481 |
+
if x_noisy.shape[0] != self.cond_hint.shape[0]:
|
482 |
+
self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)
|
483 |
+
if self.control_input is None:
|
484 |
+
self.t2i_model.to(x_noisy.dtype)
|
485 |
+
self.t2i_model.to(self.device)
|
486 |
+
self.control_input = self.t2i_model(self.cond_hint.to(x_noisy.dtype))
|
487 |
+
self.t2i_model.cpu()
|
488 |
+
|
489 |
+
control_input = list(map(lambda a: None if a is None else a.clone(), self.control_input))
|
490 |
+
mid = None
|
491 |
+
if self.t2i_model.xl == True:
|
492 |
+
mid = control_input[-1:]
|
493 |
+
control_input = control_input[:-1]
|
494 |
+
return self.control_merge(control_input, mid, control_prev, x_noisy.dtype)
|
495 |
+
|
496 |
+
def copy(self):
|
497 |
+
c = T2IAdapter(self.t2i_model, self.channels_in, self.compression_ratio, self.upscale_algorithm)
|
498 |
+
self.copy_to(c)
|
499 |
+
return c
|
500 |
+
|
501 |
+
def load_t2i_adapter(t2i_data):
|
502 |
+
compression_ratio = 8
|
503 |
+
upscale_algorithm = 'nearest-exact'
|
504 |
+
|
505 |
+
if 'adapter' in t2i_data:
|
506 |
+
t2i_data = t2i_data['adapter']
|
507 |
+
if 'adapter.body.0.resnets.0.block1.weight' in t2i_data: #diffusers format
|
508 |
+
prefix_replace = {}
|
509 |
+
for i in range(4):
|
510 |
+
for j in range(2):
|
511 |
+
prefix_replace["adapter.body.{}.resnets.{}.".format(i, j)] = "body.{}.".format(i * 2 + j)
|
512 |
+
prefix_replace["adapter.body.{}.".format(i, j)] = "body.{}.".format(i * 2)
|
513 |
+
prefix_replace["adapter."] = ""
|
514 |
+
t2i_data = comfy.utils.state_dict_prefix_replace(t2i_data, prefix_replace)
|
515 |
+
keys = t2i_data.keys()
|
516 |
+
|
517 |
+
if "body.0.in_conv.weight" in keys:
|
518 |
+
cin = t2i_data['body.0.in_conv.weight'].shape[1]
|
519 |
+
model_ad = comfy.t2i_adapter.adapter.Adapter_light(cin=cin, channels=[320, 640, 1280, 1280], nums_rb=4)
|
520 |
+
elif 'conv_in.weight' in keys:
|
521 |
+
cin = t2i_data['conv_in.weight'].shape[1]
|
522 |
+
channel = t2i_data['conv_in.weight'].shape[0]
|
523 |
+
ksize = t2i_data['body.0.block2.weight'].shape[2]
|
524 |
+
use_conv = False
|
525 |
+
down_opts = list(filter(lambda a: a.endswith("down_opt.op.weight"), keys))
|
526 |
+
if len(down_opts) > 0:
|
527 |
+
use_conv = True
|
528 |
+
xl = False
|
529 |
+
if cin == 256 or cin == 768:
|
530 |
+
xl = True
|
531 |
+
model_ad = comfy.t2i_adapter.adapter.Adapter(cin=cin, channels=[channel, channel*2, channel*4, channel*4][:4], nums_rb=2, ksize=ksize, sk=True, use_conv=use_conv, xl=xl)
|
532 |
+
elif "backbone.0.0.weight" in keys:
|
533 |
+
model_ad = comfy.ldm.cascade.controlnet.ControlNet(c_in=t2i_data['backbone.0.0.weight'].shape[1], proj_blocks=[0, 4, 8, 12, 51, 55, 59, 63])
|
534 |
+
compression_ratio = 32
|
535 |
+
upscale_algorithm = 'bilinear'
|
536 |
+
elif "backbone.10.blocks.0.weight" in keys:
|
537 |
+
model_ad = comfy.ldm.cascade.controlnet.ControlNet(c_in=t2i_data['backbone.0.weight'].shape[1], bottleneck_mode="large", proj_blocks=[0, 4, 8, 12, 51, 55, 59, 63])
|
538 |
+
compression_ratio = 1
|
539 |
+
upscale_algorithm = 'nearest-exact'
|
540 |
+
else:
|
541 |
+
return None
|
542 |
+
|
543 |
+
missing, unexpected = model_ad.load_state_dict(t2i_data)
|
544 |
+
if len(missing) > 0:
|
545 |
+
logging.warning("t2i missing {}".format(missing))
|
546 |
+
|
547 |
+
if len(unexpected) > 0:
|
548 |
+
logging.debug("t2i unexpected {}".format(unexpected))
|
549 |
+
|
550 |
+
return T2IAdapter(model_ad, model_ad.input_channels, compression_ratio, upscale_algorithm)
|
ComfyUI/comfy/diffusers_convert.py
ADDED
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import torch
|
3 |
+
import logging
|
4 |
+
|
5 |
+
# conversion code from https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py
|
6 |
+
|
7 |
+
# =================#
|
8 |
+
# UNet Conversion #
|
9 |
+
# =================#
|
10 |
+
|
11 |
+
unet_conversion_map = [
|
12 |
+
# (stable-diffusion, HF Diffusers)
|
13 |
+
("time_embed.0.weight", "time_embedding.linear_1.weight"),
|
14 |
+
("time_embed.0.bias", "time_embedding.linear_1.bias"),
|
15 |
+
("time_embed.2.weight", "time_embedding.linear_2.weight"),
|
16 |
+
("time_embed.2.bias", "time_embedding.linear_2.bias"),
|
17 |
+
("input_blocks.0.0.weight", "conv_in.weight"),
|
18 |
+
("input_blocks.0.0.bias", "conv_in.bias"),
|
19 |
+
("out.0.weight", "conv_norm_out.weight"),
|
20 |
+
("out.0.bias", "conv_norm_out.bias"),
|
21 |
+
("out.2.weight", "conv_out.weight"),
|
22 |
+
("out.2.bias", "conv_out.bias"),
|
23 |
+
]
|
24 |
+
|
25 |
+
unet_conversion_map_resnet = [
|
26 |
+
# (stable-diffusion, HF Diffusers)
|
27 |
+
("in_layers.0", "norm1"),
|
28 |
+
("in_layers.2", "conv1"),
|
29 |
+
("out_layers.0", "norm2"),
|
30 |
+
("out_layers.3", "conv2"),
|
31 |
+
("emb_layers.1", "time_emb_proj"),
|
32 |
+
("skip_connection", "conv_shortcut"),
|
33 |
+
]
|
34 |
+
|
35 |
+
unet_conversion_map_layer = []
|
36 |
+
# hardcoded number of downblocks and resnets/attentions...
|
37 |
+
# would need smarter logic for other networks.
|
38 |
+
for i in range(4):
|
39 |
+
# loop over downblocks/upblocks
|
40 |
+
|
41 |
+
for j in range(2):
|
42 |
+
# loop over resnets/attentions for downblocks
|
43 |
+
hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
|
44 |
+
sd_down_res_prefix = f"input_blocks.{3 * i + j + 1}.0."
|
45 |
+
unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
|
46 |
+
|
47 |
+
if i < 3:
|
48 |
+
# no attention layers in down_blocks.3
|
49 |
+
hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
|
50 |
+
sd_down_atn_prefix = f"input_blocks.{3 * i + j + 1}.1."
|
51 |
+
unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
|
52 |
+
|
53 |
+
for j in range(3):
|
54 |
+
# loop over resnets/attentions for upblocks
|
55 |
+
hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
|
56 |
+
sd_up_res_prefix = f"output_blocks.{3 * i + j}.0."
|
57 |
+
unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
|
58 |
+
|
59 |
+
if i > 0:
|
60 |
+
# no attention layers in up_blocks.0
|
61 |
+
hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
|
62 |
+
sd_up_atn_prefix = f"output_blocks.{3 * i + j}.1."
|
63 |
+
unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
|
64 |
+
|
65 |
+
if i < 3:
|
66 |
+
# no downsample in down_blocks.3
|
67 |
+
hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
|
68 |
+
sd_downsample_prefix = f"input_blocks.{3 * (i + 1)}.0.op."
|
69 |
+
unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
|
70 |
+
|
71 |
+
# no upsample in up_blocks.3
|
72 |
+
hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
|
73 |
+
sd_upsample_prefix = f"output_blocks.{3 * i + 2}.{1 if i == 0 else 2}."
|
74 |
+
unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
|
75 |
+
|
76 |
+
hf_mid_atn_prefix = "mid_block.attentions.0."
|
77 |
+
sd_mid_atn_prefix = "middle_block.1."
|
78 |
+
unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
|
79 |
+
|
80 |
+
for j in range(2):
|
81 |
+
hf_mid_res_prefix = f"mid_block.resnets.{j}."
|
82 |
+
sd_mid_res_prefix = f"middle_block.{2 * j}."
|
83 |
+
unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
|
84 |
+
|
85 |
+
|
86 |
+
def convert_unet_state_dict(unet_state_dict):
|
87 |
+
# buyer beware: this is a *brittle* function,
|
88 |
+
# and correct output requires that all of these pieces interact in
|
89 |
+
# the exact order in which I have arranged them.
|
90 |
+
mapping = {k: k for k in unet_state_dict.keys()}
|
91 |
+
for sd_name, hf_name in unet_conversion_map:
|
92 |
+
mapping[hf_name] = sd_name
|
93 |
+
for k, v in mapping.items():
|
94 |
+
if "resnets" in k:
|
95 |
+
for sd_part, hf_part in unet_conversion_map_resnet:
|
96 |
+
v = v.replace(hf_part, sd_part)
|
97 |
+
mapping[k] = v
|
98 |
+
for k, v in mapping.items():
|
99 |
+
for sd_part, hf_part in unet_conversion_map_layer:
|
100 |
+
v = v.replace(hf_part, sd_part)
|
101 |
+
mapping[k] = v
|
102 |
+
new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
|
103 |
+
return new_state_dict
|
104 |
+
|
105 |
+
|
106 |
+
# ================#
|
107 |
+
# VAE Conversion #
|
108 |
+
# ================#
|
109 |
+
|
110 |
+
vae_conversion_map = [
|
111 |
+
# (stable-diffusion, HF Diffusers)
|
112 |
+
("nin_shortcut", "conv_shortcut"),
|
113 |
+
("norm_out", "conv_norm_out"),
|
114 |
+
("mid.attn_1.", "mid_block.attentions.0."),
|
115 |
+
]
|
116 |
+
|
117 |
+
for i in range(4):
|
118 |
+
# down_blocks have two resnets
|
119 |
+
for j in range(2):
|
120 |
+
hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}."
|
121 |
+
sd_down_prefix = f"encoder.down.{i}.block.{j}."
|
122 |
+
vae_conversion_map.append((sd_down_prefix, hf_down_prefix))
|
123 |
+
|
124 |
+
if i < 3:
|
125 |
+
hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0."
|
126 |
+
sd_downsample_prefix = f"down.{i}.downsample."
|
127 |
+
vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix))
|
128 |
+
|
129 |
+
hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
|
130 |
+
sd_upsample_prefix = f"up.{3 - i}.upsample."
|
131 |
+
vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix))
|
132 |
+
|
133 |
+
# up_blocks have three resnets
|
134 |
+
# also, up blocks in hf are numbered in reverse from sd
|
135 |
+
for j in range(3):
|
136 |
+
hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}."
|
137 |
+
sd_up_prefix = f"decoder.up.{3 - i}.block.{j}."
|
138 |
+
vae_conversion_map.append((sd_up_prefix, hf_up_prefix))
|
139 |
+
|
140 |
+
# this part accounts for mid blocks in both the encoder and the decoder
|
141 |
+
for i in range(2):
|
142 |
+
hf_mid_res_prefix = f"mid_block.resnets.{i}."
|
143 |
+
sd_mid_res_prefix = f"mid.block_{i + 1}."
|
144 |
+
vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix))
|
145 |
+
|
146 |
+
vae_conversion_map_attn = [
|
147 |
+
# (stable-diffusion, HF Diffusers)
|
148 |
+
("norm.", "group_norm."),
|
149 |
+
("q.", "query."),
|
150 |
+
("k.", "key."),
|
151 |
+
("v.", "value."),
|
152 |
+
("q.", "to_q."),
|
153 |
+
("k.", "to_k."),
|
154 |
+
("v.", "to_v."),
|
155 |
+
("proj_out.", "to_out.0."),
|
156 |
+
("proj_out.", "proj_attn."),
|
157 |
+
]
|
158 |
+
|
159 |
+
|
160 |
+
def reshape_weight_for_sd(w):
|
161 |
+
# convert HF linear weights to SD conv2d weights
|
162 |
+
return w.reshape(*w.shape, 1, 1)
|
163 |
+
|
164 |
+
|
165 |
+
def convert_vae_state_dict(vae_state_dict):
|
166 |
+
mapping = {k: k for k in vae_state_dict.keys()}
|
167 |
+
for k, v in mapping.items():
|
168 |
+
for sd_part, hf_part in vae_conversion_map:
|
169 |
+
v = v.replace(hf_part, sd_part)
|
170 |
+
mapping[k] = v
|
171 |
+
for k, v in mapping.items():
|
172 |
+
if "attentions" in k:
|
173 |
+
for sd_part, hf_part in vae_conversion_map_attn:
|
174 |
+
v = v.replace(hf_part, sd_part)
|
175 |
+
mapping[k] = v
|
176 |
+
new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
|
177 |
+
weights_to_convert = ["q", "k", "v", "proj_out"]
|
178 |
+
for k, v in new_state_dict.items():
|
179 |
+
for weight_name in weights_to_convert:
|
180 |
+
if f"mid.attn_1.{weight_name}.weight" in k:
|
181 |
+
logging.debug(f"Reshaping {k} for SD format")
|
182 |
+
new_state_dict[k] = reshape_weight_for_sd(v)
|
183 |
+
return new_state_dict
|
184 |
+
|
185 |
+
|
186 |
+
# =========================#
|
187 |
+
# Text Encoder Conversion #
|
188 |
+
# =========================#
|
189 |
+
|
190 |
+
|
191 |
+
textenc_conversion_lst = [
|
192 |
+
# (stable-diffusion, HF Diffusers)
|
193 |
+
("resblocks.", "text_model.encoder.layers."),
|
194 |
+
("ln_1", "layer_norm1"),
|
195 |
+
("ln_2", "layer_norm2"),
|
196 |
+
(".c_fc.", ".fc1."),
|
197 |
+
(".c_proj.", ".fc2."),
|
198 |
+
(".attn", ".self_attn"),
|
199 |
+
("ln_final.", "transformer.text_model.final_layer_norm."),
|
200 |
+
("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
|
201 |
+
("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
|
202 |
+
]
|
203 |
+
protected = {re.escape(x[1]): x[0] for x in textenc_conversion_lst}
|
204 |
+
textenc_pattern = re.compile("|".join(protected.keys()))
|
205 |
+
|
206 |
+
# Ordering is from https://github.com/pytorch/pytorch/blob/master/test/cpp/api/modules.cpp
|
207 |
+
code2idx = {"q": 0, "k": 1, "v": 2}
|
208 |
+
|
209 |
+
|
210 |
+
def convert_text_enc_state_dict_v20(text_enc_dict, prefix=""):
|
211 |
+
new_state_dict = {}
|
212 |
+
capture_qkv_weight = {}
|
213 |
+
capture_qkv_bias = {}
|
214 |
+
for k, v in text_enc_dict.items():
|
215 |
+
if not k.startswith(prefix):
|
216 |
+
continue
|
217 |
+
if (
|
218 |
+
k.endswith(".self_attn.q_proj.weight")
|
219 |
+
or k.endswith(".self_attn.k_proj.weight")
|
220 |
+
or k.endswith(".self_attn.v_proj.weight")
|
221 |
+
):
|
222 |
+
k_pre = k[: -len(".q_proj.weight")]
|
223 |
+
k_code = k[-len("q_proj.weight")]
|
224 |
+
if k_pre not in capture_qkv_weight:
|
225 |
+
capture_qkv_weight[k_pre] = [None, None, None]
|
226 |
+
capture_qkv_weight[k_pre][code2idx[k_code]] = v
|
227 |
+
continue
|
228 |
+
|
229 |
+
if (
|
230 |
+
k.endswith(".self_attn.q_proj.bias")
|
231 |
+
or k.endswith(".self_attn.k_proj.bias")
|
232 |
+
or k.endswith(".self_attn.v_proj.bias")
|
233 |
+
):
|
234 |
+
k_pre = k[: -len(".q_proj.bias")]
|
235 |
+
k_code = k[-len("q_proj.bias")]
|
236 |
+
if k_pre not in capture_qkv_bias:
|
237 |
+
capture_qkv_bias[k_pre] = [None, None, None]
|
238 |
+
capture_qkv_bias[k_pre][code2idx[k_code]] = v
|
239 |
+
continue
|
240 |
+
|
241 |
+
text_proj = "transformer.text_projection.weight"
|
242 |
+
if k.endswith(text_proj):
|
243 |
+
new_state_dict[k.replace(text_proj, "text_projection")] = v.transpose(0, 1).contiguous()
|
244 |
+
else:
|
245 |
+
relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k)
|
246 |
+
new_state_dict[relabelled_key] = v
|
247 |
+
|
248 |
+
for k_pre, tensors in capture_qkv_weight.items():
|
249 |
+
if None in tensors:
|
250 |
+
raise Exception("CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing")
|
251 |
+
relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k_pre)
|
252 |
+
new_state_dict[relabelled_key + ".in_proj_weight"] = torch.cat(tensors)
|
253 |
+
|
254 |
+
for k_pre, tensors in capture_qkv_bias.items():
|
255 |
+
if None in tensors:
|
256 |
+
raise Exception("CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing")
|
257 |
+
relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k_pre)
|
258 |
+
new_state_dict[relabelled_key + ".in_proj_bias"] = torch.cat(tensors)
|
259 |
+
|
260 |
+
return new_state_dict
|
261 |
+
|
262 |
+
|
263 |
+
def convert_text_enc_state_dict(text_enc_dict):
|
264 |
+
return text_enc_dict
|
265 |
+
|
266 |
+
|
ComfyUI/comfy/diffusers_load.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import comfy.sd
|
4 |
+
|
5 |
+
def first_file(path, filenames):
|
6 |
+
for f in filenames:
|
7 |
+
p = os.path.join(path, f)
|
8 |
+
if os.path.exists(p):
|
9 |
+
return p
|
10 |
+
return None
|
11 |
+
|
12 |
+
def load_diffusers(model_path, output_vae=True, output_clip=True, embedding_directory=None):
|
13 |
+
diffusion_model_names = ["diffusion_pytorch_model.fp16.safetensors", "diffusion_pytorch_model.safetensors", "diffusion_pytorch_model.fp16.bin", "diffusion_pytorch_model.bin"]
|
14 |
+
unet_path = first_file(os.path.join(model_path, "unet"), diffusion_model_names)
|
15 |
+
vae_path = first_file(os.path.join(model_path, "vae"), diffusion_model_names)
|
16 |
+
|
17 |
+
text_encoder_model_names = ["model.fp16.safetensors", "model.safetensors", "pytorch_model.fp16.bin", "pytorch_model.bin"]
|
18 |
+
text_encoder1_path = first_file(os.path.join(model_path, "text_encoder"), text_encoder_model_names)
|
19 |
+
text_encoder2_path = first_file(os.path.join(model_path, "text_encoder_2"), text_encoder_model_names)
|
20 |
+
|
21 |
+
text_encoder_paths = [text_encoder1_path]
|
22 |
+
if text_encoder2_path is not None:
|
23 |
+
text_encoder_paths.append(text_encoder2_path)
|
24 |
+
|
25 |
+
unet = comfy.sd.load_unet(unet_path)
|
26 |
+
|
27 |
+
clip = None
|
28 |
+
if output_clip:
|
29 |
+
clip = comfy.sd.load_clip(text_encoder_paths, embedding_directory=embedding_directory)
|
30 |
+
|
31 |
+
vae = None
|
32 |
+
if output_vae:
|
33 |
+
sd = comfy.utils.load_torch_file(vae_path)
|
34 |
+
vae = comfy.sd.VAE(sd=sd)
|
35 |
+
|
36 |
+
return (unet, clip, vae)
|
ComfyUI/comfy/extra_samplers/uni_pc.py
ADDED
@@ -0,0 +1,875 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#code taken from: https://github.com/wl-zhao/UniPC and modified
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import torch.nn.functional as F
|
5 |
+
import math
|
6 |
+
|
7 |
+
from tqdm.auto import trange, tqdm
|
8 |
+
|
9 |
+
|
10 |
+
class NoiseScheduleVP:
|
11 |
+
def __init__(
|
12 |
+
self,
|
13 |
+
schedule='discrete',
|
14 |
+
betas=None,
|
15 |
+
alphas_cumprod=None,
|
16 |
+
continuous_beta_0=0.1,
|
17 |
+
continuous_beta_1=20.,
|
18 |
+
):
|
19 |
+
"""Create a wrapper class for the forward SDE (VP type).
|
20 |
+
|
21 |
+
***
|
22 |
+
Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
|
23 |
+
We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
|
24 |
+
***
|
25 |
+
|
26 |
+
The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
|
27 |
+
We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
|
28 |
+
Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
|
29 |
+
|
30 |
+
log_alpha_t = self.marginal_log_mean_coeff(t)
|
31 |
+
sigma_t = self.marginal_std(t)
|
32 |
+
lambda_t = self.marginal_lambda(t)
|
33 |
+
|
34 |
+
Moreover, as lambda(t) is an invertible function, we also support its inverse function:
|
35 |
+
|
36 |
+
t = self.inverse_lambda(lambda_t)
|
37 |
+
|
38 |
+
===============================================================
|
39 |
+
|
40 |
+
We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
|
41 |
+
|
42 |
+
1. For discrete-time DPMs:
|
43 |
+
|
44 |
+
For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
|
45 |
+
t_i = (i + 1) / N
|
46 |
+
e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
|
47 |
+
We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
|
48 |
+
|
49 |
+
Args:
|
50 |
+
betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
|
51 |
+
alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
|
52 |
+
|
53 |
+
Note that we always have alphas_cumprod = cumprod(betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
|
54 |
+
|
55 |
+
**Important**: Please pay special attention for the args for `alphas_cumprod`:
|
56 |
+
The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
|
57 |
+
q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
|
58 |
+
Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
|
59 |
+
alpha_{t_n} = \sqrt{\hat{alpha_n}},
|
60 |
+
and
|
61 |
+
log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
|
62 |
+
|
63 |
+
|
64 |
+
2. For continuous-time DPMs:
|
65 |
+
|
66 |
+
We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
|
67 |
+
schedule are the default settings in DDPM and improved-DDPM:
|
68 |
+
|
69 |
+
Args:
|
70 |
+
beta_min: A `float` number. The smallest beta for the linear schedule.
|
71 |
+
beta_max: A `float` number. The largest beta for the linear schedule.
|
72 |
+
cosine_s: A `float` number. The hyperparameter in the cosine schedule.
|
73 |
+
cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
|
74 |
+
T: A `float` number. The ending time of the forward process.
|
75 |
+
|
76 |
+
===============================================================
|
77 |
+
|
78 |
+
Args:
|
79 |
+
schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
|
80 |
+
'linear' or 'cosine' for continuous-time DPMs.
|
81 |
+
Returns:
|
82 |
+
A wrapper object of the forward SDE (VP type).
|
83 |
+
|
84 |
+
===============================================================
|
85 |
+
|
86 |
+
Example:
|
87 |
+
|
88 |
+
# For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
|
89 |
+
>>> ns = NoiseScheduleVP('discrete', betas=betas)
|
90 |
+
|
91 |
+
# For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
|
92 |
+
>>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
|
93 |
+
|
94 |
+
# For continuous-time DPMs (VPSDE), linear schedule:
|
95 |
+
>>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
|
96 |
+
|
97 |
+
"""
|
98 |
+
|
99 |
+
if schedule not in ['discrete', 'linear', 'cosine']:
|
100 |
+
raise ValueError("Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(schedule))
|
101 |
+
|
102 |
+
self.schedule = schedule
|
103 |
+
if schedule == 'discrete':
|
104 |
+
if betas is not None:
|
105 |
+
log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
|
106 |
+
else:
|
107 |
+
assert alphas_cumprod is not None
|
108 |
+
log_alphas = 0.5 * torch.log(alphas_cumprod)
|
109 |
+
self.total_N = len(log_alphas)
|
110 |
+
self.T = 1.
|
111 |
+
self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1))
|
112 |
+
self.log_alpha_array = log_alphas.reshape((1, -1,))
|
113 |
+
else:
|
114 |
+
self.total_N = 1000
|
115 |
+
self.beta_0 = continuous_beta_0
|
116 |
+
self.beta_1 = continuous_beta_1
|
117 |
+
self.cosine_s = 0.008
|
118 |
+
self.cosine_beta_max = 999.
|
119 |
+
self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s
|
120 |
+
self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.))
|
121 |
+
self.schedule = schedule
|
122 |
+
if schedule == 'cosine':
|
123 |
+
# For the cosine schedule, T = 1 will have numerical issues. So we manually set the ending time T.
|
124 |
+
# Note that T = 0.9946 may be not the optimal setting. However, we find it works well.
|
125 |
+
self.T = 0.9946
|
126 |
+
else:
|
127 |
+
self.T = 1.
|
128 |
+
|
129 |
+
def marginal_log_mean_coeff(self, t):
|
130 |
+
"""
|
131 |
+
Compute log(alpha_t) of a given continuous-time label t in [0, T].
|
132 |
+
"""
|
133 |
+
if self.schedule == 'discrete':
|
134 |
+
return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device)).reshape((-1))
|
135 |
+
elif self.schedule == 'linear':
|
136 |
+
return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
|
137 |
+
elif self.schedule == 'cosine':
|
138 |
+
log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
|
139 |
+
log_alpha_t = log_alpha_fn(t) - self.cosine_log_alpha_0
|
140 |
+
return log_alpha_t
|
141 |
+
|
142 |
+
def marginal_alpha(self, t):
|
143 |
+
"""
|
144 |
+
Compute alpha_t of a given continuous-time label t in [0, T].
|
145 |
+
"""
|
146 |
+
return torch.exp(self.marginal_log_mean_coeff(t))
|
147 |
+
|
148 |
+
def marginal_std(self, t):
|
149 |
+
"""
|
150 |
+
Compute sigma_t of a given continuous-time label t in [0, T].
|
151 |
+
"""
|
152 |
+
return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
|
153 |
+
|
154 |
+
def marginal_lambda(self, t):
|
155 |
+
"""
|
156 |
+
Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
|
157 |
+
"""
|
158 |
+
log_mean_coeff = self.marginal_log_mean_coeff(t)
|
159 |
+
log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
|
160 |
+
return log_mean_coeff - log_std
|
161 |
+
|
162 |
+
def inverse_lambda(self, lamb):
|
163 |
+
"""
|
164 |
+
Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
|
165 |
+
"""
|
166 |
+
if self.schedule == 'linear':
|
167 |
+
tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
|
168 |
+
Delta = self.beta_0**2 + tmp
|
169 |
+
return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
|
170 |
+
elif self.schedule == 'discrete':
|
171 |
+
log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
|
172 |
+
t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]), torch.flip(self.t_array.to(lamb.device), [1]))
|
173 |
+
return t.reshape((-1,))
|
174 |
+
else:
|
175 |
+
log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
|
176 |
+
t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s
|
177 |
+
t = t_fn(log_alpha)
|
178 |
+
return t
|
179 |
+
|
180 |
+
|
181 |
+
def model_wrapper(
|
182 |
+
model,
|
183 |
+
noise_schedule,
|
184 |
+
model_type="noise",
|
185 |
+
model_kwargs={},
|
186 |
+
guidance_type="uncond",
|
187 |
+
condition=None,
|
188 |
+
unconditional_condition=None,
|
189 |
+
guidance_scale=1.,
|
190 |
+
classifier_fn=None,
|
191 |
+
classifier_kwargs={},
|
192 |
+
):
|
193 |
+
"""Create a wrapper function for the noise prediction model.
|
194 |
+
|
195 |
+
DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to
|
196 |
+
firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.
|
197 |
+
|
198 |
+
We support four types of the diffusion model by setting `model_type`:
|
199 |
+
|
200 |
+
1. "noise": noise prediction model. (Trained by predicting noise).
|
201 |
+
|
202 |
+
2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).
|
203 |
+
|
204 |
+
3. "v": velocity prediction model. (Trained by predicting the velocity).
|
205 |
+
The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].
|
206 |
+
|
207 |
+
[1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models."
|
208 |
+
arXiv preprint arXiv:2202.00512 (2022).
|
209 |
+
[2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
|
210 |
+
arXiv preprint arXiv:2210.02303 (2022).
|
211 |
+
|
212 |
+
4. "score": marginal score function. (Trained by denoising score matching).
|
213 |
+
Note that the score function and the noise prediction model follows a simple relationship:
|
214 |
+
```
|
215 |
+
noise(x_t, t) = -sigma_t * score(x_t, t)
|
216 |
+
```
|
217 |
+
|
218 |
+
We support three types of guided sampling by DPMs by setting `guidance_type`:
|
219 |
+
1. "uncond": unconditional sampling by DPMs.
|
220 |
+
The input `model` has the following format:
|
221 |
+
``
|
222 |
+
model(x, t_input, **model_kwargs) -> noise | x_start | v | score
|
223 |
+
``
|
224 |
+
|
225 |
+
2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
|
226 |
+
The input `model` has the following format:
|
227 |
+
``
|
228 |
+
model(x, t_input, **model_kwargs) -> noise | x_start | v | score
|
229 |
+
``
|
230 |
+
|
231 |
+
The input `classifier_fn` has the following format:
|
232 |
+
``
|
233 |
+
classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
|
234 |
+
``
|
235 |
+
|
236 |
+
[3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis,"
|
237 |
+
in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
|
238 |
+
|
239 |
+
3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
|
240 |
+
The input `model` has the following format:
|
241 |
+
``
|
242 |
+
model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
|
243 |
+
``
|
244 |
+
And if cond == `unconditional_condition`, the model output is the unconditional DPM output.
|
245 |
+
|
246 |
+
[4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
|
247 |
+
arXiv preprint arXiv:2207.12598 (2022).
|
248 |
+
|
249 |
+
|
250 |
+
The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
|
251 |
+
or continuous-time labels (i.e. epsilon to T).
|
252 |
+
|
253 |
+
We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
|
254 |
+
``
|
255 |
+
def model_fn(x, t_continuous) -> noise:
|
256 |
+
t_input = get_model_input_time(t_continuous)
|
257 |
+
return noise_pred(model, x, t_input, **model_kwargs)
|
258 |
+
``
|
259 |
+
where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver.
|
260 |
+
|
261 |
+
===============================================================
|
262 |
+
|
263 |
+
Args:
|
264 |
+
model: A diffusion model with the corresponding format described above.
|
265 |
+
noise_schedule: A noise schedule object, such as NoiseScheduleVP.
|
266 |
+
model_type: A `str`. The parameterization type of the diffusion model.
|
267 |
+
"noise" or "x_start" or "v" or "score".
|
268 |
+
model_kwargs: A `dict`. A dict for the other inputs of the model function.
|
269 |
+
guidance_type: A `str`. The type of the guidance for sampling.
|
270 |
+
"uncond" or "classifier" or "classifier-free".
|
271 |
+
condition: A pytorch tensor. The condition for the guided sampling.
|
272 |
+
Only used for "classifier" or "classifier-free" guidance type.
|
273 |
+
unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
|
274 |
+
Only used for "classifier-free" guidance type.
|
275 |
+
guidance_scale: A `float`. The scale for the guided sampling.
|
276 |
+
classifier_fn: A classifier function. Only used for the classifier guidance.
|
277 |
+
classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
|
278 |
+
Returns:
|
279 |
+
A noise prediction model that accepts the noised data and the continuous time as the inputs.
|
280 |
+
"""
|
281 |
+
|
282 |
+
def get_model_input_time(t_continuous):
|
283 |
+
"""
|
284 |
+
Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
|
285 |
+
For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
|
286 |
+
For continuous-time DPMs, we just use `t_continuous`.
|
287 |
+
"""
|
288 |
+
if noise_schedule.schedule == 'discrete':
|
289 |
+
return (t_continuous - 1. / noise_schedule.total_N) * 1000.
|
290 |
+
else:
|
291 |
+
return t_continuous
|
292 |
+
|
293 |
+
def noise_pred_fn(x, t_continuous, cond=None):
|
294 |
+
if t_continuous.reshape((-1,)).shape[0] == 1:
|
295 |
+
t_continuous = t_continuous.expand((x.shape[0]))
|
296 |
+
t_input = get_model_input_time(t_continuous)
|
297 |
+
output = model(x, t_input, **model_kwargs)
|
298 |
+
if model_type == "noise":
|
299 |
+
return output
|
300 |
+
elif model_type == "x_start":
|
301 |
+
alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
|
302 |
+
dims = x.dim()
|
303 |
+
return (x - expand_dims(alpha_t, dims) * output) / expand_dims(sigma_t, dims)
|
304 |
+
elif model_type == "v":
|
305 |
+
alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
|
306 |
+
dims = x.dim()
|
307 |
+
return expand_dims(alpha_t, dims) * output + expand_dims(sigma_t, dims) * x
|
308 |
+
elif model_type == "score":
|
309 |
+
sigma_t = noise_schedule.marginal_std(t_continuous)
|
310 |
+
dims = x.dim()
|
311 |
+
return -expand_dims(sigma_t, dims) * output
|
312 |
+
|
313 |
+
def cond_grad_fn(x, t_input):
|
314 |
+
"""
|
315 |
+
Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
|
316 |
+
"""
|
317 |
+
with torch.enable_grad():
|
318 |
+
x_in = x.detach().requires_grad_(True)
|
319 |
+
log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
|
320 |
+
return torch.autograd.grad(log_prob.sum(), x_in)[0]
|
321 |
+
|
322 |
+
def model_fn(x, t_continuous):
|
323 |
+
"""
|
324 |
+
The noise predicition model function that is used for DPM-Solver.
|
325 |
+
"""
|
326 |
+
if t_continuous.reshape((-1,)).shape[0] == 1:
|
327 |
+
t_continuous = t_continuous.expand((x.shape[0]))
|
328 |
+
if guidance_type == "uncond":
|
329 |
+
return noise_pred_fn(x, t_continuous)
|
330 |
+
elif guidance_type == "classifier":
|
331 |
+
assert classifier_fn is not None
|
332 |
+
t_input = get_model_input_time(t_continuous)
|
333 |
+
cond_grad = cond_grad_fn(x, t_input)
|
334 |
+
sigma_t = noise_schedule.marginal_std(t_continuous)
|
335 |
+
noise = noise_pred_fn(x, t_continuous)
|
336 |
+
return noise - guidance_scale * expand_dims(sigma_t, dims=cond_grad.dim()) * cond_grad
|
337 |
+
elif guidance_type == "classifier-free":
|
338 |
+
if guidance_scale == 1. or unconditional_condition is None:
|
339 |
+
return noise_pred_fn(x, t_continuous, cond=condition)
|
340 |
+
else:
|
341 |
+
x_in = torch.cat([x] * 2)
|
342 |
+
t_in = torch.cat([t_continuous] * 2)
|
343 |
+
c_in = torch.cat([unconditional_condition, condition])
|
344 |
+
noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
|
345 |
+
return noise_uncond + guidance_scale * (noise - noise_uncond)
|
346 |
+
|
347 |
+
assert model_type in ["noise", "x_start", "v"]
|
348 |
+
assert guidance_type in ["uncond", "classifier", "classifier-free"]
|
349 |
+
return model_fn
|
350 |
+
|
351 |
+
|
352 |
+
class UniPC:
|
353 |
+
def __init__(
|
354 |
+
self,
|
355 |
+
model_fn,
|
356 |
+
noise_schedule,
|
357 |
+
predict_x0=True,
|
358 |
+
thresholding=False,
|
359 |
+
max_val=1.,
|
360 |
+
variant='bh1',
|
361 |
+
):
|
362 |
+
"""Construct a UniPC.
|
363 |
+
|
364 |
+
We support both data_prediction and noise_prediction.
|
365 |
+
"""
|
366 |
+
self.model = model_fn
|
367 |
+
self.noise_schedule = noise_schedule
|
368 |
+
self.variant = variant
|
369 |
+
self.predict_x0 = predict_x0
|
370 |
+
self.thresholding = thresholding
|
371 |
+
self.max_val = max_val
|
372 |
+
|
373 |
+
def dynamic_thresholding_fn(self, x0, t=None):
|
374 |
+
"""
|
375 |
+
The dynamic thresholding method.
|
376 |
+
"""
|
377 |
+
dims = x0.dim()
|
378 |
+
p = self.dynamic_thresholding_ratio
|
379 |
+
s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
|
380 |
+
s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims)
|
381 |
+
x0 = torch.clamp(x0, -s, s) / s
|
382 |
+
return x0
|
383 |
+
|
384 |
+
def noise_prediction_fn(self, x, t):
|
385 |
+
"""
|
386 |
+
Return the noise prediction model.
|
387 |
+
"""
|
388 |
+
return self.model(x, t)
|
389 |
+
|
390 |
+
def data_prediction_fn(self, x, t):
|
391 |
+
"""
|
392 |
+
Return the data prediction model (with thresholding).
|
393 |
+
"""
|
394 |
+
noise = self.noise_prediction_fn(x, t)
|
395 |
+
dims = x.dim()
|
396 |
+
alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
|
397 |
+
x0 = (x - expand_dims(sigma_t, dims) * noise) / expand_dims(alpha_t, dims)
|
398 |
+
if self.thresholding:
|
399 |
+
p = 0.995 # A hyperparameter in the paper of "Imagen" [1].
|
400 |
+
s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
|
401 |
+
s = expand_dims(torch.maximum(s, self.max_val * torch.ones_like(s).to(s.device)), dims)
|
402 |
+
x0 = torch.clamp(x0, -s, s) / s
|
403 |
+
return x0
|
404 |
+
|
405 |
+
def model_fn(self, x, t):
|
406 |
+
"""
|
407 |
+
Convert the model to the noise prediction model or the data prediction model.
|
408 |
+
"""
|
409 |
+
if self.predict_x0:
|
410 |
+
return self.data_prediction_fn(x, t)
|
411 |
+
else:
|
412 |
+
return self.noise_prediction_fn(x, t)
|
413 |
+
|
414 |
+
def get_time_steps(self, skip_type, t_T, t_0, N, device):
|
415 |
+
"""Compute the intermediate time steps for sampling.
|
416 |
+
"""
|
417 |
+
if skip_type == 'logSNR':
|
418 |
+
lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
|
419 |
+
lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
|
420 |
+
logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
|
421 |
+
return self.noise_schedule.inverse_lambda(logSNR_steps)
|
422 |
+
elif skip_type == 'time_uniform':
|
423 |
+
return torch.linspace(t_T, t_0, N + 1).to(device)
|
424 |
+
elif skip_type == 'time_quadratic':
|
425 |
+
t_order = 2
|
426 |
+
t = torch.linspace(t_T**(1. / t_order), t_0**(1. / t_order), N + 1).pow(t_order).to(device)
|
427 |
+
return t
|
428 |
+
else:
|
429 |
+
raise ValueError("Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type))
|
430 |
+
|
431 |
+
def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
|
432 |
+
"""
|
433 |
+
Get the order of each step for sampling by the singlestep DPM-Solver.
|
434 |
+
"""
|
435 |
+
if order == 3:
|
436 |
+
K = steps // 3 + 1
|
437 |
+
if steps % 3 == 0:
|
438 |
+
orders = [3,] * (K - 2) + [2, 1]
|
439 |
+
elif steps % 3 == 1:
|
440 |
+
orders = [3,] * (K - 1) + [1]
|
441 |
+
else:
|
442 |
+
orders = [3,] * (K - 1) + [2]
|
443 |
+
elif order == 2:
|
444 |
+
if steps % 2 == 0:
|
445 |
+
K = steps // 2
|
446 |
+
orders = [2,] * K
|
447 |
+
else:
|
448 |
+
K = steps // 2 + 1
|
449 |
+
orders = [2,] * (K - 1) + [1]
|
450 |
+
elif order == 1:
|
451 |
+
K = steps
|
452 |
+
orders = [1,] * steps
|
453 |
+
else:
|
454 |
+
raise ValueError("'order' must be '1' or '2' or '3'.")
|
455 |
+
if skip_type == 'logSNR':
|
456 |
+
# To reproduce the results in DPM-Solver paper
|
457 |
+
timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
|
458 |
+
else:
|
459 |
+
timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[torch.cumsum(torch.tensor([0,] + orders), 0).to(device)]
|
460 |
+
return timesteps_outer, orders
|
461 |
+
|
462 |
+
def denoise_to_zero_fn(self, x, s):
|
463 |
+
"""
|
464 |
+
Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
|
465 |
+
"""
|
466 |
+
return self.data_prediction_fn(x, s)
|
467 |
+
|
468 |
+
def multistep_uni_pc_update(self, x, model_prev_list, t_prev_list, t, order, **kwargs):
|
469 |
+
if len(t.shape) == 0:
|
470 |
+
t = t.view(-1)
|
471 |
+
if 'bh' in self.variant:
|
472 |
+
return self.multistep_uni_pc_bh_update(x, model_prev_list, t_prev_list, t, order, **kwargs)
|
473 |
+
else:
|
474 |
+
assert self.variant == 'vary_coeff'
|
475 |
+
return self.multistep_uni_pc_vary_update(x, model_prev_list, t_prev_list, t, order, **kwargs)
|
476 |
+
|
477 |
+
def multistep_uni_pc_vary_update(self, x, model_prev_list, t_prev_list, t, order, use_corrector=True):
|
478 |
+
print(f'using unified predictor-corrector with order {order} (solver type: vary coeff)')
|
479 |
+
ns = self.noise_schedule
|
480 |
+
assert order <= len(model_prev_list)
|
481 |
+
|
482 |
+
# first compute rks
|
483 |
+
t_prev_0 = t_prev_list[-1]
|
484 |
+
lambda_prev_0 = ns.marginal_lambda(t_prev_0)
|
485 |
+
lambda_t = ns.marginal_lambda(t)
|
486 |
+
model_prev_0 = model_prev_list[-1]
|
487 |
+
sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
|
488 |
+
log_alpha_t = ns.marginal_log_mean_coeff(t)
|
489 |
+
alpha_t = torch.exp(log_alpha_t)
|
490 |
+
|
491 |
+
h = lambda_t - lambda_prev_0
|
492 |
+
|
493 |
+
rks = []
|
494 |
+
D1s = []
|
495 |
+
for i in range(1, order):
|
496 |
+
t_prev_i = t_prev_list[-(i + 1)]
|
497 |
+
model_prev_i = model_prev_list[-(i + 1)]
|
498 |
+
lambda_prev_i = ns.marginal_lambda(t_prev_i)
|
499 |
+
rk = (lambda_prev_i - lambda_prev_0) / h
|
500 |
+
rks.append(rk)
|
501 |
+
D1s.append((model_prev_i - model_prev_0) / rk)
|
502 |
+
|
503 |
+
rks.append(1.)
|
504 |
+
rks = torch.tensor(rks, device=x.device)
|
505 |
+
|
506 |
+
K = len(rks)
|
507 |
+
# build C matrix
|
508 |
+
C = []
|
509 |
+
|
510 |
+
col = torch.ones_like(rks)
|
511 |
+
for k in range(1, K + 1):
|
512 |
+
C.append(col)
|
513 |
+
col = col * rks / (k + 1)
|
514 |
+
C = torch.stack(C, dim=1)
|
515 |
+
|
516 |
+
if len(D1s) > 0:
|
517 |
+
D1s = torch.stack(D1s, dim=1) # (B, K)
|
518 |
+
C_inv_p = torch.linalg.inv(C[:-1, :-1])
|
519 |
+
A_p = C_inv_p
|
520 |
+
|
521 |
+
if use_corrector:
|
522 |
+
print('using corrector')
|
523 |
+
C_inv = torch.linalg.inv(C)
|
524 |
+
A_c = C_inv
|
525 |
+
|
526 |
+
hh = -h if self.predict_x0 else h
|
527 |
+
h_phi_1 = torch.expm1(hh)
|
528 |
+
h_phi_ks = []
|
529 |
+
factorial_k = 1
|
530 |
+
h_phi_k = h_phi_1
|
531 |
+
for k in range(1, K + 2):
|
532 |
+
h_phi_ks.append(h_phi_k)
|
533 |
+
h_phi_k = h_phi_k / hh - 1 / factorial_k
|
534 |
+
factorial_k *= (k + 1)
|
535 |
+
|
536 |
+
model_t = None
|
537 |
+
if self.predict_x0:
|
538 |
+
x_t_ = (
|
539 |
+
sigma_t / sigma_prev_0 * x
|
540 |
+
- alpha_t * h_phi_1 * model_prev_0
|
541 |
+
)
|
542 |
+
# now predictor
|
543 |
+
x_t = x_t_
|
544 |
+
if len(D1s) > 0:
|
545 |
+
# compute the residuals for predictor
|
546 |
+
for k in range(K - 1):
|
547 |
+
x_t = x_t - alpha_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_p[k])
|
548 |
+
# now corrector
|
549 |
+
if use_corrector:
|
550 |
+
model_t = self.model_fn(x_t, t)
|
551 |
+
D1_t = (model_t - model_prev_0)
|
552 |
+
x_t = x_t_
|
553 |
+
k = 0
|
554 |
+
for k in range(K - 1):
|
555 |
+
x_t = x_t - alpha_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_c[k][:-1])
|
556 |
+
x_t = x_t - alpha_t * h_phi_ks[K] * (D1_t * A_c[k][-1])
|
557 |
+
else:
|
558 |
+
log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
|
559 |
+
x_t_ = (
|
560 |
+
(torch.exp(log_alpha_t - log_alpha_prev_0)) * x
|
561 |
+
- (sigma_t * h_phi_1) * model_prev_0
|
562 |
+
)
|
563 |
+
# now predictor
|
564 |
+
x_t = x_t_
|
565 |
+
if len(D1s) > 0:
|
566 |
+
# compute the residuals for predictor
|
567 |
+
for k in range(K - 1):
|
568 |
+
x_t = x_t - sigma_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_p[k])
|
569 |
+
# now corrector
|
570 |
+
if use_corrector:
|
571 |
+
model_t = self.model_fn(x_t, t)
|
572 |
+
D1_t = (model_t - model_prev_0)
|
573 |
+
x_t = x_t_
|
574 |
+
k = 0
|
575 |
+
for k in range(K - 1):
|
576 |
+
x_t = x_t - sigma_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_c[k][:-1])
|
577 |
+
x_t = x_t - sigma_t * h_phi_ks[K] * (D1_t * A_c[k][-1])
|
578 |
+
return x_t, model_t
|
579 |
+
|
580 |
+
def multistep_uni_pc_bh_update(self, x, model_prev_list, t_prev_list, t, order, x_t=None, use_corrector=True):
|
581 |
+
# print(f'using unified predictor-corrector with order {order} (solver type: B(h))')
|
582 |
+
ns = self.noise_schedule
|
583 |
+
assert order <= len(model_prev_list)
|
584 |
+
dims = x.dim()
|
585 |
+
|
586 |
+
# first compute rks
|
587 |
+
t_prev_0 = t_prev_list[-1]
|
588 |
+
lambda_prev_0 = ns.marginal_lambda(t_prev_0)
|
589 |
+
lambda_t = ns.marginal_lambda(t)
|
590 |
+
model_prev_0 = model_prev_list[-1]
|
591 |
+
sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
|
592 |
+
log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
|
593 |
+
alpha_t = torch.exp(log_alpha_t)
|
594 |
+
|
595 |
+
h = lambda_t - lambda_prev_0
|
596 |
+
|
597 |
+
rks = []
|
598 |
+
D1s = []
|
599 |
+
for i in range(1, order):
|
600 |
+
t_prev_i = t_prev_list[-(i + 1)]
|
601 |
+
model_prev_i = model_prev_list[-(i + 1)]
|
602 |
+
lambda_prev_i = ns.marginal_lambda(t_prev_i)
|
603 |
+
rk = ((lambda_prev_i - lambda_prev_0) / h)[0]
|
604 |
+
rks.append(rk)
|
605 |
+
D1s.append((model_prev_i - model_prev_0) / rk)
|
606 |
+
|
607 |
+
rks.append(1.)
|
608 |
+
rks = torch.tensor(rks, device=x.device)
|
609 |
+
|
610 |
+
R = []
|
611 |
+
b = []
|
612 |
+
|
613 |
+
hh = -h[0] if self.predict_x0 else h[0]
|
614 |
+
h_phi_1 = torch.expm1(hh) # h\phi_1(h) = e^h - 1
|
615 |
+
h_phi_k = h_phi_1 / hh - 1
|
616 |
+
|
617 |
+
factorial_i = 1
|
618 |
+
|
619 |
+
if self.variant == 'bh1':
|
620 |
+
B_h = hh
|
621 |
+
elif self.variant == 'bh2':
|
622 |
+
B_h = torch.expm1(hh)
|
623 |
+
else:
|
624 |
+
raise NotImplementedError()
|
625 |
+
|
626 |
+
for i in range(1, order + 1):
|
627 |
+
R.append(torch.pow(rks, i - 1))
|
628 |
+
b.append(h_phi_k * factorial_i / B_h)
|
629 |
+
factorial_i *= (i + 1)
|
630 |
+
h_phi_k = h_phi_k / hh - 1 / factorial_i
|
631 |
+
|
632 |
+
R = torch.stack(R)
|
633 |
+
b = torch.tensor(b, device=x.device)
|
634 |
+
|
635 |
+
# now predictor
|
636 |
+
use_predictor = len(D1s) > 0 and x_t is None
|
637 |
+
if len(D1s) > 0:
|
638 |
+
D1s = torch.stack(D1s, dim=1) # (B, K)
|
639 |
+
if x_t is None:
|
640 |
+
# for order 2, we use a simplified version
|
641 |
+
if order == 2:
|
642 |
+
rhos_p = torch.tensor([0.5], device=b.device)
|
643 |
+
else:
|
644 |
+
rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1])
|
645 |
+
else:
|
646 |
+
D1s = None
|
647 |
+
|
648 |
+
if use_corrector:
|
649 |
+
# print('using corrector')
|
650 |
+
# for order 1, we use a simplified version
|
651 |
+
if order == 1:
|
652 |
+
rhos_c = torch.tensor([0.5], device=b.device)
|
653 |
+
else:
|
654 |
+
rhos_c = torch.linalg.solve(R, b)
|
655 |
+
|
656 |
+
model_t = None
|
657 |
+
if self.predict_x0:
|
658 |
+
x_t_ = (
|
659 |
+
expand_dims(sigma_t / sigma_prev_0, dims) * x
|
660 |
+
- expand_dims(alpha_t * h_phi_1, dims)* model_prev_0
|
661 |
+
)
|
662 |
+
|
663 |
+
if x_t is None:
|
664 |
+
if use_predictor:
|
665 |
+
pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s)
|
666 |
+
else:
|
667 |
+
pred_res = 0
|
668 |
+
x_t = x_t_ - expand_dims(alpha_t * B_h, dims) * pred_res
|
669 |
+
|
670 |
+
if use_corrector:
|
671 |
+
model_t = self.model_fn(x_t, t)
|
672 |
+
if D1s is not None:
|
673 |
+
corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
|
674 |
+
else:
|
675 |
+
corr_res = 0
|
676 |
+
D1_t = (model_t - model_prev_0)
|
677 |
+
x_t = x_t_ - expand_dims(alpha_t * B_h, dims) * (corr_res + rhos_c[-1] * D1_t)
|
678 |
+
else:
|
679 |
+
x_t_ = (
|
680 |
+
expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
|
681 |
+
- expand_dims(sigma_t * h_phi_1, dims) * model_prev_0
|
682 |
+
)
|
683 |
+
if x_t is None:
|
684 |
+
if use_predictor:
|
685 |
+
pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s)
|
686 |
+
else:
|
687 |
+
pred_res = 0
|
688 |
+
x_t = x_t_ - expand_dims(sigma_t * B_h, dims) * pred_res
|
689 |
+
|
690 |
+
if use_corrector:
|
691 |
+
model_t = self.model_fn(x_t, t)
|
692 |
+
if D1s is not None:
|
693 |
+
corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
|
694 |
+
else:
|
695 |
+
corr_res = 0
|
696 |
+
D1_t = (model_t - model_prev_0)
|
697 |
+
x_t = x_t_ - expand_dims(sigma_t * B_h, dims) * (corr_res + rhos_c[-1] * D1_t)
|
698 |
+
return x_t, model_t
|
699 |
+
|
700 |
+
|
701 |
+
def sample(self, x, timesteps, t_start=None, t_end=None, order=3, skip_type='time_uniform',
|
702 |
+
method='singlestep', lower_order_final=True, denoise_to_zero=False, solver_type='dpm_solver',
|
703 |
+
atol=0.0078, rtol=0.05, corrector=False, callback=None, disable_pbar=False
|
704 |
+
):
|
705 |
+
# t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
|
706 |
+
# t_T = self.noise_schedule.T if t_start is None else t_start
|
707 |
+
device = x.device
|
708 |
+
steps = len(timesteps) - 1
|
709 |
+
if method == 'multistep':
|
710 |
+
assert steps >= order
|
711 |
+
# timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
|
712 |
+
assert timesteps.shape[0] - 1 == steps
|
713 |
+
# with torch.no_grad():
|
714 |
+
for step_index in trange(steps, disable=disable_pbar):
|
715 |
+
if step_index == 0:
|
716 |
+
vec_t = timesteps[0].expand((x.shape[0]))
|
717 |
+
model_prev_list = [self.model_fn(x, vec_t)]
|
718 |
+
t_prev_list = [vec_t]
|
719 |
+
elif step_index < order:
|
720 |
+
init_order = step_index
|
721 |
+
# Init the first `order` values by lower order multistep DPM-Solver.
|
722 |
+
# for init_order in range(1, order):
|
723 |
+
vec_t = timesteps[init_order].expand(x.shape[0])
|
724 |
+
x, model_x = self.multistep_uni_pc_update(x, model_prev_list, t_prev_list, vec_t, init_order, use_corrector=True)
|
725 |
+
if model_x is None:
|
726 |
+
model_x = self.model_fn(x, vec_t)
|
727 |
+
model_prev_list.append(model_x)
|
728 |
+
t_prev_list.append(vec_t)
|
729 |
+
else:
|
730 |
+
extra_final_step = 0
|
731 |
+
if step_index == (steps - 1):
|
732 |
+
extra_final_step = 1
|
733 |
+
for step in range(step_index, step_index + 1 + extra_final_step):
|
734 |
+
vec_t = timesteps[step].expand(x.shape[0])
|
735 |
+
if lower_order_final:
|
736 |
+
step_order = min(order, steps + 1 - step)
|
737 |
+
else:
|
738 |
+
step_order = order
|
739 |
+
# print('this step order:', step_order)
|
740 |
+
if step == steps:
|
741 |
+
# print('do not run corrector at the last step')
|
742 |
+
use_corrector = False
|
743 |
+
else:
|
744 |
+
use_corrector = True
|
745 |
+
x, model_x = self.multistep_uni_pc_update(x, model_prev_list, t_prev_list, vec_t, step_order, use_corrector=use_corrector)
|
746 |
+
for i in range(order - 1):
|
747 |
+
t_prev_list[i] = t_prev_list[i + 1]
|
748 |
+
model_prev_list[i] = model_prev_list[i + 1]
|
749 |
+
t_prev_list[-1] = vec_t
|
750 |
+
# We do not need to evaluate the final model value.
|
751 |
+
if step < steps:
|
752 |
+
if model_x is None:
|
753 |
+
model_x = self.model_fn(x, vec_t)
|
754 |
+
model_prev_list[-1] = model_x
|
755 |
+
if callback is not None:
|
756 |
+
callback({'x': x, 'i': step_index, 'denoised': model_prev_list[-1]})
|
757 |
+
else:
|
758 |
+
raise NotImplementedError()
|
759 |
+
# if denoise_to_zero:
|
760 |
+
# x = self.denoise_to_zero_fn(x, torch.ones((x.shape[0],)).to(device) * t_0)
|
761 |
+
return x
|
762 |
+
|
763 |
+
|
764 |
+
#############################################################
|
765 |
+
# other utility functions
|
766 |
+
#############################################################
|
767 |
+
|
768 |
+
def interpolate_fn(x, xp, yp):
|
769 |
+
"""
|
770 |
+
A piecewise linear function y = f(x), using xp and yp as keypoints.
|
771 |
+
We implement f(x) in a differentiable way (i.e. applicable for autograd).
|
772 |
+
The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
|
773 |
+
|
774 |
+
Args:
|
775 |
+
x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
|
776 |
+
xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
|
777 |
+
yp: PyTorch tensor with shape [C, K].
|
778 |
+
Returns:
|
779 |
+
The function values f(x), with shape [N, C].
|
780 |
+
"""
|
781 |
+
N, K = x.shape[0], xp.shape[1]
|
782 |
+
all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
|
783 |
+
sorted_all_x, x_indices = torch.sort(all_x, dim=2)
|
784 |
+
x_idx = torch.argmin(x_indices, dim=2)
|
785 |
+
cand_start_idx = x_idx - 1
|
786 |
+
start_idx = torch.where(
|
787 |
+
torch.eq(x_idx, 0),
|
788 |
+
torch.tensor(1, device=x.device),
|
789 |
+
torch.where(
|
790 |
+
torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
|
791 |
+
),
|
792 |
+
)
|
793 |
+
end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
|
794 |
+
start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
|
795 |
+
end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
|
796 |
+
start_idx2 = torch.where(
|
797 |
+
torch.eq(x_idx, 0),
|
798 |
+
torch.tensor(0, device=x.device),
|
799 |
+
torch.where(
|
800 |
+
torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
|
801 |
+
),
|
802 |
+
)
|
803 |
+
y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
|
804 |
+
start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
|
805 |
+
end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
|
806 |
+
cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
|
807 |
+
return cand
|
808 |
+
|
809 |
+
|
810 |
+
def expand_dims(v, dims):
|
811 |
+
"""
|
812 |
+
Expand the tensor `v` to the dim `dims`.
|
813 |
+
|
814 |
+
Args:
|
815 |
+
`v`: a PyTorch tensor with shape [N].
|
816 |
+
`dim`: a `int`.
|
817 |
+
Returns:
|
818 |
+
a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
|
819 |
+
"""
|
820 |
+
return v[(...,) + (None,)*(dims - 1)]
|
821 |
+
|
822 |
+
|
823 |
+
class SigmaConvert:
|
824 |
+
schedule = ""
|
825 |
+
def marginal_log_mean_coeff(self, sigma):
|
826 |
+
return 0.5 * torch.log(1 / ((sigma * sigma) + 1))
|
827 |
+
|
828 |
+
def marginal_alpha(self, t):
|
829 |
+
return torch.exp(self.marginal_log_mean_coeff(t))
|
830 |
+
|
831 |
+
def marginal_std(self, t):
|
832 |
+
return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
|
833 |
+
|
834 |
+
def marginal_lambda(self, t):
|
835 |
+
"""
|
836 |
+
Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
|
837 |
+
"""
|
838 |
+
log_mean_coeff = self.marginal_log_mean_coeff(t)
|
839 |
+
log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
|
840 |
+
return log_mean_coeff - log_std
|
841 |
+
|
842 |
+
def predict_eps_sigma(model, input, sigma_in, **kwargs):
|
843 |
+
sigma = sigma_in.view(sigma_in.shape[:1] + (1,) * (input.ndim - 1))
|
844 |
+
input = input * ((sigma ** 2 + 1.0) ** 0.5)
|
845 |
+
return (input - model(input, sigma_in, **kwargs)) / sigma
|
846 |
+
|
847 |
+
|
848 |
+
def sample_unipc(model, noise, sigmas, extra_args=None, callback=None, disable=False, variant='bh1'):
|
849 |
+
timesteps = sigmas.clone()
|
850 |
+
if sigmas[-1] == 0:
|
851 |
+
timesteps = sigmas[:]
|
852 |
+
timesteps[-1] = 0.001
|
853 |
+
else:
|
854 |
+
timesteps = sigmas.clone()
|
855 |
+
ns = SigmaConvert()
|
856 |
+
|
857 |
+
noise = noise / torch.sqrt(1.0 + timesteps[0] ** 2.0)
|
858 |
+
model_type = "noise"
|
859 |
+
|
860 |
+
model_fn = model_wrapper(
|
861 |
+
lambda input, sigma, **kwargs: predict_eps_sigma(model, input, sigma, **kwargs),
|
862 |
+
ns,
|
863 |
+
model_type=model_type,
|
864 |
+
guidance_type="uncond",
|
865 |
+
model_kwargs=extra_args,
|
866 |
+
)
|
867 |
+
|
868 |
+
order = min(3, len(timesteps) - 2)
|
869 |
+
uni_pc = UniPC(model_fn, ns, predict_x0=True, thresholding=False, variant=variant)
|
870 |
+
x = uni_pc.sample(noise, timesteps=timesteps, skip_type="time_uniform", method="multistep", order=order, lower_order_final=True, callback=callback, disable_pbar=disable)
|
871 |
+
x /= ns.marginal_alpha(timesteps[-1])
|
872 |
+
return x
|
873 |
+
|
874 |
+
def sample_unipc_bh2(model, noise, sigmas, extra_args=None, callback=None, disable=False):
|
875 |
+
return sample_unipc(model, noise, sigmas, extra_args, callback, disable, variant='bh2')
|
ComfyUI/comfy/gligen.py
ADDED
@@ -0,0 +1,343 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
from .ldm.modules.attention import CrossAttention
|
4 |
+
from inspect import isfunction
|
5 |
+
import comfy.ops
|
6 |
+
ops = comfy.ops.manual_cast
|
7 |
+
|
8 |
+
def exists(val):
|
9 |
+
return val is not None
|
10 |
+
|
11 |
+
|
12 |
+
def uniq(arr):
|
13 |
+
return{el: True for el in arr}.keys()
|
14 |
+
|
15 |
+
|
16 |
+
def default(val, d):
|
17 |
+
if exists(val):
|
18 |
+
return val
|
19 |
+
return d() if isfunction(d) else d
|
20 |
+
|
21 |
+
|
22 |
+
# feedforward
|
23 |
+
class GEGLU(nn.Module):
|
24 |
+
def __init__(self, dim_in, dim_out):
|
25 |
+
super().__init__()
|
26 |
+
self.proj = ops.Linear(dim_in, dim_out * 2)
|
27 |
+
|
28 |
+
def forward(self, x):
|
29 |
+
x, gate = self.proj(x).chunk(2, dim=-1)
|
30 |
+
return x * torch.nn.functional.gelu(gate)
|
31 |
+
|
32 |
+
|
33 |
+
class FeedForward(nn.Module):
|
34 |
+
def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
|
35 |
+
super().__init__()
|
36 |
+
inner_dim = int(dim * mult)
|
37 |
+
dim_out = default(dim_out, dim)
|
38 |
+
project_in = nn.Sequential(
|
39 |
+
ops.Linear(dim, inner_dim),
|
40 |
+
nn.GELU()
|
41 |
+
) if not glu else GEGLU(dim, inner_dim)
|
42 |
+
|
43 |
+
self.net = nn.Sequential(
|
44 |
+
project_in,
|
45 |
+
nn.Dropout(dropout),
|
46 |
+
ops.Linear(inner_dim, dim_out)
|
47 |
+
)
|
48 |
+
|
49 |
+
def forward(self, x):
|
50 |
+
return self.net(x)
|
51 |
+
|
52 |
+
|
53 |
+
class GatedCrossAttentionDense(nn.Module):
|
54 |
+
def __init__(self, query_dim, context_dim, n_heads, d_head):
|
55 |
+
super().__init__()
|
56 |
+
|
57 |
+
self.attn = CrossAttention(
|
58 |
+
query_dim=query_dim,
|
59 |
+
context_dim=context_dim,
|
60 |
+
heads=n_heads,
|
61 |
+
dim_head=d_head,
|
62 |
+
operations=ops)
|
63 |
+
self.ff = FeedForward(query_dim, glu=True)
|
64 |
+
|
65 |
+
self.norm1 = ops.LayerNorm(query_dim)
|
66 |
+
self.norm2 = ops.LayerNorm(query_dim)
|
67 |
+
|
68 |
+
self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
|
69 |
+
self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
|
70 |
+
|
71 |
+
# this can be useful: we can externally change magnitude of tanh(alpha)
|
72 |
+
# for example, when it is set to 0, then the entire model is same as
|
73 |
+
# original one
|
74 |
+
self.scale = 1
|
75 |
+
|
76 |
+
def forward(self, x, objs):
|
77 |
+
|
78 |
+
x = x + self.scale * \
|
79 |
+
torch.tanh(self.alpha_attn) * self.attn(self.norm1(x), objs, objs)
|
80 |
+
x = x + self.scale * \
|
81 |
+
torch.tanh(self.alpha_dense) * self.ff(self.norm2(x))
|
82 |
+
|
83 |
+
return x
|
84 |
+
|
85 |
+
|
86 |
+
class GatedSelfAttentionDense(nn.Module):
|
87 |
+
def __init__(self, query_dim, context_dim, n_heads, d_head):
|
88 |
+
super().__init__()
|
89 |
+
|
90 |
+
# we need a linear projection since we need cat visual feature and obj
|
91 |
+
# feature
|
92 |
+
self.linear = ops.Linear(context_dim, query_dim)
|
93 |
+
|
94 |
+
self.attn = CrossAttention(
|
95 |
+
query_dim=query_dim,
|
96 |
+
context_dim=query_dim,
|
97 |
+
heads=n_heads,
|
98 |
+
dim_head=d_head,
|
99 |
+
operations=ops)
|
100 |
+
self.ff = FeedForward(query_dim, glu=True)
|
101 |
+
|
102 |
+
self.norm1 = ops.LayerNorm(query_dim)
|
103 |
+
self.norm2 = ops.LayerNorm(query_dim)
|
104 |
+
|
105 |
+
self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
|
106 |
+
self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
|
107 |
+
|
108 |
+
# this can be useful: we can externally change magnitude of tanh(alpha)
|
109 |
+
# for example, when it is set to 0, then the entire model is same as
|
110 |
+
# original one
|
111 |
+
self.scale = 1
|
112 |
+
|
113 |
+
def forward(self, x, objs):
|
114 |
+
|
115 |
+
N_visual = x.shape[1]
|
116 |
+
objs = self.linear(objs)
|
117 |
+
|
118 |
+
x = x + self.scale * torch.tanh(self.alpha_attn) * self.attn(
|
119 |
+
self.norm1(torch.cat([x, objs], dim=1)))[:, 0:N_visual, :]
|
120 |
+
x = x + self.scale * \
|
121 |
+
torch.tanh(self.alpha_dense) * self.ff(self.norm2(x))
|
122 |
+
|
123 |
+
return x
|
124 |
+
|
125 |
+
|
126 |
+
class GatedSelfAttentionDense2(nn.Module):
|
127 |
+
def __init__(self, query_dim, context_dim, n_heads, d_head):
|
128 |
+
super().__init__()
|
129 |
+
|
130 |
+
# we need a linear projection since we need cat visual feature and obj
|
131 |
+
# feature
|
132 |
+
self.linear = ops.Linear(context_dim, query_dim)
|
133 |
+
|
134 |
+
self.attn = CrossAttention(
|
135 |
+
query_dim=query_dim, context_dim=query_dim, dim_head=d_head, operations=ops)
|
136 |
+
self.ff = FeedForward(query_dim, glu=True)
|
137 |
+
|
138 |
+
self.norm1 = ops.LayerNorm(query_dim)
|
139 |
+
self.norm2 = ops.LayerNorm(query_dim)
|
140 |
+
|
141 |
+
self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
|
142 |
+
self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
|
143 |
+
|
144 |
+
# this can be useful: we can externally change magnitude of tanh(alpha)
|
145 |
+
# for example, when it is set to 0, then the entire model is same as
|
146 |
+
# original one
|
147 |
+
self.scale = 1
|
148 |
+
|
149 |
+
def forward(self, x, objs):
|
150 |
+
|
151 |
+
B, N_visual, _ = x.shape
|
152 |
+
B, N_ground, _ = objs.shape
|
153 |
+
|
154 |
+
objs = self.linear(objs)
|
155 |
+
|
156 |
+
# sanity check
|
157 |
+
size_v = math.sqrt(N_visual)
|
158 |
+
size_g = math.sqrt(N_ground)
|
159 |
+
assert int(size_v) == size_v, "Visual tokens must be square rootable"
|
160 |
+
assert int(size_g) == size_g, "Grounding tokens must be square rootable"
|
161 |
+
size_v = int(size_v)
|
162 |
+
size_g = int(size_g)
|
163 |
+
|
164 |
+
# select grounding token and resize it to visual token size as residual
|
165 |
+
out = self.attn(self.norm1(torch.cat([x, objs], dim=1)))[
|
166 |
+
:, N_visual:, :]
|
167 |
+
out = out.permute(0, 2, 1).reshape(B, -1, size_g, size_g)
|
168 |
+
out = torch.nn.functional.interpolate(
|
169 |
+
out, (size_v, size_v), mode='bicubic')
|
170 |
+
residual = out.reshape(B, -1, N_visual).permute(0, 2, 1)
|
171 |
+
|
172 |
+
# add residual to visual feature
|
173 |
+
x = x + self.scale * torch.tanh(self.alpha_attn) * residual
|
174 |
+
x = x + self.scale * \
|
175 |
+
torch.tanh(self.alpha_dense) * self.ff(self.norm2(x))
|
176 |
+
|
177 |
+
return x
|
178 |
+
|
179 |
+
|
180 |
+
class FourierEmbedder():
|
181 |
+
def __init__(self, num_freqs=64, temperature=100):
|
182 |
+
|
183 |
+
self.num_freqs = num_freqs
|
184 |
+
self.temperature = temperature
|
185 |
+
self.freq_bands = temperature ** (torch.arange(num_freqs) / num_freqs)
|
186 |
+
|
187 |
+
@torch.no_grad()
|
188 |
+
def __call__(self, x, cat_dim=-1):
|
189 |
+
"x: arbitrary shape of tensor. dim: cat dim"
|
190 |
+
out = []
|
191 |
+
for freq in self.freq_bands:
|
192 |
+
out.append(torch.sin(freq * x))
|
193 |
+
out.append(torch.cos(freq * x))
|
194 |
+
return torch.cat(out, cat_dim)
|
195 |
+
|
196 |
+
|
197 |
+
class PositionNet(nn.Module):
|
198 |
+
def __init__(self, in_dim, out_dim, fourier_freqs=8):
|
199 |
+
super().__init__()
|
200 |
+
self.in_dim = in_dim
|
201 |
+
self.out_dim = out_dim
|
202 |
+
|
203 |
+
self.fourier_embedder = FourierEmbedder(num_freqs=fourier_freqs)
|
204 |
+
self.position_dim = fourier_freqs * 2 * 4 # 2 is sin&cos, 4 is xyxy
|
205 |
+
|
206 |
+
self.linears = nn.Sequential(
|
207 |
+
ops.Linear(self.in_dim + self.position_dim, 512),
|
208 |
+
nn.SiLU(),
|
209 |
+
ops.Linear(512, 512),
|
210 |
+
nn.SiLU(),
|
211 |
+
ops.Linear(512, out_dim),
|
212 |
+
)
|
213 |
+
|
214 |
+
self.null_positive_feature = torch.nn.Parameter(
|
215 |
+
torch.zeros([self.in_dim]))
|
216 |
+
self.null_position_feature = torch.nn.Parameter(
|
217 |
+
torch.zeros([self.position_dim]))
|
218 |
+
|
219 |
+
def forward(self, boxes, masks, positive_embeddings):
|
220 |
+
B, N, _ = boxes.shape
|
221 |
+
masks = masks.unsqueeze(-1)
|
222 |
+
positive_embeddings = positive_embeddings
|
223 |
+
|
224 |
+
# embedding position (it may includes padding as placeholder)
|
225 |
+
xyxy_embedding = self.fourier_embedder(boxes) # B*N*4 --> B*N*C
|
226 |
+
|
227 |
+
# learnable null embedding
|
228 |
+
positive_null = self.null_positive_feature.to(device=boxes.device, dtype=boxes.dtype).view(1, 1, -1)
|
229 |
+
xyxy_null = self.null_position_feature.to(device=boxes.device, dtype=boxes.dtype).view(1, 1, -1)
|
230 |
+
|
231 |
+
# replace padding with learnable null embedding
|
232 |
+
positive_embeddings = positive_embeddings * \
|
233 |
+
masks + (1 - masks) * positive_null
|
234 |
+
xyxy_embedding = xyxy_embedding * masks + (1 - masks) * xyxy_null
|
235 |
+
|
236 |
+
objs = self.linears(
|
237 |
+
torch.cat([positive_embeddings, xyxy_embedding], dim=-1))
|
238 |
+
assert objs.shape == torch.Size([B, N, self.out_dim])
|
239 |
+
return objs
|
240 |
+
|
241 |
+
|
242 |
+
class Gligen(nn.Module):
|
243 |
+
def __init__(self, modules, position_net, key_dim):
|
244 |
+
super().__init__()
|
245 |
+
self.module_list = nn.ModuleList(modules)
|
246 |
+
self.position_net = position_net
|
247 |
+
self.key_dim = key_dim
|
248 |
+
self.max_objs = 30
|
249 |
+
self.current_device = torch.device("cpu")
|
250 |
+
|
251 |
+
def _set_position(self, boxes, masks, positive_embeddings):
|
252 |
+
objs = self.position_net(boxes, masks, positive_embeddings)
|
253 |
+
def func(x, extra_options):
|
254 |
+
key = extra_options["transformer_index"]
|
255 |
+
module = self.module_list[key]
|
256 |
+
return module(x, objs.to(device=x.device, dtype=x.dtype))
|
257 |
+
return func
|
258 |
+
|
259 |
+
def set_position(self, latent_image_shape, position_params, device):
|
260 |
+
batch, c, h, w = latent_image_shape
|
261 |
+
masks = torch.zeros([self.max_objs], device="cpu")
|
262 |
+
boxes = []
|
263 |
+
positive_embeddings = []
|
264 |
+
for p in position_params:
|
265 |
+
x1 = (p[4]) / w
|
266 |
+
y1 = (p[3]) / h
|
267 |
+
x2 = (p[4] + p[2]) / w
|
268 |
+
y2 = (p[3] + p[1]) / h
|
269 |
+
masks[len(boxes)] = 1.0
|
270 |
+
boxes += [torch.tensor((x1, y1, x2, y2)).unsqueeze(0)]
|
271 |
+
positive_embeddings += [p[0]]
|
272 |
+
append_boxes = []
|
273 |
+
append_conds = []
|
274 |
+
if len(boxes) < self.max_objs:
|
275 |
+
append_boxes = [torch.zeros(
|
276 |
+
[self.max_objs - len(boxes), 4], device="cpu")]
|
277 |
+
append_conds = [torch.zeros(
|
278 |
+
[self.max_objs - len(boxes), self.key_dim], device="cpu")]
|
279 |
+
|
280 |
+
box_out = torch.cat(
|
281 |
+
boxes + append_boxes).unsqueeze(0).repeat(batch, 1, 1)
|
282 |
+
masks = masks.unsqueeze(0).repeat(batch, 1)
|
283 |
+
conds = torch.cat(positive_embeddings +
|
284 |
+
append_conds).unsqueeze(0).repeat(batch, 1, 1)
|
285 |
+
return self._set_position(
|
286 |
+
box_out.to(device),
|
287 |
+
masks.to(device),
|
288 |
+
conds.to(device))
|
289 |
+
|
290 |
+
def set_empty(self, latent_image_shape, device):
|
291 |
+
batch, c, h, w = latent_image_shape
|
292 |
+
masks = torch.zeros([self.max_objs], device="cpu").repeat(batch, 1)
|
293 |
+
box_out = torch.zeros([self.max_objs, 4],
|
294 |
+
device="cpu").repeat(batch, 1, 1)
|
295 |
+
conds = torch.zeros([self.max_objs, self.key_dim],
|
296 |
+
device="cpu").repeat(batch, 1, 1)
|
297 |
+
return self._set_position(
|
298 |
+
box_out.to(device),
|
299 |
+
masks.to(device),
|
300 |
+
conds.to(device))
|
301 |
+
|
302 |
+
|
303 |
+
def load_gligen(sd):
|
304 |
+
sd_k = sd.keys()
|
305 |
+
output_list = []
|
306 |
+
key_dim = 768
|
307 |
+
for a in ["input_blocks", "middle_block", "output_blocks"]:
|
308 |
+
for b in range(20):
|
309 |
+
k_temp = filter(lambda k: "{}.{}.".format(a, b)
|
310 |
+
in k and ".fuser." in k, sd_k)
|
311 |
+
k_temp = map(lambda k: (k, k.split(".fuser.")[-1]), k_temp)
|
312 |
+
|
313 |
+
n_sd = {}
|
314 |
+
for k in k_temp:
|
315 |
+
n_sd[k[1]] = sd[k[0]]
|
316 |
+
if len(n_sd) > 0:
|
317 |
+
query_dim = n_sd["linear.weight"].shape[0]
|
318 |
+
key_dim = n_sd["linear.weight"].shape[1]
|
319 |
+
|
320 |
+
if key_dim == 768: # SD1.x
|
321 |
+
n_heads = 8
|
322 |
+
d_head = query_dim // n_heads
|
323 |
+
else:
|
324 |
+
d_head = 64
|
325 |
+
n_heads = query_dim // d_head
|
326 |
+
|
327 |
+
gated = GatedSelfAttentionDense(
|
328 |
+
query_dim, key_dim, n_heads, d_head)
|
329 |
+
gated.load_state_dict(n_sd, strict=False)
|
330 |
+
output_list.append(gated)
|
331 |
+
|
332 |
+
if "position_net.null_positive_feature" in sd_k:
|
333 |
+
in_dim = sd["position_net.null_positive_feature"].shape[0]
|
334 |
+
out_dim = sd["position_net.linears.4.weight"].shape[0]
|
335 |
+
|
336 |
+
class WeightsLoader(torch.nn.Module):
|
337 |
+
pass
|
338 |
+
w = WeightsLoader()
|
339 |
+
w.position_net = PositionNet(in_dim, out_dim)
|
340 |
+
w.load_state_dict(sd, strict=False)
|
341 |
+
|
342 |
+
gligen = Gligen(output_list, w.position_net, key_dim)
|
343 |
+
return gligen
|
ComfyUI/comfy/k_diffusion/sampling.py
ADDED
@@ -0,0 +1,810 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
|
3 |
+
from scipy import integrate
|
4 |
+
import torch
|
5 |
+
from torch import nn
|
6 |
+
import torchsde
|
7 |
+
from tqdm.auto import trange, tqdm
|
8 |
+
|
9 |
+
from . import utils
|
10 |
+
|
11 |
+
|
12 |
+
def append_zero(x):
|
13 |
+
return torch.cat([x, x.new_zeros([1])])
|
14 |
+
|
15 |
+
|
16 |
+
def get_sigmas_karras(n, sigma_min, sigma_max, rho=7., device='cpu'):
|
17 |
+
"""Constructs the noise schedule of Karras et al. (2022)."""
|
18 |
+
ramp = torch.linspace(0, 1, n, device=device)
|
19 |
+
min_inv_rho = sigma_min ** (1 / rho)
|
20 |
+
max_inv_rho = sigma_max ** (1 / rho)
|
21 |
+
sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
|
22 |
+
return append_zero(sigmas).to(device)
|
23 |
+
|
24 |
+
|
25 |
+
def get_sigmas_exponential(n, sigma_min, sigma_max, device='cpu'):
|
26 |
+
"""Constructs an exponential noise schedule."""
|
27 |
+
sigmas = torch.linspace(math.log(sigma_max), math.log(sigma_min), n, device=device).exp()
|
28 |
+
return append_zero(sigmas)
|
29 |
+
|
30 |
+
|
31 |
+
def get_sigmas_polyexponential(n, sigma_min, sigma_max, rho=1., device='cpu'):
|
32 |
+
"""Constructs an polynomial in log sigma noise schedule."""
|
33 |
+
ramp = torch.linspace(1, 0, n, device=device) ** rho
|
34 |
+
sigmas = torch.exp(ramp * (math.log(sigma_max) - math.log(sigma_min)) + math.log(sigma_min))
|
35 |
+
return append_zero(sigmas)
|
36 |
+
|
37 |
+
|
38 |
+
def get_sigmas_vp(n, beta_d=19.9, beta_min=0.1, eps_s=1e-3, device='cpu'):
|
39 |
+
"""Constructs a continuous VP noise schedule."""
|
40 |
+
t = torch.linspace(1, eps_s, n, device=device)
|
41 |
+
sigmas = torch.sqrt(torch.exp(beta_d * t ** 2 / 2 + beta_min * t) - 1)
|
42 |
+
return append_zero(sigmas)
|
43 |
+
|
44 |
+
|
45 |
+
def to_d(x, sigma, denoised):
|
46 |
+
"""Converts a denoiser output to a Karras ODE derivative."""
|
47 |
+
return (x - denoised) / utils.append_dims(sigma, x.ndim)
|
48 |
+
|
49 |
+
|
50 |
+
def get_ancestral_step(sigma_from, sigma_to, eta=1.):
|
51 |
+
"""Calculates the noise level (sigma_down) to step down to and the amount
|
52 |
+
of noise to add (sigma_up) when doing an ancestral sampling step."""
|
53 |
+
if not eta:
|
54 |
+
return sigma_to, 0.
|
55 |
+
sigma_up = min(sigma_to, eta * (sigma_to ** 2 * (sigma_from ** 2 - sigma_to ** 2) / sigma_from ** 2) ** 0.5)
|
56 |
+
sigma_down = (sigma_to ** 2 - sigma_up ** 2) ** 0.5
|
57 |
+
return sigma_down, sigma_up
|
58 |
+
|
59 |
+
|
60 |
+
def default_noise_sampler(x):
|
61 |
+
return lambda sigma, sigma_next: torch.randn_like(x)
|
62 |
+
|
63 |
+
|
64 |
+
class BatchedBrownianTree:
|
65 |
+
"""A wrapper around torchsde.BrownianTree that enables batches of entropy."""
|
66 |
+
|
67 |
+
def __init__(self, x, t0, t1, seed=None, **kwargs):
|
68 |
+
self.cpu_tree = True
|
69 |
+
if "cpu" in kwargs:
|
70 |
+
self.cpu_tree = kwargs.pop("cpu")
|
71 |
+
t0, t1, self.sign = self.sort(t0, t1)
|
72 |
+
w0 = kwargs.get('w0', torch.zeros_like(x))
|
73 |
+
if seed is None:
|
74 |
+
seed = torch.randint(0, 2 ** 63 - 1, []).item()
|
75 |
+
self.batched = True
|
76 |
+
try:
|
77 |
+
assert len(seed) == x.shape[0]
|
78 |
+
w0 = w0[0]
|
79 |
+
except TypeError:
|
80 |
+
seed = [seed]
|
81 |
+
self.batched = False
|
82 |
+
if self.cpu_tree:
|
83 |
+
self.trees = [torchsde.BrownianTree(t0.cpu(), w0.cpu(), t1.cpu(), entropy=s, **kwargs) for s in seed]
|
84 |
+
else:
|
85 |
+
self.trees = [torchsde.BrownianTree(t0, w0, t1, entropy=s, **kwargs) for s in seed]
|
86 |
+
|
87 |
+
@staticmethod
|
88 |
+
def sort(a, b):
|
89 |
+
return (a, b, 1) if a < b else (b, a, -1)
|
90 |
+
|
91 |
+
def __call__(self, t0, t1):
|
92 |
+
t0, t1, sign = self.sort(t0, t1)
|
93 |
+
if self.cpu_tree:
|
94 |
+
w = torch.stack([tree(t0.cpu().float(), t1.cpu().float()).to(t0.dtype).to(t0.device) for tree in self.trees]) * (self.sign * sign)
|
95 |
+
else:
|
96 |
+
w = torch.stack([tree(t0, t1) for tree in self.trees]) * (self.sign * sign)
|
97 |
+
|
98 |
+
return w if self.batched else w[0]
|
99 |
+
|
100 |
+
|
101 |
+
class BrownianTreeNoiseSampler:
|
102 |
+
"""A noise sampler backed by a torchsde.BrownianTree.
|
103 |
+
|
104 |
+
Args:
|
105 |
+
x (Tensor): The tensor whose shape, device and dtype to use to generate
|
106 |
+
random samples.
|
107 |
+
sigma_min (float): The low end of the valid interval.
|
108 |
+
sigma_max (float): The high end of the valid interval.
|
109 |
+
seed (int or List[int]): The random seed. If a list of seeds is
|
110 |
+
supplied instead of a single integer, then the noise sampler will
|
111 |
+
use one BrownianTree per batch item, each with its own seed.
|
112 |
+
transform (callable): A function that maps sigma to the sampler's
|
113 |
+
internal timestep.
|
114 |
+
"""
|
115 |
+
|
116 |
+
def __init__(self, x, sigma_min, sigma_max, seed=None, transform=lambda x: x, cpu=False):
|
117 |
+
self.transform = transform
|
118 |
+
t0, t1 = self.transform(torch.as_tensor(sigma_min)), self.transform(torch.as_tensor(sigma_max))
|
119 |
+
self.tree = BatchedBrownianTree(x, t0, t1, seed, cpu=cpu)
|
120 |
+
|
121 |
+
def __call__(self, sigma, sigma_next):
|
122 |
+
t0, t1 = self.transform(torch.as_tensor(sigma)), self.transform(torch.as_tensor(sigma_next))
|
123 |
+
return self.tree(t0, t1) / (t1 - t0).abs().sqrt()
|
124 |
+
|
125 |
+
|
126 |
+
@torch.no_grad()
|
127 |
+
def sample_euler(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
|
128 |
+
"""Implements Algorithm 2 (Euler steps) from Karras et al. (2022)."""
|
129 |
+
extra_args = {} if extra_args is None else extra_args
|
130 |
+
s_in = x.new_ones([x.shape[0]])
|
131 |
+
for i in trange(len(sigmas) - 1, disable=disable):
|
132 |
+
gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.
|
133 |
+
sigma_hat = sigmas[i] * (gamma + 1)
|
134 |
+
if gamma > 0:
|
135 |
+
eps = torch.randn_like(x) * s_noise
|
136 |
+
x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
|
137 |
+
denoised = model(x, sigma_hat * s_in, **extra_args)
|
138 |
+
d = to_d(x, sigma_hat, denoised)
|
139 |
+
if callback is not None:
|
140 |
+
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
|
141 |
+
dt = sigmas[i + 1] - sigma_hat
|
142 |
+
# Euler method
|
143 |
+
x = x + d * dt
|
144 |
+
return x
|
145 |
+
|
146 |
+
|
147 |
+
@torch.no_grad()
|
148 |
+
def sample_euler_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
|
149 |
+
"""Ancestral sampling with Euler method steps."""
|
150 |
+
extra_args = {} if extra_args is None else extra_args
|
151 |
+
noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
|
152 |
+
s_in = x.new_ones([x.shape[0]])
|
153 |
+
for i in trange(len(sigmas) - 1, disable=disable):
|
154 |
+
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
155 |
+
sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
|
156 |
+
if callback is not None:
|
157 |
+
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
|
158 |
+
d = to_d(x, sigmas[i], denoised)
|
159 |
+
# Euler method
|
160 |
+
dt = sigma_down - sigmas[i]
|
161 |
+
x = x + d * dt
|
162 |
+
if sigmas[i + 1] > 0:
|
163 |
+
x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
|
164 |
+
return x
|
165 |
+
|
166 |
+
|
167 |
+
@torch.no_grad()
|
168 |
+
def sample_heun(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
|
169 |
+
"""Implements Algorithm 2 (Heun steps) from Karras et al. (2022)."""
|
170 |
+
extra_args = {} if extra_args is None else extra_args
|
171 |
+
s_in = x.new_ones([x.shape[0]])
|
172 |
+
for i in trange(len(sigmas) - 1, disable=disable):
|
173 |
+
gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.
|
174 |
+
sigma_hat = sigmas[i] * (gamma + 1)
|
175 |
+
if gamma > 0:
|
176 |
+
eps = torch.randn_like(x) * s_noise
|
177 |
+
x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
|
178 |
+
denoised = model(x, sigma_hat * s_in, **extra_args)
|
179 |
+
d = to_d(x, sigma_hat, denoised)
|
180 |
+
if callback is not None:
|
181 |
+
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
|
182 |
+
dt = sigmas[i + 1] - sigma_hat
|
183 |
+
if sigmas[i + 1] == 0:
|
184 |
+
# Euler method
|
185 |
+
x = x + d * dt
|
186 |
+
else:
|
187 |
+
# Heun's method
|
188 |
+
x_2 = x + d * dt
|
189 |
+
denoised_2 = model(x_2, sigmas[i + 1] * s_in, **extra_args)
|
190 |
+
d_2 = to_d(x_2, sigmas[i + 1], denoised_2)
|
191 |
+
d_prime = (d + d_2) / 2
|
192 |
+
x = x + d_prime * dt
|
193 |
+
return x
|
194 |
+
|
195 |
+
|
196 |
+
@torch.no_grad()
|
197 |
+
def sample_dpm_2(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
|
198 |
+
"""A sampler inspired by DPM-Solver-2 and Algorithm 2 from Karras et al. (2022)."""
|
199 |
+
extra_args = {} if extra_args is None else extra_args
|
200 |
+
s_in = x.new_ones([x.shape[0]])
|
201 |
+
for i in trange(len(sigmas) - 1, disable=disable):
|
202 |
+
gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.
|
203 |
+
sigma_hat = sigmas[i] * (gamma + 1)
|
204 |
+
if gamma > 0:
|
205 |
+
eps = torch.randn_like(x) * s_noise
|
206 |
+
x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
|
207 |
+
denoised = model(x, sigma_hat * s_in, **extra_args)
|
208 |
+
d = to_d(x, sigma_hat, denoised)
|
209 |
+
if callback is not None:
|
210 |
+
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
|
211 |
+
if sigmas[i + 1] == 0:
|
212 |
+
# Euler method
|
213 |
+
dt = sigmas[i + 1] - sigma_hat
|
214 |
+
x = x + d * dt
|
215 |
+
else:
|
216 |
+
# DPM-Solver-2
|
217 |
+
sigma_mid = sigma_hat.log().lerp(sigmas[i + 1].log(), 0.5).exp()
|
218 |
+
dt_1 = sigma_mid - sigma_hat
|
219 |
+
dt_2 = sigmas[i + 1] - sigma_hat
|
220 |
+
x_2 = x + d * dt_1
|
221 |
+
denoised_2 = model(x_2, sigma_mid * s_in, **extra_args)
|
222 |
+
d_2 = to_d(x_2, sigma_mid, denoised_2)
|
223 |
+
x = x + d_2 * dt_2
|
224 |
+
return x
|
225 |
+
|
226 |
+
|
227 |
+
@torch.no_grad()
|
228 |
+
def sample_dpm_2_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
|
229 |
+
"""Ancestral sampling with DPM-Solver second-order steps."""
|
230 |
+
extra_args = {} if extra_args is None else extra_args
|
231 |
+
noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
|
232 |
+
s_in = x.new_ones([x.shape[0]])
|
233 |
+
for i in trange(len(sigmas) - 1, disable=disable):
|
234 |
+
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
235 |
+
sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
|
236 |
+
if callback is not None:
|
237 |
+
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
|
238 |
+
d = to_d(x, sigmas[i], denoised)
|
239 |
+
if sigma_down == 0:
|
240 |
+
# Euler method
|
241 |
+
dt = sigma_down - sigmas[i]
|
242 |
+
x = x + d * dt
|
243 |
+
else:
|
244 |
+
# DPM-Solver-2
|
245 |
+
sigma_mid = sigmas[i].log().lerp(sigma_down.log(), 0.5).exp()
|
246 |
+
dt_1 = sigma_mid - sigmas[i]
|
247 |
+
dt_2 = sigma_down - sigmas[i]
|
248 |
+
x_2 = x + d * dt_1
|
249 |
+
denoised_2 = model(x_2, sigma_mid * s_in, **extra_args)
|
250 |
+
d_2 = to_d(x_2, sigma_mid, denoised_2)
|
251 |
+
x = x + d_2 * dt_2
|
252 |
+
x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
|
253 |
+
return x
|
254 |
+
|
255 |
+
|
256 |
+
def linear_multistep_coeff(order, t, i, j):
|
257 |
+
if order - 1 > i:
|
258 |
+
raise ValueError(f'Order {order} too high for step {i}')
|
259 |
+
def fn(tau):
|
260 |
+
prod = 1.
|
261 |
+
for k in range(order):
|
262 |
+
if j == k:
|
263 |
+
continue
|
264 |
+
prod *= (tau - t[i - k]) / (t[i - j] - t[i - k])
|
265 |
+
return prod
|
266 |
+
return integrate.quad(fn, t[i], t[i + 1], epsrel=1e-4)[0]
|
267 |
+
|
268 |
+
|
269 |
+
@torch.no_grad()
|
270 |
+
def sample_lms(model, x, sigmas, extra_args=None, callback=None, disable=None, order=4):
|
271 |
+
extra_args = {} if extra_args is None else extra_args
|
272 |
+
s_in = x.new_ones([x.shape[0]])
|
273 |
+
sigmas_cpu = sigmas.detach().cpu().numpy()
|
274 |
+
ds = []
|
275 |
+
for i in trange(len(sigmas) - 1, disable=disable):
|
276 |
+
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
277 |
+
d = to_d(x, sigmas[i], denoised)
|
278 |
+
ds.append(d)
|
279 |
+
if len(ds) > order:
|
280 |
+
ds.pop(0)
|
281 |
+
if callback is not None:
|
282 |
+
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
|
283 |
+
cur_order = min(i + 1, order)
|
284 |
+
coeffs = [linear_multistep_coeff(cur_order, sigmas_cpu, i, j) for j in range(cur_order)]
|
285 |
+
x = x + sum(coeff * d for coeff, d in zip(coeffs, reversed(ds)))
|
286 |
+
return x
|
287 |
+
|
288 |
+
|
289 |
+
class PIDStepSizeController:
|
290 |
+
"""A PID controller for ODE adaptive step size control."""
|
291 |
+
def __init__(self, h, pcoeff, icoeff, dcoeff, order=1, accept_safety=0.81, eps=1e-8):
|
292 |
+
self.h = h
|
293 |
+
self.b1 = (pcoeff + icoeff + dcoeff) / order
|
294 |
+
self.b2 = -(pcoeff + 2 * dcoeff) / order
|
295 |
+
self.b3 = dcoeff / order
|
296 |
+
self.accept_safety = accept_safety
|
297 |
+
self.eps = eps
|
298 |
+
self.errs = []
|
299 |
+
|
300 |
+
def limiter(self, x):
|
301 |
+
return 1 + math.atan(x - 1)
|
302 |
+
|
303 |
+
def propose_step(self, error):
|
304 |
+
inv_error = 1 / (float(error) + self.eps)
|
305 |
+
if not self.errs:
|
306 |
+
self.errs = [inv_error, inv_error, inv_error]
|
307 |
+
self.errs[0] = inv_error
|
308 |
+
factor = self.errs[0] ** self.b1 * self.errs[1] ** self.b2 * self.errs[2] ** self.b3
|
309 |
+
factor = self.limiter(factor)
|
310 |
+
accept = factor >= self.accept_safety
|
311 |
+
if accept:
|
312 |
+
self.errs[2] = self.errs[1]
|
313 |
+
self.errs[1] = self.errs[0]
|
314 |
+
self.h *= factor
|
315 |
+
return accept
|
316 |
+
|
317 |
+
|
318 |
+
class DPMSolver(nn.Module):
|
319 |
+
"""DPM-Solver. See https://arxiv.org/abs/2206.00927."""
|
320 |
+
|
321 |
+
def __init__(self, model, extra_args=None, eps_callback=None, info_callback=None):
|
322 |
+
super().__init__()
|
323 |
+
self.model = model
|
324 |
+
self.extra_args = {} if extra_args is None else extra_args
|
325 |
+
self.eps_callback = eps_callback
|
326 |
+
self.info_callback = info_callback
|
327 |
+
|
328 |
+
def t(self, sigma):
|
329 |
+
return -sigma.log()
|
330 |
+
|
331 |
+
def sigma(self, t):
|
332 |
+
return t.neg().exp()
|
333 |
+
|
334 |
+
def eps(self, eps_cache, key, x, t, *args, **kwargs):
|
335 |
+
if key in eps_cache:
|
336 |
+
return eps_cache[key], eps_cache
|
337 |
+
sigma = self.sigma(t) * x.new_ones([x.shape[0]])
|
338 |
+
eps = (x - self.model(x, sigma, *args, **self.extra_args, **kwargs)) / self.sigma(t)
|
339 |
+
if self.eps_callback is not None:
|
340 |
+
self.eps_callback()
|
341 |
+
return eps, {key: eps, **eps_cache}
|
342 |
+
|
343 |
+
def dpm_solver_1_step(self, x, t, t_next, eps_cache=None):
|
344 |
+
eps_cache = {} if eps_cache is None else eps_cache
|
345 |
+
h = t_next - t
|
346 |
+
eps, eps_cache = self.eps(eps_cache, 'eps', x, t)
|
347 |
+
x_1 = x - self.sigma(t_next) * h.expm1() * eps
|
348 |
+
return x_1, eps_cache
|
349 |
+
|
350 |
+
def dpm_solver_2_step(self, x, t, t_next, r1=1 / 2, eps_cache=None):
|
351 |
+
eps_cache = {} if eps_cache is None else eps_cache
|
352 |
+
h = t_next - t
|
353 |
+
eps, eps_cache = self.eps(eps_cache, 'eps', x, t)
|
354 |
+
s1 = t + r1 * h
|
355 |
+
u1 = x - self.sigma(s1) * (r1 * h).expm1() * eps
|
356 |
+
eps_r1, eps_cache = self.eps(eps_cache, 'eps_r1', u1, s1)
|
357 |
+
x_2 = x - self.sigma(t_next) * h.expm1() * eps - self.sigma(t_next) / (2 * r1) * h.expm1() * (eps_r1 - eps)
|
358 |
+
return x_2, eps_cache
|
359 |
+
|
360 |
+
def dpm_solver_3_step(self, x, t, t_next, r1=1 / 3, r2=2 / 3, eps_cache=None):
|
361 |
+
eps_cache = {} if eps_cache is None else eps_cache
|
362 |
+
h = t_next - t
|
363 |
+
eps, eps_cache = self.eps(eps_cache, 'eps', x, t)
|
364 |
+
s1 = t + r1 * h
|
365 |
+
s2 = t + r2 * h
|
366 |
+
u1 = x - self.sigma(s1) * (r1 * h).expm1() * eps
|
367 |
+
eps_r1, eps_cache = self.eps(eps_cache, 'eps_r1', u1, s1)
|
368 |
+
u2 = x - self.sigma(s2) * (r2 * h).expm1() * eps - self.sigma(s2) * (r2 / r1) * ((r2 * h).expm1() / (r2 * h) - 1) * (eps_r1 - eps)
|
369 |
+
eps_r2, eps_cache = self.eps(eps_cache, 'eps_r2', u2, s2)
|
370 |
+
x_3 = x - self.sigma(t_next) * h.expm1() * eps - self.sigma(t_next) / r2 * (h.expm1() / h - 1) * (eps_r2 - eps)
|
371 |
+
return x_3, eps_cache
|
372 |
+
|
373 |
+
def dpm_solver_fast(self, x, t_start, t_end, nfe, eta=0., s_noise=1., noise_sampler=None):
|
374 |
+
noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
|
375 |
+
if not t_end > t_start and eta:
|
376 |
+
raise ValueError('eta must be 0 for reverse sampling')
|
377 |
+
|
378 |
+
m = math.floor(nfe / 3) + 1
|
379 |
+
ts = torch.linspace(t_start, t_end, m + 1, device=x.device)
|
380 |
+
|
381 |
+
if nfe % 3 == 0:
|
382 |
+
orders = [3] * (m - 2) + [2, 1]
|
383 |
+
else:
|
384 |
+
orders = [3] * (m - 1) + [nfe % 3]
|
385 |
+
|
386 |
+
for i in range(len(orders)):
|
387 |
+
eps_cache = {}
|
388 |
+
t, t_next = ts[i], ts[i + 1]
|
389 |
+
if eta:
|
390 |
+
sd, su = get_ancestral_step(self.sigma(t), self.sigma(t_next), eta)
|
391 |
+
t_next_ = torch.minimum(t_end, self.t(sd))
|
392 |
+
su = (self.sigma(t_next) ** 2 - self.sigma(t_next_) ** 2) ** 0.5
|
393 |
+
else:
|
394 |
+
t_next_, su = t_next, 0.
|
395 |
+
|
396 |
+
eps, eps_cache = self.eps(eps_cache, 'eps', x, t)
|
397 |
+
denoised = x - self.sigma(t) * eps
|
398 |
+
if self.info_callback is not None:
|
399 |
+
self.info_callback({'x': x, 'i': i, 't': ts[i], 't_up': t, 'denoised': denoised})
|
400 |
+
|
401 |
+
if orders[i] == 1:
|
402 |
+
x, eps_cache = self.dpm_solver_1_step(x, t, t_next_, eps_cache=eps_cache)
|
403 |
+
elif orders[i] == 2:
|
404 |
+
x, eps_cache = self.dpm_solver_2_step(x, t, t_next_, eps_cache=eps_cache)
|
405 |
+
else:
|
406 |
+
x, eps_cache = self.dpm_solver_3_step(x, t, t_next_, eps_cache=eps_cache)
|
407 |
+
|
408 |
+
x = x + su * s_noise * noise_sampler(self.sigma(t), self.sigma(t_next))
|
409 |
+
|
410 |
+
return x
|
411 |
+
|
412 |
+
def dpm_solver_adaptive(self, x, t_start, t_end, order=3, rtol=0.05, atol=0.0078, h_init=0.05, pcoeff=0., icoeff=1., dcoeff=0., accept_safety=0.81, eta=0., s_noise=1., noise_sampler=None):
|
413 |
+
noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
|
414 |
+
if order not in {2, 3}:
|
415 |
+
raise ValueError('order should be 2 or 3')
|
416 |
+
forward = t_end > t_start
|
417 |
+
if not forward and eta:
|
418 |
+
raise ValueError('eta must be 0 for reverse sampling')
|
419 |
+
h_init = abs(h_init) * (1 if forward else -1)
|
420 |
+
atol = torch.tensor(atol)
|
421 |
+
rtol = torch.tensor(rtol)
|
422 |
+
s = t_start
|
423 |
+
x_prev = x
|
424 |
+
accept = True
|
425 |
+
pid = PIDStepSizeController(h_init, pcoeff, icoeff, dcoeff, 1.5 if eta else order, accept_safety)
|
426 |
+
info = {'steps': 0, 'nfe': 0, 'n_accept': 0, 'n_reject': 0}
|
427 |
+
|
428 |
+
while s < t_end - 1e-5 if forward else s > t_end + 1e-5:
|
429 |
+
eps_cache = {}
|
430 |
+
t = torch.minimum(t_end, s + pid.h) if forward else torch.maximum(t_end, s + pid.h)
|
431 |
+
if eta:
|
432 |
+
sd, su = get_ancestral_step(self.sigma(s), self.sigma(t), eta)
|
433 |
+
t_ = torch.minimum(t_end, self.t(sd))
|
434 |
+
su = (self.sigma(t) ** 2 - self.sigma(t_) ** 2) ** 0.5
|
435 |
+
else:
|
436 |
+
t_, su = t, 0.
|
437 |
+
|
438 |
+
eps, eps_cache = self.eps(eps_cache, 'eps', x, s)
|
439 |
+
denoised = x - self.sigma(s) * eps
|
440 |
+
|
441 |
+
if order == 2:
|
442 |
+
x_low, eps_cache = self.dpm_solver_1_step(x, s, t_, eps_cache=eps_cache)
|
443 |
+
x_high, eps_cache = self.dpm_solver_2_step(x, s, t_, eps_cache=eps_cache)
|
444 |
+
else:
|
445 |
+
x_low, eps_cache = self.dpm_solver_2_step(x, s, t_, r1=1 / 3, eps_cache=eps_cache)
|
446 |
+
x_high, eps_cache = self.dpm_solver_3_step(x, s, t_, eps_cache=eps_cache)
|
447 |
+
delta = torch.maximum(atol, rtol * torch.maximum(x_low.abs(), x_prev.abs()))
|
448 |
+
error = torch.linalg.norm((x_low - x_high) / delta) / x.numel() ** 0.5
|
449 |
+
accept = pid.propose_step(error)
|
450 |
+
if accept:
|
451 |
+
x_prev = x_low
|
452 |
+
x = x_high + su * s_noise * noise_sampler(self.sigma(s), self.sigma(t))
|
453 |
+
s = t
|
454 |
+
info['n_accept'] += 1
|
455 |
+
else:
|
456 |
+
info['n_reject'] += 1
|
457 |
+
info['nfe'] += order
|
458 |
+
info['steps'] += 1
|
459 |
+
|
460 |
+
if self.info_callback is not None:
|
461 |
+
self.info_callback({'x': x, 'i': info['steps'] - 1, 't': s, 't_up': s, 'denoised': denoised, 'error': error, 'h': pid.h, **info})
|
462 |
+
|
463 |
+
return x, info
|
464 |
+
|
465 |
+
|
466 |
+
@torch.no_grad()
|
467 |
+
def sample_dpm_fast(model, x, sigma_min, sigma_max, n, extra_args=None, callback=None, disable=None, eta=0., s_noise=1., noise_sampler=None):
|
468 |
+
"""DPM-Solver-Fast (fixed step size). See https://arxiv.org/abs/2206.00927."""
|
469 |
+
if sigma_min <= 0 or sigma_max <= 0:
|
470 |
+
raise ValueError('sigma_min and sigma_max must not be 0')
|
471 |
+
with tqdm(total=n, disable=disable) as pbar:
|
472 |
+
dpm_solver = DPMSolver(model, extra_args, eps_callback=pbar.update)
|
473 |
+
if callback is not None:
|
474 |
+
dpm_solver.info_callback = lambda info: callback({'sigma': dpm_solver.sigma(info['t']), 'sigma_hat': dpm_solver.sigma(info['t_up']), **info})
|
475 |
+
return dpm_solver.dpm_solver_fast(x, dpm_solver.t(torch.tensor(sigma_max)), dpm_solver.t(torch.tensor(sigma_min)), n, eta, s_noise, noise_sampler)
|
476 |
+
|
477 |
+
|
478 |
+
@torch.no_grad()
|
479 |
+
def sample_dpm_adaptive(model, x, sigma_min, sigma_max, extra_args=None, callback=None, disable=None, order=3, rtol=0.05, atol=0.0078, h_init=0.05, pcoeff=0., icoeff=1., dcoeff=0., accept_safety=0.81, eta=0., s_noise=1., noise_sampler=None, return_info=False):
|
480 |
+
"""DPM-Solver-12 and 23 (adaptive step size). See https://arxiv.org/abs/2206.00927."""
|
481 |
+
if sigma_min <= 0 or sigma_max <= 0:
|
482 |
+
raise ValueError('sigma_min and sigma_max must not be 0')
|
483 |
+
with tqdm(disable=disable) as pbar:
|
484 |
+
dpm_solver = DPMSolver(model, extra_args, eps_callback=pbar.update)
|
485 |
+
if callback is not None:
|
486 |
+
dpm_solver.info_callback = lambda info: callback({'sigma': dpm_solver.sigma(info['t']), 'sigma_hat': dpm_solver.sigma(info['t_up']), **info})
|
487 |
+
x, info = dpm_solver.dpm_solver_adaptive(x, dpm_solver.t(torch.tensor(sigma_max)), dpm_solver.t(torch.tensor(sigma_min)), order, rtol, atol, h_init, pcoeff, icoeff, dcoeff, accept_safety, eta, s_noise, noise_sampler)
|
488 |
+
if return_info:
|
489 |
+
return x, info
|
490 |
+
return x
|
491 |
+
|
492 |
+
|
493 |
+
@torch.no_grad()
|
494 |
+
def sample_dpmpp_2s_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
|
495 |
+
"""Ancestral sampling with DPM-Solver++(2S) second-order steps."""
|
496 |
+
extra_args = {} if extra_args is None else extra_args
|
497 |
+
noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
|
498 |
+
s_in = x.new_ones([x.shape[0]])
|
499 |
+
sigma_fn = lambda t: t.neg().exp()
|
500 |
+
t_fn = lambda sigma: sigma.log().neg()
|
501 |
+
|
502 |
+
for i in trange(len(sigmas) - 1, disable=disable):
|
503 |
+
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
504 |
+
sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta)
|
505 |
+
if callback is not None:
|
506 |
+
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
|
507 |
+
if sigma_down == 0:
|
508 |
+
# Euler method
|
509 |
+
d = to_d(x, sigmas[i], denoised)
|
510 |
+
dt = sigma_down - sigmas[i]
|
511 |
+
x = x + d * dt
|
512 |
+
else:
|
513 |
+
# DPM-Solver++(2S)
|
514 |
+
t, t_next = t_fn(sigmas[i]), t_fn(sigma_down)
|
515 |
+
r = 1 / 2
|
516 |
+
h = t_next - t
|
517 |
+
s = t + r * h
|
518 |
+
x_2 = (sigma_fn(s) / sigma_fn(t)) * x - (-h * r).expm1() * denoised
|
519 |
+
denoised_2 = model(x_2, sigma_fn(s) * s_in, **extra_args)
|
520 |
+
x = (sigma_fn(t_next) / sigma_fn(t)) * x - (-h).expm1() * denoised_2
|
521 |
+
# Noise addition
|
522 |
+
if sigmas[i + 1] > 0:
|
523 |
+
x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
|
524 |
+
return x
|
525 |
+
|
526 |
+
|
527 |
+
@torch.no_grad()
|
528 |
+
def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
|
529 |
+
"""DPM-Solver++ (stochastic)."""
|
530 |
+
sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
|
531 |
+
seed = extra_args.get("seed", None)
|
532 |
+
noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
|
533 |
+
extra_args = {} if extra_args is None else extra_args
|
534 |
+
s_in = x.new_ones([x.shape[0]])
|
535 |
+
sigma_fn = lambda t: t.neg().exp()
|
536 |
+
t_fn = lambda sigma: sigma.log().neg()
|
537 |
+
|
538 |
+
for i in trange(len(sigmas) - 1, disable=disable):
|
539 |
+
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
540 |
+
if callback is not None:
|
541 |
+
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
|
542 |
+
if sigmas[i + 1] == 0:
|
543 |
+
# Euler method
|
544 |
+
d = to_d(x, sigmas[i], denoised)
|
545 |
+
dt = sigmas[i + 1] - sigmas[i]
|
546 |
+
x = x + d * dt
|
547 |
+
else:
|
548 |
+
# DPM-Solver++
|
549 |
+
t, t_next = t_fn(sigmas[i]), t_fn(sigmas[i + 1])
|
550 |
+
h = t_next - t
|
551 |
+
s = t + h * r
|
552 |
+
fac = 1 / (2 * r)
|
553 |
+
|
554 |
+
# Step 1
|
555 |
+
sd, su = get_ancestral_step(sigma_fn(t), sigma_fn(s), eta)
|
556 |
+
s_ = t_fn(sd)
|
557 |
+
x_2 = (sigma_fn(s_) / sigma_fn(t)) * x - (t - s_).expm1() * denoised
|
558 |
+
x_2 = x_2 + noise_sampler(sigma_fn(t), sigma_fn(s)) * s_noise * su
|
559 |
+
denoised_2 = model(x_2, sigma_fn(s) * s_in, **extra_args)
|
560 |
+
|
561 |
+
# Step 2
|
562 |
+
sd, su = get_ancestral_step(sigma_fn(t), sigma_fn(t_next), eta)
|
563 |
+
t_next_ = t_fn(sd)
|
564 |
+
denoised_d = (1 - fac) * denoised + fac * denoised_2
|
565 |
+
x = (sigma_fn(t_next_) / sigma_fn(t)) * x - (t - t_next_).expm1() * denoised_d
|
566 |
+
x = x + noise_sampler(sigma_fn(t), sigma_fn(t_next)) * s_noise * su
|
567 |
+
return x
|
568 |
+
|
569 |
+
|
570 |
+
@torch.no_grad()
|
571 |
+
def sample_dpmpp_2m(model, x, sigmas, extra_args=None, callback=None, disable=None):
|
572 |
+
"""DPM-Solver++(2M)."""
|
573 |
+
extra_args = {} if extra_args is None else extra_args
|
574 |
+
s_in = x.new_ones([x.shape[0]])
|
575 |
+
sigma_fn = lambda t: t.neg().exp()
|
576 |
+
t_fn = lambda sigma: sigma.log().neg()
|
577 |
+
old_denoised = None
|
578 |
+
|
579 |
+
for i in trange(len(sigmas) - 1, disable=disable):
|
580 |
+
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
581 |
+
if callback is not None:
|
582 |
+
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
|
583 |
+
t, t_next = t_fn(sigmas[i]), t_fn(sigmas[i + 1])
|
584 |
+
h = t_next - t
|
585 |
+
if old_denoised is None or sigmas[i + 1] == 0:
|
586 |
+
x = (sigma_fn(t_next) / sigma_fn(t)) * x - (-h).expm1() * denoised
|
587 |
+
else:
|
588 |
+
h_last = t - t_fn(sigmas[i - 1])
|
589 |
+
r = h_last / h
|
590 |
+
denoised_d = (1 + 1 / (2 * r)) * denoised - (1 / (2 * r)) * old_denoised
|
591 |
+
x = (sigma_fn(t_next) / sigma_fn(t)) * x - (-h).expm1() * denoised_d
|
592 |
+
old_denoised = denoised
|
593 |
+
return x
|
594 |
+
|
595 |
+
@torch.no_grad()
|
596 |
+
def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'):
|
597 |
+
"""DPM-Solver++(2M) SDE."""
|
598 |
+
|
599 |
+
if solver_type not in {'heun', 'midpoint'}:
|
600 |
+
raise ValueError('solver_type must be \'heun\' or \'midpoint\'')
|
601 |
+
|
602 |
+
seed = extra_args.get("seed", None)
|
603 |
+
sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
|
604 |
+
noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
|
605 |
+
extra_args = {} if extra_args is None else extra_args
|
606 |
+
s_in = x.new_ones([x.shape[0]])
|
607 |
+
|
608 |
+
old_denoised = None
|
609 |
+
h_last = None
|
610 |
+
h = None
|
611 |
+
|
612 |
+
for i in trange(len(sigmas) - 1, disable=disable):
|
613 |
+
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
614 |
+
if callback is not None:
|
615 |
+
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
|
616 |
+
if sigmas[i + 1] == 0:
|
617 |
+
# Denoising step
|
618 |
+
x = denoised
|
619 |
+
else:
|
620 |
+
# DPM-Solver++(2M) SDE
|
621 |
+
t, s = -sigmas[i].log(), -sigmas[i + 1].log()
|
622 |
+
h = s - t
|
623 |
+
eta_h = eta * h
|
624 |
+
|
625 |
+
x = sigmas[i + 1] / sigmas[i] * (-eta_h).exp() * x + (-h - eta_h).expm1().neg() * denoised
|
626 |
+
|
627 |
+
if old_denoised is not None:
|
628 |
+
r = h_last / h
|
629 |
+
if solver_type == 'heun':
|
630 |
+
x = x + ((-h - eta_h).expm1().neg() / (-h - eta_h) + 1) * (1 / r) * (denoised - old_denoised)
|
631 |
+
elif solver_type == 'midpoint':
|
632 |
+
x = x + 0.5 * (-h - eta_h).expm1().neg() * (1 / r) * (denoised - old_denoised)
|
633 |
+
|
634 |
+
if eta:
|
635 |
+
x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * eta_h).expm1().neg().sqrt() * s_noise
|
636 |
+
|
637 |
+
old_denoised = denoised
|
638 |
+
h_last = h
|
639 |
+
return x
|
640 |
+
|
641 |
+
@torch.no_grad()
|
642 |
+
def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
|
643 |
+
"""DPM-Solver++(3M) SDE."""
|
644 |
+
|
645 |
+
seed = extra_args.get("seed", None)
|
646 |
+
sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
|
647 |
+
noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=seed, cpu=True) if noise_sampler is None else noise_sampler
|
648 |
+
extra_args = {} if extra_args is None else extra_args
|
649 |
+
s_in = x.new_ones([x.shape[0]])
|
650 |
+
|
651 |
+
denoised_1, denoised_2 = None, None
|
652 |
+
h, h_1, h_2 = None, None, None
|
653 |
+
|
654 |
+
for i in trange(len(sigmas) - 1, disable=disable):
|
655 |
+
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
656 |
+
if callback is not None:
|
657 |
+
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
|
658 |
+
if sigmas[i + 1] == 0:
|
659 |
+
# Denoising step
|
660 |
+
x = denoised
|
661 |
+
else:
|
662 |
+
t, s = -sigmas[i].log(), -sigmas[i + 1].log()
|
663 |
+
h = s - t
|
664 |
+
h_eta = h * (eta + 1)
|
665 |
+
|
666 |
+
x = torch.exp(-h_eta) * x + (-h_eta).expm1().neg() * denoised
|
667 |
+
|
668 |
+
if h_2 is not None:
|
669 |
+
r0 = h_1 / h
|
670 |
+
r1 = h_2 / h
|
671 |
+
d1_0 = (denoised - denoised_1) / r0
|
672 |
+
d1_1 = (denoised_1 - denoised_2) / r1
|
673 |
+
d1 = d1_0 + (d1_0 - d1_1) * r0 / (r0 + r1)
|
674 |
+
d2 = (d1_0 - d1_1) / (r0 + r1)
|
675 |
+
phi_2 = h_eta.neg().expm1() / h_eta + 1
|
676 |
+
phi_3 = phi_2 / h_eta - 0.5
|
677 |
+
x = x + phi_2 * d1 - phi_3 * d2
|
678 |
+
elif h_1 is not None:
|
679 |
+
r = h_1 / h
|
680 |
+
d = (denoised - denoised_1) / r
|
681 |
+
phi_2 = h_eta.neg().expm1() / h_eta + 1
|
682 |
+
x = x + phi_2 * d
|
683 |
+
|
684 |
+
if eta:
|
685 |
+
x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * h * eta).expm1().neg().sqrt() * s_noise
|
686 |
+
|
687 |
+
denoised_1, denoised_2 = denoised, denoised_1
|
688 |
+
h_1, h_2 = h, h_1
|
689 |
+
return x
|
690 |
+
|
691 |
+
@torch.no_grad()
|
692 |
+
def sample_dpmpp_3m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None):
|
693 |
+
sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
|
694 |
+
noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
|
695 |
+
return sample_dpmpp_3m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler)
|
696 |
+
|
697 |
+
@torch.no_grad()
|
698 |
+
def sample_dpmpp_2m_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'):
|
699 |
+
sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
|
700 |
+
noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
|
701 |
+
return sample_dpmpp_2m_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, solver_type=solver_type)
|
702 |
+
|
703 |
+
@torch.no_grad()
|
704 |
+
def sample_dpmpp_sde_gpu(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2):
|
705 |
+
sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max()
|
706 |
+
noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max, seed=extra_args.get("seed", None), cpu=False) if noise_sampler is None else noise_sampler
|
707 |
+
return sample_dpmpp_sde(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, eta=eta, s_noise=s_noise, noise_sampler=noise_sampler, r=r)
|
708 |
+
|
709 |
+
|
710 |
+
def DDPMSampler_step(x, sigma, sigma_prev, noise, noise_sampler):
|
711 |
+
alpha_cumprod = 1 / ((sigma * sigma) + 1)
|
712 |
+
alpha_cumprod_prev = 1 / ((sigma_prev * sigma_prev) + 1)
|
713 |
+
alpha = (alpha_cumprod / alpha_cumprod_prev)
|
714 |
+
|
715 |
+
mu = (1.0 / alpha).sqrt() * (x - (1 - alpha) * noise / (1 - alpha_cumprod).sqrt())
|
716 |
+
if sigma_prev > 0:
|
717 |
+
mu += ((1 - alpha) * (1. - alpha_cumprod_prev) / (1. - alpha_cumprod)).sqrt() * noise_sampler(sigma, sigma_prev)
|
718 |
+
return mu
|
719 |
+
|
720 |
+
def generic_step_sampler(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None, step_function=None):
|
721 |
+
extra_args = {} if extra_args is None else extra_args
|
722 |
+
noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
|
723 |
+
s_in = x.new_ones([x.shape[0]])
|
724 |
+
|
725 |
+
for i in trange(len(sigmas) - 1, disable=disable):
|
726 |
+
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
727 |
+
if callback is not None:
|
728 |
+
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
|
729 |
+
x = step_function(x / torch.sqrt(1.0 + sigmas[i] ** 2.0), sigmas[i], sigmas[i + 1], (x - denoised) / sigmas[i], noise_sampler)
|
730 |
+
if sigmas[i + 1] != 0:
|
731 |
+
x *= torch.sqrt(1.0 + sigmas[i + 1] ** 2.0)
|
732 |
+
return x
|
733 |
+
|
734 |
+
|
735 |
+
@torch.no_grad()
|
736 |
+
def sample_ddpm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None):
|
737 |
+
return generic_step_sampler(model, x, sigmas, extra_args, callback, disable, noise_sampler, DDPMSampler_step)
|
738 |
+
|
739 |
+
@torch.no_grad()
|
740 |
+
def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None):
|
741 |
+
extra_args = {} if extra_args is None else extra_args
|
742 |
+
noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler
|
743 |
+
s_in = x.new_ones([x.shape[0]])
|
744 |
+
for i in trange(len(sigmas) - 1, disable=disable):
|
745 |
+
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
746 |
+
if callback is not None:
|
747 |
+
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
|
748 |
+
|
749 |
+
x = denoised
|
750 |
+
if sigmas[i + 1] > 0:
|
751 |
+
x += sigmas[i + 1] * noise_sampler(sigmas[i], sigmas[i + 1])
|
752 |
+
return x
|
753 |
+
|
754 |
+
|
755 |
+
|
756 |
+
@torch.no_grad()
|
757 |
+
def sample_heunpp2(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
|
758 |
+
# From MIT licensed: https://github.com/Carzit/sd-webui-samplers-scheduler/
|
759 |
+
extra_args = {} if extra_args is None else extra_args
|
760 |
+
s_in = x.new_ones([x.shape[0]])
|
761 |
+
s_end = sigmas[-1]
|
762 |
+
for i in trange(len(sigmas) - 1, disable=disable):
|
763 |
+
gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.
|
764 |
+
eps = torch.randn_like(x) * s_noise
|
765 |
+
sigma_hat = sigmas[i] * (gamma + 1)
|
766 |
+
if gamma > 0:
|
767 |
+
x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
|
768 |
+
denoised = model(x, sigma_hat * s_in, **extra_args)
|
769 |
+
d = to_d(x, sigma_hat, denoised)
|
770 |
+
if callback is not None:
|
771 |
+
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
|
772 |
+
dt = sigmas[i + 1] - sigma_hat
|
773 |
+
if sigmas[i + 1] == s_end:
|
774 |
+
# Euler method
|
775 |
+
x = x + d * dt
|
776 |
+
elif sigmas[i + 2] == s_end:
|
777 |
+
|
778 |
+
# Heun's method
|
779 |
+
x_2 = x + d * dt
|
780 |
+
denoised_2 = model(x_2, sigmas[i + 1] * s_in, **extra_args)
|
781 |
+
d_2 = to_d(x_2, sigmas[i + 1], denoised_2)
|
782 |
+
|
783 |
+
w = 2 * sigmas[0]
|
784 |
+
w2 = sigmas[i+1]/w
|
785 |
+
w1 = 1 - w2
|
786 |
+
|
787 |
+
d_prime = d * w1 + d_2 * w2
|
788 |
+
|
789 |
+
|
790 |
+
x = x + d_prime * dt
|
791 |
+
|
792 |
+
else:
|
793 |
+
# Heun++
|
794 |
+
x_2 = x + d * dt
|
795 |
+
denoised_2 = model(x_2, sigmas[i + 1] * s_in, **extra_args)
|
796 |
+
d_2 = to_d(x_2, sigmas[i + 1], denoised_2)
|
797 |
+
dt_2 = sigmas[i + 2] - sigmas[i + 1]
|
798 |
+
|
799 |
+
x_3 = x_2 + d_2 * dt_2
|
800 |
+
denoised_3 = model(x_3, sigmas[i + 2] * s_in, **extra_args)
|
801 |
+
d_3 = to_d(x_3, sigmas[i + 2], denoised_3)
|
802 |
+
|
803 |
+
w = 3 * sigmas[0]
|
804 |
+
w2 = sigmas[i + 1] / w
|
805 |
+
w3 = sigmas[i + 2] / w
|
806 |
+
w1 = 1 - w2 - w3
|
807 |
+
|
808 |
+
d_prime = w1 * d + w2 * d_2 + w3 * d_3
|
809 |
+
x = x + d_prime * dt
|
810 |
+
return x
|
ComfyUI/comfy/k_diffusion/utils.py
ADDED
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from contextlib import contextmanager
|
2 |
+
import hashlib
|
3 |
+
import math
|
4 |
+
from pathlib import Path
|
5 |
+
import shutil
|
6 |
+
import urllib
|
7 |
+
import warnings
|
8 |
+
|
9 |
+
from PIL import Image
|
10 |
+
import torch
|
11 |
+
from torch import nn, optim
|
12 |
+
from torch.utils import data
|
13 |
+
|
14 |
+
|
15 |
+
def hf_datasets_augs_helper(examples, transform, image_key, mode='RGB'):
|
16 |
+
"""Apply passed in transforms for HuggingFace Datasets."""
|
17 |
+
images = [transform(image.convert(mode)) for image in examples[image_key]]
|
18 |
+
return {image_key: images}
|
19 |
+
|
20 |
+
|
21 |
+
def append_dims(x, target_dims):
|
22 |
+
"""Appends dimensions to the end of a tensor until it has target_dims dimensions."""
|
23 |
+
dims_to_append = target_dims - x.ndim
|
24 |
+
if dims_to_append < 0:
|
25 |
+
raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less')
|
26 |
+
expanded = x[(...,) + (None,) * dims_to_append]
|
27 |
+
# MPS will get inf values if it tries to index into the new axes, but detaching fixes this.
|
28 |
+
# https://github.com/pytorch/pytorch/issues/84364
|
29 |
+
return expanded.detach().clone() if expanded.device.type == 'mps' else expanded
|
30 |
+
|
31 |
+
|
32 |
+
def n_params(module):
|
33 |
+
"""Returns the number of trainable parameters in a module."""
|
34 |
+
return sum(p.numel() for p in module.parameters())
|
35 |
+
|
36 |
+
|
37 |
+
def download_file(path, url, digest=None):
|
38 |
+
"""Downloads a file if it does not exist, optionally checking its SHA-256 hash."""
|
39 |
+
path = Path(path)
|
40 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
41 |
+
if not path.exists():
|
42 |
+
with urllib.request.urlopen(url) as response, open(path, 'wb') as f:
|
43 |
+
shutil.copyfileobj(response, f)
|
44 |
+
if digest is not None:
|
45 |
+
file_digest = hashlib.sha256(open(path, 'rb').read()).hexdigest()
|
46 |
+
if digest != file_digest:
|
47 |
+
raise OSError(f'hash of {path} (url: {url}) failed to validate')
|
48 |
+
return path
|
49 |
+
|
50 |
+
|
51 |
+
@contextmanager
|
52 |
+
def train_mode(model, mode=True):
|
53 |
+
"""A context manager that places a model into training mode and restores
|
54 |
+
the previous mode on exit."""
|
55 |
+
modes = [module.training for module in model.modules()]
|
56 |
+
try:
|
57 |
+
yield model.train(mode)
|
58 |
+
finally:
|
59 |
+
for i, module in enumerate(model.modules()):
|
60 |
+
module.training = modes[i]
|
61 |
+
|
62 |
+
|
63 |
+
def eval_mode(model):
|
64 |
+
"""A context manager that places a model into evaluation mode and restores
|
65 |
+
the previous mode on exit."""
|
66 |
+
return train_mode(model, False)
|
67 |
+
|
68 |
+
|
69 |
+
@torch.no_grad()
|
70 |
+
def ema_update(model, averaged_model, decay):
|
71 |
+
"""Incorporates updated model parameters into an exponential moving averaged
|
72 |
+
version of a model. It should be called after each optimizer step."""
|
73 |
+
model_params = dict(model.named_parameters())
|
74 |
+
averaged_params = dict(averaged_model.named_parameters())
|
75 |
+
assert model_params.keys() == averaged_params.keys()
|
76 |
+
|
77 |
+
for name, param in model_params.items():
|
78 |
+
averaged_params[name].mul_(decay).add_(param, alpha=1 - decay)
|
79 |
+
|
80 |
+
model_buffers = dict(model.named_buffers())
|
81 |
+
averaged_buffers = dict(averaged_model.named_buffers())
|
82 |
+
assert model_buffers.keys() == averaged_buffers.keys()
|
83 |
+
|
84 |
+
for name, buf in model_buffers.items():
|
85 |
+
averaged_buffers[name].copy_(buf)
|
86 |
+
|
87 |
+
|
88 |
+
class EMAWarmup:
|
89 |
+
"""Implements an EMA warmup using an inverse decay schedule.
|
90 |
+
If inv_gamma=1 and power=1, implements a simple average. inv_gamma=1, power=2/3 are
|
91 |
+
good values for models you plan to train for a million or more steps (reaches decay
|
92 |
+
factor 0.999 at 31.6K steps, 0.9999 at 1M steps), inv_gamma=1, power=3/4 for models
|
93 |
+
you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999 at
|
94 |
+
215.4k steps).
|
95 |
+
Args:
|
96 |
+
inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
|
97 |
+
power (float): Exponential factor of EMA warmup. Default: 1.
|
98 |
+
min_value (float): The minimum EMA decay rate. Default: 0.
|
99 |
+
max_value (float): The maximum EMA decay rate. Default: 1.
|
100 |
+
start_at (int): The epoch to start averaging at. Default: 0.
|
101 |
+
last_epoch (int): The index of last epoch. Default: 0.
|
102 |
+
"""
|
103 |
+
|
104 |
+
def __init__(self, inv_gamma=1., power=1., min_value=0., max_value=1., start_at=0,
|
105 |
+
last_epoch=0):
|
106 |
+
self.inv_gamma = inv_gamma
|
107 |
+
self.power = power
|
108 |
+
self.min_value = min_value
|
109 |
+
self.max_value = max_value
|
110 |
+
self.start_at = start_at
|
111 |
+
self.last_epoch = last_epoch
|
112 |
+
|
113 |
+
def state_dict(self):
|
114 |
+
"""Returns the state of the class as a :class:`dict`."""
|
115 |
+
return dict(self.__dict__.items())
|
116 |
+
|
117 |
+
def load_state_dict(self, state_dict):
|
118 |
+
"""Loads the class's state.
|
119 |
+
Args:
|
120 |
+
state_dict (dict): scaler state. Should be an object returned
|
121 |
+
from a call to :meth:`state_dict`.
|
122 |
+
"""
|
123 |
+
self.__dict__.update(state_dict)
|
124 |
+
|
125 |
+
def get_value(self):
|
126 |
+
"""Gets the current EMA decay rate."""
|
127 |
+
epoch = max(0, self.last_epoch - self.start_at)
|
128 |
+
value = 1 - (1 + epoch / self.inv_gamma) ** -self.power
|
129 |
+
return 0. if epoch < 0 else min(self.max_value, max(self.min_value, value))
|
130 |
+
|
131 |
+
def step(self):
|
132 |
+
"""Updates the step count."""
|
133 |
+
self.last_epoch += 1
|
134 |
+
|
135 |
+
|
136 |
+
class InverseLR(optim.lr_scheduler._LRScheduler):
|
137 |
+
"""Implements an inverse decay learning rate schedule with an optional exponential
|
138 |
+
warmup. When last_epoch=-1, sets initial lr as lr.
|
139 |
+
inv_gamma is the number of steps/epochs required for the learning rate to decay to
|
140 |
+
(1 / 2)**power of its original value.
|
141 |
+
Args:
|
142 |
+
optimizer (Optimizer): Wrapped optimizer.
|
143 |
+
inv_gamma (float): Inverse multiplicative factor of learning rate decay. Default: 1.
|
144 |
+
power (float): Exponential factor of learning rate decay. Default: 1.
|
145 |
+
warmup (float): Exponential warmup factor (0 <= warmup < 1, 0 to disable)
|
146 |
+
Default: 0.
|
147 |
+
min_lr (float): The minimum learning rate. Default: 0.
|
148 |
+
last_epoch (int): The index of last epoch. Default: -1.
|
149 |
+
verbose (bool): If ``True``, prints a message to stdout for
|
150 |
+
each update. Default: ``False``.
|
151 |
+
"""
|
152 |
+
|
153 |
+
def __init__(self, optimizer, inv_gamma=1., power=1., warmup=0., min_lr=0.,
|
154 |
+
last_epoch=-1, verbose=False):
|
155 |
+
self.inv_gamma = inv_gamma
|
156 |
+
self.power = power
|
157 |
+
if not 0. <= warmup < 1:
|
158 |
+
raise ValueError('Invalid value for warmup')
|
159 |
+
self.warmup = warmup
|
160 |
+
self.min_lr = min_lr
|
161 |
+
super().__init__(optimizer, last_epoch, verbose)
|
162 |
+
|
163 |
+
def get_lr(self):
|
164 |
+
if not self._get_lr_called_within_step:
|
165 |
+
warnings.warn("To get the last learning rate computed by the scheduler, "
|
166 |
+
"please use `get_last_lr()`.")
|
167 |
+
|
168 |
+
return self._get_closed_form_lr()
|
169 |
+
|
170 |
+
def _get_closed_form_lr(self):
|
171 |
+
warmup = 1 - self.warmup ** (self.last_epoch + 1)
|
172 |
+
lr_mult = (1 + self.last_epoch / self.inv_gamma) ** -self.power
|
173 |
+
return [warmup * max(self.min_lr, base_lr * lr_mult)
|
174 |
+
for base_lr in self.base_lrs]
|
175 |
+
|
176 |
+
|
177 |
+
class ExponentialLR(optim.lr_scheduler._LRScheduler):
|
178 |
+
"""Implements an exponential learning rate schedule with an optional exponential
|
179 |
+
warmup. When last_epoch=-1, sets initial lr as lr. Decays the learning rate
|
180 |
+
continuously by decay (default 0.5) every num_steps steps.
|
181 |
+
Args:
|
182 |
+
optimizer (Optimizer): Wrapped optimizer.
|
183 |
+
num_steps (float): The number of steps to decay the learning rate by decay in.
|
184 |
+
decay (float): The factor by which to decay the learning rate every num_steps
|
185 |
+
steps. Default: 0.5.
|
186 |
+
warmup (float): Exponential warmup factor (0 <= warmup < 1, 0 to disable)
|
187 |
+
Default: 0.
|
188 |
+
min_lr (float): The minimum learning rate. Default: 0.
|
189 |
+
last_epoch (int): The index of last epoch. Default: -1.
|
190 |
+
verbose (bool): If ``True``, prints a message to stdout for
|
191 |
+
each update. Default: ``False``.
|
192 |
+
"""
|
193 |
+
|
194 |
+
def __init__(self, optimizer, num_steps, decay=0.5, warmup=0., min_lr=0.,
|
195 |
+
last_epoch=-1, verbose=False):
|
196 |
+
self.num_steps = num_steps
|
197 |
+
self.decay = decay
|
198 |
+
if not 0. <= warmup < 1:
|
199 |
+
raise ValueError('Invalid value for warmup')
|
200 |
+
self.warmup = warmup
|
201 |
+
self.min_lr = min_lr
|
202 |
+
super().__init__(optimizer, last_epoch, verbose)
|
203 |
+
|
204 |
+
def get_lr(self):
|
205 |
+
if not self._get_lr_called_within_step:
|
206 |
+
warnings.warn("To get the last learning rate computed by the scheduler, "
|
207 |
+
"please use `get_last_lr()`.")
|
208 |
+
|
209 |
+
return self._get_closed_form_lr()
|
210 |
+
|
211 |
+
def _get_closed_form_lr(self):
|
212 |
+
warmup = 1 - self.warmup ** (self.last_epoch + 1)
|
213 |
+
lr_mult = (self.decay ** (1 / self.num_steps)) ** self.last_epoch
|
214 |
+
return [warmup * max(self.min_lr, base_lr * lr_mult)
|
215 |
+
for base_lr in self.base_lrs]
|
216 |
+
|
217 |
+
|
218 |
+
def rand_log_normal(shape, loc=0., scale=1., device='cpu', dtype=torch.float32):
|
219 |
+
"""Draws samples from an lognormal distribution."""
|
220 |
+
return (torch.randn(shape, device=device, dtype=dtype) * scale + loc).exp()
|
221 |
+
|
222 |
+
|
223 |
+
def rand_log_logistic(shape, loc=0., scale=1., min_value=0., max_value=float('inf'), device='cpu', dtype=torch.float32):
|
224 |
+
"""Draws samples from an optionally truncated log-logistic distribution."""
|
225 |
+
min_value = torch.as_tensor(min_value, device=device, dtype=torch.float64)
|
226 |
+
max_value = torch.as_tensor(max_value, device=device, dtype=torch.float64)
|
227 |
+
min_cdf = min_value.log().sub(loc).div(scale).sigmoid()
|
228 |
+
max_cdf = max_value.log().sub(loc).div(scale).sigmoid()
|
229 |
+
u = torch.rand(shape, device=device, dtype=torch.float64) * (max_cdf - min_cdf) + min_cdf
|
230 |
+
return u.logit().mul(scale).add(loc).exp().to(dtype)
|
231 |
+
|
232 |
+
|
233 |
+
def rand_log_uniform(shape, min_value, max_value, device='cpu', dtype=torch.float32):
|
234 |
+
"""Draws samples from an log-uniform distribution."""
|
235 |
+
min_value = math.log(min_value)
|
236 |
+
max_value = math.log(max_value)
|
237 |
+
return (torch.rand(shape, device=device, dtype=dtype) * (max_value - min_value) + min_value).exp()
|
238 |
+
|
239 |
+
|
240 |
+
def rand_v_diffusion(shape, sigma_data=1., min_value=0., max_value=float('inf'), device='cpu', dtype=torch.float32):
|
241 |
+
"""Draws samples from a truncated v-diffusion training timestep distribution."""
|
242 |
+
min_cdf = math.atan(min_value / sigma_data) * 2 / math.pi
|
243 |
+
max_cdf = math.atan(max_value / sigma_data) * 2 / math.pi
|
244 |
+
u = torch.rand(shape, device=device, dtype=dtype) * (max_cdf - min_cdf) + min_cdf
|
245 |
+
return torch.tan(u * math.pi / 2) * sigma_data
|
246 |
+
|
247 |
+
|
248 |
+
def rand_split_log_normal(shape, loc, scale_1, scale_2, device='cpu', dtype=torch.float32):
|
249 |
+
"""Draws samples from a split lognormal distribution."""
|
250 |
+
n = torch.randn(shape, device=device, dtype=dtype).abs()
|
251 |
+
u = torch.rand(shape, device=device, dtype=dtype)
|
252 |
+
n_left = n * -scale_1 + loc
|
253 |
+
n_right = n * scale_2 + loc
|
254 |
+
ratio = scale_1 / (scale_1 + scale_2)
|
255 |
+
return torch.where(u < ratio, n_left, n_right).exp()
|
256 |
+
|
257 |
+
|
258 |
+
class FolderOfImages(data.Dataset):
|
259 |
+
"""Recursively finds all images in a directory. It does not support
|
260 |
+
classes/targets."""
|
261 |
+
|
262 |
+
IMG_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp'}
|
263 |
+
|
264 |
+
def __init__(self, root, transform=None):
|
265 |
+
super().__init__()
|
266 |
+
self.root = Path(root)
|
267 |
+
self.transform = nn.Identity() if transform is None else transform
|
268 |
+
self.paths = sorted(path for path in self.root.rglob('*') if path.suffix.lower() in self.IMG_EXTENSIONS)
|
269 |
+
|
270 |
+
def __repr__(self):
|
271 |
+
return f'FolderOfImages(root="{self.root}", len: {len(self)})'
|
272 |
+
|
273 |
+
def __len__(self):
|
274 |
+
return len(self.paths)
|
275 |
+
|
276 |
+
def __getitem__(self, key):
|
277 |
+
path = self.paths[key]
|
278 |
+
with open(path, 'rb') as f:
|
279 |
+
image = Image.open(f).convert('RGB')
|
280 |
+
image = self.transform(image)
|
281 |
+
return image,
|
282 |
+
|
283 |
+
|
284 |
+
class CSVLogger:
|
285 |
+
def __init__(self, filename, columns):
|
286 |
+
self.filename = Path(filename)
|
287 |
+
self.columns = columns
|
288 |
+
if self.filename.exists():
|
289 |
+
self.file = open(self.filename, 'a')
|
290 |
+
else:
|
291 |
+
self.file = open(self.filename, 'w')
|
292 |
+
self.write(*self.columns)
|
293 |
+
|
294 |
+
def write(self, *args):
|
295 |
+
print(*args, sep=',', file=self.file, flush=True)
|
296 |
+
|
297 |
+
|
298 |
+
@contextmanager
|
299 |
+
def tf32_mode(cudnn=None, matmul=None):
|
300 |
+
"""A context manager that sets whether TF32 is allowed on cuDNN or matmul."""
|
301 |
+
cudnn_old = torch.backends.cudnn.allow_tf32
|
302 |
+
matmul_old = torch.backends.cuda.matmul.allow_tf32
|
303 |
+
try:
|
304 |
+
if cudnn is not None:
|
305 |
+
torch.backends.cudnn.allow_tf32 = cudnn
|
306 |
+
if matmul is not None:
|
307 |
+
torch.backends.cuda.matmul.allow_tf32 = matmul
|
308 |
+
yield
|
309 |
+
finally:
|
310 |
+
if cudnn is not None:
|
311 |
+
torch.backends.cudnn.allow_tf32 = cudnn_old
|
312 |
+
if matmul is not None:
|
313 |
+
torch.backends.cuda.matmul.allow_tf32 = matmul_old
|
ComfyUI/comfy/latent_formats.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
class LatentFormat:
|
4 |
+
scale_factor = 1.0
|
5 |
+
latent_rgb_factors = None
|
6 |
+
taesd_decoder_name = None
|
7 |
+
|
8 |
+
def process_in(self, latent):
|
9 |
+
return latent * self.scale_factor
|
10 |
+
|
11 |
+
def process_out(self, latent):
|
12 |
+
return latent / self.scale_factor
|
13 |
+
|
14 |
+
class SD15(LatentFormat):
|
15 |
+
def __init__(self, scale_factor=0.18215):
|
16 |
+
self.scale_factor = scale_factor
|
17 |
+
self.latent_rgb_factors = [
|
18 |
+
# R G B
|
19 |
+
[ 0.3512, 0.2297, 0.3227],
|
20 |
+
[ 0.3250, 0.4974, 0.2350],
|
21 |
+
[-0.2829, 0.1762, 0.2721],
|
22 |
+
[-0.2120, -0.2616, -0.7177]
|
23 |
+
]
|
24 |
+
self.taesd_decoder_name = "taesd_decoder"
|
25 |
+
|
26 |
+
class SDXL(LatentFormat):
|
27 |
+
def __init__(self):
|
28 |
+
self.scale_factor = 0.13025
|
29 |
+
self.latent_rgb_factors = [
|
30 |
+
# R G B
|
31 |
+
[ 0.3920, 0.4054, 0.4549],
|
32 |
+
[-0.2634, -0.0196, 0.0653],
|
33 |
+
[ 0.0568, 0.1687, -0.0755],
|
34 |
+
[-0.3112, -0.2359, -0.2076]
|
35 |
+
]
|
36 |
+
self.taesd_decoder_name = "taesdxl_decoder"
|
37 |
+
|
38 |
+
class SDXL_Playground_2_5(LatentFormat):
|
39 |
+
def __init__(self):
|
40 |
+
self.scale_factor = 0.5
|
41 |
+
self.latents_mean = torch.tensor([-1.6574, 1.886, -1.383, 2.5155]).view(1, 4, 1, 1)
|
42 |
+
self.latents_std = torch.tensor([8.4927, 5.9022, 6.5498, 5.2299]).view(1, 4, 1, 1)
|
43 |
+
|
44 |
+
self.latent_rgb_factors = [
|
45 |
+
# R G B
|
46 |
+
[ 0.3920, 0.4054, 0.4549],
|
47 |
+
[-0.2634, -0.0196, 0.0653],
|
48 |
+
[ 0.0568, 0.1687, -0.0755],
|
49 |
+
[-0.3112, -0.2359, -0.2076]
|
50 |
+
]
|
51 |
+
self.taesd_decoder_name = "taesdxl_decoder"
|
52 |
+
|
53 |
+
def process_in(self, latent):
|
54 |
+
latents_mean = self.latents_mean.to(latent.device, latent.dtype)
|
55 |
+
latents_std = self.latents_std.to(latent.device, latent.dtype)
|
56 |
+
return (latent - latents_mean) * self.scale_factor / latents_std
|
57 |
+
|
58 |
+
def process_out(self, latent):
|
59 |
+
latents_mean = self.latents_mean.to(latent.device, latent.dtype)
|
60 |
+
latents_std = self.latents_std.to(latent.device, latent.dtype)
|
61 |
+
return latent * latents_std / self.scale_factor + latents_mean
|
62 |
+
|
63 |
+
|
64 |
+
class SD_X4(LatentFormat):
|
65 |
+
def __init__(self):
|
66 |
+
self.scale_factor = 0.08333
|
67 |
+
self.latent_rgb_factors = [
|
68 |
+
[-0.2340, -0.3863, -0.3257],
|
69 |
+
[ 0.0994, 0.0885, -0.0908],
|
70 |
+
[-0.2833, -0.2349, -0.3741],
|
71 |
+
[ 0.2523, -0.0055, -0.1651]
|
72 |
+
]
|
73 |
+
|
74 |
+
class SC_Prior(LatentFormat):
|
75 |
+
def __init__(self):
|
76 |
+
self.scale_factor = 1.0
|
77 |
+
self.latent_rgb_factors = [
|
78 |
+
[-0.0326, -0.0204, -0.0127],
|
79 |
+
[-0.1592, -0.0427, 0.0216],
|
80 |
+
[ 0.0873, 0.0638, -0.0020],
|
81 |
+
[-0.0602, 0.0442, 0.1304],
|
82 |
+
[ 0.0800, -0.0313, -0.1796],
|
83 |
+
[-0.0810, -0.0638, -0.1581],
|
84 |
+
[ 0.1791, 0.1180, 0.0967],
|
85 |
+
[ 0.0740, 0.1416, 0.0432],
|
86 |
+
[-0.1745, -0.1888, -0.1373],
|
87 |
+
[ 0.2412, 0.1577, 0.0928],
|
88 |
+
[ 0.1908, 0.0998, 0.0682],
|
89 |
+
[ 0.0209, 0.0365, -0.0092],
|
90 |
+
[ 0.0448, -0.0650, -0.1728],
|
91 |
+
[-0.1658, -0.1045, -0.1308],
|
92 |
+
[ 0.0542, 0.1545, 0.1325],
|
93 |
+
[-0.0352, -0.1672, -0.2541]
|
94 |
+
]
|
95 |
+
|
96 |
+
class SC_B(LatentFormat):
|
97 |
+
def __init__(self):
|
98 |
+
self.scale_factor = 1.0
|
99 |
+
self.latent_rgb_factors = [
|
100 |
+
[ 0.1121, 0.2006, 0.1023],
|
101 |
+
[-0.2093, -0.0222, -0.0195],
|
102 |
+
[-0.3087, -0.1535, 0.0366],
|
103 |
+
[ 0.0290, -0.1574, -0.4078]
|
104 |
+
]
|
ComfyUI/comfy/ldm/cascade/common.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This file is part of ComfyUI.
|
3 |
+
Copyright (C) 2024 Stability AI
|
4 |
+
|
5 |
+
This program is free software: you can redistribute it and/or modify
|
6 |
+
it under the terms of the GNU General Public License as published by
|
7 |
+
the Free Software Foundation, either version 3 of the License, or
|
8 |
+
(at your option) any later version.
|
9 |
+
|
10 |
+
This program is distributed in the hope that it will be useful,
|
11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13 |
+
GNU General Public License for more details.
|
14 |
+
|
15 |
+
You should have received a copy of the GNU General Public License
|
16 |
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
17 |
+
"""
|
18 |
+
|
19 |
+
import torch
|
20 |
+
import torch.nn as nn
|
21 |
+
from comfy.ldm.modules.attention import optimized_attention
|
22 |
+
|
23 |
+
class Linear(torch.nn.Linear):
|
24 |
+
def reset_parameters(self):
|
25 |
+
return None
|
26 |
+
|
27 |
+
class Conv2d(torch.nn.Conv2d):
|
28 |
+
def reset_parameters(self):
|
29 |
+
return None
|
30 |
+
|
31 |
+
class OptimizedAttention(nn.Module):
|
32 |
+
def __init__(self, c, nhead, dropout=0.0, dtype=None, device=None, operations=None):
|
33 |
+
super().__init__()
|
34 |
+
self.heads = nhead
|
35 |
+
|
36 |
+
self.to_q = operations.Linear(c, c, bias=True, dtype=dtype, device=device)
|
37 |
+
self.to_k = operations.Linear(c, c, bias=True, dtype=dtype, device=device)
|
38 |
+
self.to_v = operations.Linear(c, c, bias=True, dtype=dtype, device=device)
|
39 |
+
|
40 |
+
self.out_proj = operations.Linear(c, c, bias=True, dtype=dtype, device=device)
|
41 |
+
|
42 |
+
def forward(self, q, k, v):
|
43 |
+
q = self.to_q(q)
|
44 |
+
k = self.to_k(k)
|
45 |
+
v = self.to_v(v)
|
46 |
+
|
47 |
+
out = optimized_attention(q, k, v, self.heads)
|
48 |
+
|
49 |
+
return self.out_proj(out)
|
50 |
+
|
51 |
+
class Attention2D(nn.Module):
|
52 |
+
def __init__(self, c, nhead, dropout=0.0, dtype=None, device=None, operations=None):
|
53 |
+
super().__init__()
|
54 |
+
self.attn = OptimizedAttention(c, nhead, dtype=dtype, device=device, operations=operations)
|
55 |
+
# self.attn = nn.MultiheadAttention(c, nhead, dropout=dropout, bias=True, batch_first=True, dtype=dtype, device=device)
|
56 |
+
|
57 |
+
def forward(self, x, kv, self_attn=False):
|
58 |
+
orig_shape = x.shape
|
59 |
+
x = x.view(x.size(0), x.size(1), -1).permute(0, 2, 1) # Bx4xHxW -> Bx(HxW)x4
|
60 |
+
if self_attn:
|
61 |
+
kv = torch.cat([x, kv], dim=1)
|
62 |
+
# x = self.attn(x, kv, kv, need_weights=False)[0]
|
63 |
+
x = self.attn(x, kv, kv)
|
64 |
+
x = x.permute(0, 2, 1).view(*orig_shape)
|
65 |
+
return x
|
66 |
+
|
67 |
+
|
68 |
+
def LayerNorm2d_op(operations):
|
69 |
+
class LayerNorm2d(operations.LayerNorm):
|
70 |
+
def __init__(self, *args, **kwargs):
|
71 |
+
super().__init__(*args, **kwargs)
|
72 |
+
|
73 |
+
def forward(self, x):
|
74 |
+
return super().forward(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
|
75 |
+
return LayerNorm2d
|
76 |
+
|
77 |
+
class GlobalResponseNorm(nn.Module):
|
78 |
+
"from https://github.com/facebookresearch/ConvNeXt-V2/blob/3608f67cc1dae164790c5d0aead7bf2d73d9719b/models/utils.py#L105"
|
79 |
+
def __init__(self, dim, dtype=None, device=None):
|
80 |
+
super().__init__()
|
81 |
+
self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim, dtype=dtype, device=device))
|
82 |
+
self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim, dtype=dtype, device=device))
|
83 |
+
|
84 |
+
def forward(self, x):
|
85 |
+
Gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
|
86 |
+
Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
|
87 |
+
return self.gamma.to(device=x.device, dtype=x.dtype) * (x * Nx) + self.beta.to(device=x.device, dtype=x.dtype) + x
|
88 |
+
|
89 |
+
|
90 |
+
class ResBlock(nn.Module):
|
91 |
+
def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0, dtype=None, device=None, operations=None): # , num_heads=4, expansion=2):
|
92 |
+
super().__init__()
|
93 |
+
self.depthwise = operations.Conv2d(c, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c, dtype=dtype, device=device)
|
94 |
+
# self.depthwise = SAMBlock(c, num_heads, expansion)
|
95 |
+
self.norm = LayerNorm2d_op(operations)(c, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
96 |
+
self.channelwise = nn.Sequential(
|
97 |
+
operations.Linear(c + c_skip, c * 4, dtype=dtype, device=device),
|
98 |
+
nn.GELU(),
|
99 |
+
GlobalResponseNorm(c * 4, dtype=dtype, device=device),
|
100 |
+
nn.Dropout(dropout),
|
101 |
+
operations.Linear(c * 4, c, dtype=dtype, device=device)
|
102 |
+
)
|
103 |
+
|
104 |
+
def forward(self, x, x_skip=None):
|
105 |
+
x_res = x
|
106 |
+
x = self.norm(self.depthwise(x))
|
107 |
+
if x_skip is not None:
|
108 |
+
x = torch.cat([x, x_skip], dim=1)
|
109 |
+
x = self.channelwise(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
|
110 |
+
return x + x_res
|
111 |
+
|
112 |
+
|
113 |
+
class AttnBlock(nn.Module):
|
114 |
+
def __init__(self, c, c_cond, nhead, self_attn=True, dropout=0.0, dtype=None, device=None, operations=None):
|
115 |
+
super().__init__()
|
116 |
+
self.self_attn = self_attn
|
117 |
+
self.norm = LayerNorm2d_op(operations)(c, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
118 |
+
self.attention = Attention2D(c, nhead, dropout, dtype=dtype, device=device, operations=operations)
|
119 |
+
self.kv_mapper = nn.Sequential(
|
120 |
+
nn.SiLU(),
|
121 |
+
operations.Linear(c_cond, c, dtype=dtype, device=device)
|
122 |
+
)
|
123 |
+
|
124 |
+
def forward(self, x, kv):
|
125 |
+
kv = self.kv_mapper(kv)
|
126 |
+
x = x + self.attention(self.norm(x), kv, self_attn=self.self_attn)
|
127 |
+
return x
|
128 |
+
|
129 |
+
|
130 |
+
class FeedForwardBlock(nn.Module):
|
131 |
+
def __init__(self, c, dropout=0.0, dtype=None, device=None, operations=None):
|
132 |
+
super().__init__()
|
133 |
+
self.norm = LayerNorm2d_op(operations)(c, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
134 |
+
self.channelwise = nn.Sequential(
|
135 |
+
operations.Linear(c, c * 4, dtype=dtype, device=device),
|
136 |
+
nn.GELU(),
|
137 |
+
GlobalResponseNorm(c * 4, dtype=dtype, device=device),
|
138 |
+
nn.Dropout(dropout),
|
139 |
+
operations.Linear(c * 4, c, dtype=dtype, device=device)
|
140 |
+
)
|
141 |
+
|
142 |
+
def forward(self, x):
|
143 |
+
x = x + self.channelwise(self.norm(x).permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
|
144 |
+
return x
|
145 |
+
|
146 |
+
|
147 |
+
class TimestepBlock(nn.Module):
|
148 |
+
def __init__(self, c, c_timestep, conds=['sca'], dtype=None, device=None, operations=None):
|
149 |
+
super().__init__()
|
150 |
+
self.mapper = operations.Linear(c_timestep, c * 2, dtype=dtype, device=device)
|
151 |
+
self.conds = conds
|
152 |
+
for cname in conds:
|
153 |
+
setattr(self, f"mapper_{cname}", operations.Linear(c_timestep, c * 2, dtype=dtype, device=device))
|
154 |
+
|
155 |
+
def forward(self, x, t):
|
156 |
+
t = t.chunk(len(self.conds) + 1, dim=1)
|
157 |
+
a, b = self.mapper(t[0])[:, :, None, None].chunk(2, dim=1)
|
158 |
+
for i, c in enumerate(self.conds):
|
159 |
+
ac, bc = getattr(self, f"mapper_{c}")(t[i + 1])[:, :, None, None].chunk(2, dim=1)
|
160 |
+
a, b = a + ac, b + bc
|
161 |
+
return x * (1 + a) + b
|
ComfyUI/comfy/ldm/cascade/controlnet.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This file is part of ComfyUI.
|
3 |
+
Copyright (C) 2024 Stability AI
|
4 |
+
|
5 |
+
This program is free software: you can redistribute it and/or modify
|
6 |
+
it under the terms of the GNU General Public License as published by
|
7 |
+
the Free Software Foundation, either version 3 of the License, or
|
8 |
+
(at your option) any later version.
|
9 |
+
|
10 |
+
This program is distributed in the hope that it will be useful,
|
11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13 |
+
GNU General Public License for more details.
|
14 |
+
|
15 |
+
You should have received a copy of the GNU General Public License
|
16 |
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
17 |
+
"""
|
18 |
+
|
19 |
+
import torch
|
20 |
+
import torchvision
|
21 |
+
from torch import nn
|
22 |
+
from .common import LayerNorm2d_op
|
23 |
+
|
24 |
+
|
25 |
+
class CNetResBlock(nn.Module):
|
26 |
+
def __init__(self, c, dtype=None, device=None, operations=None):
|
27 |
+
super().__init__()
|
28 |
+
self.blocks = nn.Sequential(
|
29 |
+
LayerNorm2d_op(operations)(c, dtype=dtype, device=device),
|
30 |
+
nn.GELU(),
|
31 |
+
operations.Conv2d(c, c, kernel_size=3, padding=1),
|
32 |
+
LayerNorm2d_op(operations)(c, dtype=dtype, device=device),
|
33 |
+
nn.GELU(),
|
34 |
+
operations.Conv2d(c, c, kernel_size=3, padding=1),
|
35 |
+
)
|
36 |
+
|
37 |
+
def forward(self, x):
|
38 |
+
return x + self.blocks(x)
|
39 |
+
|
40 |
+
|
41 |
+
class ControlNet(nn.Module):
|
42 |
+
def __init__(self, c_in=3, c_proj=2048, proj_blocks=None, bottleneck_mode=None, dtype=None, device=None, operations=nn):
|
43 |
+
super().__init__()
|
44 |
+
if bottleneck_mode is None:
|
45 |
+
bottleneck_mode = 'effnet'
|
46 |
+
self.proj_blocks = proj_blocks
|
47 |
+
if bottleneck_mode == 'effnet':
|
48 |
+
embd_channels = 1280
|
49 |
+
self.backbone = torchvision.models.efficientnet_v2_s().features.eval()
|
50 |
+
if c_in != 3:
|
51 |
+
in_weights = self.backbone[0][0].weight.data
|
52 |
+
self.backbone[0][0] = operations.Conv2d(c_in, 24, kernel_size=3, stride=2, bias=False, dtype=dtype, device=device)
|
53 |
+
if c_in > 3:
|
54 |
+
# nn.init.constant_(self.backbone[0][0].weight, 0)
|
55 |
+
self.backbone[0][0].weight.data[:, :3] = in_weights[:, :3].clone()
|
56 |
+
else:
|
57 |
+
self.backbone[0][0].weight.data = in_weights[:, :c_in].clone()
|
58 |
+
elif bottleneck_mode == 'simple':
|
59 |
+
embd_channels = c_in
|
60 |
+
self.backbone = nn.Sequential(
|
61 |
+
operations.Conv2d(embd_channels, embd_channels * 4, kernel_size=3, padding=1, dtype=dtype, device=device),
|
62 |
+
nn.LeakyReLU(0.2, inplace=True),
|
63 |
+
operations.Conv2d(embd_channels * 4, embd_channels, kernel_size=3, padding=1, dtype=dtype, device=device),
|
64 |
+
)
|
65 |
+
elif bottleneck_mode == 'large':
|
66 |
+
self.backbone = nn.Sequential(
|
67 |
+
operations.Conv2d(c_in, 4096 * 4, kernel_size=1, dtype=dtype, device=device),
|
68 |
+
nn.LeakyReLU(0.2, inplace=True),
|
69 |
+
operations.Conv2d(4096 * 4, 1024, kernel_size=1, dtype=dtype, device=device),
|
70 |
+
*[CNetResBlock(1024, dtype=dtype, device=device, operations=operations) for _ in range(8)],
|
71 |
+
operations.Conv2d(1024, 1280, kernel_size=1, dtype=dtype, device=device),
|
72 |
+
)
|
73 |
+
embd_channels = 1280
|
74 |
+
else:
|
75 |
+
raise ValueError(f'Unknown bottleneck mode: {bottleneck_mode}')
|
76 |
+
self.projections = nn.ModuleList()
|
77 |
+
for _ in range(len(proj_blocks)):
|
78 |
+
self.projections.append(nn.Sequential(
|
79 |
+
operations.Conv2d(embd_channels, embd_channels, kernel_size=1, bias=False, dtype=dtype, device=device),
|
80 |
+
nn.LeakyReLU(0.2, inplace=True),
|
81 |
+
operations.Conv2d(embd_channels, c_proj, kernel_size=1, bias=False, dtype=dtype, device=device),
|
82 |
+
))
|
83 |
+
# nn.init.constant_(self.projections[-1][-1].weight, 0) # zero output projection
|
84 |
+
self.xl = False
|
85 |
+
self.input_channels = c_in
|
86 |
+
self.unshuffle_amount = 8
|
87 |
+
|
88 |
+
def forward(self, x):
|
89 |
+
x = self.backbone(x)
|
90 |
+
proj_outputs = [None for _ in range(max(self.proj_blocks) + 1)]
|
91 |
+
for i, idx in enumerate(self.proj_blocks):
|
92 |
+
proj_outputs[idx] = self.projections[i](x)
|
93 |
+
return proj_outputs
|
ComfyUI/comfy/ldm/cascade/stage_a.py
ADDED
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This file is part of ComfyUI.
|
3 |
+
Copyright (C) 2024 Stability AI
|
4 |
+
|
5 |
+
This program is free software: you can redistribute it and/or modify
|
6 |
+
it under the terms of the GNU General Public License as published by
|
7 |
+
the Free Software Foundation, either version 3 of the License, or
|
8 |
+
(at your option) any later version.
|
9 |
+
|
10 |
+
This program is distributed in the hope that it will be useful,
|
11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13 |
+
GNU General Public License for more details.
|
14 |
+
|
15 |
+
You should have received a copy of the GNU General Public License
|
16 |
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
17 |
+
"""
|
18 |
+
|
19 |
+
import torch
|
20 |
+
from torch import nn
|
21 |
+
from torch.autograd import Function
|
22 |
+
|
23 |
+
class vector_quantize(Function):
|
24 |
+
@staticmethod
|
25 |
+
def forward(ctx, x, codebook):
|
26 |
+
with torch.no_grad():
|
27 |
+
codebook_sqr = torch.sum(codebook ** 2, dim=1)
|
28 |
+
x_sqr = torch.sum(x ** 2, dim=1, keepdim=True)
|
29 |
+
|
30 |
+
dist = torch.addmm(codebook_sqr + x_sqr, x, codebook.t(), alpha=-2.0, beta=1.0)
|
31 |
+
_, indices = dist.min(dim=1)
|
32 |
+
|
33 |
+
ctx.save_for_backward(indices, codebook)
|
34 |
+
ctx.mark_non_differentiable(indices)
|
35 |
+
|
36 |
+
nn = torch.index_select(codebook, 0, indices)
|
37 |
+
return nn, indices
|
38 |
+
|
39 |
+
@staticmethod
|
40 |
+
def backward(ctx, grad_output, grad_indices):
|
41 |
+
grad_inputs, grad_codebook = None, None
|
42 |
+
|
43 |
+
if ctx.needs_input_grad[0]:
|
44 |
+
grad_inputs = grad_output.clone()
|
45 |
+
if ctx.needs_input_grad[1]:
|
46 |
+
# Gradient wrt. the codebook
|
47 |
+
indices, codebook = ctx.saved_tensors
|
48 |
+
|
49 |
+
grad_codebook = torch.zeros_like(codebook)
|
50 |
+
grad_codebook.index_add_(0, indices, grad_output)
|
51 |
+
|
52 |
+
return (grad_inputs, grad_codebook)
|
53 |
+
|
54 |
+
|
55 |
+
class VectorQuantize(nn.Module):
|
56 |
+
def __init__(self, embedding_size, k, ema_decay=0.99, ema_loss=False):
|
57 |
+
"""
|
58 |
+
Takes an input of variable size (as long as the last dimension matches the embedding size).
|
59 |
+
Returns one tensor containing the nearest neigbour embeddings to each of the inputs,
|
60 |
+
with the same size as the input, vq and commitment components for the loss as a touple
|
61 |
+
in the second output and the indices of the quantized vectors in the third:
|
62 |
+
quantized, (vq_loss, commit_loss), indices
|
63 |
+
"""
|
64 |
+
super(VectorQuantize, self).__init__()
|
65 |
+
|
66 |
+
self.codebook = nn.Embedding(k, embedding_size)
|
67 |
+
self.codebook.weight.data.uniform_(-1./k, 1./k)
|
68 |
+
self.vq = vector_quantize.apply
|
69 |
+
|
70 |
+
self.ema_decay = ema_decay
|
71 |
+
self.ema_loss = ema_loss
|
72 |
+
if ema_loss:
|
73 |
+
self.register_buffer('ema_element_count', torch.ones(k))
|
74 |
+
self.register_buffer('ema_weight_sum', torch.zeros_like(self.codebook.weight))
|
75 |
+
|
76 |
+
def _laplace_smoothing(self, x, epsilon):
|
77 |
+
n = torch.sum(x)
|
78 |
+
return ((x + epsilon) / (n + x.size(0) * epsilon) * n)
|
79 |
+
|
80 |
+
def _updateEMA(self, z_e_x, indices):
|
81 |
+
mask = nn.functional.one_hot(indices, self.ema_element_count.size(0)).float()
|
82 |
+
elem_count = mask.sum(dim=0)
|
83 |
+
weight_sum = torch.mm(mask.t(), z_e_x)
|
84 |
+
|
85 |
+
self.ema_element_count = (self.ema_decay * self.ema_element_count) + ((1-self.ema_decay) * elem_count)
|
86 |
+
self.ema_element_count = self._laplace_smoothing(self.ema_element_count, 1e-5)
|
87 |
+
self.ema_weight_sum = (self.ema_decay * self.ema_weight_sum) + ((1-self.ema_decay) * weight_sum)
|
88 |
+
|
89 |
+
self.codebook.weight.data = self.ema_weight_sum / self.ema_element_count.unsqueeze(-1)
|
90 |
+
|
91 |
+
def idx2vq(self, idx, dim=-1):
|
92 |
+
q_idx = self.codebook(idx)
|
93 |
+
if dim != -1:
|
94 |
+
q_idx = q_idx.movedim(-1, dim)
|
95 |
+
return q_idx
|
96 |
+
|
97 |
+
def forward(self, x, get_losses=True, dim=-1):
|
98 |
+
if dim != -1:
|
99 |
+
x = x.movedim(dim, -1)
|
100 |
+
z_e_x = x.contiguous().view(-1, x.size(-1)) if len(x.shape) > 2 else x
|
101 |
+
z_q_x, indices = self.vq(z_e_x, self.codebook.weight.detach())
|
102 |
+
vq_loss, commit_loss = None, None
|
103 |
+
if self.ema_loss and self.training:
|
104 |
+
self._updateEMA(z_e_x.detach(), indices.detach())
|
105 |
+
# pick the graded embeddings after updating the codebook in order to have a more accurate commitment loss
|
106 |
+
z_q_x_grd = torch.index_select(self.codebook.weight, dim=0, index=indices)
|
107 |
+
if get_losses:
|
108 |
+
vq_loss = (z_q_x_grd - z_e_x.detach()).pow(2).mean()
|
109 |
+
commit_loss = (z_e_x - z_q_x_grd.detach()).pow(2).mean()
|
110 |
+
|
111 |
+
z_q_x = z_q_x.view(x.shape)
|
112 |
+
if dim != -1:
|
113 |
+
z_q_x = z_q_x.movedim(-1, dim)
|
114 |
+
return z_q_x, (vq_loss, commit_loss), indices.view(x.shape[:-1])
|
115 |
+
|
116 |
+
|
117 |
+
class ResBlock(nn.Module):
|
118 |
+
def __init__(self, c, c_hidden):
|
119 |
+
super().__init__()
|
120 |
+
# depthwise/attention
|
121 |
+
self.norm1 = nn.LayerNorm(c, elementwise_affine=False, eps=1e-6)
|
122 |
+
self.depthwise = nn.Sequential(
|
123 |
+
nn.ReplicationPad2d(1),
|
124 |
+
nn.Conv2d(c, c, kernel_size=3, groups=c)
|
125 |
+
)
|
126 |
+
|
127 |
+
# channelwise
|
128 |
+
self.norm2 = nn.LayerNorm(c, elementwise_affine=False, eps=1e-6)
|
129 |
+
self.channelwise = nn.Sequential(
|
130 |
+
nn.Linear(c, c_hidden),
|
131 |
+
nn.GELU(),
|
132 |
+
nn.Linear(c_hidden, c),
|
133 |
+
)
|
134 |
+
|
135 |
+
self.gammas = nn.Parameter(torch.zeros(6), requires_grad=True)
|
136 |
+
|
137 |
+
# Init weights
|
138 |
+
def _basic_init(module):
|
139 |
+
if isinstance(module, nn.Linear) or isinstance(module, nn.Conv2d):
|
140 |
+
torch.nn.init.xavier_uniform_(module.weight)
|
141 |
+
if module.bias is not None:
|
142 |
+
nn.init.constant_(module.bias, 0)
|
143 |
+
|
144 |
+
self.apply(_basic_init)
|
145 |
+
|
146 |
+
def _norm(self, x, norm):
|
147 |
+
return norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
|
148 |
+
|
149 |
+
def forward(self, x):
|
150 |
+
mods = self.gammas
|
151 |
+
|
152 |
+
x_temp = self._norm(x, self.norm1) * (1 + mods[0]) + mods[1]
|
153 |
+
try:
|
154 |
+
x = x + self.depthwise(x_temp) * mods[2]
|
155 |
+
except: #operation not implemented for bf16
|
156 |
+
x_temp = self.depthwise[0](x_temp.float()).to(x.dtype)
|
157 |
+
x = x + self.depthwise[1](x_temp) * mods[2]
|
158 |
+
|
159 |
+
x_temp = self._norm(x, self.norm2) * (1 + mods[3]) + mods[4]
|
160 |
+
x = x + self.channelwise(x_temp.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) * mods[5]
|
161 |
+
|
162 |
+
return x
|
163 |
+
|
164 |
+
|
165 |
+
class StageA(nn.Module):
|
166 |
+
def __init__(self, levels=2, bottleneck_blocks=12, c_hidden=384, c_latent=4, codebook_size=8192,
|
167 |
+
scale_factor=0.43): # 0.3764
|
168 |
+
super().__init__()
|
169 |
+
self.c_latent = c_latent
|
170 |
+
self.scale_factor = scale_factor
|
171 |
+
c_levels = [c_hidden // (2 ** i) for i in reversed(range(levels))]
|
172 |
+
|
173 |
+
# Encoder blocks
|
174 |
+
self.in_block = nn.Sequential(
|
175 |
+
nn.PixelUnshuffle(2),
|
176 |
+
nn.Conv2d(3 * 4, c_levels[0], kernel_size=1)
|
177 |
+
)
|
178 |
+
down_blocks = []
|
179 |
+
for i in range(levels):
|
180 |
+
if i > 0:
|
181 |
+
down_blocks.append(nn.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
|
182 |
+
block = ResBlock(c_levels[i], c_levels[i] * 4)
|
183 |
+
down_blocks.append(block)
|
184 |
+
down_blocks.append(nn.Sequential(
|
185 |
+
nn.Conv2d(c_levels[-1], c_latent, kernel_size=1, bias=False),
|
186 |
+
nn.BatchNorm2d(c_latent), # then normalize them to have mean 0 and std 1
|
187 |
+
))
|
188 |
+
self.down_blocks = nn.Sequential(*down_blocks)
|
189 |
+
self.down_blocks[0]
|
190 |
+
|
191 |
+
self.codebook_size = codebook_size
|
192 |
+
self.vquantizer = VectorQuantize(c_latent, k=codebook_size)
|
193 |
+
|
194 |
+
# Decoder blocks
|
195 |
+
up_blocks = [nn.Sequential(
|
196 |
+
nn.Conv2d(c_latent, c_levels[-1], kernel_size=1)
|
197 |
+
)]
|
198 |
+
for i in range(levels):
|
199 |
+
for j in range(bottleneck_blocks if i == 0 else 1):
|
200 |
+
block = ResBlock(c_levels[levels - 1 - i], c_levels[levels - 1 - i] * 4)
|
201 |
+
up_blocks.append(block)
|
202 |
+
if i < levels - 1:
|
203 |
+
up_blocks.append(
|
204 |
+
nn.ConvTranspose2d(c_levels[levels - 1 - i], c_levels[levels - 2 - i], kernel_size=4, stride=2,
|
205 |
+
padding=1))
|
206 |
+
self.up_blocks = nn.Sequential(*up_blocks)
|
207 |
+
self.out_block = nn.Sequential(
|
208 |
+
nn.Conv2d(c_levels[0], 3 * 4, kernel_size=1),
|
209 |
+
nn.PixelShuffle(2),
|
210 |
+
)
|
211 |
+
|
212 |
+
def encode(self, x, quantize=False):
|
213 |
+
x = self.in_block(x)
|
214 |
+
x = self.down_blocks(x)
|
215 |
+
if quantize:
|
216 |
+
qe, (vq_loss, commit_loss), indices = self.vquantizer.forward(x, dim=1)
|
217 |
+
return qe / self.scale_factor, x / self.scale_factor, indices, vq_loss + commit_loss * 0.25
|
218 |
+
else:
|
219 |
+
return x / self.scale_factor
|
220 |
+
|
221 |
+
def decode(self, x):
|
222 |
+
x = x * self.scale_factor
|
223 |
+
x = self.up_blocks(x)
|
224 |
+
x = self.out_block(x)
|
225 |
+
return x
|
226 |
+
|
227 |
+
def forward(self, x, quantize=False):
|
228 |
+
qe, x, _, vq_loss = self.encode(x, quantize)
|
229 |
+
x = self.decode(qe)
|
230 |
+
return x, vq_loss
|
231 |
+
|
232 |
+
|
233 |
+
class Discriminator(nn.Module):
|
234 |
+
def __init__(self, c_in=3, c_cond=0, c_hidden=512, depth=6):
|
235 |
+
super().__init__()
|
236 |
+
d = max(depth - 3, 3)
|
237 |
+
layers = [
|
238 |
+
nn.utils.spectral_norm(nn.Conv2d(c_in, c_hidden // (2 ** d), kernel_size=3, stride=2, padding=1)),
|
239 |
+
nn.LeakyReLU(0.2),
|
240 |
+
]
|
241 |
+
for i in range(depth - 1):
|
242 |
+
c_in = c_hidden // (2 ** max((d - i), 0))
|
243 |
+
c_out = c_hidden // (2 ** max((d - 1 - i), 0))
|
244 |
+
layers.append(nn.utils.spectral_norm(nn.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
|
245 |
+
layers.append(nn.InstanceNorm2d(c_out))
|
246 |
+
layers.append(nn.LeakyReLU(0.2))
|
247 |
+
self.encoder = nn.Sequential(*layers)
|
248 |
+
self.shuffle = nn.Conv2d((c_hidden + c_cond) if c_cond > 0 else c_hidden, 1, kernel_size=1)
|
249 |
+
self.logits = nn.Sigmoid()
|
250 |
+
|
251 |
+
def forward(self, x, cond=None):
|
252 |
+
x = self.encoder(x)
|
253 |
+
if cond is not None:
|
254 |
+
cond = cond.view(cond.size(0), cond.size(1), 1, 1, ).expand(-1, -1, x.size(-2), x.size(-1))
|
255 |
+
x = torch.cat([x, cond], dim=1)
|
256 |
+
x = self.shuffle(x)
|
257 |
+
x = self.logits(x)
|
258 |
+
return x
|
ComfyUI/comfy/ldm/cascade/stage_b.py
ADDED
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This file is part of ComfyUI.
|
3 |
+
Copyright (C) 2024 Stability AI
|
4 |
+
|
5 |
+
This program is free software: you can redistribute it and/or modify
|
6 |
+
it under the terms of the GNU General Public License as published by
|
7 |
+
the Free Software Foundation, either version 3 of the License, or
|
8 |
+
(at your option) any later version.
|
9 |
+
|
10 |
+
This program is distributed in the hope that it will be useful,
|
11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13 |
+
GNU General Public License for more details.
|
14 |
+
|
15 |
+
You should have received a copy of the GNU General Public License
|
16 |
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
17 |
+
"""
|
18 |
+
|
19 |
+
import math
|
20 |
+
import numpy as np
|
21 |
+
import torch
|
22 |
+
from torch import nn
|
23 |
+
from .common import AttnBlock, LayerNorm2d_op, ResBlock, FeedForwardBlock, TimestepBlock
|
24 |
+
|
25 |
+
class StageB(nn.Module):
|
26 |
+
def __init__(self, c_in=4, c_out=4, c_r=64, patch_size=2, c_cond=1280, c_hidden=[320, 640, 1280, 1280],
|
27 |
+
nhead=[-1, -1, 20, 20], blocks=[[2, 6, 28, 6], [6, 28, 6, 2]],
|
28 |
+
block_repeat=[[1, 1, 1, 1], [3, 3, 2, 2]], level_config=['CT', 'CT', 'CTA', 'CTA'], c_clip=1280,
|
29 |
+
c_clip_seq=4, c_effnet=16, c_pixels=3, kernel_size=3, dropout=[0, 0, 0.0, 0.0], self_attn=True,
|
30 |
+
t_conds=['sca'], stable_cascade_stage=None, dtype=None, device=None, operations=None):
|
31 |
+
super().__init__()
|
32 |
+
self.dtype = dtype
|
33 |
+
self.c_r = c_r
|
34 |
+
self.t_conds = t_conds
|
35 |
+
self.c_clip_seq = c_clip_seq
|
36 |
+
if not isinstance(dropout, list):
|
37 |
+
dropout = [dropout] * len(c_hidden)
|
38 |
+
if not isinstance(self_attn, list):
|
39 |
+
self_attn = [self_attn] * len(c_hidden)
|
40 |
+
|
41 |
+
# CONDITIONING
|
42 |
+
self.effnet_mapper = nn.Sequential(
|
43 |
+
operations.Conv2d(c_effnet, c_hidden[0] * 4, kernel_size=1, dtype=dtype, device=device),
|
44 |
+
nn.GELU(),
|
45 |
+
operations.Conv2d(c_hidden[0] * 4, c_hidden[0], kernel_size=1, dtype=dtype, device=device),
|
46 |
+
LayerNorm2d_op(operations)(c_hidden[0], elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
47 |
+
)
|
48 |
+
self.pixels_mapper = nn.Sequential(
|
49 |
+
operations.Conv2d(c_pixels, c_hidden[0] * 4, kernel_size=1, dtype=dtype, device=device),
|
50 |
+
nn.GELU(),
|
51 |
+
operations.Conv2d(c_hidden[0] * 4, c_hidden[0], kernel_size=1, dtype=dtype, device=device),
|
52 |
+
LayerNorm2d_op(operations)(c_hidden[0], elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
53 |
+
)
|
54 |
+
self.clip_mapper = operations.Linear(c_clip, c_cond * c_clip_seq, dtype=dtype, device=device)
|
55 |
+
self.clip_norm = operations.LayerNorm(c_cond, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
56 |
+
|
57 |
+
self.embedding = nn.Sequential(
|
58 |
+
nn.PixelUnshuffle(patch_size),
|
59 |
+
operations.Conv2d(c_in * (patch_size ** 2), c_hidden[0], kernel_size=1, dtype=dtype, device=device),
|
60 |
+
LayerNorm2d_op(operations)(c_hidden[0], elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
61 |
+
)
|
62 |
+
|
63 |
+
def get_block(block_type, c_hidden, nhead, c_skip=0, dropout=0, self_attn=True):
|
64 |
+
if block_type == 'C':
|
65 |
+
return ResBlock(c_hidden, c_skip, kernel_size=kernel_size, dropout=dropout, dtype=dtype, device=device, operations=operations)
|
66 |
+
elif block_type == 'A':
|
67 |
+
return AttnBlock(c_hidden, c_cond, nhead, self_attn=self_attn, dropout=dropout, dtype=dtype, device=device, operations=operations)
|
68 |
+
elif block_type == 'F':
|
69 |
+
return FeedForwardBlock(c_hidden, dropout=dropout, dtype=dtype, device=device, operations=operations)
|
70 |
+
elif block_type == 'T':
|
71 |
+
return TimestepBlock(c_hidden, c_r, conds=t_conds, dtype=dtype, device=device, operations=operations)
|
72 |
+
else:
|
73 |
+
raise Exception(f'Block type {block_type} not supported')
|
74 |
+
|
75 |
+
# BLOCKS
|
76 |
+
# -- down blocks
|
77 |
+
self.down_blocks = nn.ModuleList()
|
78 |
+
self.down_downscalers = nn.ModuleList()
|
79 |
+
self.down_repeat_mappers = nn.ModuleList()
|
80 |
+
for i in range(len(c_hidden)):
|
81 |
+
if i > 0:
|
82 |
+
self.down_downscalers.append(nn.Sequential(
|
83 |
+
LayerNorm2d_op(operations)(c_hidden[i - 1], elementwise_affine=False, eps=1e-6, dtype=dtype, device=device),
|
84 |
+
operations.Conv2d(c_hidden[i - 1], c_hidden[i], kernel_size=2, stride=2, dtype=dtype, device=device),
|
85 |
+
))
|
86 |
+
else:
|
87 |
+
self.down_downscalers.append(nn.Identity())
|
88 |
+
down_block = nn.ModuleList()
|
89 |
+
for _ in range(blocks[0][i]):
|
90 |
+
for block_type in level_config[i]:
|
91 |
+
block = get_block(block_type, c_hidden[i], nhead[i], dropout=dropout[i], self_attn=self_attn[i])
|
92 |
+
down_block.append(block)
|
93 |
+
self.down_blocks.append(down_block)
|
94 |
+
if block_repeat is not None:
|
95 |
+
block_repeat_mappers = nn.ModuleList()
|
96 |
+
for _ in range(block_repeat[0][i] - 1):
|
97 |
+
block_repeat_mappers.append(operations.Conv2d(c_hidden[i], c_hidden[i], kernel_size=1, dtype=dtype, device=device))
|
98 |
+
self.down_repeat_mappers.append(block_repeat_mappers)
|
99 |
+
|
100 |
+
# -- up blocks
|
101 |
+
self.up_blocks = nn.ModuleList()
|
102 |
+
self.up_upscalers = nn.ModuleList()
|
103 |
+
self.up_repeat_mappers = nn.ModuleList()
|
104 |
+
for i in reversed(range(len(c_hidden))):
|
105 |
+
if i > 0:
|
106 |
+
self.up_upscalers.append(nn.Sequential(
|
107 |
+
LayerNorm2d_op(operations)(c_hidden[i], elementwise_affine=False, eps=1e-6, dtype=dtype, device=device),
|
108 |
+
operations.ConvTranspose2d(c_hidden[i], c_hidden[i - 1], kernel_size=2, stride=2, dtype=dtype, device=device),
|
109 |
+
))
|
110 |
+
else:
|
111 |
+
self.up_upscalers.append(nn.Identity())
|
112 |
+
up_block = nn.ModuleList()
|
113 |
+
for j in range(blocks[1][::-1][i]):
|
114 |
+
for k, block_type in enumerate(level_config[i]):
|
115 |
+
c_skip = c_hidden[i] if i < len(c_hidden) - 1 and j == k == 0 else 0
|
116 |
+
block = get_block(block_type, c_hidden[i], nhead[i], c_skip=c_skip, dropout=dropout[i],
|
117 |
+
self_attn=self_attn[i])
|
118 |
+
up_block.append(block)
|
119 |
+
self.up_blocks.append(up_block)
|
120 |
+
if block_repeat is not None:
|
121 |
+
block_repeat_mappers = nn.ModuleList()
|
122 |
+
for _ in range(block_repeat[1][::-1][i] - 1):
|
123 |
+
block_repeat_mappers.append(operations.Conv2d(c_hidden[i], c_hidden[i], kernel_size=1, dtype=dtype, device=device))
|
124 |
+
self.up_repeat_mappers.append(block_repeat_mappers)
|
125 |
+
|
126 |
+
# OUTPUT
|
127 |
+
self.clf = nn.Sequential(
|
128 |
+
LayerNorm2d_op(operations)(c_hidden[0], elementwise_affine=False, eps=1e-6, dtype=dtype, device=device),
|
129 |
+
operations.Conv2d(c_hidden[0], c_out * (patch_size ** 2), kernel_size=1, dtype=dtype, device=device),
|
130 |
+
nn.PixelShuffle(patch_size),
|
131 |
+
)
|
132 |
+
|
133 |
+
# --- WEIGHT INIT ---
|
134 |
+
# self.apply(self._init_weights) # General init
|
135 |
+
# nn.init.normal_(self.clip_mapper.weight, std=0.02) # conditionings
|
136 |
+
# nn.init.normal_(self.effnet_mapper[0].weight, std=0.02) # conditionings
|
137 |
+
# nn.init.normal_(self.effnet_mapper[2].weight, std=0.02) # conditionings
|
138 |
+
# nn.init.normal_(self.pixels_mapper[0].weight, std=0.02) # conditionings
|
139 |
+
# nn.init.normal_(self.pixels_mapper[2].weight, std=0.02) # conditionings
|
140 |
+
# torch.nn.init.xavier_uniform_(self.embedding[1].weight, 0.02) # inputs
|
141 |
+
# nn.init.constant_(self.clf[1].weight, 0) # outputs
|
142 |
+
#
|
143 |
+
# # blocks
|
144 |
+
# for level_block in self.down_blocks + self.up_blocks:
|
145 |
+
# for block in level_block:
|
146 |
+
# if isinstance(block, ResBlock) or isinstance(block, FeedForwardBlock):
|
147 |
+
# block.channelwise[-1].weight.data *= np.sqrt(1 / sum(blocks[0]))
|
148 |
+
# elif isinstance(block, TimestepBlock):
|
149 |
+
# for layer in block.modules():
|
150 |
+
# if isinstance(layer, nn.Linear):
|
151 |
+
# nn.init.constant_(layer.weight, 0)
|
152 |
+
#
|
153 |
+
# def _init_weights(self, m):
|
154 |
+
# if isinstance(m, (nn.Conv2d, nn.Linear)):
|
155 |
+
# torch.nn.init.xavier_uniform_(m.weight)
|
156 |
+
# if m.bias is not None:
|
157 |
+
# nn.init.constant_(m.bias, 0)
|
158 |
+
|
159 |
+
def gen_r_embedding(self, r, max_positions=10000):
|
160 |
+
r = r * max_positions
|
161 |
+
half_dim = self.c_r // 2
|
162 |
+
emb = math.log(max_positions) / (half_dim - 1)
|
163 |
+
emb = torch.arange(half_dim, device=r.device).float().mul(-emb).exp()
|
164 |
+
emb = r[:, None] * emb[None, :]
|
165 |
+
emb = torch.cat([emb.sin(), emb.cos()], dim=1)
|
166 |
+
if self.c_r % 2 == 1: # zero pad
|
167 |
+
emb = nn.functional.pad(emb, (0, 1), mode='constant')
|
168 |
+
return emb
|
169 |
+
|
170 |
+
def gen_c_embeddings(self, clip):
|
171 |
+
if len(clip.shape) == 2:
|
172 |
+
clip = clip.unsqueeze(1)
|
173 |
+
clip = self.clip_mapper(clip).view(clip.size(0), clip.size(1) * self.c_clip_seq, -1)
|
174 |
+
clip = self.clip_norm(clip)
|
175 |
+
return clip
|
176 |
+
|
177 |
+
def _down_encode(self, x, r_embed, clip):
|
178 |
+
level_outputs = []
|
179 |
+
block_group = zip(self.down_blocks, self.down_downscalers, self.down_repeat_mappers)
|
180 |
+
for down_block, downscaler, repmap in block_group:
|
181 |
+
x = downscaler(x)
|
182 |
+
for i in range(len(repmap) + 1):
|
183 |
+
for block in down_block:
|
184 |
+
if isinstance(block, ResBlock) or (
|
185 |
+
hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
|
186 |
+
ResBlock)):
|
187 |
+
x = block(x)
|
188 |
+
elif isinstance(block, AttnBlock) or (
|
189 |
+
hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
|
190 |
+
AttnBlock)):
|
191 |
+
x = block(x, clip)
|
192 |
+
elif isinstance(block, TimestepBlock) or (
|
193 |
+
hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
|
194 |
+
TimestepBlock)):
|
195 |
+
x = block(x, r_embed)
|
196 |
+
else:
|
197 |
+
x = block(x)
|
198 |
+
if i < len(repmap):
|
199 |
+
x = repmap[i](x)
|
200 |
+
level_outputs.insert(0, x)
|
201 |
+
return level_outputs
|
202 |
+
|
203 |
+
def _up_decode(self, level_outputs, r_embed, clip):
|
204 |
+
x = level_outputs[0]
|
205 |
+
block_group = zip(self.up_blocks, self.up_upscalers, self.up_repeat_mappers)
|
206 |
+
for i, (up_block, upscaler, repmap) in enumerate(block_group):
|
207 |
+
for j in range(len(repmap) + 1):
|
208 |
+
for k, block in enumerate(up_block):
|
209 |
+
if isinstance(block, ResBlock) or (
|
210 |
+
hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
|
211 |
+
ResBlock)):
|
212 |
+
skip = level_outputs[i] if k == 0 and i > 0 else None
|
213 |
+
if skip is not None and (x.size(-1) != skip.size(-1) or x.size(-2) != skip.size(-2)):
|
214 |
+
x = torch.nn.functional.interpolate(x, skip.shape[-2:], mode='bilinear',
|
215 |
+
align_corners=True)
|
216 |
+
x = block(x, skip)
|
217 |
+
elif isinstance(block, AttnBlock) or (
|
218 |
+
hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
|
219 |
+
AttnBlock)):
|
220 |
+
x = block(x, clip)
|
221 |
+
elif isinstance(block, TimestepBlock) or (
|
222 |
+
hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
|
223 |
+
TimestepBlock)):
|
224 |
+
x = block(x, r_embed)
|
225 |
+
else:
|
226 |
+
x = block(x)
|
227 |
+
if j < len(repmap):
|
228 |
+
x = repmap[j](x)
|
229 |
+
x = upscaler(x)
|
230 |
+
return x
|
231 |
+
|
232 |
+
def forward(self, x, r, effnet, clip, pixels=None, **kwargs):
|
233 |
+
if pixels is None:
|
234 |
+
pixels = x.new_zeros(x.size(0), 3, 8, 8)
|
235 |
+
|
236 |
+
# Process the conditioning embeddings
|
237 |
+
r_embed = self.gen_r_embedding(r).to(dtype=x.dtype)
|
238 |
+
for c in self.t_conds:
|
239 |
+
t_cond = kwargs.get(c, torch.zeros_like(r))
|
240 |
+
r_embed = torch.cat([r_embed, self.gen_r_embedding(t_cond).to(dtype=x.dtype)], dim=1)
|
241 |
+
clip = self.gen_c_embeddings(clip)
|
242 |
+
|
243 |
+
# Model Blocks
|
244 |
+
x = self.embedding(x)
|
245 |
+
x = x + self.effnet_mapper(
|
246 |
+
nn.functional.interpolate(effnet, size=x.shape[-2:], mode='bilinear', align_corners=True))
|
247 |
+
x = x + nn.functional.interpolate(self.pixels_mapper(pixels), size=x.shape[-2:], mode='bilinear',
|
248 |
+
align_corners=True)
|
249 |
+
level_outputs = self._down_encode(x, r_embed, clip)
|
250 |
+
x = self._up_decode(level_outputs, r_embed, clip)
|
251 |
+
return self.clf(x)
|
252 |
+
|
253 |
+
def update_weights_ema(self, src_model, beta=0.999):
|
254 |
+
for self_params, src_params in zip(self.parameters(), src_model.parameters()):
|
255 |
+
self_params.data = self_params.data * beta + src_params.data.clone().to(self_params.device) * (1 - beta)
|
256 |
+
for self_buffers, src_buffers in zip(self.buffers(), src_model.buffers()):
|
257 |
+
self_buffers.data = self_buffers.data * beta + src_buffers.data.clone().to(self_buffers.device) * (1 - beta)
|
ComfyUI/comfy/ldm/cascade/stage_c.py
ADDED
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This file is part of ComfyUI.
|
3 |
+
Copyright (C) 2024 Stability AI
|
4 |
+
|
5 |
+
This program is free software: you can redistribute it and/or modify
|
6 |
+
it under the terms of the GNU General Public License as published by
|
7 |
+
the Free Software Foundation, either version 3 of the License, or
|
8 |
+
(at your option) any later version.
|
9 |
+
|
10 |
+
This program is distributed in the hope that it will be useful,
|
11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13 |
+
GNU General Public License for more details.
|
14 |
+
|
15 |
+
You should have received a copy of the GNU General Public License
|
16 |
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
17 |
+
"""
|
18 |
+
|
19 |
+
import torch
|
20 |
+
from torch import nn
|
21 |
+
import numpy as np
|
22 |
+
import math
|
23 |
+
from .common import AttnBlock, LayerNorm2d_op, ResBlock, FeedForwardBlock, TimestepBlock
|
24 |
+
# from .controlnet import ControlNetDeliverer
|
25 |
+
|
26 |
+
class UpDownBlock2d(nn.Module):
|
27 |
+
def __init__(self, c_in, c_out, mode, enabled=True, dtype=None, device=None, operations=None):
|
28 |
+
super().__init__()
|
29 |
+
assert mode in ['up', 'down']
|
30 |
+
interpolation = nn.Upsample(scale_factor=2 if mode == 'up' else 0.5, mode='bilinear',
|
31 |
+
align_corners=True) if enabled else nn.Identity()
|
32 |
+
mapping = operations.Conv2d(c_in, c_out, kernel_size=1, dtype=dtype, device=device)
|
33 |
+
self.blocks = nn.ModuleList([interpolation, mapping] if mode == 'up' else [mapping, interpolation])
|
34 |
+
|
35 |
+
def forward(self, x):
|
36 |
+
for block in self.blocks:
|
37 |
+
x = block(x)
|
38 |
+
return x
|
39 |
+
|
40 |
+
|
41 |
+
class StageC(nn.Module):
|
42 |
+
def __init__(self, c_in=16, c_out=16, c_r=64, patch_size=1, c_cond=2048, c_hidden=[2048, 2048], nhead=[32, 32],
|
43 |
+
blocks=[[8, 24], [24, 8]], block_repeat=[[1, 1], [1, 1]], level_config=['CTA', 'CTA'],
|
44 |
+
c_clip_text=1280, c_clip_text_pooled=1280, c_clip_img=768, c_clip_seq=4, kernel_size=3,
|
45 |
+
dropout=[0.0, 0.0], self_attn=True, t_conds=['sca', 'crp'], switch_level=[False], stable_cascade_stage=None,
|
46 |
+
dtype=None, device=None, operations=None):
|
47 |
+
super().__init__()
|
48 |
+
self.dtype = dtype
|
49 |
+
self.c_r = c_r
|
50 |
+
self.t_conds = t_conds
|
51 |
+
self.c_clip_seq = c_clip_seq
|
52 |
+
if not isinstance(dropout, list):
|
53 |
+
dropout = [dropout] * len(c_hidden)
|
54 |
+
if not isinstance(self_attn, list):
|
55 |
+
self_attn = [self_attn] * len(c_hidden)
|
56 |
+
|
57 |
+
# CONDITIONING
|
58 |
+
self.clip_txt_mapper = operations.Linear(c_clip_text, c_cond, dtype=dtype, device=device)
|
59 |
+
self.clip_txt_pooled_mapper = operations.Linear(c_clip_text_pooled, c_cond * c_clip_seq, dtype=dtype, device=device)
|
60 |
+
self.clip_img_mapper = operations.Linear(c_clip_img, c_cond * c_clip_seq, dtype=dtype, device=device)
|
61 |
+
self.clip_norm = operations.LayerNorm(c_cond, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
|
62 |
+
|
63 |
+
self.embedding = nn.Sequential(
|
64 |
+
nn.PixelUnshuffle(patch_size),
|
65 |
+
operations.Conv2d(c_in * (patch_size ** 2), c_hidden[0], kernel_size=1, dtype=dtype, device=device),
|
66 |
+
LayerNorm2d_op(operations)(c_hidden[0], elementwise_affine=False, eps=1e-6)
|
67 |
+
)
|
68 |
+
|
69 |
+
def get_block(block_type, c_hidden, nhead, c_skip=0, dropout=0, self_attn=True):
|
70 |
+
if block_type == 'C':
|
71 |
+
return ResBlock(c_hidden, c_skip, kernel_size=kernel_size, dropout=dropout, dtype=dtype, device=device, operations=operations)
|
72 |
+
elif block_type == 'A':
|
73 |
+
return AttnBlock(c_hidden, c_cond, nhead, self_attn=self_attn, dropout=dropout, dtype=dtype, device=device, operations=operations)
|
74 |
+
elif block_type == 'F':
|
75 |
+
return FeedForwardBlock(c_hidden, dropout=dropout, dtype=dtype, device=device, operations=operations)
|
76 |
+
elif block_type == 'T':
|
77 |
+
return TimestepBlock(c_hidden, c_r, conds=t_conds, dtype=dtype, device=device, operations=operations)
|
78 |
+
else:
|
79 |
+
raise Exception(f'Block type {block_type} not supported')
|
80 |
+
|
81 |
+
# BLOCKS
|
82 |
+
# -- down blocks
|
83 |
+
self.down_blocks = nn.ModuleList()
|
84 |
+
self.down_downscalers = nn.ModuleList()
|
85 |
+
self.down_repeat_mappers = nn.ModuleList()
|
86 |
+
for i in range(len(c_hidden)):
|
87 |
+
if i > 0:
|
88 |
+
self.down_downscalers.append(nn.Sequential(
|
89 |
+
LayerNorm2d_op(operations)(c_hidden[i - 1], elementwise_affine=False, eps=1e-6),
|
90 |
+
UpDownBlock2d(c_hidden[i - 1], c_hidden[i], mode='down', enabled=switch_level[i - 1], dtype=dtype, device=device, operations=operations)
|
91 |
+
))
|
92 |
+
else:
|
93 |
+
self.down_downscalers.append(nn.Identity())
|
94 |
+
down_block = nn.ModuleList()
|
95 |
+
for _ in range(blocks[0][i]):
|
96 |
+
for block_type in level_config[i]:
|
97 |
+
block = get_block(block_type, c_hidden[i], nhead[i], dropout=dropout[i], self_attn=self_attn[i])
|
98 |
+
down_block.append(block)
|
99 |
+
self.down_blocks.append(down_block)
|
100 |
+
if block_repeat is not None:
|
101 |
+
block_repeat_mappers = nn.ModuleList()
|
102 |
+
for _ in range(block_repeat[0][i] - 1):
|
103 |
+
block_repeat_mappers.append(operations.Conv2d(c_hidden[i], c_hidden[i], kernel_size=1, dtype=dtype, device=device))
|
104 |
+
self.down_repeat_mappers.append(block_repeat_mappers)
|
105 |
+
|
106 |
+
# -- up blocks
|
107 |
+
self.up_blocks = nn.ModuleList()
|
108 |
+
self.up_upscalers = nn.ModuleList()
|
109 |
+
self.up_repeat_mappers = nn.ModuleList()
|
110 |
+
for i in reversed(range(len(c_hidden))):
|
111 |
+
if i > 0:
|
112 |
+
self.up_upscalers.append(nn.Sequential(
|
113 |
+
LayerNorm2d_op(operations)(c_hidden[i], elementwise_affine=False, eps=1e-6),
|
114 |
+
UpDownBlock2d(c_hidden[i], c_hidden[i - 1], mode='up', enabled=switch_level[i - 1], dtype=dtype, device=device, operations=operations)
|
115 |
+
))
|
116 |
+
else:
|
117 |
+
self.up_upscalers.append(nn.Identity())
|
118 |
+
up_block = nn.ModuleList()
|
119 |
+
for j in range(blocks[1][::-1][i]):
|
120 |
+
for k, block_type in enumerate(level_config[i]):
|
121 |
+
c_skip = c_hidden[i] if i < len(c_hidden) - 1 and j == k == 0 else 0
|
122 |
+
block = get_block(block_type, c_hidden[i], nhead[i], c_skip=c_skip, dropout=dropout[i],
|
123 |
+
self_attn=self_attn[i])
|
124 |
+
up_block.append(block)
|
125 |
+
self.up_blocks.append(up_block)
|
126 |
+
if block_repeat is not None:
|
127 |
+
block_repeat_mappers = nn.ModuleList()
|
128 |
+
for _ in range(block_repeat[1][::-1][i] - 1):
|
129 |
+
block_repeat_mappers.append(operations.Conv2d(c_hidden[i], c_hidden[i], kernel_size=1, dtype=dtype, device=device))
|
130 |
+
self.up_repeat_mappers.append(block_repeat_mappers)
|
131 |
+
|
132 |
+
# OUTPUT
|
133 |
+
self.clf = nn.Sequential(
|
134 |
+
LayerNorm2d_op(operations)(c_hidden[0], elementwise_affine=False, eps=1e-6, dtype=dtype, device=device),
|
135 |
+
operations.Conv2d(c_hidden[0], c_out * (patch_size ** 2), kernel_size=1, dtype=dtype, device=device),
|
136 |
+
nn.PixelShuffle(patch_size),
|
137 |
+
)
|
138 |
+
|
139 |
+
# --- WEIGHT INIT ---
|
140 |
+
# self.apply(self._init_weights) # General init
|
141 |
+
# nn.init.normal_(self.clip_txt_mapper.weight, std=0.02) # conditionings
|
142 |
+
# nn.init.normal_(self.clip_txt_pooled_mapper.weight, std=0.02) # conditionings
|
143 |
+
# nn.init.normal_(self.clip_img_mapper.weight, std=0.02) # conditionings
|
144 |
+
# torch.nn.init.xavier_uniform_(self.embedding[1].weight, 0.02) # inputs
|
145 |
+
# nn.init.constant_(self.clf[1].weight, 0) # outputs
|
146 |
+
#
|
147 |
+
# # blocks
|
148 |
+
# for level_block in self.down_blocks + self.up_blocks:
|
149 |
+
# for block in level_block:
|
150 |
+
# if isinstance(block, ResBlock) or isinstance(block, FeedForwardBlock):
|
151 |
+
# block.channelwise[-1].weight.data *= np.sqrt(1 / sum(blocks[0]))
|
152 |
+
# elif isinstance(block, TimestepBlock):
|
153 |
+
# for layer in block.modules():
|
154 |
+
# if isinstance(layer, nn.Linear):
|
155 |
+
# nn.init.constant_(layer.weight, 0)
|
156 |
+
#
|
157 |
+
# def _init_weights(self, m):
|
158 |
+
# if isinstance(m, (nn.Conv2d, nn.Linear)):
|
159 |
+
# torch.nn.init.xavier_uniform_(m.weight)
|
160 |
+
# if m.bias is not None:
|
161 |
+
# nn.init.constant_(m.bias, 0)
|
162 |
+
|
163 |
+
def gen_r_embedding(self, r, max_positions=10000):
|
164 |
+
r = r * max_positions
|
165 |
+
half_dim = self.c_r // 2
|
166 |
+
emb = math.log(max_positions) / (half_dim - 1)
|
167 |
+
emb = torch.arange(half_dim, device=r.device).float().mul(-emb).exp()
|
168 |
+
emb = r[:, None] * emb[None, :]
|
169 |
+
emb = torch.cat([emb.sin(), emb.cos()], dim=1)
|
170 |
+
if self.c_r % 2 == 1: # zero pad
|
171 |
+
emb = nn.functional.pad(emb, (0, 1), mode='constant')
|
172 |
+
return emb
|
173 |
+
|
174 |
+
def gen_c_embeddings(self, clip_txt, clip_txt_pooled, clip_img):
|
175 |
+
clip_txt = self.clip_txt_mapper(clip_txt)
|
176 |
+
if len(clip_txt_pooled.shape) == 2:
|
177 |
+
clip_txt_pooled = clip_txt_pooled.unsqueeze(1)
|
178 |
+
if len(clip_img.shape) == 2:
|
179 |
+
clip_img = clip_img.unsqueeze(1)
|
180 |
+
clip_txt_pool = self.clip_txt_pooled_mapper(clip_txt_pooled).view(clip_txt_pooled.size(0), clip_txt_pooled.size(1) * self.c_clip_seq, -1)
|
181 |
+
clip_img = self.clip_img_mapper(clip_img).view(clip_img.size(0), clip_img.size(1) * self.c_clip_seq, -1)
|
182 |
+
clip = torch.cat([clip_txt, clip_txt_pool, clip_img], dim=1)
|
183 |
+
clip = self.clip_norm(clip)
|
184 |
+
return clip
|
185 |
+
|
186 |
+
def _down_encode(self, x, r_embed, clip, cnet=None):
|
187 |
+
level_outputs = []
|
188 |
+
block_group = zip(self.down_blocks, self.down_downscalers, self.down_repeat_mappers)
|
189 |
+
for down_block, downscaler, repmap in block_group:
|
190 |
+
x = downscaler(x)
|
191 |
+
for i in range(len(repmap) + 1):
|
192 |
+
for block in down_block:
|
193 |
+
if isinstance(block, ResBlock) or (
|
194 |
+
hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
|
195 |
+
ResBlock)):
|
196 |
+
if cnet is not None:
|
197 |
+
next_cnet = cnet.pop()
|
198 |
+
if next_cnet is not None:
|
199 |
+
x = x + nn.functional.interpolate(next_cnet, size=x.shape[-2:], mode='bilinear',
|
200 |
+
align_corners=True).to(x.dtype)
|
201 |
+
x = block(x)
|
202 |
+
elif isinstance(block, AttnBlock) or (
|
203 |
+
hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
|
204 |
+
AttnBlock)):
|
205 |
+
x = block(x, clip)
|
206 |
+
elif isinstance(block, TimestepBlock) or (
|
207 |
+
hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
|
208 |
+
TimestepBlock)):
|
209 |
+
x = block(x, r_embed)
|
210 |
+
else:
|
211 |
+
x = block(x)
|
212 |
+
if i < len(repmap):
|
213 |
+
x = repmap[i](x)
|
214 |
+
level_outputs.insert(0, x)
|
215 |
+
return level_outputs
|
216 |
+
|
217 |
+
def _up_decode(self, level_outputs, r_embed, clip, cnet=None):
|
218 |
+
x = level_outputs[0]
|
219 |
+
block_group = zip(self.up_blocks, self.up_upscalers, self.up_repeat_mappers)
|
220 |
+
for i, (up_block, upscaler, repmap) in enumerate(block_group):
|
221 |
+
for j in range(len(repmap) + 1):
|
222 |
+
for k, block in enumerate(up_block):
|
223 |
+
if isinstance(block, ResBlock) or (
|
224 |
+
hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
|
225 |
+
ResBlock)):
|
226 |
+
skip = level_outputs[i] if k == 0 and i > 0 else None
|
227 |
+
if skip is not None and (x.size(-1) != skip.size(-1) or x.size(-2) != skip.size(-2)):
|
228 |
+
x = torch.nn.functional.interpolate(x, skip.shape[-2:], mode='bilinear',
|
229 |
+
align_corners=True)
|
230 |
+
if cnet is not None:
|
231 |
+
next_cnet = cnet.pop()
|
232 |
+
if next_cnet is not None:
|
233 |
+
x = x + nn.functional.interpolate(next_cnet, size=x.shape[-2:], mode='bilinear',
|
234 |
+
align_corners=True).to(x.dtype)
|
235 |
+
x = block(x, skip)
|
236 |
+
elif isinstance(block, AttnBlock) or (
|
237 |
+
hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
|
238 |
+
AttnBlock)):
|
239 |
+
x = block(x, clip)
|
240 |
+
elif isinstance(block, TimestepBlock) or (
|
241 |
+
hasattr(block, '_fsdp_wrapped_module') and isinstance(block._fsdp_wrapped_module,
|
242 |
+
TimestepBlock)):
|
243 |
+
x = block(x, r_embed)
|
244 |
+
else:
|
245 |
+
x = block(x)
|
246 |
+
if j < len(repmap):
|
247 |
+
x = repmap[j](x)
|
248 |
+
x = upscaler(x)
|
249 |
+
return x
|
250 |
+
|
251 |
+
def forward(self, x, r, clip_text, clip_text_pooled, clip_img, control=None, **kwargs):
|
252 |
+
# Process the conditioning embeddings
|
253 |
+
r_embed = self.gen_r_embedding(r).to(dtype=x.dtype)
|
254 |
+
for c in self.t_conds:
|
255 |
+
t_cond = kwargs.get(c, torch.zeros_like(r))
|
256 |
+
r_embed = torch.cat([r_embed, self.gen_r_embedding(t_cond).to(dtype=x.dtype)], dim=1)
|
257 |
+
clip = self.gen_c_embeddings(clip_text, clip_text_pooled, clip_img)
|
258 |
+
|
259 |
+
if control is not None:
|
260 |
+
cnet = control.get("input")
|
261 |
+
else:
|
262 |
+
cnet = None
|
263 |
+
|
264 |
+
# Model Blocks
|
265 |
+
x = self.embedding(x)
|
266 |
+
level_outputs = self._down_encode(x, r_embed, clip, cnet)
|
267 |
+
x = self._up_decode(level_outputs, r_embed, clip, cnet)
|
268 |
+
return self.clf(x)
|
269 |
+
|
270 |
+
def update_weights_ema(self, src_model, beta=0.999):
|
271 |
+
for self_params, src_params in zip(self.parameters(), src_model.parameters()):
|
272 |
+
self_params.data = self_params.data * beta + src_params.data.clone().to(self_params.device) * (1 - beta)
|
273 |
+
for self_buffers, src_buffers in zip(self.buffers(), src_model.buffers()):
|
274 |
+
self_buffers.data = self_buffers.data * beta + src_buffers.data.clone().to(self_buffers.device) * (1 - beta)
|
ComfyUI/comfy/ldm/cascade/stage_c_coder.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This file is part of ComfyUI.
|
3 |
+
Copyright (C) 2024 Stability AI
|
4 |
+
|
5 |
+
This program is free software: you can redistribute it and/or modify
|
6 |
+
it under the terms of the GNU General Public License as published by
|
7 |
+
the Free Software Foundation, either version 3 of the License, or
|
8 |
+
(at your option) any later version.
|
9 |
+
|
10 |
+
This program is distributed in the hope that it will be useful,
|
11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13 |
+
GNU General Public License for more details.
|
14 |
+
|
15 |
+
You should have received a copy of the GNU General Public License
|
16 |
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
17 |
+
"""
|
18 |
+
import torch
|
19 |
+
import torchvision
|
20 |
+
from torch import nn
|
21 |
+
|
22 |
+
|
23 |
+
# EfficientNet
|
24 |
+
class EfficientNetEncoder(nn.Module):
|
25 |
+
def __init__(self, c_latent=16):
|
26 |
+
super().__init__()
|
27 |
+
self.backbone = torchvision.models.efficientnet_v2_s().features.eval()
|
28 |
+
self.mapper = nn.Sequential(
|
29 |
+
nn.Conv2d(1280, c_latent, kernel_size=1, bias=False),
|
30 |
+
nn.BatchNorm2d(c_latent, affine=False), # then normalize them to have mean 0 and std 1
|
31 |
+
)
|
32 |
+
self.mean = nn.Parameter(torch.tensor([0.485, 0.456, 0.406]))
|
33 |
+
self.std = nn.Parameter(torch.tensor([0.229, 0.224, 0.225]))
|
34 |
+
|
35 |
+
def forward(self, x):
|
36 |
+
x = x * 0.5 + 0.5
|
37 |
+
x = (x - self.mean.view([3,1,1])) / self.std.view([3,1,1])
|
38 |
+
o = self.mapper(self.backbone(x))
|
39 |
+
return o
|
40 |
+
|
41 |
+
|
42 |
+
# Fast Decoder for Stage C latents. E.g. 16 x 24 x 24 -> 3 x 192 x 192
|
43 |
+
class Previewer(nn.Module):
|
44 |
+
def __init__(self, c_in=16, c_hidden=512, c_out=3):
|
45 |
+
super().__init__()
|
46 |
+
self.blocks = nn.Sequential(
|
47 |
+
nn.Conv2d(c_in, c_hidden, kernel_size=1), # 16 channels to 512 channels
|
48 |
+
nn.GELU(),
|
49 |
+
nn.BatchNorm2d(c_hidden),
|
50 |
+
|
51 |
+
nn.Conv2d(c_hidden, c_hidden, kernel_size=3, padding=1),
|
52 |
+
nn.GELU(),
|
53 |
+
nn.BatchNorm2d(c_hidden),
|
54 |
+
|
55 |
+
nn.ConvTranspose2d(c_hidden, c_hidden // 2, kernel_size=2, stride=2), # 16 -> 32
|
56 |
+
nn.GELU(),
|
57 |
+
nn.BatchNorm2d(c_hidden // 2),
|
58 |
+
|
59 |
+
nn.Conv2d(c_hidden // 2, c_hidden // 2, kernel_size=3, padding=1),
|
60 |
+
nn.GELU(),
|
61 |
+
nn.BatchNorm2d(c_hidden // 2),
|
62 |
+
|
63 |
+
nn.ConvTranspose2d(c_hidden // 2, c_hidden // 4, kernel_size=2, stride=2), # 32 -> 64
|
64 |
+
nn.GELU(),
|
65 |
+
nn.BatchNorm2d(c_hidden // 4),
|
66 |
+
|
67 |
+
nn.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
|
68 |
+
nn.GELU(),
|
69 |
+
nn.BatchNorm2d(c_hidden // 4),
|
70 |
+
|
71 |
+
nn.ConvTranspose2d(c_hidden // 4, c_hidden // 4, kernel_size=2, stride=2), # 64 -> 128
|
72 |
+
nn.GELU(),
|
73 |
+
nn.BatchNorm2d(c_hidden // 4),
|
74 |
+
|
75 |
+
nn.Conv2d(c_hidden // 4, c_hidden // 4, kernel_size=3, padding=1),
|
76 |
+
nn.GELU(),
|
77 |
+
nn.BatchNorm2d(c_hidden // 4),
|
78 |
+
|
79 |
+
nn.Conv2d(c_hidden // 4, c_out, kernel_size=1),
|
80 |
+
)
|
81 |
+
|
82 |
+
def forward(self, x):
|
83 |
+
return (self.blocks(x) - 0.5) * 2.0
|
84 |
+
|
85 |
+
class StageC_coder(nn.Module):
|
86 |
+
def __init__(self):
|
87 |
+
super().__init__()
|
88 |
+
self.previewer = Previewer()
|
89 |
+
self.encoder = EfficientNetEncoder()
|
90 |
+
|
91 |
+
def encode(self, x):
|
92 |
+
return self.encoder(x)
|
93 |
+
|
94 |
+
def decode(self, x):
|
95 |
+
return self.previewer(x)
|
ComfyUI/comfy/ldm/models/autoencoder.py
ADDED
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
# import pytorch_lightning as pl
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from contextlib import contextmanager
|
5 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
6 |
+
|
7 |
+
from comfy.ldm.modules.distributions.distributions import DiagonalGaussianDistribution
|
8 |
+
|
9 |
+
from comfy.ldm.util import instantiate_from_config
|
10 |
+
from comfy.ldm.modules.ema import LitEma
|
11 |
+
import comfy.ops
|
12 |
+
|
13 |
+
class DiagonalGaussianRegularizer(torch.nn.Module):
|
14 |
+
def __init__(self, sample: bool = True):
|
15 |
+
super().__init__()
|
16 |
+
self.sample = sample
|
17 |
+
|
18 |
+
def get_trainable_parameters(self) -> Any:
|
19 |
+
yield from ()
|
20 |
+
|
21 |
+
def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, dict]:
|
22 |
+
log = dict()
|
23 |
+
posterior = DiagonalGaussianDistribution(z)
|
24 |
+
if self.sample:
|
25 |
+
z = posterior.sample()
|
26 |
+
else:
|
27 |
+
z = posterior.mode()
|
28 |
+
kl_loss = posterior.kl()
|
29 |
+
kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
|
30 |
+
log["kl_loss"] = kl_loss
|
31 |
+
return z, log
|
32 |
+
|
33 |
+
|
34 |
+
class AbstractAutoencoder(torch.nn.Module):
|
35 |
+
"""
|
36 |
+
This is the base class for all autoencoders, including image autoencoders, image autoencoders with discriminators,
|
37 |
+
unCLIP models, etc. Hence, it is fairly general, and specific features
|
38 |
+
(e.g. discriminator training, encoding, decoding) must be implemented in subclasses.
|
39 |
+
"""
|
40 |
+
|
41 |
+
def __init__(
|
42 |
+
self,
|
43 |
+
ema_decay: Union[None, float] = None,
|
44 |
+
monitor: Union[None, str] = None,
|
45 |
+
input_key: str = "jpg",
|
46 |
+
**kwargs,
|
47 |
+
):
|
48 |
+
super().__init__()
|
49 |
+
|
50 |
+
self.input_key = input_key
|
51 |
+
self.use_ema = ema_decay is not None
|
52 |
+
if monitor is not None:
|
53 |
+
self.monitor = monitor
|
54 |
+
|
55 |
+
if self.use_ema:
|
56 |
+
self.model_ema = LitEma(self, decay=ema_decay)
|
57 |
+
logpy.info(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
|
58 |
+
|
59 |
+
def get_input(self, batch) -> Any:
|
60 |
+
raise NotImplementedError()
|
61 |
+
|
62 |
+
def on_train_batch_end(self, *args, **kwargs):
|
63 |
+
# for EMA computation
|
64 |
+
if self.use_ema:
|
65 |
+
self.model_ema(self)
|
66 |
+
|
67 |
+
@contextmanager
|
68 |
+
def ema_scope(self, context=None):
|
69 |
+
if self.use_ema:
|
70 |
+
self.model_ema.store(self.parameters())
|
71 |
+
self.model_ema.copy_to(self)
|
72 |
+
if context is not None:
|
73 |
+
logpy.info(f"{context}: Switched to EMA weights")
|
74 |
+
try:
|
75 |
+
yield None
|
76 |
+
finally:
|
77 |
+
if self.use_ema:
|
78 |
+
self.model_ema.restore(self.parameters())
|
79 |
+
if context is not None:
|
80 |
+
logpy.info(f"{context}: Restored training weights")
|
81 |
+
|
82 |
+
def encode(self, *args, **kwargs) -> torch.Tensor:
|
83 |
+
raise NotImplementedError("encode()-method of abstract base class called")
|
84 |
+
|
85 |
+
def decode(self, *args, **kwargs) -> torch.Tensor:
|
86 |
+
raise NotImplementedError("decode()-method of abstract base class called")
|
87 |
+
|
88 |
+
def instantiate_optimizer_from_config(self, params, lr, cfg):
|
89 |
+
logpy.info(f"loading >>> {cfg['target']} <<< optimizer from config")
|
90 |
+
return get_obj_from_str(cfg["target"])(
|
91 |
+
params, lr=lr, **cfg.get("params", dict())
|
92 |
+
)
|
93 |
+
|
94 |
+
def configure_optimizers(self) -> Any:
|
95 |
+
raise NotImplementedError()
|
96 |
+
|
97 |
+
|
98 |
+
class AutoencodingEngine(AbstractAutoencoder):
|
99 |
+
"""
|
100 |
+
Base class for all image autoencoders that we train, like VQGAN or AutoencoderKL
|
101 |
+
(we also restore them explicitly as special cases for legacy reasons).
|
102 |
+
Regularizations such as KL or VQ are moved to the regularizer class.
|
103 |
+
"""
|
104 |
+
|
105 |
+
def __init__(
|
106 |
+
self,
|
107 |
+
*args,
|
108 |
+
encoder_config: Dict,
|
109 |
+
decoder_config: Dict,
|
110 |
+
regularizer_config: Dict,
|
111 |
+
**kwargs,
|
112 |
+
):
|
113 |
+
super().__init__(*args, **kwargs)
|
114 |
+
|
115 |
+
self.encoder: torch.nn.Module = instantiate_from_config(encoder_config)
|
116 |
+
self.decoder: torch.nn.Module = instantiate_from_config(decoder_config)
|
117 |
+
self.regularization: AbstractRegularizer = instantiate_from_config(
|
118 |
+
regularizer_config
|
119 |
+
)
|
120 |
+
|
121 |
+
def get_last_layer(self):
|
122 |
+
return self.decoder.get_last_layer()
|
123 |
+
|
124 |
+
def encode(
|
125 |
+
self,
|
126 |
+
x: torch.Tensor,
|
127 |
+
return_reg_log: bool = False,
|
128 |
+
unregularized: bool = False,
|
129 |
+
) -> Union[torch.Tensor, Tuple[torch.Tensor, dict]]:
|
130 |
+
z = self.encoder(x)
|
131 |
+
if unregularized:
|
132 |
+
return z, dict()
|
133 |
+
z, reg_log = self.regularization(z)
|
134 |
+
if return_reg_log:
|
135 |
+
return z, reg_log
|
136 |
+
return z
|
137 |
+
|
138 |
+
def decode(self, z: torch.Tensor, **kwargs) -> torch.Tensor:
|
139 |
+
x = self.decoder(z, **kwargs)
|
140 |
+
return x
|
141 |
+
|
142 |
+
def forward(
|
143 |
+
self, x: torch.Tensor, **additional_decode_kwargs
|
144 |
+
) -> Tuple[torch.Tensor, torch.Tensor, dict]:
|
145 |
+
z, reg_log = self.encode(x, return_reg_log=True)
|
146 |
+
dec = self.decode(z, **additional_decode_kwargs)
|
147 |
+
return z, dec, reg_log
|
148 |
+
|
149 |
+
|
150 |
+
class AutoencodingEngineLegacy(AutoencodingEngine):
|
151 |
+
def __init__(self, embed_dim: int, **kwargs):
|
152 |
+
self.max_batch_size = kwargs.pop("max_batch_size", None)
|
153 |
+
ddconfig = kwargs.pop("ddconfig")
|
154 |
+
super().__init__(
|
155 |
+
encoder_config={
|
156 |
+
"target": "comfy.ldm.modules.diffusionmodules.model.Encoder",
|
157 |
+
"params": ddconfig,
|
158 |
+
},
|
159 |
+
decoder_config={
|
160 |
+
"target": "comfy.ldm.modules.diffusionmodules.model.Decoder",
|
161 |
+
"params": ddconfig,
|
162 |
+
},
|
163 |
+
**kwargs,
|
164 |
+
)
|
165 |
+
self.quant_conv = comfy.ops.disable_weight_init.Conv2d(
|
166 |
+
(1 + ddconfig["double_z"]) * ddconfig["z_channels"],
|
167 |
+
(1 + ddconfig["double_z"]) * embed_dim,
|
168 |
+
1,
|
169 |
+
)
|
170 |
+
self.post_quant_conv = comfy.ops.disable_weight_init.Conv2d(embed_dim, ddconfig["z_channels"], 1)
|
171 |
+
self.embed_dim = embed_dim
|
172 |
+
|
173 |
+
def get_autoencoder_params(self) -> list:
|
174 |
+
params = super().get_autoencoder_params()
|
175 |
+
return params
|
176 |
+
|
177 |
+
def encode(
|
178 |
+
self, x: torch.Tensor, return_reg_log: bool = False
|
179 |
+
) -> Union[torch.Tensor, Tuple[torch.Tensor, dict]]:
|
180 |
+
if self.max_batch_size is None:
|
181 |
+
z = self.encoder(x)
|
182 |
+
z = self.quant_conv(z)
|
183 |
+
else:
|
184 |
+
N = x.shape[0]
|
185 |
+
bs = self.max_batch_size
|
186 |
+
n_batches = int(math.ceil(N / bs))
|
187 |
+
z = list()
|
188 |
+
for i_batch in range(n_batches):
|
189 |
+
z_batch = self.encoder(x[i_batch * bs : (i_batch + 1) * bs])
|
190 |
+
z_batch = self.quant_conv(z_batch)
|
191 |
+
z.append(z_batch)
|
192 |
+
z = torch.cat(z, 0)
|
193 |
+
|
194 |
+
z, reg_log = self.regularization(z)
|
195 |
+
if return_reg_log:
|
196 |
+
return z, reg_log
|
197 |
+
return z
|
198 |
+
|
199 |
+
def decode(self, z: torch.Tensor, **decoder_kwargs) -> torch.Tensor:
|
200 |
+
if self.max_batch_size is None:
|
201 |
+
dec = self.post_quant_conv(z)
|
202 |
+
dec = self.decoder(dec, **decoder_kwargs)
|
203 |
+
else:
|
204 |
+
N = z.shape[0]
|
205 |
+
bs = self.max_batch_size
|
206 |
+
n_batches = int(math.ceil(N / bs))
|
207 |
+
dec = list()
|
208 |
+
for i_batch in range(n_batches):
|
209 |
+
dec_batch = self.post_quant_conv(z[i_batch * bs : (i_batch + 1) * bs])
|
210 |
+
dec_batch = self.decoder(dec_batch, **decoder_kwargs)
|
211 |
+
dec.append(dec_batch)
|
212 |
+
dec = torch.cat(dec, 0)
|
213 |
+
|
214 |
+
return dec
|
215 |
+
|
216 |
+
|
217 |
+
class AutoencoderKL(AutoencodingEngineLegacy):
|
218 |
+
def __init__(self, **kwargs):
|
219 |
+
if "lossconfig" in kwargs:
|
220 |
+
kwargs["loss_config"] = kwargs.pop("lossconfig")
|
221 |
+
super().__init__(
|
222 |
+
regularizer_config={
|
223 |
+
"target": (
|
224 |
+
"comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"
|
225 |
+
)
|
226 |
+
},
|
227 |
+
**kwargs,
|
228 |
+
)
|
ComfyUI/comfy/ldm/modules/attention.py
ADDED
@@ -0,0 +1,801 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from torch import nn, einsum
|
5 |
+
from einops import rearrange, repeat
|
6 |
+
from typing import Optional, Any
|
7 |
+
import logging
|
8 |
+
|
9 |
+
from .diffusionmodules.util import checkpoint, AlphaBlender, timestep_embedding
|
10 |
+
from .sub_quadratic_attention import efficient_dot_product_attention
|
11 |
+
|
12 |
+
from comfy import model_management
|
13 |
+
|
14 |
+
if model_management.xformers_enabled():
|
15 |
+
import xformers
|
16 |
+
import xformers.ops
|
17 |
+
|
18 |
+
from comfy.cli_args import args
|
19 |
+
import comfy.ops
|
20 |
+
ops = comfy.ops.disable_weight_init
|
21 |
+
|
22 |
+
# CrossAttn precision handling
|
23 |
+
if args.dont_upcast_attention:
|
24 |
+
logging.info("disabling upcasting of attention")
|
25 |
+
_ATTN_PRECISION = "fp16"
|
26 |
+
else:
|
27 |
+
_ATTN_PRECISION = "fp32"
|
28 |
+
|
29 |
+
|
30 |
+
def exists(val):
|
31 |
+
return val is not None
|
32 |
+
|
33 |
+
|
34 |
+
def uniq(arr):
|
35 |
+
return{el: True for el in arr}.keys()
|
36 |
+
|
37 |
+
|
38 |
+
def default(val, d):
|
39 |
+
if exists(val):
|
40 |
+
return val
|
41 |
+
return d
|
42 |
+
|
43 |
+
|
44 |
+
def max_neg_value(t):
|
45 |
+
return -torch.finfo(t.dtype).max
|
46 |
+
|
47 |
+
|
48 |
+
def init_(tensor):
|
49 |
+
dim = tensor.shape[-1]
|
50 |
+
std = 1 / math.sqrt(dim)
|
51 |
+
tensor.uniform_(-std, std)
|
52 |
+
return tensor
|
53 |
+
|
54 |
+
|
55 |
+
# feedforward
|
56 |
+
class GEGLU(nn.Module):
|
57 |
+
def __init__(self, dim_in, dim_out, dtype=None, device=None, operations=ops):
|
58 |
+
super().__init__()
|
59 |
+
self.proj = operations.Linear(dim_in, dim_out * 2, dtype=dtype, device=device)
|
60 |
+
|
61 |
+
def forward(self, x):
|
62 |
+
x, gate = self.proj(x).chunk(2, dim=-1)
|
63 |
+
return x * F.gelu(gate)
|
64 |
+
|
65 |
+
|
66 |
+
class FeedForward(nn.Module):
|
67 |
+
def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0., dtype=None, device=None, operations=ops):
|
68 |
+
super().__init__()
|
69 |
+
inner_dim = int(dim * mult)
|
70 |
+
dim_out = default(dim_out, dim)
|
71 |
+
project_in = nn.Sequential(
|
72 |
+
operations.Linear(dim, inner_dim, dtype=dtype, device=device),
|
73 |
+
nn.GELU()
|
74 |
+
) if not glu else GEGLU(dim, inner_dim, dtype=dtype, device=device, operations=operations)
|
75 |
+
|
76 |
+
self.net = nn.Sequential(
|
77 |
+
project_in,
|
78 |
+
nn.Dropout(dropout),
|
79 |
+
operations.Linear(inner_dim, dim_out, dtype=dtype, device=device)
|
80 |
+
)
|
81 |
+
|
82 |
+
def forward(self, x):
|
83 |
+
return self.net(x)
|
84 |
+
|
85 |
+
def Normalize(in_channels, dtype=None, device=None):
|
86 |
+
return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True, dtype=dtype, device=device)
|
87 |
+
|
88 |
+
def attention_basic(q, k, v, heads, mask=None):
|
89 |
+
b, _, dim_head = q.shape
|
90 |
+
dim_head //= heads
|
91 |
+
scale = dim_head ** -0.5
|
92 |
+
|
93 |
+
h = heads
|
94 |
+
q, k, v = map(
|
95 |
+
lambda t: t.unsqueeze(3)
|
96 |
+
.reshape(b, -1, heads, dim_head)
|
97 |
+
.permute(0, 2, 1, 3)
|
98 |
+
.reshape(b * heads, -1, dim_head)
|
99 |
+
.contiguous(),
|
100 |
+
(q, k, v),
|
101 |
+
)
|
102 |
+
|
103 |
+
# force cast to fp32 to avoid overflowing
|
104 |
+
if _ATTN_PRECISION =="fp32":
|
105 |
+
sim = einsum('b i d, b j d -> b i j', q.float(), k.float()) * scale
|
106 |
+
else:
|
107 |
+
sim = einsum('b i d, b j d -> b i j', q, k) * scale
|
108 |
+
|
109 |
+
del q, k
|
110 |
+
|
111 |
+
if exists(mask):
|
112 |
+
if mask.dtype == torch.bool:
|
113 |
+
mask = rearrange(mask, 'b ... -> b (...)') #TODO: check if this bool part matches pytorch attention
|
114 |
+
max_neg_value = -torch.finfo(sim.dtype).max
|
115 |
+
mask = repeat(mask, 'b j -> (b h) () j', h=h)
|
116 |
+
sim.masked_fill_(~mask, max_neg_value)
|
117 |
+
else:
|
118 |
+
if len(mask.shape) == 2:
|
119 |
+
bs = 1
|
120 |
+
else:
|
121 |
+
bs = mask.shape[0]
|
122 |
+
mask = mask.reshape(bs, -1, mask.shape[-2], mask.shape[-1]).expand(b, heads, -1, -1).reshape(-1, mask.shape[-2], mask.shape[-1])
|
123 |
+
sim.add_(mask)
|
124 |
+
|
125 |
+
# attention, what we cannot get enough of
|
126 |
+
sim = sim.softmax(dim=-1)
|
127 |
+
|
128 |
+
out = einsum('b i j, b j d -> b i d', sim.to(v.dtype), v)
|
129 |
+
out = (
|
130 |
+
out.unsqueeze(0)
|
131 |
+
.reshape(b, heads, -1, dim_head)
|
132 |
+
.permute(0, 2, 1, 3)
|
133 |
+
.reshape(b, -1, heads * dim_head)
|
134 |
+
)
|
135 |
+
return out
|
136 |
+
|
137 |
+
|
138 |
+
def attention_sub_quad(query, key, value, heads, mask=None):
|
139 |
+
b, _, dim_head = query.shape
|
140 |
+
dim_head //= heads
|
141 |
+
|
142 |
+
scale = dim_head ** -0.5
|
143 |
+
query = query.unsqueeze(3).reshape(b, -1, heads, dim_head).permute(0, 2, 1, 3).reshape(b * heads, -1, dim_head)
|
144 |
+
value = value.unsqueeze(3).reshape(b, -1, heads, dim_head).permute(0, 2, 1, 3).reshape(b * heads, -1, dim_head)
|
145 |
+
|
146 |
+
key = key.unsqueeze(3).reshape(b, -1, heads, dim_head).permute(0, 2, 3, 1).reshape(b * heads, dim_head, -1)
|
147 |
+
|
148 |
+
dtype = query.dtype
|
149 |
+
upcast_attention = _ATTN_PRECISION =="fp32" and query.dtype != torch.float32
|
150 |
+
if upcast_attention:
|
151 |
+
bytes_per_token = torch.finfo(torch.float32).bits//8
|
152 |
+
else:
|
153 |
+
bytes_per_token = torch.finfo(query.dtype).bits//8
|
154 |
+
batch_x_heads, q_tokens, _ = query.shape
|
155 |
+
_, _, k_tokens = key.shape
|
156 |
+
qk_matmul_size_bytes = batch_x_heads * bytes_per_token * q_tokens * k_tokens
|
157 |
+
|
158 |
+
mem_free_total, mem_free_torch = model_management.get_free_memory(query.device, True)
|
159 |
+
|
160 |
+
kv_chunk_size_min = None
|
161 |
+
kv_chunk_size = None
|
162 |
+
query_chunk_size = None
|
163 |
+
|
164 |
+
for x in [4096, 2048, 1024, 512, 256]:
|
165 |
+
count = mem_free_total / (batch_x_heads * bytes_per_token * x * 4.0)
|
166 |
+
if count >= k_tokens:
|
167 |
+
kv_chunk_size = k_tokens
|
168 |
+
query_chunk_size = x
|
169 |
+
break
|
170 |
+
|
171 |
+
if query_chunk_size is None:
|
172 |
+
query_chunk_size = 512
|
173 |
+
|
174 |
+
if mask is not None:
|
175 |
+
if len(mask.shape) == 2:
|
176 |
+
bs = 1
|
177 |
+
else:
|
178 |
+
bs = mask.shape[0]
|
179 |
+
mask = mask.reshape(bs, -1, mask.shape[-2], mask.shape[-1]).expand(b, heads, -1, -1).reshape(-1, mask.shape[-2], mask.shape[-1])
|
180 |
+
|
181 |
+
hidden_states = efficient_dot_product_attention(
|
182 |
+
query,
|
183 |
+
key,
|
184 |
+
value,
|
185 |
+
query_chunk_size=query_chunk_size,
|
186 |
+
kv_chunk_size=kv_chunk_size,
|
187 |
+
kv_chunk_size_min=kv_chunk_size_min,
|
188 |
+
use_checkpoint=False,
|
189 |
+
upcast_attention=upcast_attention,
|
190 |
+
mask=mask,
|
191 |
+
)
|
192 |
+
|
193 |
+
hidden_states = hidden_states.to(dtype)
|
194 |
+
|
195 |
+
hidden_states = hidden_states.unflatten(0, (-1, heads)).transpose(1,2).flatten(start_dim=2)
|
196 |
+
return hidden_states
|
197 |
+
|
198 |
+
def attention_split(q, k, v, heads, mask=None):
|
199 |
+
b, _, dim_head = q.shape
|
200 |
+
dim_head //= heads
|
201 |
+
scale = dim_head ** -0.5
|
202 |
+
|
203 |
+
h = heads
|
204 |
+
q, k, v = map(
|
205 |
+
lambda t: t.unsqueeze(3)
|
206 |
+
.reshape(b, -1, heads, dim_head)
|
207 |
+
.permute(0, 2, 1, 3)
|
208 |
+
.reshape(b * heads, -1, dim_head)
|
209 |
+
.contiguous(),
|
210 |
+
(q, k, v),
|
211 |
+
)
|
212 |
+
|
213 |
+
r1 = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device, dtype=q.dtype)
|
214 |
+
|
215 |
+
mem_free_total = model_management.get_free_memory(q.device)
|
216 |
+
|
217 |
+
if _ATTN_PRECISION =="fp32":
|
218 |
+
element_size = 4
|
219 |
+
else:
|
220 |
+
element_size = q.element_size()
|
221 |
+
|
222 |
+
gb = 1024 ** 3
|
223 |
+
tensor_size = q.shape[0] * q.shape[1] * k.shape[1] * element_size
|
224 |
+
modifier = 3
|
225 |
+
mem_required = tensor_size * modifier
|
226 |
+
steps = 1
|
227 |
+
|
228 |
+
|
229 |
+
if mem_required > mem_free_total:
|
230 |
+
steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
|
231 |
+
# print(f"Expected tensor size:{tensor_size/gb:0.1f}GB, cuda free:{mem_free_cuda/gb:0.1f}GB "
|
232 |
+
# f"torch free:{mem_free_torch/gb:0.1f} total:{mem_free_total/gb:0.1f} steps:{steps}")
|
233 |
+
|
234 |
+
if steps > 64:
|
235 |
+
max_res = math.floor(math.sqrt(math.sqrt(mem_free_total / 2.5)) / 8) * 64
|
236 |
+
raise RuntimeError(f'Not enough memory, use lower resolution (max approx. {max_res}x{max_res}). '
|
237 |
+
f'Need: {mem_required/64/gb:0.1f}GB free, Have:{mem_free_total/gb:0.1f}GB free')
|
238 |
+
|
239 |
+
if mask is not None:
|
240 |
+
if len(mask.shape) == 2:
|
241 |
+
bs = 1
|
242 |
+
else:
|
243 |
+
bs = mask.shape[0]
|
244 |
+
mask = mask.reshape(bs, -1, mask.shape[-2], mask.shape[-1]).expand(b, heads, -1, -1).reshape(-1, mask.shape[-2], mask.shape[-1])
|
245 |
+
|
246 |
+
# print("steps", steps, mem_required, mem_free_total, modifier, q.element_size(), tensor_size)
|
247 |
+
first_op_done = False
|
248 |
+
cleared_cache = False
|
249 |
+
while True:
|
250 |
+
try:
|
251 |
+
slice_size = q.shape[1] // steps if (q.shape[1] % steps) == 0 else q.shape[1]
|
252 |
+
for i in range(0, q.shape[1], slice_size):
|
253 |
+
end = i + slice_size
|
254 |
+
if _ATTN_PRECISION =="fp32":
|
255 |
+
with torch.autocast(enabled=False, device_type = 'cuda'):
|
256 |
+
s1 = einsum('b i d, b j d -> b i j', q[:, i:end].float(), k.float()) * scale
|
257 |
+
else:
|
258 |
+
s1 = einsum('b i d, b j d -> b i j', q[:, i:end], k) * scale
|
259 |
+
|
260 |
+
if mask is not None:
|
261 |
+
if len(mask.shape) == 2:
|
262 |
+
s1 += mask[i:end]
|
263 |
+
else:
|
264 |
+
s1 += mask[:, i:end]
|
265 |
+
|
266 |
+
s2 = s1.softmax(dim=-1).to(v.dtype)
|
267 |
+
del s1
|
268 |
+
first_op_done = True
|
269 |
+
|
270 |
+
r1[:, i:end] = einsum('b i j, b j d -> b i d', s2, v)
|
271 |
+
del s2
|
272 |
+
break
|
273 |
+
except model_management.OOM_EXCEPTION as e:
|
274 |
+
if first_op_done == False:
|
275 |
+
model_management.soft_empty_cache(True)
|
276 |
+
if cleared_cache == False:
|
277 |
+
cleared_cache = True
|
278 |
+
logging.warning("out of memory error, emptying cache and trying again")
|
279 |
+
continue
|
280 |
+
steps *= 2
|
281 |
+
if steps > 64:
|
282 |
+
raise e
|
283 |
+
logging.warning("out of memory error, increasing steps and trying again {}".format(steps))
|
284 |
+
else:
|
285 |
+
raise e
|
286 |
+
|
287 |
+
del q, k, v
|
288 |
+
|
289 |
+
r1 = (
|
290 |
+
r1.unsqueeze(0)
|
291 |
+
.reshape(b, heads, -1, dim_head)
|
292 |
+
.permute(0, 2, 1, 3)
|
293 |
+
.reshape(b, -1, heads * dim_head)
|
294 |
+
)
|
295 |
+
return r1
|
296 |
+
|
297 |
+
BROKEN_XFORMERS = False
|
298 |
+
try:
|
299 |
+
x_vers = xformers.__version__
|
300 |
+
#I think 0.0.23 is also broken (q with bs bigger than 65535 gives CUDA error)
|
301 |
+
BROKEN_XFORMERS = x_vers.startswith("0.0.21") or x_vers.startswith("0.0.22") or x_vers.startswith("0.0.23")
|
302 |
+
except:
|
303 |
+
pass
|
304 |
+
|
305 |
+
def attention_xformers(q, k, v, heads, mask=None):
|
306 |
+
b, _, dim_head = q.shape
|
307 |
+
dim_head //= heads
|
308 |
+
if BROKEN_XFORMERS:
|
309 |
+
if b * heads > 65535:
|
310 |
+
return attention_pytorch(q, k, v, heads, mask)
|
311 |
+
|
312 |
+
q, k, v = map(
|
313 |
+
lambda t: t.unsqueeze(3)
|
314 |
+
.reshape(b, -1, heads, dim_head)
|
315 |
+
.permute(0, 2, 1, 3)
|
316 |
+
.reshape(b * heads, -1, dim_head)
|
317 |
+
.contiguous(),
|
318 |
+
(q, k, v),
|
319 |
+
)
|
320 |
+
|
321 |
+
if mask is not None:
|
322 |
+
pad = 8 - q.shape[1] % 8
|
323 |
+
mask_out = torch.empty([q.shape[0], q.shape[1], q.shape[1] + pad], dtype=q.dtype, device=q.device)
|
324 |
+
mask_out[:, :, :mask.shape[-1]] = mask
|
325 |
+
mask = mask_out[:, :, :mask.shape[-1]]
|
326 |
+
|
327 |
+
out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=mask)
|
328 |
+
|
329 |
+
out = (
|
330 |
+
out.unsqueeze(0)
|
331 |
+
.reshape(b, heads, -1, dim_head)
|
332 |
+
.permute(0, 2, 1, 3)
|
333 |
+
.reshape(b, -1, heads * dim_head)
|
334 |
+
)
|
335 |
+
return out
|
336 |
+
|
337 |
+
def attention_pytorch(q, k, v, heads, mask=None):
|
338 |
+
b, _, dim_head = q.shape
|
339 |
+
dim_head //= heads
|
340 |
+
q, k, v = map(
|
341 |
+
lambda t: t.view(b, -1, heads, dim_head).transpose(1, 2),
|
342 |
+
(q, k, v),
|
343 |
+
)
|
344 |
+
|
345 |
+
out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
|
346 |
+
out = (
|
347 |
+
out.transpose(1, 2).reshape(b, -1, heads * dim_head)
|
348 |
+
)
|
349 |
+
return out
|
350 |
+
|
351 |
+
|
352 |
+
optimized_attention = attention_basic
|
353 |
+
|
354 |
+
if model_management.xformers_enabled():
|
355 |
+
logging.info("Using xformers cross attention")
|
356 |
+
optimized_attention = attention_xformers
|
357 |
+
elif model_management.pytorch_attention_enabled():
|
358 |
+
logging.info("Using pytorch cross attention")
|
359 |
+
optimized_attention = attention_pytorch
|
360 |
+
else:
|
361 |
+
if args.use_split_cross_attention:
|
362 |
+
logging.info("Using split optimization for cross attention")
|
363 |
+
optimized_attention = attention_split
|
364 |
+
else:
|
365 |
+
logging.info("Using sub quadratic optimization for cross attention, if you have memory or speed issues try using: --use-split-cross-attention")
|
366 |
+
optimized_attention = attention_sub_quad
|
367 |
+
|
368 |
+
optimized_attention_masked = optimized_attention
|
369 |
+
|
370 |
+
def optimized_attention_for_device(device, mask=False, small_input=False):
|
371 |
+
if small_input:
|
372 |
+
if model_management.pytorch_attention_enabled():
|
373 |
+
return attention_pytorch #TODO: need to confirm but this is probably slightly faster for small inputs in all cases
|
374 |
+
else:
|
375 |
+
return attention_basic
|
376 |
+
|
377 |
+
if device == torch.device("cpu"):
|
378 |
+
return attention_sub_quad
|
379 |
+
|
380 |
+
if mask:
|
381 |
+
return optimized_attention_masked
|
382 |
+
|
383 |
+
return optimized_attention
|
384 |
+
|
385 |
+
|
386 |
+
class CrossAttention(nn.Module):
|
387 |
+
def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0., dtype=None, device=None, operations=ops):
|
388 |
+
super().__init__()
|
389 |
+
inner_dim = dim_head * heads
|
390 |
+
context_dim = default(context_dim, query_dim)
|
391 |
+
|
392 |
+
self.heads = heads
|
393 |
+
self.dim_head = dim_head
|
394 |
+
|
395 |
+
self.to_q = operations.Linear(query_dim, inner_dim, bias=False, dtype=dtype, device=device)
|
396 |
+
self.to_k = operations.Linear(context_dim, inner_dim, bias=False, dtype=dtype, device=device)
|
397 |
+
self.to_v = operations.Linear(context_dim, inner_dim, bias=False, dtype=dtype, device=device)
|
398 |
+
|
399 |
+
self.to_out = nn.Sequential(operations.Linear(inner_dim, query_dim, dtype=dtype, device=device), nn.Dropout(dropout))
|
400 |
+
|
401 |
+
def forward(self, x, context=None, value=None, mask=None):
|
402 |
+
q = self.to_q(x)
|
403 |
+
context = default(context, x)
|
404 |
+
k = self.to_k(context)
|
405 |
+
if value is not None:
|
406 |
+
v = self.to_v(value)
|
407 |
+
del value
|
408 |
+
else:
|
409 |
+
v = self.to_v(context)
|
410 |
+
|
411 |
+
if mask is None:
|
412 |
+
out = optimized_attention(q, k, v, self.heads)
|
413 |
+
else:
|
414 |
+
out = optimized_attention_masked(q, k, v, self.heads, mask)
|
415 |
+
return self.to_out(out)
|
416 |
+
|
417 |
+
|
418 |
+
class BasicTransformerBlock(nn.Module):
|
419 |
+
def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True, ff_in=False, inner_dim=None,
|
420 |
+
disable_self_attn=False, disable_temporal_crossattention=False, switch_temporal_ca_to_sa=False, dtype=None, device=None, operations=ops):
|
421 |
+
super().__init__()
|
422 |
+
|
423 |
+
self.ff_in = ff_in or inner_dim is not None
|
424 |
+
if inner_dim is None:
|
425 |
+
inner_dim = dim
|
426 |
+
|
427 |
+
self.is_res = inner_dim == dim
|
428 |
+
|
429 |
+
if self.ff_in:
|
430 |
+
self.norm_in = operations.LayerNorm(dim, dtype=dtype, device=device)
|
431 |
+
self.ff_in = FeedForward(dim, dim_out=inner_dim, dropout=dropout, glu=gated_ff, dtype=dtype, device=device, operations=operations)
|
432 |
+
|
433 |
+
self.disable_self_attn = disable_self_attn
|
434 |
+
self.attn1 = CrossAttention(query_dim=inner_dim, heads=n_heads, dim_head=d_head, dropout=dropout,
|
435 |
+
context_dim=context_dim if self.disable_self_attn else None, dtype=dtype, device=device, operations=operations) # is a self-attention if not self.disable_self_attn
|
436 |
+
self.ff = FeedForward(inner_dim, dim_out=dim, dropout=dropout, glu=gated_ff, dtype=dtype, device=device, operations=operations)
|
437 |
+
|
438 |
+
if disable_temporal_crossattention:
|
439 |
+
if switch_temporal_ca_to_sa:
|
440 |
+
raise ValueError
|
441 |
+
else:
|
442 |
+
self.attn2 = None
|
443 |
+
else:
|
444 |
+
context_dim_attn2 = None
|
445 |
+
if not switch_temporal_ca_to_sa:
|
446 |
+
context_dim_attn2 = context_dim
|
447 |
+
|
448 |
+
self.attn2 = CrossAttention(query_dim=inner_dim, context_dim=context_dim_attn2,
|
449 |
+
heads=n_heads, dim_head=d_head, dropout=dropout, dtype=dtype, device=device, operations=operations) # is self-attn if context is none
|
450 |
+
self.norm2 = operations.LayerNorm(inner_dim, dtype=dtype, device=device)
|
451 |
+
|
452 |
+
self.norm1 = operations.LayerNorm(inner_dim, dtype=dtype, device=device)
|
453 |
+
self.norm3 = operations.LayerNorm(inner_dim, dtype=dtype, device=device)
|
454 |
+
self.checkpoint = checkpoint
|
455 |
+
self.n_heads = n_heads
|
456 |
+
self.d_head = d_head
|
457 |
+
self.switch_temporal_ca_to_sa = switch_temporal_ca_to_sa
|
458 |
+
|
459 |
+
def forward(self, x, context=None, transformer_options={}):
|
460 |
+
return checkpoint(self._forward, (x, context, transformer_options), self.parameters(), self.checkpoint)
|
461 |
+
|
462 |
+
def _forward(self, x, context=None, transformer_options={}):
|
463 |
+
extra_options = {}
|
464 |
+
block = transformer_options.get("block", None)
|
465 |
+
block_index = transformer_options.get("block_index", 0)
|
466 |
+
transformer_patches = {}
|
467 |
+
transformer_patches_replace = {}
|
468 |
+
|
469 |
+
for k in transformer_options:
|
470 |
+
if k == "patches":
|
471 |
+
transformer_patches = transformer_options[k]
|
472 |
+
elif k == "patches_replace":
|
473 |
+
transformer_patches_replace = transformer_options[k]
|
474 |
+
else:
|
475 |
+
extra_options[k] = transformer_options[k]
|
476 |
+
|
477 |
+
extra_options["n_heads"] = self.n_heads
|
478 |
+
extra_options["dim_head"] = self.d_head
|
479 |
+
|
480 |
+
if self.ff_in:
|
481 |
+
x_skip = x
|
482 |
+
x = self.ff_in(self.norm_in(x))
|
483 |
+
if self.is_res:
|
484 |
+
x += x_skip
|
485 |
+
|
486 |
+
n = self.norm1(x)
|
487 |
+
if self.disable_self_attn:
|
488 |
+
context_attn1 = context
|
489 |
+
else:
|
490 |
+
context_attn1 = None
|
491 |
+
value_attn1 = None
|
492 |
+
|
493 |
+
if "attn1_patch" in transformer_patches:
|
494 |
+
patch = transformer_patches["attn1_patch"]
|
495 |
+
if context_attn1 is None:
|
496 |
+
context_attn1 = n
|
497 |
+
value_attn1 = context_attn1
|
498 |
+
for p in patch:
|
499 |
+
n, context_attn1, value_attn1 = p(n, context_attn1, value_attn1, extra_options)
|
500 |
+
|
501 |
+
if block is not None:
|
502 |
+
transformer_block = (block[0], block[1], block_index)
|
503 |
+
else:
|
504 |
+
transformer_block = None
|
505 |
+
attn1_replace_patch = transformer_patches_replace.get("attn1", {})
|
506 |
+
block_attn1 = transformer_block
|
507 |
+
if block_attn1 not in attn1_replace_patch:
|
508 |
+
block_attn1 = block
|
509 |
+
|
510 |
+
if block_attn1 in attn1_replace_patch:
|
511 |
+
if context_attn1 is None:
|
512 |
+
context_attn1 = n
|
513 |
+
value_attn1 = n
|
514 |
+
n = self.attn1.to_q(n)
|
515 |
+
context_attn1 = self.attn1.to_k(context_attn1)
|
516 |
+
value_attn1 = self.attn1.to_v(value_attn1)
|
517 |
+
n = attn1_replace_patch[block_attn1](n, context_attn1, value_attn1, extra_options)
|
518 |
+
n = self.attn1.to_out(n)
|
519 |
+
else:
|
520 |
+
n = self.attn1(n, context=context_attn1, value=value_attn1)
|
521 |
+
|
522 |
+
if "attn1_output_patch" in transformer_patches:
|
523 |
+
patch = transformer_patches["attn1_output_patch"]
|
524 |
+
for p in patch:
|
525 |
+
n = p(n, extra_options)
|
526 |
+
|
527 |
+
x += n
|
528 |
+
if "middle_patch" in transformer_patches:
|
529 |
+
patch = transformer_patches["middle_patch"]
|
530 |
+
for p in patch:
|
531 |
+
x = p(x, extra_options)
|
532 |
+
|
533 |
+
if self.attn2 is not None:
|
534 |
+
n = self.norm2(x)
|
535 |
+
if self.switch_temporal_ca_to_sa:
|
536 |
+
context_attn2 = n
|
537 |
+
else:
|
538 |
+
context_attn2 = context
|
539 |
+
value_attn2 = None
|
540 |
+
if "attn2_patch" in transformer_patches:
|
541 |
+
patch = transformer_patches["attn2_patch"]
|
542 |
+
value_attn2 = context_attn2
|
543 |
+
for p in patch:
|
544 |
+
n, context_attn2, value_attn2 = p(n, context_attn2, value_attn2, extra_options)
|
545 |
+
|
546 |
+
attn2_replace_patch = transformer_patches_replace.get("attn2", {})
|
547 |
+
block_attn2 = transformer_block
|
548 |
+
if block_attn2 not in attn2_replace_patch:
|
549 |
+
block_attn2 = block
|
550 |
+
|
551 |
+
if block_attn2 in attn2_replace_patch:
|
552 |
+
if value_attn2 is None:
|
553 |
+
value_attn2 = context_attn2
|
554 |
+
n = self.attn2.to_q(n)
|
555 |
+
context_attn2 = self.attn2.to_k(context_attn2)
|
556 |
+
value_attn2 = self.attn2.to_v(value_attn2)
|
557 |
+
n = attn2_replace_patch[block_attn2](n, context_attn2, value_attn2, extra_options)
|
558 |
+
n = self.attn2.to_out(n)
|
559 |
+
else:
|
560 |
+
n = self.attn2(n, context=context_attn2, value=value_attn2)
|
561 |
+
|
562 |
+
if "attn2_output_patch" in transformer_patches:
|
563 |
+
patch = transformer_patches["attn2_output_patch"]
|
564 |
+
for p in patch:
|
565 |
+
n = p(n, extra_options)
|
566 |
+
|
567 |
+
x += n
|
568 |
+
if self.is_res:
|
569 |
+
x_skip = x
|
570 |
+
x = self.ff(self.norm3(x))
|
571 |
+
if self.is_res:
|
572 |
+
x += x_skip
|
573 |
+
|
574 |
+
return x
|
575 |
+
|
576 |
+
|
577 |
+
class SpatialTransformer(nn.Module):
|
578 |
+
"""
|
579 |
+
Transformer block for image-like data.
|
580 |
+
First, project the input (aka embedding)
|
581 |
+
and reshape to b, t, d.
|
582 |
+
Then apply standard transformer action.
|
583 |
+
Finally, reshape to image
|
584 |
+
NEW: use_linear for more efficiency instead of the 1x1 convs
|
585 |
+
"""
|
586 |
+
def __init__(self, in_channels, n_heads, d_head,
|
587 |
+
depth=1, dropout=0., context_dim=None,
|
588 |
+
disable_self_attn=False, use_linear=False,
|
589 |
+
use_checkpoint=True, dtype=None, device=None, operations=ops):
|
590 |
+
super().__init__()
|
591 |
+
if exists(context_dim) and not isinstance(context_dim, list):
|
592 |
+
context_dim = [context_dim] * depth
|
593 |
+
self.in_channels = in_channels
|
594 |
+
inner_dim = n_heads * d_head
|
595 |
+
self.norm = operations.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True, dtype=dtype, device=device)
|
596 |
+
if not use_linear:
|
597 |
+
self.proj_in = operations.Conv2d(in_channels,
|
598 |
+
inner_dim,
|
599 |
+
kernel_size=1,
|
600 |
+
stride=1,
|
601 |
+
padding=0, dtype=dtype, device=device)
|
602 |
+
else:
|
603 |
+
self.proj_in = operations.Linear(in_channels, inner_dim, dtype=dtype, device=device)
|
604 |
+
|
605 |
+
self.transformer_blocks = nn.ModuleList(
|
606 |
+
[BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim[d],
|
607 |
+
disable_self_attn=disable_self_attn, checkpoint=use_checkpoint, dtype=dtype, device=device, operations=operations)
|
608 |
+
for d in range(depth)]
|
609 |
+
)
|
610 |
+
if not use_linear:
|
611 |
+
self.proj_out = operations.Conv2d(inner_dim,in_channels,
|
612 |
+
kernel_size=1,
|
613 |
+
stride=1,
|
614 |
+
padding=0, dtype=dtype, device=device)
|
615 |
+
else:
|
616 |
+
self.proj_out = operations.Linear(in_channels, inner_dim, dtype=dtype, device=device)
|
617 |
+
self.use_linear = use_linear
|
618 |
+
|
619 |
+
def forward(self, x, context=None, transformer_options={}):
|
620 |
+
# note: if no context is given, cross-attention defaults to self-attention
|
621 |
+
if not isinstance(context, list):
|
622 |
+
context = [context] * len(self.transformer_blocks)
|
623 |
+
b, c, h, w = x.shape
|
624 |
+
x_in = x
|
625 |
+
x = self.norm(x)
|
626 |
+
if not self.use_linear:
|
627 |
+
x = self.proj_in(x)
|
628 |
+
x = rearrange(x, 'b c h w -> b (h w) c').contiguous()
|
629 |
+
if self.use_linear:
|
630 |
+
x = self.proj_in(x)
|
631 |
+
for i, block in enumerate(self.transformer_blocks):
|
632 |
+
transformer_options["block_index"] = i
|
633 |
+
x = block(x, context=context[i], transformer_options=transformer_options)
|
634 |
+
if self.use_linear:
|
635 |
+
x = self.proj_out(x)
|
636 |
+
x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
|
637 |
+
if not self.use_linear:
|
638 |
+
x = self.proj_out(x)
|
639 |
+
return x + x_in
|
640 |
+
|
641 |
+
|
642 |
+
class SpatialVideoTransformer(SpatialTransformer):
|
643 |
+
def __init__(
|
644 |
+
self,
|
645 |
+
in_channels,
|
646 |
+
n_heads,
|
647 |
+
d_head,
|
648 |
+
depth=1,
|
649 |
+
dropout=0.0,
|
650 |
+
use_linear=False,
|
651 |
+
context_dim=None,
|
652 |
+
use_spatial_context=False,
|
653 |
+
timesteps=None,
|
654 |
+
merge_strategy: str = "fixed",
|
655 |
+
merge_factor: float = 0.5,
|
656 |
+
time_context_dim=None,
|
657 |
+
ff_in=False,
|
658 |
+
checkpoint=False,
|
659 |
+
time_depth=1,
|
660 |
+
disable_self_attn=False,
|
661 |
+
disable_temporal_crossattention=False,
|
662 |
+
max_time_embed_period: int = 10000,
|
663 |
+
dtype=None, device=None, operations=ops
|
664 |
+
):
|
665 |
+
super().__init__(
|
666 |
+
in_channels,
|
667 |
+
n_heads,
|
668 |
+
d_head,
|
669 |
+
depth=depth,
|
670 |
+
dropout=dropout,
|
671 |
+
use_checkpoint=checkpoint,
|
672 |
+
context_dim=context_dim,
|
673 |
+
use_linear=use_linear,
|
674 |
+
disable_self_attn=disable_self_attn,
|
675 |
+
dtype=dtype, device=device, operations=operations
|
676 |
+
)
|
677 |
+
self.time_depth = time_depth
|
678 |
+
self.depth = depth
|
679 |
+
self.max_time_embed_period = max_time_embed_period
|
680 |
+
|
681 |
+
time_mix_d_head = d_head
|
682 |
+
n_time_mix_heads = n_heads
|
683 |
+
|
684 |
+
time_mix_inner_dim = int(time_mix_d_head * n_time_mix_heads)
|
685 |
+
|
686 |
+
inner_dim = n_heads * d_head
|
687 |
+
if use_spatial_context:
|
688 |
+
time_context_dim = context_dim
|
689 |
+
|
690 |
+
self.time_stack = nn.ModuleList(
|
691 |
+
[
|
692 |
+
BasicTransformerBlock(
|
693 |
+
inner_dim,
|
694 |
+
n_time_mix_heads,
|
695 |
+
time_mix_d_head,
|
696 |
+
dropout=dropout,
|
697 |
+
context_dim=time_context_dim,
|
698 |
+
# timesteps=timesteps,
|
699 |
+
checkpoint=checkpoint,
|
700 |
+
ff_in=ff_in,
|
701 |
+
inner_dim=time_mix_inner_dim,
|
702 |
+
disable_self_attn=disable_self_attn,
|
703 |
+
disable_temporal_crossattention=disable_temporal_crossattention,
|
704 |
+
dtype=dtype, device=device, operations=operations
|
705 |
+
)
|
706 |
+
for _ in range(self.depth)
|
707 |
+
]
|
708 |
+
)
|
709 |
+
|
710 |
+
assert len(self.time_stack) == len(self.transformer_blocks)
|
711 |
+
|
712 |
+
self.use_spatial_context = use_spatial_context
|
713 |
+
self.in_channels = in_channels
|
714 |
+
|
715 |
+
time_embed_dim = self.in_channels * 4
|
716 |
+
self.time_pos_embed = nn.Sequential(
|
717 |
+
operations.Linear(self.in_channels, time_embed_dim, dtype=dtype, device=device),
|
718 |
+
nn.SiLU(),
|
719 |
+
operations.Linear(time_embed_dim, self.in_channels, dtype=dtype, device=device),
|
720 |
+
)
|
721 |
+
|
722 |
+
self.time_mixer = AlphaBlender(
|
723 |
+
alpha=merge_factor, merge_strategy=merge_strategy
|
724 |
+
)
|
725 |
+
|
726 |
+
def forward(
|
727 |
+
self,
|
728 |
+
x: torch.Tensor,
|
729 |
+
context: Optional[torch.Tensor] = None,
|
730 |
+
time_context: Optional[torch.Tensor] = None,
|
731 |
+
timesteps: Optional[int] = None,
|
732 |
+
image_only_indicator: Optional[torch.Tensor] = None,
|
733 |
+
transformer_options={}
|
734 |
+
) -> torch.Tensor:
|
735 |
+
_, _, h, w = x.shape
|
736 |
+
x_in = x
|
737 |
+
spatial_context = None
|
738 |
+
if exists(context):
|
739 |
+
spatial_context = context
|
740 |
+
|
741 |
+
if self.use_spatial_context:
|
742 |
+
assert (
|
743 |
+
context.ndim == 3
|
744 |
+
), f"n dims of spatial context should be 3 but are {context.ndim}"
|
745 |
+
|
746 |
+
if time_context is None:
|
747 |
+
time_context = context
|
748 |
+
time_context_first_timestep = time_context[::timesteps]
|
749 |
+
time_context = repeat(
|
750 |
+
time_context_first_timestep, "b ... -> (b n) ...", n=h * w
|
751 |
+
)
|
752 |
+
elif time_context is not None and not self.use_spatial_context:
|
753 |
+
time_context = repeat(time_context, "b ... -> (b n) ...", n=h * w)
|
754 |
+
if time_context.ndim == 2:
|
755 |
+
time_context = rearrange(time_context, "b c -> b 1 c")
|
756 |
+
|
757 |
+
x = self.norm(x)
|
758 |
+
if not self.use_linear:
|
759 |
+
x = self.proj_in(x)
|
760 |
+
x = rearrange(x, "b c h w -> b (h w) c")
|
761 |
+
if self.use_linear:
|
762 |
+
x = self.proj_in(x)
|
763 |
+
|
764 |
+
num_frames = torch.arange(timesteps, device=x.device)
|
765 |
+
num_frames = repeat(num_frames, "t -> b t", b=x.shape[0] // timesteps)
|
766 |
+
num_frames = rearrange(num_frames, "b t -> (b t)")
|
767 |
+
t_emb = timestep_embedding(num_frames, self.in_channels, repeat_only=False, max_period=self.max_time_embed_period).to(x.dtype)
|
768 |
+
emb = self.time_pos_embed(t_emb)
|
769 |
+
emb = emb[:, None, :]
|
770 |
+
|
771 |
+
for it_, (block, mix_block) in enumerate(
|
772 |
+
zip(self.transformer_blocks, self.time_stack)
|
773 |
+
):
|
774 |
+
transformer_options["block_index"] = it_
|
775 |
+
x = block(
|
776 |
+
x,
|
777 |
+
context=spatial_context,
|
778 |
+
transformer_options=transformer_options,
|
779 |
+
)
|
780 |
+
|
781 |
+
x_mix = x
|
782 |
+
x_mix = x_mix + emb
|
783 |
+
|
784 |
+
B, S, C = x_mix.shape
|
785 |
+
x_mix = rearrange(x_mix, "(b t) s c -> (b s) t c", t=timesteps)
|
786 |
+
x_mix = mix_block(x_mix, context=time_context) #TODO: transformer_options
|
787 |
+
x_mix = rearrange(
|
788 |
+
x_mix, "(b s) t c -> (b t) s c", s=S, b=B // timesteps, c=C, t=timesteps
|
789 |
+
)
|
790 |
+
|
791 |
+
x = self.time_mixer(x_spatial=x, x_temporal=x_mix, image_only_indicator=image_only_indicator)
|
792 |
+
|
793 |
+
if self.use_linear:
|
794 |
+
x = self.proj_out(x)
|
795 |
+
x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
|
796 |
+
if not self.use_linear:
|
797 |
+
x = self.proj_out(x)
|
798 |
+
out = x + x_in
|
799 |
+
return out
|
800 |
+
|
801 |
+
|
ComfyUI/comfy/ldm/modules/diffusionmodules/__init__.py
ADDED
File without changes
|
ComfyUI/comfy/ldm/modules/diffusionmodules/model.py
ADDED
@@ -0,0 +1,651 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# pytorch_diffusion + derived encoder decoder
|
2 |
+
import math
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
import numpy as np
|
6 |
+
from einops import rearrange
|
7 |
+
from typing import Optional, Any
|
8 |
+
import logging
|
9 |
+
|
10 |
+
from comfy import model_management
|
11 |
+
import comfy.ops
|
12 |
+
ops = comfy.ops.disable_weight_init
|
13 |
+
|
14 |
+
if model_management.xformers_enabled_vae():
|
15 |
+
import xformers
|
16 |
+
import xformers.ops
|
17 |
+
|
18 |
+
def get_timestep_embedding(timesteps, embedding_dim):
|
19 |
+
"""
|
20 |
+
This matches the implementation in Denoising Diffusion Probabilistic Models:
|
21 |
+
From Fairseq.
|
22 |
+
Build sinusoidal embeddings.
|
23 |
+
This matches the implementation in tensor2tensor, but differs slightly
|
24 |
+
from the description in Section 3.5 of "Attention Is All You Need".
|
25 |
+
"""
|
26 |
+
assert len(timesteps.shape) == 1
|
27 |
+
|
28 |
+
half_dim = embedding_dim // 2
|
29 |
+
emb = math.log(10000) / (half_dim - 1)
|
30 |
+
emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
|
31 |
+
emb = emb.to(device=timesteps.device)
|
32 |
+
emb = timesteps.float()[:, None] * emb[None, :]
|
33 |
+
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
|
34 |
+
if embedding_dim % 2 == 1: # zero pad
|
35 |
+
emb = torch.nn.functional.pad(emb, (0,1,0,0))
|
36 |
+
return emb
|
37 |
+
|
38 |
+
|
39 |
+
def nonlinearity(x):
|
40 |
+
# swish
|
41 |
+
return x*torch.sigmoid(x)
|
42 |
+
|
43 |
+
|
44 |
+
def Normalize(in_channels, num_groups=32):
|
45 |
+
return ops.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
|
46 |
+
|
47 |
+
|
48 |
+
class Upsample(nn.Module):
|
49 |
+
def __init__(self, in_channels, with_conv):
|
50 |
+
super().__init__()
|
51 |
+
self.with_conv = with_conv
|
52 |
+
if self.with_conv:
|
53 |
+
self.conv = ops.Conv2d(in_channels,
|
54 |
+
in_channels,
|
55 |
+
kernel_size=3,
|
56 |
+
stride=1,
|
57 |
+
padding=1)
|
58 |
+
|
59 |
+
def forward(self, x):
|
60 |
+
try:
|
61 |
+
x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
|
62 |
+
except: #operation not implemented for bf16
|
63 |
+
b, c, h, w = x.shape
|
64 |
+
out = torch.empty((b, c, h*2, w*2), dtype=x.dtype, layout=x.layout, device=x.device)
|
65 |
+
split = 8
|
66 |
+
l = out.shape[1] // split
|
67 |
+
for i in range(0, out.shape[1], l):
|
68 |
+
out[:,i:i+l] = torch.nn.functional.interpolate(x[:,i:i+l].to(torch.float32), scale_factor=2.0, mode="nearest").to(x.dtype)
|
69 |
+
del x
|
70 |
+
x = out
|
71 |
+
|
72 |
+
if self.with_conv:
|
73 |
+
x = self.conv(x)
|
74 |
+
return x
|
75 |
+
|
76 |
+
|
77 |
+
class Downsample(nn.Module):
|
78 |
+
def __init__(self, in_channels, with_conv):
|
79 |
+
super().__init__()
|
80 |
+
self.with_conv = with_conv
|
81 |
+
if self.with_conv:
|
82 |
+
# no asymmetric padding in torch conv, must do it ourselves
|
83 |
+
self.conv = ops.Conv2d(in_channels,
|
84 |
+
in_channels,
|
85 |
+
kernel_size=3,
|
86 |
+
stride=2,
|
87 |
+
padding=0)
|
88 |
+
|
89 |
+
def forward(self, x):
|
90 |
+
if self.with_conv:
|
91 |
+
pad = (0,1,0,1)
|
92 |
+
x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
|
93 |
+
x = self.conv(x)
|
94 |
+
else:
|
95 |
+
x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
|
96 |
+
return x
|
97 |
+
|
98 |
+
|
99 |
+
class ResnetBlock(nn.Module):
|
100 |
+
def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
|
101 |
+
dropout, temb_channels=512):
|
102 |
+
super().__init__()
|
103 |
+
self.in_channels = in_channels
|
104 |
+
out_channels = in_channels if out_channels is None else out_channels
|
105 |
+
self.out_channels = out_channels
|
106 |
+
self.use_conv_shortcut = conv_shortcut
|
107 |
+
|
108 |
+
self.swish = torch.nn.SiLU(inplace=True)
|
109 |
+
self.norm1 = Normalize(in_channels)
|
110 |
+
self.conv1 = ops.Conv2d(in_channels,
|
111 |
+
out_channels,
|
112 |
+
kernel_size=3,
|
113 |
+
stride=1,
|
114 |
+
padding=1)
|
115 |
+
if temb_channels > 0:
|
116 |
+
self.temb_proj = ops.Linear(temb_channels,
|
117 |
+
out_channels)
|
118 |
+
self.norm2 = Normalize(out_channels)
|
119 |
+
self.dropout = torch.nn.Dropout(dropout, inplace=True)
|
120 |
+
self.conv2 = ops.Conv2d(out_channels,
|
121 |
+
out_channels,
|
122 |
+
kernel_size=3,
|
123 |
+
stride=1,
|
124 |
+
padding=1)
|
125 |
+
if self.in_channels != self.out_channels:
|
126 |
+
if self.use_conv_shortcut:
|
127 |
+
self.conv_shortcut = ops.Conv2d(in_channels,
|
128 |
+
out_channels,
|
129 |
+
kernel_size=3,
|
130 |
+
stride=1,
|
131 |
+
padding=1)
|
132 |
+
else:
|
133 |
+
self.nin_shortcut = ops.Conv2d(in_channels,
|
134 |
+
out_channels,
|
135 |
+
kernel_size=1,
|
136 |
+
stride=1,
|
137 |
+
padding=0)
|
138 |
+
|
139 |
+
def forward(self, x, temb):
|
140 |
+
h = x
|
141 |
+
h = self.norm1(h)
|
142 |
+
h = self.swish(h)
|
143 |
+
h = self.conv1(h)
|
144 |
+
|
145 |
+
if temb is not None:
|
146 |
+
h = h + self.temb_proj(self.swish(temb))[:,:,None,None]
|
147 |
+
|
148 |
+
h = self.norm2(h)
|
149 |
+
h = self.swish(h)
|
150 |
+
h = self.dropout(h)
|
151 |
+
h = self.conv2(h)
|
152 |
+
|
153 |
+
if self.in_channels != self.out_channels:
|
154 |
+
if self.use_conv_shortcut:
|
155 |
+
x = self.conv_shortcut(x)
|
156 |
+
else:
|
157 |
+
x = self.nin_shortcut(x)
|
158 |
+
|
159 |
+
return x+h
|
160 |
+
|
161 |
+
def slice_attention(q, k, v):
|
162 |
+
r1 = torch.zeros_like(k, device=q.device)
|
163 |
+
scale = (int(q.shape[-1])**(-0.5))
|
164 |
+
|
165 |
+
mem_free_total = model_management.get_free_memory(q.device)
|
166 |
+
|
167 |
+
gb = 1024 ** 3
|
168 |
+
tensor_size = q.shape[0] * q.shape[1] * k.shape[2] * q.element_size()
|
169 |
+
modifier = 3 if q.element_size() == 2 else 2.5
|
170 |
+
mem_required = tensor_size * modifier
|
171 |
+
steps = 1
|
172 |
+
|
173 |
+
if mem_required > mem_free_total:
|
174 |
+
steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
|
175 |
+
|
176 |
+
while True:
|
177 |
+
try:
|
178 |
+
slice_size = q.shape[1] // steps if (q.shape[1] % steps) == 0 else q.shape[1]
|
179 |
+
for i in range(0, q.shape[1], slice_size):
|
180 |
+
end = i + slice_size
|
181 |
+
s1 = torch.bmm(q[:, i:end], k) * scale
|
182 |
+
|
183 |
+
s2 = torch.nn.functional.softmax(s1, dim=2).permute(0,2,1)
|
184 |
+
del s1
|
185 |
+
|
186 |
+
r1[:, :, i:end] = torch.bmm(v, s2)
|
187 |
+
del s2
|
188 |
+
break
|
189 |
+
except model_management.OOM_EXCEPTION as e:
|
190 |
+
model_management.soft_empty_cache(True)
|
191 |
+
steps *= 2
|
192 |
+
if steps > 128:
|
193 |
+
raise e
|
194 |
+
logging.warning("out of memory error, increasing steps and trying again {}".format(steps))
|
195 |
+
|
196 |
+
return r1
|
197 |
+
|
198 |
+
def normal_attention(q, k, v):
|
199 |
+
# compute attention
|
200 |
+
b,c,h,w = q.shape
|
201 |
+
|
202 |
+
q = q.reshape(b,c,h*w)
|
203 |
+
q = q.permute(0,2,1) # b,hw,c
|
204 |
+
k = k.reshape(b,c,h*w) # b,c,hw
|
205 |
+
v = v.reshape(b,c,h*w)
|
206 |
+
|
207 |
+
r1 = slice_attention(q, k, v)
|
208 |
+
h_ = r1.reshape(b,c,h,w)
|
209 |
+
del r1
|
210 |
+
return h_
|
211 |
+
|
212 |
+
def xformers_attention(q, k, v):
|
213 |
+
# compute attention
|
214 |
+
B, C, H, W = q.shape
|
215 |
+
q, k, v = map(
|
216 |
+
lambda t: t.view(B, C, -1).transpose(1, 2).contiguous(),
|
217 |
+
(q, k, v),
|
218 |
+
)
|
219 |
+
|
220 |
+
try:
|
221 |
+
out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None)
|
222 |
+
out = out.transpose(1, 2).reshape(B, C, H, W)
|
223 |
+
except NotImplementedError as e:
|
224 |
+
out = slice_attention(q.view(B, -1, C), k.view(B, -1, C).transpose(1, 2), v.view(B, -1, C).transpose(1, 2)).reshape(B, C, H, W)
|
225 |
+
return out
|
226 |
+
|
227 |
+
def pytorch_attention(q, k, v):
|
228 |
+
# compute attention
|
229 |
+
B, C, H, W = q.shape
|
230 |
+
q, k, v = map(
|
231 |
+
lambda t: t.view(B, 1, C, -1).transpose(2, 3).contiguous(),
|
232 |
+
(q, k, v),
|
233 |
+
)
|
234 |
+
|
235 |
+
try:
|
236 |
+
out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False)
|
237 |
+
out = out.transpose(2, 3).reshape(B, C, H, W)
|
238 |
+
except model_management.OOM_EXCEPTION as e:
|
239 |
+
logging.warning("scaled_dot_product_attention OOMed: switched to slice attention")
|
240 |
+
out = slice_attention(q.view(B, -1, C), k.view(B, -1, C).transpose(1, 2), v.view(B, -1, C).transpose(1, 2)).reshape(B, C, H, W)
|
241 |
+
return out
|
242 |
+
|
243 |
+
|
244 |
+
class AttnBlock(nn.Module):
|
245 |
+
def __init__(self, in_channels):
|
246 |
+
super().__init__()
|
247 |
+
self.in_channels = in_channels
|
248 |
+
|
249 |
+
self.norm = Normalize(in_channels)
|
250 |
+
self.q = ops.Conv2d(in_channels,
|
251 |
+
in_channels,
|
252 |
+
kernel_size=1,
|
253 |
+
stride=1,
|
254 |
+
padding=0)
|
255 |
+
self.k = ops.Conv2d(in_channels,
|
256 |
+
in_channels,
|
257 |
+
kernel_size=1,
|
258 |
+
stride=1,
|
259 |
+
padding=0)
|
260 |
+
self.v = ops.Conv2d(in_channels,
|
261 |
+
in_channels,
|
262 |
+
kernel_size=1,
|
263 |
+
stride=1,
|
264 |
+
padding=0)
|
265 |
+
self.proj_out = ops.Conv2d(in_channels,
|
266 |
+
in_channels,
|
267 |
+
kernel_size=1,
|
268 |
+
stride=1,
|
269 |
+
padding=0)
|
270 |
+
|
271 |
+
if model_management.xformers_enabled_vae():
|
272 |
+
logging.info("Using xformers attention in VAE")
|
273 |
+
self.optimized_attention = xformers_attention
|
274 |
+
elif model_management.pytorch_attention_enabled():
|
275 |
+
logging.info("Using pytorch attention in VAE")
|
276 |
+
self.optimized_attention = pytorch_attention
|
277 |
+
else:
|
278 |
+
logging.info("Using split attention in VAE")
|
279 |
+
self.optimized_attention = normal_attention
|
280 |
+
|
281 |
+
def forward(self, x):
|
282 |
+
h_ = x
|
283 |
+
h_ = self.norm(h_)
|
284 |
+
q = self.q(h_)
|
285 |
+
k = self.k(h_)
|
286 |
+
v = self.v(h_)
|
287 |
+
|
288 |
+
h_ = self.optimized_attention(q, k, v)
|
289 |
+
|
290 |
+
h_ = self.proj_out(h_)
|
291 |
+
|
292 |
+
return x+h_
|
293 |
+
|
294 |
+
|
295 |
+
def make_attn(in_channels, attn_type="vanilla", attn_kwargs=None):
|
296 |
+
return AttnBlock(in_channels)
|
297 |
+
|
298 |
+
|
299 |
+
class Model(nn.Module):
|
300 |
+
def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
|
301 |
+
attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
|
302 |
+
resolution, use_timestep=True, use_linear_attn=False, attn_type="vanilla"):
|
303 |
+
super().__init__()
|
304 |
+
if use_linear_attn: attn_type = "linear"
|
305 |
+
self.ch = ch
|
306 |
+
self.temb_ch = self.ch*4
|
307 |
+
self.num_resolutions = len(ch_mult)
|
308 |
+
self.num_res_blocks = num_res_blocks
|
309 |
+
self.resolution = resolution
|
310 |
+
self.in_channels = in_channels
|
311 |
+
|
312 |
+
self.use_timestep = use_timestep
|
313 |
+
if self.use_timestep:
|
314 |
+
# timestep embedding
|
315 |
+
self.temb = nn.Module()
|
316 |
+
self.temb.dense = nn.ModuleList([
|
317 |
+
ops.Linear(self.ch,
|
318 |
+
self.temb_ch),
|
319 |
+
ops.Linear(self.temb_ch,
|
320 |
+
self.temb_ch),
|
321 |
+
])
|
322 |
+
|
323 |
+
# downsampling
|
324 |
+
self.conv_in = ops.Conv2d(in_channels,
|
325 |
+
self.ch,
|
326 |
+
kernel_size=3,
|
327 |
+
stride=1,
|
328 |
+
padding=1)
|
329 |
+
|
330 |
+
curr_res = resolution
|
331 |
+
in_ch_mult = (1,)+tuple(ch_mult)
|
332 |
+
self.down = nn.ModuleList()
|
333 |
+
for i_level in range(self.num_resolutions):
|
334 |
+
block = nn.ModuleList()
|
335 |
+
attn = nn.ModuleList()
|
336 |
+
block_in = ch*in_ch_mult[i_level]
|
337 |
+
block_out = ch*ch_mult[i_level]
|
338 |
+
for i_block in range(self.num_res_blocks):
|
339 |
+
block.append(ResnetBlock(in_channels=block_in,
|
340 |
+
out_channels=block_out,
|
341 |
+
temb_channels=self.temb_ch,
|
342 |
+
dropout=dropout))
|
343 |
+
block_in = block_out
|
344 |
+
if curr_res in attn_resolutions:
|
345 |
+
attn.append(make_attn(block_in, attn_type=attn_type))
|
346 |
+
down = nn.Module()
|
347 |
+
down.block = block
|
348 |
+
down.attn = attn
|
349 |
+
if i_level != self.num_resolutions-1:
|
350 |
+
down.downsample = Downsample(block_in, resamp_with_conv)
|
351 |
+
curr_res = curr_res // 2
|
352 |
+
self.down.append(down)
|
353 |
+
|
354 |
+
# middle
|
355 |
+
self.mid = nn.Module()
|
356 |
+
self.mid.block_1 = ResnetBlock(in_channels=block_in,
|
357 |
+
out_channels=block_in,
|
358 |
+
temb_channels=self.temb_ch,
|
359 |
+
dropout=dropout)
|
360 |
+
self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
|
361 |
+
self.mid.block_2 = ResnetBlock(in_channels=block_in,
|
362 |
+
out_channels=block_in,
|
363 |
+
temb_channels=self.temb_ch,
|
364 |
+
dropout=dropout)
|
365 |
+
|
366 |
+
# upsampling
|
367 |
+
self.up = nn.ModuleList()
|
368 |
+
for i_level in reversed(range(self.num_resolutions)):
|
369 |
+
block = nn.ModuleList()
|
370 |
+
attn = nn.ModuleList()
|
371 |
+
block_out = ch*ch_mult[i_level]
|
372 |
+
skip_in = ch*ch_mult[i_level]
|
373 |
+
for i_block in range(self.num_res_blocks+1):
|
374 |
+
if i_block == self.num_res_blocks:
|
375 |
+
skip_in = ch*in_ch_mult[i_level]
|
376 |
+
block.append(ResnetBlock(in_channels=block_in+skip_in,
|
377 |
+
out_channels=block_out,
|
378 |
+
temb_channels=self.temb_ch,
|
379 |
+
dropout=dropout))
|
380 |
+
block_in = block_out
|
381 |
+
if curr_res in attn_resolutions:
|
382 |
+
attn.append(make_attn(block_in, attn_type=attn_type))
|
383 |
+
up = nn.Module()
|
384 |
+
up.block = block
|
385 |
+
up.attn = attn
|
386 |
+
if i_level != 0:
|
387 |
+
up.upsample = Upsample(block_in, resamp_with_conv)
|
388 |
+
curr_res = curr_res * 2
|
389 |
+
self.up.insert(0, up) # prepend to get consistent order
|
390 |
+
|
391 |
+
# end
|
392 |
+
self.norm_out = Normalize(block_in)
|
393 |
+
self.conv_out = ops.Conv2d(block_in,
|
394 |
+
out_ch,
|
395 |
+
kernel_size=3,
|
396 |
+
stride=1,
|
397 |
+
padding=1)
|
398 |
+
|
399 |
+
def forward(self, x, t=None, context=None):
|
400 |
+
#assert x.shape[2] == x.shape[3] == self.resolution
|
401 |
+
if context is not None:
|
402 |
+
# assume aligned context, cat along channel axis
|
403 |
+
x = torch.cat((x, context), dim=1)
|
404 |
+
if self.use_timestep:
|
405 |
+
# timestep embedding
|
406 |
+
assert t is not None
|
407 |
+
temb = get_timestep_embedding(t, self.ch)
|
408 |
+
temb = self.temb.dense[0](temb)
|
409 |
+
temb = nonlinearity(temb)
|
410 |
+
temb = self.temb.dense[1](temb)
|
411 |
+
else:
|
412 |
+
temb = None
|
413 |
+
|
414 |
+
# downsampling
|
415 |
+
hs = [self.conv_in(x)]
|
416 |
+
for i_level in range(self.num_resolutions):
|
417 |
+
for i_block in range(self.num_res_blocks):
|
418 |
+
h = self.down[i_level].block[i_block](hs[-1], temb)
|
419 |
+
if len(self.down[i_level].attn) > 0:
|
420 |
+
h = self.down[i_level].attn[i_block](h)
|
421 |
+
hs.append(h)
|
422 |
+
if i_level != self.num_resolutions-1:
|
423 |
+
hs.append(self.down[i_level].downsample(hs[-1]))
|
424 |
+
|
425 |
+
# middle
|
426 |
+
h = hs[-1]
|
427 |
+
h = self.mid.block_1(h, temb)
|
428 |
+
h = self.mid.attn_1(h)
|
429 |
+
h = self.mid.block_2(h, temb)
|
430 |
+
|
431 |
+
# upsampling
|
432 |
+
for i_level in reversed(range(self.num_resolutions)):
|
433 |
+
for i_block in range(self.num_res_blocks+1):
|
434 |
+
h = self.up[i_level].block[i_block](
|
435 |
+
torch.cat([h, hs.pop()], dim=1), temb)
|
436 |
+
if len(self.up[i_level].attn) > 0:
|
437 |
+
h = self.up[i_level].attn[i_block](h)
|
438 |
+
if i_level != 0:
|
439 |
+
h = self.up[i_level].upsample(h)
|
440 |
+
|
441 |
+
# end
|
442 |
+
h = self.norm_out(h)
|
443 |
+
h = nonlinearity(h)
|
444 |
+
h = self.conv_out(h)
|
445 |
+
return h
|
446 |
+
|
447 |
+
def get_last_layer(self):
|
448 |
+
return self.conv_out.weight
|
449 |
+
|
450 |
+
|
451 |
+
class Encoder(nn.Module):
|
452 |
+
def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
|
453 |
+
attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
|
454 |
+
resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla",
|
455 |
+
**ignore_kwargs):
|
456 |
+
super().__init__()
|
457 |
+
if use_linear_attn: attn_type = "linear"
|
458 |
+
self.ch = ch
|
459 |
+
self.temb_ch = 0
|
460 |
+
self.num_resolutions = len(ch_mult)
|
461 |
+
self.num_res_blocks = num_res_blocks
|
462 |
+
self.resolution = resolution
|
463 |
+
self.in_channels = in_channels
|
464 |
+
|
465 |
+
# downsampling
|
466 |
+
self.conv_in = ops.Conv2d(in_channels,
|
467 |
+
self.ch,
|
468 |
+
kernel_size=3,
|
469 |
+
stride=1,
|
470 |
+
padding=1)
|
471 |
+
|
472 |
+
curr_res = resolution
|
473 |
+
in_ch_mult = (1,)+tuple(ch_mult)
|
474 |
+
self.in_ch_mult = in_ch_mult
|
475 |
+
self.down = nn.ModuleList()
|
476 |
+
for i_level in range(self.num_resolutions):
|
477 |
+
block = nn.ModuleList()
|
478 |
+
attn = nn.ModuleList()
|
479 |
+
block_in = ch*in_ch_mult[i_level]
|
480 |
+
block_out = ch*ch_mult[i_level]
|
481 |
+
for i_block in range(self.num_res_blocks):
|
482 |
+
block.append(ResnetBlock(in_channels=block_in,
|
483 |
+
out_channels=block_out,
|
484 |
+
temb_channels=self.temb_ch,
|
485 |
+
dropout=dropout))
|
486 |
+
block_in = block_out
|
487 |
+
if curr_res in attn_resolutions:
|
488 |
+
attn.append(make_attn(block_in, attn_type=attn_type))
|
489 |
+
down = nn.Module()
|
490 |
+
down.block = block
|
491 |
+
down.attn = attn
|
492 |
+
if i_level != self.num_resolutions-1:
|
493 |
+
down.downsample = Downsample(block_in, resamp_with_conv)
|
494 |
+
curr_res = curr_res // 2
|
495 |
+
self.down.append(down)
|
496 |
+
|
497 |
+
# middle
|
498 |
+
self.mid = nn.Module()
|
499 |
+
self.mid.block_1 = ResnetBlock(in_channels=block_in,
|
500 |
+
out_channels=block_in,
|
501 |
+
temb_channels=self.temb_ch,
|
502 |
+
dropout=dropout)
|
503 |
+
self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
|
504 |
+
self.mid.block_2 = ResnetBlock(in_channels=block_in,
|
505 |
+
out_channels=block_in,
|
506 |
+
temb_channels=self.temb_ch,
|
507 |
+
dropout=dropout)
|
508 |
+
|
509 |
+
# end
|
510 |
+
self.norm_out = Normalize(block_in)
|
511 |
+
self.conv_out = ops.Conv2d(block_in,
|
512 |
+
2*z_channels if double_z else z_channels,
|
513 |
+
kernel_size=3,
|
514 |
+
stride=1,
|
515 |
+
padding=1)
|
516 |
+
|
517 |
+
def forward(self, x):
|
518 |
+
# timestep embedding
|
519 |
+
temb = None
|
520 |
+
# downsampling
|
521 |
+
h = self.conv_in(x)
|
522 |
+
for i_level in range(self.num_resolutions):
|
523 |
+
for i_block in range(self.num_res_blocks):
|
524 |
+
h = self.down[i_level].block[i_block](h, temb)
|
525 |
+
if len(self.down[i_level].attn) > 0:
|
526 |
+
h = self.down[i_level].attn[i_block](h)
|
527 |
+
if i_level != self.num_resolutions-1:
|
528 |
+
h = self.down[i_level].downsample(h)
|
529 |
+
|
530 |
+
# middle
|
531 |
+
h = self.mid.block_1(h, temb)
|
532 |
+
h = self.mid.attn_1(h)
|
533 |
+
h = self.mid.block_2(h, temb)
|
534 |
+
|
535 |
+
# end
|
536 |
+
h = self.norm_out(h)
|
537 |
+
h = nonlinearity(h)
|
538 |
+
h = self.conv_out(h)
|
539 |
+
return h
|
540 |
+
|
541 |
+
|
542 |
+
class Decoder(nn.Module):
|
543 |
+
def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
|
544 |
+
attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
|
545 |
+
resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
|
546 |
+
conv_out_op=ops.Conv2d,
|
547 |
+
resnet_op=ResnetBlock,
|
548 |
+
attn_op=AttnBlock,
|
549 |
+
**ignorekwargs):
|
550 |
+
super().__init__()
|
551 |
+
if use_linear_attn: attn_type = "linear"
|
552 |
+
self.ch = ch
|
553 |
+
self.temb_ch = 0
|
554 |
+
self.num_resolutions = len(ch_mult)
|
555 |
+
self.num_res_blocks = num_res_blocks
|
556 |
+
self.resolution = resolution
|
557 |
+
self.in_channels = in_channels
|
558 |
+
self.give_pre_end = give_pre_end
|
559 |
+
self.tanh_out = tanh_out
|
560 |
+
|
561 |
+
# compute in_ch_mult, block_in and curr_res at lowest res
|
562 |
+
in_ch_mult = (1,)+tuple(ch_mult)
|
563 |
+
block_in = ch*ch_mult[self.num_resolutions-1]
|
564 |
+
curr_res = resolution // 2**(self.num_resolutions-1)
|
565 |
+
self.z_shape = (1,z_channels,curr_res,curr_res)
|
566 |
+
logging.debug("Working with z of shape {} = {} dimensions.".format(
|
567 |
+
self.z_shape, np.prod(self.z_shape)))
|
568 |
+
|
569 |
+
# z to block_in
|
570 |
+
self.conv_in = ops.Conv2d(z_channels,
|
571 |
+
block_in,
|
572 |
+
kernel_size=3,
|
573 |
+
stride=1,
|
574 |
+
padding=1)
|
575 |
+
|
576 |
+
# middle
|
577 |
+
self.mid = nn.Module()
|
578 |
+
self.mid.block_1 = resnet_op(in_channels=block_in,
|
579 |
+
out_channels=block_in,
|
580 |
+
temb_channels=self.temb_ch,
|
581 |
+
dropout=dropout)
|
582 |
+
self.mid.attn_1 = attn_op(block_in)
|
583 |
+
self.mid.block_2 = resnet_op(in_channels=block_in,
|
584 |
+
out_channels=block_in,
|
585 |
+
temb_channels=self.temb_ch,
|
586 |
+
dropout=dropout)
|
587 |
+
|
588 |
+
# upsampling
|
589 |
+
self.up = nn.ModuleList()
|
590 |
+
for i_level in reversed(range(self.num_resolutions)):
|
591 |
+
block = nn.ModuleList()
|
592 |
+
attn = nn.ModuleList()
|
593 |
+
block_out = ch*ch_mult[i_level]
|
594 |
+
for i_block in range(self.num_res_blocks+1):
|
595 |
+
block.append(resnet_op(in_channels=block_in,
|
596 |
+
out_channels=block_out,
|
597 |
+
temb_channels=self.temb_ch,
|
598 |
+
dropout=dropout))
|
599 |
+
block_in = block_out
|
600 |
+
if curr_res in attn_resolutions:
|
601 |
+
attn.append(attn_op(block_in))
|
602 |
+
up = nn.Module()
|
603 |
+
up.block = block
|
604 |
+
up.attn = attn
|
605 |
+
if i_level != 0:
|
606 |
+
up.upsample = Upsample(block_in, resamp_with_conv)
|
607 |
+
curr_res = curr_res * 2
|
608 |
+
self.up.insert(0, up) # prepend to get consistent order
|
609 |
+
|
610 |
+
# end
|
611 |
+
self.norm_out = Normalize(block_in)
|
612 |
+
self.conv_out = conv_out_op(block_in,
|
613 |
+
out_ch,
|
614 |
+
kernel_size=3,
|
615 |
+
stride=1,
|
616 |
+
padding=1)
|
617 |
+
|
618 |
+
def forward(self, z, **kwargs):
|
619 |
+
#assert z.shape[1:] == self.z_shape[1:]
|
620 |
+
self.last_z_shape = z.shape
|
621 |
+
|
622 |
+
# timestep embedding
|
623 |
+
temb = None
|
624 |
+
|
625 |
+
# z to block_in
|
626 |
+
h = self.conv_in(z)
|
627 |
+
|
628 |
+
# middle
|
629 |
+
h = self.mid.block_1(h, temb, **kwargs)
|
630 |
+
h = self.mid.attn_1(h, **kwargs)
|
631 |
+
h = self.mid.block_2(h, temb, **kwargs)
|
632 |
+
|
633 |
+
# upsampling
|
634 |
+
for i_level in reversed(range(self.num_resolutions)):
|
635 |
+
for i_block in range(self.num_res_blocks+1):
|
636 |
+
h = self.up[i_level].block[i_block](h, temb, **kwargs)
|
637 |
+
if len(self.up[i_level].attn) > 0:
|
638 |
+
h = self.up[i_level].attn[i_block](h, **kwargs)
|
639 |
+
if i_level != 0:
|
640 |
+
h = self.up[i_level].upsample(h)
|
641 |
+
|
642 |
+
# end
|
643 |
+
if self.give_pre_end:
|
644 |
+
return h
|
645 |
+
|
646 |
+
h = self.norm_out(h)
|
647 |
+
h = nonlinearity(h)
|
648 |
+
h = self.conv_out(h, **kwargs)
|
649 |
+
if self.tanh_out:
|
650 |
+
h = torch.tanh(h)
|
651 |
+
return h
|
ComfyUI/comfy/ldm/modules/diffusionmodules/openaimodel.py
ADDED
@@ -0,0 +1,890 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import abstractmethod
|
2 |
+
|
3 |
+
import torch as th
|
4 |
+
import torch.nn as nn
|
5 |
+
import torch.nn.functional as F
|
6 |
+
from einops import rearrange
|
7 |
+
import logging
|
8 |
+
|
9 |
+
from .util import (
|
10 |
+
checkpoint,
|
11 |
+
avg_pool_nd,
|
12 |
+
zero_module,
|
13 |
+
timestep_embedding,
|
14 |
+
AlphaBlender,
|
15 |
+
)
|
16 |
+
from ..attention import SpatialTransformer, SpatialVideoTransformer, default
|
17 |
+
from comfy.ldm.util import exists
|
18 |
+
import comfy.ops
|
19 |
+
ops = comfy.ops.disable_weight_init
|
20 |
+
|
21 |
+
class TimestepBlock(nn.Module):
|
22 |
+
"""
|
23 |
+
Any module where forward() takes timestep embeddings as a second argument.
|
24 |
+
"""
|
25 |
+
|
26 |
+
@abstractmethod
|
27 |
+
def forward(self, x, emb):
|
28 |
+
"""
|
29 |
+
Apply the module to `x` given `emb` timestep embeddings.
|
30 |
+
"""
|
31 |
+
|
32 |
+
#This is needed because accelerate makes a copy of transformer_options which breaks "transformer_index"
|
33 |
+
def forward_timestep_embed(ts, x, emb, context=None, transformer_options={}, output_shape=None, time_context=None, num_video_frames=None, image_only_indicator=None):
|
34 |
+
for layer in ts:
|
35 |
+
if isinstance(layer, VideoResBlock):
|
36 |
+
x = layer(x, emb, num_video_frames, image_only_indicator)
|
37 |
+
elif isinstance(layer, TimestepBlock):
|
38 |
+
x = layer(x, emb)
|
39 |
+
elif isinstance(layer, SpatialVideoTransformer):
|
40 |
+
x = layer(x, context, time_context, num_video_frames, image_only_indicator, transformer_options)
|
41 |
+
if "transformer_index" in transformer_options:
|
42 |
+
transformer_options["transformer_index"] += 1
|
43 |
+
elif isinstance(layer, SpatialTransformer):
|
44 |
+
x = layer(x, context, transformer_options)
|
45 |
+
if "transformer_index" in transformer_options:
|
46 |
+
transformer_options["transformer_index"] += 1
|
47 |
+
elif isinstance(layer, Upsample):
|
48 |
+
x = layer(x, output_shape=output_shape)
|
49 |
+
else:
|
50 |
+
x = layer(x)
|
51 |
+
return x
|
52 |
+
|
53 |
+
class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
|
54 |
+
"""
|
55 |
+
A sequential module that passes timestep embeddings to the children that
|
56 |
+
support it as an extra input.
|
57 |
+
"""
|
58 |
+
|
59 |
+
def forward(self, *args, **kwargs):
|
60 |
+
return forward_timestep_embed(self, *args, **kwargs)
|
61 |
+
|
62 |
+
class Upsample(nn.Module):
|
63 |
+
"""
|
64 |
+
An upsampling layer with an optional convolution.
|
65 |
+
:param channels: channels in the inputs and outputs.
|
66 |
+
:param use_conv: a bool determining if a convolution is applied.
|
67 |
+
:param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
|
68 |
+
upsampling occurs in the inner-two dimensions.
|
69 |
+
"""
|
70 |
+
|
71 |
+
def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1, dtype=None, device=None, operations=ops):
|
72 |
+
super().__init__()
|
73 |
+
self.channels = channels
|
74 |
+
self.out_channels = out_channels or channels
|
75 |
+
self.use_conv = use_conv
|
76 |
+
self.dims = dims
|
77 |
+
if use_conv:
|
78 |
+
self.conv = operations.conv_nd(dims, self.channels, self.out_channels, 3, padding=padding, dtype=dtype, device=device)
|
79 |
+
|
80 |
+
def forward(self, x, output_shape=None):
|
81 |
+
assert x.shape[1] == self.channels
|
82 |
+
if self.dims == 3:
|
83 |
+
shape = [x.shape[2], x.shape[3] * 2, x.shape[4] * 2]
|
84 |
+
if output_shape is not None:
|
85 |
+
shape[1] = output_shape[3]
|
86 |
+
shape[2] = output_shape[4]
|
87 |
+
else:
|
88 |
+
shape = [x.shape[2] * 2, x.shape[3] * 2]
|
89 |
+
if output_shape is not None:
|
90 |
+
shape[0] = output_shape[2]
|
91 |
+
shape[1] = output_shape[3]
|
92 |
+
|
93 |
+
x = F.interpolate(x, size=shape, mode="nearest")
|
94 |
+
if self.use_conv:
|
95 |
+
x = self.conv(x)
|
96 |
+
return x
|
97 |
+
|
98 |
+
class Downsample(nn.Module):
|
99 |
+
"""
|
100 |
+
A downsampling layer with an optional convolution.
|
101 |
+
:param channels: channels in the inputs and outputs.
|
102 |
+
:param use_conv: a bool determining if a convolution is applied.
|
103 |
+
:param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
|
104 |
+
downsampling occurs in the inner-two dimensions.
|
105 |
+
"""
|
106 |
+
|
107 |
+
def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1, dtype=None, device=None, operations=ops):
|
108 |
+
super().__init__()
|
109 |
+
self.channels = channels
|
110 |
+
self.out_channels = out_channels or channels
|
111 |
+
self.use_conv = use_conv
|
112 |
+
self.dims = dims
|
113 |
+
stride = 2 if dims != 3 else (1, 2, 2)
|
114 |
+
if use_conv:
|
115 |
+
self.op = operations.conv_nd(
|
116 |
+
dims, self.channels, self.out_channels, 3, stride=stride, padding=padding, dtype=dtype, device=device
|
117 |
+
)
|
118 |
+
else:
|
119 |
+
assert self.channels == self.out_channels
|
120 |
+
self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
|
121 |
+
|
122 |
+
def forward(self, x):
|
123 |
+
assert x.shape[1] == self.channels
|
124 |
+
return self.op(x)
|
125 |
+
|
126 |
+
|
127 |
+
class ResBlock(TimestepBlock):
|
128 |
+
"""
|
129 |
+
A residual block that can optionally change the number of channels.
|
130 |
+
:param channels: the number of input channels.
|
131 |
+
:param emb_channels: the number of timestep embedding channels.
|
132 |
+
:param dropout: the rate of dropout.
|
133 |
+
:param out_channels: if specified, the number of out channels.
|
134 |
+
:param use_conv: if True and out_channels is specified, use a spatial
|
135 |
+
convolution instead of a smaller 1x1 convolution to change the
|
136 |
+
channels in the skip connection.
|
137 |
+
:param dims: determines if the signal is 1D, 2D, or 3D.
|
138 |
+
:param use_checkpoint: if True, use gradient checkpointing on this module.
|
139 |
+
:param up: if True, use this block for upsampling.
|
140 |
+
:param down: if True, use this block for downsampling.
|
141 |
+
"""
|
142 |
+
|
143 |
+
def __init__(
|
144 |
+
self,
|
145 |
+
channels,
|
146 |
+
emb_channels,
|
147 |
+
dropout,
|
148 |
+
out_channels=None,
|
149 |
+
use_conv=False,
|
150 |
+
use_scale_shift_norm=False,
|
151 |
+
dims=2,
|
152 |
+
use_checkpoint=False,
|
153 |
+
up=False,
|
154 |
+
down=False,
|
155 |
+
kernel_size=3,
|
156 |
+
exchange_temb_dims=False,
|
157 |
+
skip_t_emb=False,
|
158 |
+
dtype=None,
|
159 |
+
device=None,
|
160 |
+
operations=ops
|
161 |
+
):
|
162 |
+
super().__init__()
|
163 |
+
self.channels = channels
|
164 |
+
self.emb_channels = emb_channels
|
165 |
+
self.dropout = dropout
|
166 |
+
self.out_channels = out_channels or channels
|
167 |
+
self.use_conv = use_conv
|
168 |
+
self.use_checkpoint = use_checkpoint
|
169 |
+
self.use_scale_shift_norm = use_scale_shift_norm
|
170 |
+
self.exchange_temb_dims = exchange_temb_dims
|
171 |
+
|
172 |
+
if isinstance(kernel_size, list):
|
173 |
+
padding = [k // 2 for k in kernel_size]
|
174 |
+
else:
|
175 |
+
padding = kernel_size // 2
|
176 |
+
|
177 |
+
self.in_layers = nn.Sequential(
|
178 |
+
operations.GroupNorm(32, channels, dtype=dtype, device=device),
|
179 |
+
nn.SiLU(),
|
180 |
+
operations.conv_nd(dims, channels, self.out_channels, kernel_size, padding=padding, dtype=dtype, device=device),
|
181 |
+
)
|
182 |
+
|
183 |
+
self.updown = up or down
|
184 |
+
|
185 |
+
if up:
|
186 |
+
self.h_upd = Upsample(channels, False, dims, dtype=dtype, device=device)
|
187 |
+
self.x_upd = Upsample(channels, False, dims, dtype=dtype, device=device)
|
188 |
+
elif down:
|
189 |
+
self.h_upd = Downsample(channels, False, dims, dtype=dtype, device=device)
|
190 |
+
self.x_upd = Downsample(channels, False, dims, dtype=dtype, device=device)
|
191 |
+
else:
|
192 |
+
self.h_upd = self.x_upd = nn.Identity()
|
193 |
+
|
194 |
+
self.skip_t_emb = skip_t_emb
|
195 |
+
if self.skip_t_emb:
|
196 |
+
self.emb_layers = None
|
197 |
+
self.exchange_temb_dims = False
|
198 |
+
else:
|
199 |
+
self.emb_layers = nn.Sequential(
|
200 |
+
nn.SiLU(),
|
201 |
+
operations.Linear(
|
202 |
+
emb_channels,
|
203 |
+
2 * self.out_channels if use_scale_shift_norm else self.out_channels, dtype=dtype, device=device
|
204 |
+
),
|
205 |
+
)
|
206 |
+
self.out_layers = nn.Sequential(
|
207 |
+
operations.GroupNorm(32, self.out_channels, dtype=dtype, device=device),
|
208 |
+
nn.SiLU(),
|
209 |
+
nn.Dropout(p=dropout),
|
210 |
+
operations.conv_nd(dims, self.out_channels, self.out_channels, kernel_size, padding=padding, dtype=dtype, device=device)
|
211 |
+
,
|
212 |
+
)
|
213 |
+
|
214 |
+
if self.out_channels == channels:
|
215 |
+
self.skip_connection = nn.Identity()
|
216 |
+
elif use_conv:
|
217 |
+
self.skip_connection = operations.conv_nd(
|
218 |
+
dims, channels, self.out_channels, kernel_size, padding=padding, dtype=dtype, device=device
|
219 |
+
)
|
220 |
+
else:
|
221 |
+
self.skip_connection = operations.conv_nd(dims, channels, self.out_channels, 1, dtype=dtype, device=device)
|
222 |
+
|
223 |
+
def forward(self, x, emb):
|
224 |
+
"""
|
225 |
+
Apply the block to a Tensor, conditioned on a timestep embedding.
|
226 |
+
:param x: an [N x C x ...] Tensor of features.
|
227 |
+
:param emb: an [N x emb_channels] Tensor of timestep embeddings.
|
228 |
+
:return: an [N x C x ...] Tensor of outputs.
|
229 |
+
"""
|
230 |
+
return checkpoint(
|
231 |
+
self._forward, (x, emb), self.parameters(), self.use_checkpoint
|
232 |
+
)
|
233 |
+
|
234 |
+
|
235 |
+
def _forward(self, x, emb):
|
236 |
+
if self.updown:
|
237 |
+
in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
|
238 |
+
h = in_rest(x)
|
239 |
+
h = self.h_upd(h)
|
240 |
+
x = self.x_upd(x)
|
241 |
+
h = in_conv(h)
|
242 |
+
else:
|
243 |
+
h = self.in_layers(x)
|
244 |
+
|
245 |
+
emb_out = None
|
246 |
+
if not self.skip_t_emb:
|
247 |
+
emb_out = self.emb_layers(emb).type(h.dtype)
|
248 |
+
while len(emb_out.shape) < len(h.shape):
|
249 |
+
emb_out = emb_out[..., None]
|
250 |
+
if self.use_scale_shift_norm:
|
251 |
+
out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
|
252 |
+
h = out_norm(h)
|
253 |
+
if emb_out is not None:
|
254 |
+
scale, shift = th.chunk(emb_out, 2, dim=1)
|
255 |
+
h *= (1 + scale)
|
256 |
+
h += shift
|
257 |
+
h = out_rest(h)
|
258 |
+
else:
|
259 |
+
if emb_out is not None:
|
260 |
+
if self.exchange_temb_dims:
|
261 |
+
emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
|
262 |
+
h = h + emb_out
|
263 |
+
h = self.out_layers(h)
|
264 |
+
return self.skip_connection(x) + h
|
265 |
+
|
266 |
+
|
267 |
+
class VideoResBlock(ResBlock):
|
268 |
+
def __init__(
|
269 |
+
self,
|
270 |
+
channels: int,
|
271 |
+
emb_channels: int,
|
272 |
+
dropout: float,
|
273 |
+
video_kernel_size=3,
|
274 |
+
merge_strategy: str = "fixed",
|
275 |
+
merge_factor: float = 0.5,
|
276 |
+
out_channels=None,
|
277 |
+
use_conv: bool = False,
|
278 |
+
use_scale_shift_norm: bool = False,
|
279 |
+
dims: int = 2,
|
280 |
+
use_checkpoint: bool = False,
|
281 |
+
up: bool = False,
|
282 |
+
down: bool = False,
|
283 |
+
dtype=None,
|
284 |
+
device=None,
|
285 |
+
operations=ops
|
286 |
+
):
|
287 |
+
super().__init__(
|
288 |
+
channels,
|
289 |
+
emb_channels,
|
290 |
+
dropout,
|
291 |
+
out_channels=out_channels,
|
292 |
+
use_conv=use_conv,
|
293 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
294 |
+
dims=dims,
|
295 |
+
use_checkpoint=use_checkpoint,
|
296 |
+
up=up,
|
297 |
+
down=down,
|
298 |
+
dtype=dtype,
|
299 |
+
device=device,
|
300 |
+
operations=operations
|
301 |
+
)
|
302 |
+
|
303 |
+
self.time_stack = ResBlock(
|
304 |
+
default(out_channels, channels),
|
305 |
+
emb_channels,
|
306 |
+
dropout=dropout,
|
307 |
+
dims=3,
|
308 |
+
out_channels=default(out_channels, channels),
|
309 |
+
use_scale_shift_norm=False,
|
310 |
+
use_conv=False,
|
311 |
+
up=False,
|
312 |
+
down=False,
|
313 |
+
kernel_size=video_kernel_size,
|
314 |
+
use_checkpoint=use_checkpoint,
|
315 |
+
exchange_temb_dims=True,
|
316 |
+
dtype=dtype,
|
317 |
+
device=device,
|
318 |
+
operations=operations
|
319 |
+
)
|
320 |
+
self.time_mixer = AlphaBlender(
|
321 |
+
alpha=merge_factor,
|
322 |
+
merge_strategy=merge_strategy,
|
323 |
+
rearrange_pattern="b t -> b 1 t 1 1",
|
324 |
+
)
|
325 |
+
|
326 |
+
def forward(
|
327 |
+
self,
|
328 |
+
x: th.Tensor,
|
329 |
+
emb: th.Tensor,
|
330 |
+
num_video_frames: int,
|
331 |
+
image_only_indicator = None,
|
332 |
+
) -> th.Tensor:
|
333 |
+
x = super().forward(x, emb)
|
334 |
+
|
335 |
+
x_mix = rearrange(x, "(b t) c h w -> b c t h w", t=num_video_frames)
|
336 |
+
x = rearrange(x, "(b t) c h w -> b c t h w", t=num_video_frames)
|
337 |
+
|
338 |
+
x = self.time_stack(
|
339 |
+
x, rearrange(emb, "(b t) ... -> b t ...", t=num_video_frames)
|
340 |
+
)
|
341 |
+
x = self.time_mixer(
|
342 |
+
x_spatial=x_mix, x_temporal=x, image_only_indicator=image_only_indicator
|
343 |
+
)
|
344 |
+
x = rearrange(x, "b c t h w -> (b t) c h w")
|
345 |
+
return x
|
346 |
+
|
347 |
+
|
348 |
+
class Timestep(nn.Module):
|
349 |
+
def __init__(self, dim):
|
350 |
+
super().__init__()
|
351 |
+
self.dim = dim
|
352 |
+
|
353 |
+
def forward(self, t):
|
354 |
+
return timestep_embedding(t, self.dim)
|
355 |
+
|
356 |
+
def apply_control(h, control, name):
|
357 |
+
if control is not None and name in control and len(control[name]) > 0:
|
358 |
+
ctrl = control[name].pop()
|
359 |
+
if ctrl is not None:
|
360 |
+
try:
|
361 |
+
h += ctrl
|
362 |
+
except:
|
363 |
+
logging.warning("warning control could not be applied {} {}".format(h.shape, ctrl.shape))
|
364 |
+
return h
|
365 |
+
|
366 |
+
class UNetModel(nn.Module):
|
367 |
+
"""
|
368 |
+
The full UNet model with attention and timestep embedding.
|
369 |
+
:param in_channels: channels in the input Tensor.
|
370 |
+
:param model_channels: base channel count for the model.
|
371 |
+
:param out_channels: channels in the output Tensor.
|
372 |
+
:param num_res_blocks: number of residual blocks per downsample.
|
373 |
+
:param dropout: the dropout probability.
|
374 |
+
:param channel_mult: channel multiplier for each level of the UNet.
|
375 |
+
:param conv_resample: if True, use learned convolutions for upsampling and
|
376 |
+
downsampling.
|
377 |
+
:param dims: determines if the signal is 1D, 2D, or 3D.
|
378 |
+
:param num_classes: if specified (as an int), then this model will be
|
379 |
+
class-conditional with `num_classes` classes.
|
380 |
+
:param use_checkpoint: use gradient checkpointing to reduce memory usage.
|
381 |
+
:param num_heads: the number of attention heads in each attention layer.
|
382 |
+
:param num_heads_channels: if specified, ignore num_heads and instead use
|
383 |
+
a fixed channel width per attention head.
|
384 |
+
:param num_heads_upsample: works with num_heads to set a different number
|
385 |
+
of heads for upsampling. Deprecated.
|
386 |
+
:param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
|
387 |
+
:param resblock_updown: use residual blocks for up/downsampling.
|
388 |
+
:param use_new_attention_order: use a different attention pattern for potentially
|
389 |
+
increased efficiency.
|
390 |
+
"""
|
391 |
+
|
392 |
+
def __init__(
|
393 |
+
self,
|
394 |
+
image_size,
|
395 |
+
in_channels,
|
396 |
+
model_channels,
|
397 |
+
out_channels,
|
398 |
+
num_res_blocks,
|
399 |
+
dropout=0,
|
400 |
+
channel_mult=(1, 2, 4, 8),
|
401 |
+
conv_resample=True,
|
402 |
+
dims=2,
|
403 |
+
num_classes=None,
|
404 |
+
use_checkpoint=False,
|
405 |
+
dtype=th.float32,
|
406 |
+
num_heads=-1,
|
407 |
+
num_head_channels=-1,
|
408 |
+
num_heads_upsample=-1,
|
409 |
+
use_scale_shift_norm=False,
|
410 |
+
resblock_updown=False,
|
411 |
+
use_new_attention_order=False,
|
412 |
+
use_spatial_transformer=False, # custom transformer support
|
413 |
+
transformer_depth=1, # custom transformer support
|
414 |
+
context_dim=None, # custom transformer support
|
415 |
+
n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model
|
416 |
+
legacy=True,
|
417 |
+
disable_self_attentions=None,
|
418 |
+
num_attention_blocks=None,
|
419 |
+
disable_middle_self_attn=False,
|
420 |
+
use_linear_in_transformer=False,
|
421 |
+
adm_in_channels=None,
|
422 |
+
transformer_depth_middle=None,
|
423 |
+
transformer_depth_output=None,
|
424 |
+
use_temporal_resblock=False,
|
425 |
+
use_temporal_attention=False,
|
426 |
+
time_context_dim=None,
|
427 |
+
extra_ff_mix_layer=False,
|
428 |
+
use_spatial_context=False,
|
429 |
+
merge_strategy=None,
|
430 |
+
merge_factor=0.0,
|
431 |
+
video_kernel_size=None,
|
432 |
+
disable_temporal_crossattention=False,
|
433 |
+
max_ddpm_temb_period=10000,
|
434 |
+
device=None,
|
435 |
+
operations=ops,
|
436 |
+
):
|
437 |
+
super().__init__()
|
438 |
+
|
439 |
+
if context_dim is not None:
|
440 |
+
assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
|
441 |
+
# from omegaconf.listconfig import ListConfig
|
442 |
+
# if type(context_dim) == ListConfig:
|
443 |
+
# context_dim = list(context_dim)
|
444 |
+
|
445 |
+
if num_heads_upsample == -1:
|
446 |
+
num_heads_upsample = num_heads
|
447 |
+
|
448 |
+
if num_heads == -1:
|
449 |
+
assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
|
450 |
+
|
451 |
+
if num_head_channels == -1:
|
452 |
+
assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
|
453 |
+
|
454 |
+
self.in_channels = in_channels
|
455 |
+
self.model_channels = model_channels
|
456 |
+
self.out_channels = out_channels
|
457 |
+
|
458 |
+
if isinstance(num_res_blocks, int):
|
459 |
+
self.num_res_blocks = len(channel_mult) * [num_res_blocks]
|
460 |
+
else:
|
461 |
+
if len(num_res_blocks) != len(channel_mult):
|
462 |
+
raise ValueError("provide num_res_blocks either as an int (globally constant) or "
|
463 |
+
"as a list/tuple (per-level) with the same length as channel_mult")
|
464 |
+
self.num_res_blocks = num_res_blocks
|
465 |
+
|
466 |
+
if disable_self_attentions is not None:
|
467 |
+
# should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
|
468 |
+
assert len(disable_self_attentions) == len(channel_mult)
|
469 |
+
if num_attention_blocks is not None:
|
470 |
+
assert len(num_attention_blocks) == len(self.num_res_blocks)
|
471 |
+
|
472 |
+
transformer_depth = transformer_depth[:]
|
473 |
+
transformer_depth_output = transformer_depth_output[:]
|
474 |
+
|
475 |
+
self.dropout = dropout
|
476 |
+
self.channel_mult = channel_mult
|
477 |
+
self.conv_resample = conv_resample
|
478 |
+
self.num_classes = num_classes
|
479 |
+
self.use_checkpoint = use_checkpoint
|
480 |
+
self.dtype = dtype
|
481 |
+
self.num_heads = num_heads
|
482 |
+
self.num_head_channels = num_head_channels
|
483 |
+
self.num_heads_upsample = num_heads_upsample
|
484 |
+
self.use_temporal_resblocks = use_temporal_resblock
|
485 |
+
self.predict_codebook_ids = n_embed is not None
|
486 |
+
|
487 |
+
self.default_num_video_frames = None
|
488 |
+
|
489 |
+
time_embed_dim = model_channels * 4
|
490 |
+
self.time_embed = nn.Sequential(
|
491 |
+
operations.Linear(model_channels, time_embed_dim, dtype=self.dtype, device=device),
|
492 |
+
nn.SiLU(),
|
493 |
+
operations.Linear(time_embed_dim, time_embed_dim, dtype=self.dtype, device=device),
|
494 |
+
)
|
495 |
+
|
496 |
+
if self.num_classes is not None:
|
497 |
+
if isinstance(self.num_classes, int):
|
498 |
+
self.label_emb = nn.Embedding(num_classes, time_embed_dim, dtype=self.dtype, device=device)
|
499 |
+
elif self.num_classes == "continuous":
|
500 |
+
logging.debug("setting up linear c_adm embedding layer")
|
501 |
+
self.label_emb = nn.Linear(1, time_embed_dim)
|
502 |
+
elif self.num_classes == "sequential":
|
503 |
+
assert adm_in_channels is not None
|
504 |
+
self.label_emb = nn.Sequential(
|
505 |
+
nn.Sequential(
|
506 |
+
operations.Linear(adm_in_channels, time_embed_dim, dtype=self.dtype, device=device),
|
507 |
+
nn.SiLU(),
|
508 |
+
operations.Linear(time_embed_dim, time_embed_dim, dtype=self.dtype, device=device),
|
509 |
+
)
|
510 |
+
)
|
511 |
+
else:
|
512 |
+
raise ValueError()
|
513 |
+
|
514 |
+
self.input_blocks = nn.ModuleList(
|
515 |
+
[
|
516 |
+
TimestepEmbedSequential(
|
517 |
+
operations.conv_nd(dims, in_channels, model_channels, 3, padding=1, dtype=self.dtype, device=device)
|
518 |
+
)
|
519 |
+
]
|
520 |
+
)
|
521 |
+
self._feature_size = model_channels
|
522 |
+
input_block_chans = [model_channels]
|
523 |
+
ch = model_channels
|
524 |
+
ds = 1
|
525 |
+
|
526 |
+
def get_attention_layer(
|
527 |
+
ch,
|
528 |
+
num_heads,
|
529 |
+
dim_head,
|
530 |
+
depth=1,
|
531 |
+
context_dim=None,
|
532 |
+
use_checkpoint=False,
|
533 |
+
disable_self_attn=False,
|
534 |
+
):
|
535 |
+
if use_temporal_attention:
|
536 |
+
return SpatialVideoTransformer(
|
537 |
+
ch,
|
538 |
+
num_heads,
|
539 |
+
dim_head,
|
540 |
+
depth=depth,
|
541 |
+
context_dim=context_dim,
|
542 |
+
time_context_dim=time_context_dim,
|
543 |
+
dropout=dropout,
|
544 |
+
ff_in=extra_ff_mix_layer,
|
545 |
+
use_spatial_context=use_spatial_context,
|
546 |
+
merge_strategy=merge_strategy,
|
547 |
+
merge_factor=merge_factor,
|
548 |
+
checkpoint=use_checkpoint,
|
549 |
+
use_linear=use_linear_in_transformer,
|
550 |
+
disable_self_attn=disable_self_attn,
|
551 |
+
disable_temporal_crossattention=disable_temporal_crossattention,
|
552 |
+
max_time_embed_period=max_ddpm_temb_period,
|
553 |
+
dtype=self.dtype, device=device, operations=operations
|
554 |
+
)
|
555 |
+
else:
|
556 |
+
return SpatialTransformer(
|
557 |
+
ch, num_heads, dim_head, depth=depth, context_dim=context_dim,
|
558 |
+
disable_self_attn=disable_self_attn, use_linear=use_linear_in_transformer,
|
559 |
+
use_checkpoint=use_checkpoint, dtype=self.dtype, device=device, operations=operations
|
560 |
+
)
|
561 |
+
|
562 |
+
def get_resblock(
|
563 |
+
merge_factor,
|
564 |
+
merge_strategy,
|
565 |
+
video_kernel_size,
|
566 |
+
ch,
|
567 |
+
time_embed_dim,
|
568 |
+
dropout,
|
569 |
+
out_channels,
|
570 |
+
dims,
|
571 |
+
use_checkpoint,
|
572 |
+
use_scale_shift_norm,
|
573 |
+
down=False,
|
574 |
+
up=False,
|
575 |
+
dtype=None,
|
576 |
+
device=None,
|
577 |
+
operations=ops
|
578 |
+
):
|
579 |
+
if self.use_temporal_resblocks:
|
580 |
+
return VideoResBlock(
|
581 |
+
merge_factor=merge_factor,
|
582 |
+
merge_strategy=merge_strategy,
|
583 |
+
video_kernel_size=video_kernel_size,
|
584 |
+
channels=ch,
|
585 |
+
emb_channels=time_embed_dim,
|
586 |
+
dropout=dropout,
|
587 |
+
out_channels=out_channels,
|
588 |
+
dims=dims,
|
589 |
+
use_checkpoint=use_checkpoint,
|
590 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
591 |
+
down=down,
|
592 |
+
up=up,
|
593 |
+
dtype=dtype,
|
594 |
+
device=device,
|
595 |
+
operations=operations
|
596 |
+
)
|
597 |
+
else:
|
598 |
+
return ResBlock(
|
599 |
+
channels=ch,
|
600 |
+
emb_channels=time_embed_dim,
|
601 |
+
dropout=dropout,
|
602 |
+
out_channels=out_channels,
|
603 |
+
use_checkpoint=use_checkpoint,
|
604 |
+
dims=dims,
|
605 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
606 |
+
down=down,
|
607 |
+
up=up,
|
608 |
+
dtype=dtype,
|
609 |
+
device=device,
|
610 |
+
operations=operations
|
611 |
+
)
|
612 |
+
|
613 |
+
for level, mult in enumerate(channel_mult):
|
614 |
+
for nr in range(self.num_res_blocks[level]):
|
615 |
+
layers = [
|
616 |
+
get_resblock(
|
617 |
+
merge_factor=merge_factor,
|
618 |
+
merge_strategy=merge_strategy,
|
619 |
+
video_kernel_size=video_kernel_size,
|
620 |
+
ch=ch,
|
621 |
+
time_embed_dim=time_embed_dim,
|
622 |
+
dropout=dropout,
|
623 |
+
out_channels=mult * model_channels,
|
624 |
+
dims=dims,
|
625 |
+
use_checkpoint=use_checkpoint,
|
626 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
627 |
+
dtype=self.dtype,
|
628 |
+
device=device,
|
629 |
+
operations=operations,
|
630 |
+
)
|
631 |
+
]
|
632 |
+
ch = mult * model_channels
|
633 |
+
num_transformers = transformer_depth.pop(0)
|
634 |
+
if num_transformers > 0:
|
635 |
+
if num_head_channels == -1:
|
636 |
+
dim_head = ch // num_heads
|
637 |
+
else:
|
638 |
+
num_heads = ch // num_head_channels
|
639 |
+
dim_head = num_head_channels
|
640 |
+
if legacy:
|
641 |
+
#num_heads = 1
|
642 |
+
dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
|
643 |
+
if exists(disable_self_attentions):
|
644 |
+
disabled_sa = disable_self_attentions[level]
|
645 |
+
else:
|
646 |
+
disabled_sa = False
|
647 |
+
|
648 |
+
if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
|
649 |
+
layers.append(get_attention_layer(
|
650 |
+
ch, num_heads, dim_head, depth=num_transformers, context_dim=context_dim,
|
651 |
+
disable_self_attn=disabled_sa, use_checkpoint=use_checkpoint)
|
652 |
+
)
|
653 |
+
self.input_blocks.append(TimestepEmbedSequential(*layers))
|
654 |
+
self._feature_size += ch
|
655 |
+
input_block_chans.append(ch)
|
656 |
+
if level != len(channel_mult) - 1:
|
657 |
+
out_ch = ch
|
658 |
+
self.input_blocks.append(
|
659 |
+
TimestepEmbedSequential(
|
660 |
+
get_resblock(
|
661 |
+
merge_factor=merge_factor,
|
662 |
+
merge_strategy=merge_strategy,
|
663 |
+
video_kernel_size=video_kernel_size,
|
664 |
+
ch=ch,
|
665 |
+
time_embed_dim=time_embed_dim,
|
666 |
+
dropout=dropout,
|
667 |
+
out_channels=out_ch,
|
668 |
+
dims=dims,
|
669 |
+
use_checkpoint=use_checkpoint,
|
670 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
671 |
+
down=True,
|
672 |
+
dtype=self.dtype,
|
673 |
+
device=device,
|
674 |
+
operations=operations
|
675 |
+
)
|
676 |
+
if resblock_updown
|
677 |
+
else Downsample(
|
678 |
+
ch, conv_resample, dims=dims, out_channels=out_ch, dtype=self.dtype, device=device, operations=operations
|
679 |
+
)
|
680 |
+
)
|
681 |
+
)
|
682 |
+
ch = out_ch
|
683 |
+
input_block_chans.append(ch)
|
684 |
+
ds *= 2
|
685 |
+
self._feature_size += ch
|
686 |
+
|
687 |
+
if num_head_channels == -1:
|
688 |
+
dim_head = ch // num_heads
|
689 |
+
else:
|
690 |
+
num_heads = ch // num_head_channels
|
691 |
+
dim_head = num_head_channels
|
692 |
+
if legacy:
|
693 |
+
#num_heads = 1
|
694 |
+
dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
|
695 |
+
mid_block = [
|
696 |
+
get_resblock(
|
697 |
+
merge_factor=merge_factor,
|
698 |
+
merge_strategy=merge_strategy,
|
699 |
+
video_kernel_size=video_kernel_size,
|
700 |
+
ch=ch,
|
701 |
+
time_embed_dim=time_embed_dim,
|
702 |
+
dropout=dropout,
|
703 |
+
out_channels=None,
|
704 |
+
dims=dims,
|
705 |
+
use_checkpoint=use_checkpoint,
|
706 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
707 |
+
dtype=self.dtype,
|
708 |
+
device=device,
|
709 |
+
operations=operations
|
710 |
+
)]
|
711 |
+
|
712 |
+
self.middle_block = None
|
713 |
+
if transformer_depth_middle >= -1:
|
714 |
+
if transformer_depth_middle >= 0:
|
715 |
+
mid_block += [get_attention_layer( # always uses a self-attn
|
716 |
+
ch, num_heads, dim_head, depth=transformer_depth_middle, context_dim=context_dim,
|
717 |
+
disable_self_attn=disable_middle_self_attn, use_checkpoint=use_checkpoint
|
718 |
+
),
|
719 |
+
get_resblock(
|
720 |
+
merge_factor=merge_factor,
|
721 |
+
merge_strategy=merge_strategy,
|
722 |
+
video_kernel_size=video_kernel_size,
|
723 |
+
ch=ch,
|
724 |
+
time_embed_dim=time_embed_dim,
|
725 |
+
dropout=dropout,
|
726 |
+
out_channels=None,
|
727 |
+
dims=dims,
|
728 |
+
use_checkpoint=use_checkpoint,
|
729 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
730 |
+
dtype=self.dtype,
|
731 |
+
device=device,
|
732 |
+
operations=operations
|
733 |
+
)]
|
734 |
+
self.middle_block = TimestepEmbedSequential(*mid_block)
|
735 |
+
self._feature_size += ch
|
736 |
+
|
737 |
+
self.output_blocks = nn.ModuleList([])
|
738 |
+
for level, mult in list(enumerate(channel_mult))[::-1]:
|
739 |
+
for i in range(self.num_res_blocks[level] + 1):
|
740 |
+
ich = input_block_chans.pop()
|
741 |
+
layers = [
|
742 |
+
get_resblock(
|
743 |
+
merge_factor=merge_factor,
|
744 |
+
merge_strategy=merge_strategy,
|
745 |
+
video_kernel_size=video_kernel_size,
|
746 |
+
ch=ch + ich,
|
747 |
+
time_embed_dim=time_embed_dim,
|
748 |
+
dropout=dropout,
|
749 |
+
out_channels=model_channels * mult,
|
750 |
+
dims=dims,
|
751 |
+
use_checkpoint=use_checkpoint,
|
752 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
753 |
+
dtype=self.dtype,
|
754 |
+
device=device,
|
755 |
+
operations=operations
|
756 |
+
)
|
757 |
+
]
|
758 |
+
ch = model_channels * mult
|
759 |
+
num_transformers = transformer_depth_output.pop()
|
760 |
+
if num_transformers > 0:
|
761 |
+
if num_head_channels == -1:
|
762 |
+
dim_head = ch // num_heads
|
763 |
+
else:
|
764 |
+
num_heads = ch // num_head_channels
|
765 |
+
dim_head = num_head_channels
|
766 |
+
if legacy:
|
767 |
+
#num_heads = 1
|
768 |
+
dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
|
769 |
+
if exists(disable_self_attentions):
|
770 |
+
disabled_sa = disable_self_attentions[level]
|
771 |
+
else:
|
772 |
+
disabled_sa = False
|
773 |
+
|
774 |
+
if not exists(num_attention_blocks) or i < num_attention_blocks[level]:
|
775 |
+
layers.append(
|
776 |
+
get_attention_layer(
|
777 |
+
ch, num_heads, dim_head, depth=num_transformers, context_dim=context_dim,
|
778 |
+
disable_self_attn=disabled_sa, use_checkpoint=use_checkpoint
|
779 |
+
)
|
780 |
+
)
|
781 |
+
if level and i == self.num_res_blocks[level]:
|
782 |
+
out_ch = ch
|
783 |
+
layers.append(
|
784 |
+
get_resblock(
|
785 |
+
merge_factor=merge_factor,
|
786 |
+
merge_strategy=merge_strategy,
|
787 |
+
video_kernel_size=video_kernel_size,
|
788 |
+
ch=ch,
|
789 |
+
time_embed_dim=time_embed_dim,
|
790 |
+
dropout=dropout,
|
791 |
+
out_channels=out_ch,
|
792 |
+
dims=dims,
|
793 |
+
use_checkpoint=use_checkpoint,
|
794 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
795 |
+
up=True,
|
796 |
+
dtype=self.dtype,
|
797 |
+
device=device,
|
798 |
+
operations=operations
|
799 |
+
)
|
800 |
+
if resblock_updown
|
801 |
+
else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch, dtype=self.dtype, device=device, operations=operations)
|
802 |
+
)
|
803 |
+
ds //= 2
|
804 |
+
self.output_blocks.append(TimestepEmbedSequential(*layers))
|
805 |
+
self._feature_size += ch
|
806 |
+
|
807 |
+
self.out = nn.Sequential(
|
808 |
+
operations.GroupNorm(32, ch, dtype=self.dtype, device=device),
|
809 |
+
nn.SiLU(),
|
810 |
+
zero_module(operations.conv_nd(dims, model_channels, out_channels, 3, padding=1, dtype=self.dtype, device=device)),
|
811 |
+
)
|
812 |
+
if self.predict_codebook_ids:
|
813 |
+
self.id_predictor = nn.Sequential(
|
814 |
+
operations.GroupNorm(32, ch, dtype=self.dtype, device=device),
|
815 |
+
operations.conv_nd(dims, model_channels, n_embed, 1, dtype=self.dtype, device=device),
|
816 |
+
#nn.LogSoftmax(dim=1) # change to cross_entropy and produce non-normalized logits
|
817 |
+
)
|
818 |
+
|
819 |
+
def forward(self, x, timesteps=None, context=None, y=None, control=None, transformer_options={}, **kwargs):
|
820 |
+
"""
|
821 |
+
Apply the model to an input batch.
|
822 |
+
:param x: an [N x C x ...] Tensor of inputs.
|
823 |
+
:param timesteps: a 1-D batch of timesteps.
|
824 |
+
:param context: conditioning plugged in via crossattn
|
825 |
+
:param y: an [N] Tensor of labels, if class-conditional.
|
826 |
+
:return: an [N x C x ...] Tensor of outputs.
|
827 |
+
"""
|
828 |
+
transformer_options["original_shape"] = list(x.shape)
|
829 |
+
transformer_options["transformer_index"] = 0
|
830 |
+
transformer_patches = transformer_options.get("patches", {})
|
831 |
+
|
832 |
+
num_video_frames = kwargs.get("num_video_frames", self.default_num_video_frames)
|
833 |
+
image_only_indicator = kwargs.get("image_only_indicator", None)
|
834 |
+
time_context = kwargs.get("time_context", None)
|
835 |
+
|
836 |
+
assert (y is not None) == (
|
837 |
+
self.num_classes is not None
|
838 |
+
), "must specify y if and only if the model is class-conditional"
|
839 |
+
hs = []
|
840 |
+
t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype)
|
841 |
+
emb = self.time_embed(t_emb)
|
842 |
+
|
843 |
+
if self.num_classes is not None:
|
844 |
+
assert y.shape[0] == x.shape[0]
|
845 |
+
emb = emb + self.label_emb(y)
|
846 |
+
|
847 |
+
h = x
|
848 |
+
for id, module in enumerate(self.input_blocks):
|
849 |
+
transformer_options["block"] = ("input", id)
|
850 |
+
h = forward_timestep_embed(module, h, emb, context, transformer_options, time_context=time_context, num_video_frames=num_video_frames, image_only_indicator=image_only_indicator)
|
851 |
+
h = apply_control(h, control, 'input')
|
852 |
+
if "input_block_patch" in transformer_patches:
|
853 |
+
patch = transformer_patches["input_block_patch"]
|
854 |
+
for p in patch:
|
855 |
+
h = p(h, transformer_options)
|
856 |
+
|
857 |
+
hs.append(h)
|
858 |
+
if "input_block_patch_after_skip" in transformer_patches:
|
859 |
+
patch = transformer_patches["input_block_patch_after_skip"]
|
860 |
+
for p in patch:
|
861 |
+
h = p(h, transformer_options)
|
862 |
+
|
863 |
+
transformer_options["block"] = ("middle", 0)
|
864 |
+
if self.middle_block is not None:
|
865 |
+
h = forward_timestep_embed(self.middle_block, h, emb, context, transformer_options, time_context=time_context, num_video_frames=num_video_frames, image_only_indicator=image_only_indicator)
|
866 |
+
h = apply_control(h, control, 'middle')
|
867 |
+
|
868 |
+
|
869 |
+
for id, module in enumerate(self.output_blocks):
|
870 |
+
transformer_options["block"] = ("output", id)
|
871 |
+
hsp = hs.pop()
|
872 |
+
hsp = apply_control(hsp, control, 'output')
|
873 |
+
|
874 |
+
if "output_block_patch" in transformer_patches:
|
875 |
+
patch = transformer_patches["output_block_patch"]
|
876 |
+
for p in patch:
|
877 |
+
h, hsp = p(h, hsp, transformer_options)
|
878 |
+
|
879 |
+
h = th.cat([h, hsp], dim=1)
|
880 |
+
del hsp
|
881 |
+
if len(hs) > 0:
|
882 |
+
output_shape = hs[-1].shape
|
883 |
+
else:
|
884 |
+
output_shape = None
|
885 |
+
h = forward_timestep_embed(module, h, emb, context, transformer_options, output_shape, time_context=time_context, num_video_frames=num_video_frames, image_only_indicator=image_only_indicator)
|
886 |
+
h = h.type(x.dtype)
|
887 |
+
if self.predict_codebook_ids:
|
888 |
+
return self.id_predictor(h)
|
889 |
+
else:
|
890 |
+
return self.out(h)
|
ComfyUI/comfy/ldm/modules/diffusionmodules/upscaling.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import numpy as np
|
4 |
+
from functools import partial
|
5 |
+
|
6 |
+
from .util import extract_into_tensor, make_beta_schedule
|
7 |
+
from comfy.ldm.util import default
|
8 |
+
|
9 |
+
|
10 |
+
class AbstractLowScaleModel(nn.Module):
|
11 |
+
# for concatenating a downsampled image to the latent representation
|
12 |
+
def __init__(self, noise_schedule_config=None):
|
13 |
+
super(AbstractLowScaleModel, self).__init__()
|
14 |
+
if noise_schedule_config is not None:
|
15 |
+
self.register_schedule(**noise_schedule_config)
|
16 |
+
|
17 |
+
def register_schedule(self, beta_schedule="linear", timesteps=1000,
|
18 |
+
linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
|
19 |
+
betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
|
20 |
+
cosine_s=cosine_s)
|
21 |
+
alphas = 1. - betas
|
22 |
+
alphas_cumprod = np.cumprod(alphas, axis=0)
|
23 |
+
alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
|
24 |
+
|
25 |
+
timesteps, = betas.shape
|
26 |
+
self.num_timesteps = int(timesteps)
|
27 |
+
self.linear_start = linear_start
|
28 |
+
self.linear_end = linear_end
|
29 |
+
assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
|
30 |
+
|
31 |
+
to_torch = partial(torch.tensor, dtype=torch.float32)
|
32 |
+
|
33 |
+
self.register_buffer('betas', to_torch(betas))
|
34 |
+
self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
|
35 |
+
self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
|
36 |
+
|
37 |
+
# calculations for diffusion q(x_t | x_{t-1}) and others
|
38 |
+
self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
|
39 |
+
self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
|
40 |
+
self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
|
41 |
+
self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
|
42 |
+
self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
|
43 |
+
|
44 |
+
def q_sample(self, x_start, t, noise=None, seed=None):
|
45 |
+
if noise is None:
|
46 |
+
if seed is None:
|
47 |
+
noise = torch.randn_like(x_start)
|
48 |
+
else:
|
49 |
+
noise = torch.randn(x_start.size(), dtype=x_start.dtype, layout=x_start.layout, generator=torch.manual_seed(seed)).to(x_start.device)
|
50 |
+
return (extract_into_tensor(self.sqrt_alphas_cumprod.to(x_start.device), t, x_start.shape) * x_start +
|
51 |
+
extract_into_tensor(self.sqrt_one_minus_alphas_cumprod.to(x_start.device), t, x_start.shape) * noise)
|
52 |
+
|
53 |
+
def forward(self, x):
|
54 |
+
return x, None
|
55 |
+
|
56 |
+
def decode(self, x):
|
57 |
+
return x
|
58 |
+
|
59 |
+
|
60 |
+
class SimpleImageConcat(AbstractLowScaleModel):
|
61 |
+
# no noise level conditioning
|
62 |
+
def __init__(self):
|
63 |
+
super(SimpleImageConcat, self).__init__(noise_schedule_config=None)
|
64 |
+
self.max_noise_level = 0
|
65 |
+
|
66 |
+
def forward(self, x):
|
67 |
+
# fix to constant noise level
|
68 |
+
return x, torch.zeros(x.shape[0], device=x.device).long()
|
69 |
+
|
70 |
+
|
71 |
+
class ImageConcatWithNoiseAugmentation(AbstractLowScaleModel):
|
72 |
+
def __init__(self, noise_schedule_config, max_noise_level=1000, to_cuda=False):
|
73 |
+
super().__init__(noise_schedule_config=noise_schedule_config)
|
74 |
+
self.max_noise_level = max_noise_level
|
75 |
+
|
76 |
+
def forward(self, x, noise_level=None, seed=None):
|
77 |
+
if noise_level is None:
|
78 |
+
noise_level = torch.randint(0, self.max_noise_level, (x.shape[0],), device=x.device).long()
|
79 |
+
else:
|
80 |
+
assert isinstance(noise_level, torch.Tensor)
|
81 |
+
z = self.q_sample(x, noise_level, seed=seed)
|
82 |
+
return z, noise_level
|
83 |
+
|
84 |
+
|
85 |
+
|
ComfyUI/comfy/ldm/modules/diffusionmodules/util.py
ADDED
@@ -0,0 +1,306 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# adopted from
|
2 |
+
# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
|
3 |
+
# and
|
4 |
+
# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
|
5 |
+
# and
|
6 |
+
# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
|
7 |
+
#
|
8 |
+
# thanks!
|
9 |
+
|
10 |
+
|
11 |
+
import os
|
12 |
+
import math
|
13 |
+
import torch
|
14 |
+
import torch.nn as nn
|
15 |
+
import numpy as np
|
16 |
+
from einops import repeat, rearrange
|
17 |
+
|
18 |
+
from comfy.ldm.util import instantiate_from_config
|
19 |
+
|
20 |
+
class AlphaBlender(nn.Module):
|
21 |
+
strategies = ["learned", "fixed", "learned_with_images"]
|
22 |
+
|
23 |
+
def __init__(
|
24 |
+
self,
|
25 |
+
alpha: float,
|
26 |
+
merge_strategy: str = "learned_with_images",
|
27 |
+
rearrange_pattern: str = "b t -> (b t) 1 1",
|
28 |
+
):
|
29 |
+
super().__init__()
|
30 |
+
self.merge_strategy = merge_strategy
|
31 |
+
self.rearrange_pattern = rearrange_pattern
|
32 |
+
|
33 |
+
assert (
|
34 |
+
merge_strategy in self.strategies
|
35 |
+
), f"merge_strategy needs to be in {self.strategies}"
|
36 |
+
|
37 |
+
if self.merge_strategy == "fixed":
|
38 |
+
self.register_buffer("mix_factor", torch.Tensor([alpha]))
|
39 |
+
elif (
|
40 |
+
self.merge_strategy == "learned"
|
41 |
+
or self.merge_strategy == "learned_with_images"
|
42 |
+
):
|
43 |
+
self.register_parameter(
|
44 |
+
"mix_factor", torch.nn.Parameter(torch.Tensor([alpha]))
|
45 |
+
)
|
46 |
+
else:
|
47 |
+
raise ValueError(f"unknown merge strategy {self.merge_strategy}")
|
48 |
+
|
49 |
+
def get_alpha(self, image_only_indicator: torch.Tensor, device) -> torch.Tensor:
|
50 |
+
# skip_time_mix = rearrange(repeat(skip_time_mix, 'b -> (b t) () () ()', t=t), '(b t) 1 ... -> b 1 t ...', t=t)
|
51 |
+
if self.merge_strategy == "fixed":
|
52 |
+
# make shape compatible
|
53 |
+
# alpha = repeat(self.mix_factor, '1 -> b () t () ()', t=t, b=bs)
|
54 |
+
alpha = self.mix_factor.to(device)
|
55 |
+
elif self.merge_strategy == "learned":
|
56 |
+
alpha = torch.sigmoid(self.mix_factor.to(device))
|
57 |
+
# make shape compatible
|
58 |
+
# alpha = repeat(alpha, '1 -> s () ()', s = t * bs)
|
59 |
+
elif self.merge_strategy == "learned_with_images":
|
60 |
+
if image_only_indicator is None:
|
61 |
+
alpha = rearrange(torch.sigmoid(self.mix_factor.to(device)), "... -> ... 1")
|
62 |
+
else:
|
63 |
+
alpha = torch.where(
|
64 |
+
image_only_indicator.bool(),
|
65 |
+
torch.ones(1, 1, device=image_only_indicator.device),
|
66 |
+
rearrange(torch.sigmoid(self.mix_factor.to(image_only_indicator.device)), "... -> ... 1"),
|
67 |
+
)
|
68 |
+
alpha = rearrange(alpha, self.rearrange_pattern)
|
69 |
+
# make shape compatible
|
70 |
+
# alpha = repeat(alpha, '1 -> s () ()', s = t * bs)
|
71 |
+
else:
|
72 |
+
raise NotImplementedError()
|
73 |
+
return alpha
|
74 |
+
|
75 |
+
def forward(
|
76 |
+
self,
|
77 |
+
x_spatial,
|
78 |
+
x_temporal,
|
79 |
+
image_only_indicator=None,
|
80 |
+
) -> torch.Tensor:
|
81 |
+
alpha = self.get_alpha(image_only_indicator, x_spatial.device)
|
82 |
+
x = (
|
83 |
+
alpha.to(x_spatial.dtype) * x_spatial
|
84 |
+
+ (1.0 - alpha).to(x_spatial.dtype) * x_temporal
|
85 |
+
)
|
86 |
+
return x
|
87 |
+
|
88 |
+
|
89 |
+
def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
|
90 |
+
if schedule == "linear":
|
91 |
+
betas = (
|
92 |
+
torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
|
93 |
+
)
|
94 |
+
|
95 |
+
elif schedule == "cosine":
|
96 |
+
timesteps = (
|
97 |
+
torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
|
98 |
+
)
|
99 |
+
alphas = timesteps / (1 + cosine_s) * np.pi / 2
|
100 |
+
alphas = torch.cos(alphas).pow(2)
|
101 |
+
alphas = alphas / alphas[0]
|
102 |
+
betas = 1 - alphas[1:] / alphas[:-1]
|
103 |
+
betas = torch.clamp(betas, min=0, max=0.999)
|
104 |
+
|
105 |
+
elif schedule == "squaredcos_cap_v2": # used for karlo prior
|
106 |
+
# return early
|
107 |
+
return betas_for_alpha_bar(
|
108 |
+
n_timestep,
|
109 |
+
lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
|
110 |
+
)
|
111 |
+
|
112 |
+
elif schedule == "sqrt_linear":
|
113 |
+
betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
|
114 |
+
elif schedule == "sqrt":
|
115 |
+
betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
|
116 |
+
else:
|
117 |
+
raise ValueError(f"schedule '{schedule}' unknown.")
|
118 |
+
return betas
|
119 |
+
|
120 |
+
|
121 |
+
def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
|
122 |
+
if ddim_discr_method == 'uniform':
|
123 |
+
c = num_ddpm_timesteps // num_ddim_timesteps
|
124 |
+
ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
|
125 |
+
elif ddim_discr_method == 'quad':
|
126 |
+
ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int)
|
127 |
+
else:
|
128 |
+
raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
|
129 |
+
|
130 |
+
# assert ddim_timesteps.shape[0] == num_ddim_timesteps
|
131 |
+
# add one to get the final alpha values right (the ones from first scale to data during sampling)
|
132 |
+
steps_out = ddim_timesteps + 1
|
133 |
+
if verbose:
|
134 |
+
print(f'Selected timesteps for ddim sampler: {steps_out}')
|
135 |
+
return steps_out
|
136 |
+
|
137 |
+
|
138 |
+
def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
|
139 |
+
# select alphas for computing the variance schedule
|
140 |
+
alphas = alphacums[ddim_timesteps]
|
141 |
+
alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
|
142 |
+
|
143 |
+
# according the the formula provided in https://arxiv.org/abs/2010.02502
|
144 |
+
sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
|
145 |
+
if verbose:
|
146 |
+
print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
|
147 |
+
print(f'For the chosen value of eta, which is {eta}, '
|
148 |
+
f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
|
149 |
+
return sigmas, alphas, alphas_prev
|
150 |
+
|
151 |
+
|
152 |
+
def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
|
153 |
+
"""
|
154 |
+
Create a beta schedule that discretizes the given alpha_t_bar function,
|
155 |
+
which defines the cumulative product of (1-beta) over time from t = [0,1].
|
156 |
+
:param num_diffusion_timesteps: the number of betas to produce.
|
157 |
+
:param alpha_bar: a lambda that takes an argument t from 0 to 1 and
|
158 |
+
produces the cumulative product of (1-beta) up to that
|
159 |
+
part of the diffusion process.
|
160 |
+
:param max_beta: the maximum beta to use; use values lower than 1 to
|
161 |
+
prevent singularities.
|
162 |
+
"""
|
163 |
+
betas = []
|
164 |
+
for i in range(num_diffusion_timesteps):
|
165 |
+
t1 = i / num_diffusion_timesteps
|
166 |
+
t2 = (i + 1) / num_diffusion_timesteps
|
167 |
+
betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
|
168 |
+
return np.array(betas)
|
169 |
+
|
170 |
+
|
171 |
+
def extract_into_tensor(a, t, x_shape):
|
172 |
+
b, *_ = t.shape
|
173 |
+
out = a.gather(-1, t)
|
174 |
+
return out.reshape(b, *((1,) * (len(x_shape) - 1)))
|
175 |
+
|
176 |
+
|
177 |
+
def checkpoint(func, inputs, params, flag):
|
178 |
+
"""
|
179 |
+
Evaluate a function without caching intermediate activations, allowing for
|
180 |
+
reduced memory at the expense of extra compute in the backward pass.
|
181 |
+
:param func: the function to evaluate.
|
182 |
+
:param inputs: the argument sequence to pass to `func`.
|
183 |
+
:param params: a sequence of parameters `func` depends on but does not
|
184 |
+
explicitly take as arguments.
|
185 |
+
:param flag: if False, disable gradient checkpointing.
|
186 |
+
"""
|
187 |
+
if flag:
|
188 |
+
args = tuple(inputs) + tuple(params)
|
189 |
+
return CheckpointFunction.apply(func, len(inputs), *args)
|
190 |
+
else:
|
191 |
+
return func(*inputs)
|
192 |
+
|
193 |
+
|
194 |
+
class CheckpointFunction(torch.autograd.Function):
|
195 |
+
@staticmethod
|
196 |
+
def forward(ctx, run_function, length, *args):
|
197 |
+
ctx.run_function = run_function
|
198 |
+
ctx.input_tensors = list(args[:length])
|
199 |
+
ctx.input_params = list(args[length:])
|
200 |
+
ctx.gpu_autocast_kwargs = {"enabled": torch.is_autocast_enabled(),
|
201 |
+
"dtype": torch.get_autocast_gpu_dtype(),
|
202 |
+
"cache_enabled": torch.is_autocast_cache_enabled()}
|
203 |
+
with torch.no_grad():
|
204 |
+
output_tensors = ctx.run_function(*ctx.input_tensors)
|
205 |
+
return output_tensors
|
206 |
+
|
207 |
+
@staticmethod
|
208 |
+
def backward(ctx, *output_grads):
|
209 |
+
ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
|
210 |
+
with torch.enable_grad(), \
|
211 |
+
torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs):
|
212 |
+
# Fixes a bug where the first op in run_function modifies the
|
213 |
+
# Tensor storage in place, which is not allowed for detach()'d
|
214 |
+
# Tensors.
|
215 |
+
shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
|
216 |
+
output_tensors = ctx.run_function(*shallow_copies)
|
217 |
+
input_grads = torch.autograd.grad(
|
218 |
+
output_tensors,
|
219 |
+
ctx.input_tensors + ctx.input_params,
|
220 |
+
output_grads,
|
221 |
+
allow_unused=True,
|
222 |
+
)
|
223 |
+
del ctx.input_tensors
|
224 |
+
del ctx.input_params
|
225 |
+
del output_tensors
|
226 |
+
return (None, None) + input_grads
|
227 |
+
|
228 |
+
|
229 |
+
def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
|
230 |
+
"""
|
231 |
+
Create sinusoidal timestep embeddings.
|
232 |
+
:param timesteps: a 1-D Tensor of N indices, one per batch element.
|
233 |
+
These may be fractional.
|
234 |
+
:param dim: the dimension of the output.
|
235 |
+
:param max_period: controls the minimum frequency of the embeddings.
|
236 |
+
:return: an [N x dim] Tensor of positional embeddings.
|
237 |
+
"""
|
238 |
+
if not repeat_only:
|
239 |
+
half = dim // 2
|
240 |
+
freqs = torch.exp(
|
241 |
+
-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=timesteps.device) / half
|
242 |
+
)
|
243 |
+
args = timesteps[:, None].float() * freqs[None]
|
244 |
+
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
|
245 |
+
if dim % 2:
|
246 |
+
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
247 |
+
else:
|
248 |
+
embedding = repeat(timesteps, 'b -> b d', d=dim)
|
249 |
+
return embedding
|
250 |
+
|
251 |
+
|
252 |
+
def zero_module(module):
|
253 |
+
"""
|
254 |
+
Zero out the parameters of a module and return it.
|
255 |
+
"""
|
256 |
+
for p in module.parameters():
|
257 |
+
p.detach().zero_()
|
258 |
+
return module
|
259 |
+
|
260 |
+
|
261 |
+
def scale_module(module, scale):
|
262 |
+
"""
|
263 |
+
Scale the parameters of a module and return it.
|
264 |
+
"""
|
265 |
+
for p in module.parameters():
|
266 |
+
p.detach().mul_(scale)
|
267 |
+
return module
|
268 |
+
|
269 |
+
|
270 |
+
def mean_flat(tensor):
|
271 |
+
"""
|
272 |
+
Take the mean over all non-batch dimensions.
|
273 |
+
"""
|
274 |
+
return tensor.mean(dim=list(range(1, len(tensor.shape))))
|
275 |
+
|
276 |
+
|
277 |
+
def avg_pool_nd(dims, *args, **kwargs):
|
278 |
+
"""
|
279 |
+
Create a 1D, 2D, or 3D average pooling module.
|
280 |
+
"""
|
281 |
+
if dims == 1:
|
282 |
+
return nn.AvgPool1d(*args, **kwargs)
|
283 |
+
elif dims == 2:
|
284 |
+
return nn.AvgPool2d(*args, **kwargs)
|
285 |
+
elif dims == 3:
|
286 |
+
return nn.AvgPool3d(*args, **kwargs)
|
287 |
+
raise ValueError(f"unsupported dimensions: {dims}")
|
288 |
+
|
289 |
+
|
290 |
+
class HybridConditioner(nn.Module):
|
291 |
+
|
292 |
+
def __init__(self, c_concat_config, c_crossattn_config):
|
293 |
+
super().__init__()
|
294 |
+
self.concat_conditioner = instantiate_from_config(c_concat_config)
|
295 |
+
self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
|
296 |
+
|
297 |
+
def forward(self, c_concat, c_crossattn):
|
298 |
+
c_concat = self.concat_conditioner(c_concat)
|
299 |
+
c_crossattn = self.crossattn_conditioner(c_crossattn)
|
300 |
+
return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
|
301 |
+
|
302 |
+
|
303 |
+
def noise_like(shape, device, repeat=False):
|
304 |
+
repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
|
305 |
+
noise = lambda: torch.randn(shape, device=device)
|
306 |
+
return repeat_noise() if repeat else noise()
|
ComfyUI/comfy/ldm/modules/distributions/__init__.py
ADDED
File without changes
|
ComfyUI/comfy/ldm/modules/distributions/distributions.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
class AbstractDistribution:
|
6 |
+
def sample(self):
|
7 |
+
raise NotImplementedError()
|
8 |
+
|
9 |
+
def mode(self):
|
10 |
+
raise NotImplementedError()
|
11 |
+
|
12 |
+
|
13 |
+
class DiracDistribution(AbstractDistribution):
|
14 |
+
def __init__(self, value):
|
15 |
+
self.value = value
|
16 |
+
|
17 |
+
def sample(self):
|
18 |
+
return self.value
|
19 |
+
|
20 |
+
def mode(self):
|
21 |
+
return self.value
|
22 |
+
|
23 |
+
|
24 |
+
class DiagonalGaussianDistribution(object):
|
25 |
+
def __init__(self, parameters, deterministic=False):
|
26 |
+
self.parameters = parameters
|
27 |
+
self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
|
28 |
+
self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
|
29 |
+
self.deterministic = deterministic
|
30 |
+
self.std = torch.exp(0.5 * self.logvar)
|
31 |
+
self.var = torch.exp(self.logvar)
|
32 |
+
if self.deterministic:
|
33 |
+
self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
|
34 |
+
|
35 |
+
def sample(self):
|
36 |
+
x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
|
37 |
+
return x
|
38 |
+
|
39 |
+
def kl(self, other=None):
|
40 |
+
if self.deterministic:
|
41 |
+
return torch.Tensor([0.])
|
42 |
+
else:
|
43 |
+
if other is None:
|
44 |
+
return 0.5 * torch.sum(torch.pow(self.mean, 2)
|
45 |
+
+ self.var - 1.0 - self.logvar,
|
46 |
+
dim=[1, 2, 3])
|
47 |
+
else:
|
48 |
+
return 0.5 * torch.sum(
|
49 |
+
torch.pow(self.mean - other.mean, 2) / other.var
|
50 |
+
+ self.var / other.var - 1.0 - self.logvar + other.logvar,
|
51 |
+
dim=[1, 2, 3])
|
52 |
+
|
53 |
+
def nll(self, sample, dims=[1,2,3]):
|
54 |
+
if self.deterministic:
|
55 |
+
return torch.Tensor([0.])
|
56 |
+
logtwopi = np.log(2.0 * np.pi)
|
57 |
+
return 0.5 * torch.sum(
|
58 |
+
logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
|
59 |
+
dim=dims)
|
60 |
+
|
61 |
+
def mode(self):
|
62 |
+
return self.mean
|
63 |
+
|
64 |
+
|
65 |
+
def normal_kl(mean1, logvar1, mean2, logvar2):
|
66 |
+
"""
|
67 |
+
source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
|
68 |
+
Compute the KL divergence between two gaussians.
|
69 |
+
Shapes are automatically broadcasted, so batches can be compared to
|
70 |
+
scalars, among other use cases.
|
71 |
+
"""
|
72 |
+
tensor = None
|
73 |
+
for obj in (mean1, logvar1, mean2, logvar2):
|
74 |
+
if isinstance(obj, torch.Tensor):
|
75 |
+
tensor = obj
|
76 |
+
break
|
77 |
+
assert tensor is not None, "at least one argument must be a Tensor"
|
78 |
+
|
79 |
+
# Force variances to be Tensors. Broadcasting helps convert scalars to
|
80 |
+
# Tensors, but it does not work for torch.exp().
|
81 |
+
logvar1, logvar2 = [
|
82 |
+
x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
|
83 |
+
for x in (logvar1, logvar2)
|
84 |
+
]
|
85 |
+
|
86 |
+
return 0.5 * (
|
87 |
+
-1.0
|
88 |
+
+ logvar2
|
89 |
+
- logvar1
|
90 |
+
+ torch.exp(logvar1 - logvar2)
|
91 |
+
+ ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
|
92 |
+
)
|
ComfyUI/comfy/ldm/modules/ema.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
|
4 |
+
|
5 |
+
class LitEma(nn.Module):
|
6 |
+
def __init__(self, model, decay=0.9999, use_num_upates=True):
|
7 |
+
super().__init__()
|
8 |
+
if decay < 0.0 or decay > 1.0:
|
9 |
+
raise ValueError('Decay must be between 0 and 1')
|
10 |
+
|
11 |
+
self.m_name2s_name = {}
|
12 |
+
self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
|
13 |
+
self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int) if use_num_upates
|
14 |
+
else torch.tensor(-1, dtype=torch.int))
|
15 |
+
|
16 |
+
for name, p in model.named_parameters():
|
17 |
+
if p.requires_grad:
|
18 |
+
# remove as '.'-character is not allowed in buffers
|
19 |
+
s_name = name.replace('.', '')
|
20 |
+
self.m_name2s_name.update({name: s_name})
|
21 |
+
self.register_buffer(s_name, p.clone().detach().data)
|
22 |
+
|
23 |
+
self.collected_params = []
|
24 |
+
|
25 |
+
def reset_num_updates(self):
|
26 |
+
del self.num_updates
|
27 |
+
self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int))
|
28 |
+
|
29 |
+
def forward(self, model):
|
30 |
+
decay = self.decay
|
31 |
+
|
32 |
+
if self.num_updates >= 0:
|
33 |
+
self.num_updates += 1
|
34 |
+
decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
|
35 |
+
|
36 |
+
one_minus_decay = 1.0 - decay
|
37 |
+
|
38 |
+
with torch.no_grad():
|
39 |
+
m_param = dict(model.named_parameters())
|
40 |
+
shadow_params = dict(self.named_buffers())
|
41 |
+
|
42 |
+
for key in m_param:
|
43 |
+
if m_param[key].requires_grad:
|
44 |
+
sname = self.m_name2s_name[key]
|
45 |
+
shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
|
46 |
+
shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
|
47 |
+
else:
|
48 |
+
assert not key in self.m_name2s_name
|
49 |
+
|
50 |
+
def copy_to(self, model):
|
51 |
+
m_param = dict(model.named_parameters())
|
52 |
+
shadow_params = dict(self.named_buffers())
|
53 |
+
for key in m_param:
|
54 |
+
if m_param[key].requires_grad:
|
55 |
+
m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
|
56 |
+
else:
|
57 |
+
assert not key in self.m_name2s_name
|
58 |
+
|
59 |
+
def store(self, parameters):
|
60 |
+
"""
|
61 |
+
Save the current parameters for restoring later.
|
62 |
+
Args:
|
63 |
+
parameters: Iterable of `torch.nn.Parameter`; the parameters to be
|
64 |
+
temporarily stored.
|
65 |
+
"""
|
66 |
+
self.collected_params = [param.clone() for param in parameters]
|
67 |
+
|
68 |
+
def restore(self, parameters):
|
69 |
+
"""
|
70 |
+
Restore the parameters stored with the `store` method.
|
71 |
+
Useful to validate the model with EMA parameters without affecting the
|
72 |
+
original optimization process. Store the parameters before the
|
73 |
+
`copy_to` method. After validation (or model saving), use this to
|
74 |
+
restore the former parameters.
|
75 |
+
Args:
|
76 |
+
parameters: Iterable of `torch.nn.Parameter`; the parameters to be
|
77 |
+
updated with the stored parameters.
|
78 |
+
"""
|
79 |
+
for c_param, param in zip(self.collected_params, parameters):
|
80 |
+
param.data.copy_(c_param.data)
|
ComfyUI/comfy/ldm/modules/encoders/__init__.py
ADDED
File without changes
|
ComfyUI/comfy/ldm/modules/encoders/noise_aug_modules.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ..diffusionmodules.upscaling import ImageConcatWithNoiseAugmentation
|
2 |
+
from ..diffusionmodules.openaimodel import Timestep
|
3 |
+
import torch
|
4 |
+
|
5 |
+
class CLIPEmbeddingNoiseAugmentation(ImageConcatWithNoiseAugmentation):
|
6 |
+
def __init__(self, *args, clip_stats_path=None, timestep_dim=256, **kwargs):
|
7 |
+
super().__init__(*args, **kwargs)
|
8 |
+
if clip_stats_path is None:
|
9 |
+
clip_mean, clip_std = torch.zeros(timestep_dim), torch.ones(timestep_dim)
|
10 |
+
else:
|
11 |
+
clip_mean, clip_std = torch.load(clip_stats_path, map_location="cpu")
|
12 |
+
self.register_buffer("data_mean", clip_mean[None, :], persistent=False)
|
13 |
+
self.register_buffer("data_std", clip_std[None, :], persistent=False)
|
14 |
+
self.time_embed = Timestep(timestep_dim)
|
15 |
+
|
16 |
+
def scale(self, x):
|
17 |
+
# re-normalize to centered mean and unit variance
|
18 |
+
x = (x - self.data_mean.to(x.device)) * 1. / self.data_std.to(x.device)
|
19 |
+
return x
|
20 |
+
|
21 |
+
def unscale(self, x):
|
22 |
+
# back to original data stats
|
23 |
+
x = (x * self.data_std.to(x.device)) + self.data_mean.to(x.device)
|
24 |
+
return x
|
25 |
+
|
26 |
+
def forward(self, x, noise_level=None, seed=None):
|
27 |
+
if noise_level is None:
|
28 |
+
noise_level = torch.randint(0, self.max_noise_level, (x.shape[0],), device=x.device).long()
|
29 |
+
else:
|
30 |
+
assert isinstance(noise_level, torch.Tensor)
|
31 |
+
x = self.scale(x)
|
32 |
+
z = self.q_sample(x, noise_level, seed=seed)
|
33 |
+
z = self.unscale(z)
|
34 |
+
noise_level = self.time_embed(noise_level)
|
35 |
+
return z, noise_level
|
ComfyUI/comfy/ldm/modules/sub_quadratic_attention.py
ADDED
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# original source:
|
2 |
+
# https://github.com/AminRezaei0x443/memory-efficient-attention/blob/1bc0d9e6ac5f82ea43a375135c4e1d3896ee1694/memory_efficient_attention/attention_torch.py
|
3 |
+
# license:
|
4 |
+
# MIT
|
5 |
+
# credit:
|
6 |
+
# Amin Rezaei (original author)
|
7 |
+
# Alex Birch (optimized algorithm for 3D tensors, at the expense of removing bias, masking and callbacks)
|
8 |
+
# implementation of:
|
9 |
+
# Self-attention Does Not Need O(n2) Memory":
|
10 |
+
# https://arxiv.org/abs/2112.05682v2
|
11 |
+
|
12 |
+
from functools import partial
|
13 |
+
import torch
|
14 |
+
from torch import Tensor
|
15 |
+
from torch.utils.checkpoint import checkpoint
|
16 |
+
import math
|
17 |
+
import logging
|
18 |
+
|
19 |
+
try:
|
20 |
+
from typing import Optional, NamedTuple, List, Protocol
|
21 |
+
except ImportError:
|
22 |
+
from typing import Optional, NamedTuple, List
|
23 |
+
from typing_extensions import Protocol
|
24 |
+
|
25 |
+
from torch import Tensor
|
26 |
+
from typing import List
|
27 |
+
|
28 |
+
from comfy import model_management
|
29 |
+
|
30 |
+
def dynamic_slice(
|
31 |
+
x: Tensor,
|
32 |
+
starts: List[int],
|
33 |
+
sizes: List[int],
|
34 |
+
) -> Tensor:
|
35 |
+
slicing = [slice(start, start + size) for start, size in zip(starts, sizes)]
|
36 |
+
return x[slicing]
|
37 |
+
|
38 |
+
class AttnChunk(NamedTuple):
|
39 |
+
exp_values: Tensor
|
40 |
+
exp_weights_sum: Tensor
|
41 |
+
max_score: Tensor
|
42 |
+
|
43 |
+
class SummarizeChunk(Protocol):
|
44 |
+
@staticmethod
|
45 |
+
def __call__(
|
46 |
+
query: Tensor,
|
47 |
+
key_t: Tensor,
|
48 |
+
value: Tensor,
|
49 |
+
) -> AttnChunk: ...
|
50 |
+
|
51 |
+
class ComputeQueryChunkAttn(Protocol):
|
52 |
+
@staticmethod
|
53 |
+
def __call__(
|
54 |
+
query: Tensor,
|
55 |
+
key_t: Tensor,
|
56 |
+
value: Tensor,
|
57 |
+
) -> Tensor: ...
|
58 |
+
|
59 |
+
def _summarize_chunk(
|
60 |
+
query: Tensor,
|
61 |
+
key_t: Tensor,
|
62 |
+
value: Tensor,
|
63 |
+
scale: float,
|
64 |
+
upcast_attention: bool,
|
65 |
+
mask,
|
66 |
+
) -> AttnChunk:
|
67 |
+
if upcast_attention:
|
68 |
+
with torch.autocast(enabled=False, device_type = 'cuda'):
|
69 |
+
query = query.float()
|
70 |
+
key_t = key_t.float()
|
71 |
+
attn_weights = torch.baddbmm(
|
72 |
+
torch.empty(1, 1, 1, device=query.device, dtype=query.dtype),
|
73 |
+
query,
|
74 |
+
key_t,
|
75 |
+
alpha=scale,
|
76 |
+
beta=0,
|
77 |
+
)
|
78 |
+
else:
|
79 |
+
attn_weights = torch.baddbmm(
|
80 |
+
torch.empty(1, 1, 1, device=query.device, dtype=query.dtype),
|
81 |
+
query,
|
82 |
+
key_t,
|
83 |
+
alpha=scale,
|
84 |
+
beta=0,
|
85 |
+
)
|
86 |
+
max_score, _ = torch.max(attn_weights, -1, keepdim=True)
|
87 |
+
max_score = max_score.detach()
|
88 |
+
attn_weights -= max_score
|
89 |
+
if mask is not None:
|
90 |
+
attn_weights += mask
|
91 |
+
torch.exp(attn_weights, out=attn_weights)
|
92 |
+
exp_weights = attn_weights.to(value.dtype)
|
93 |
+
exp_values = torch.bmm(exp_weights, value)
|
94 |
+
max_score = max_score.squeeze(-1)
|
95 |
+
return AttnChunk(exp_values, exp_weights.sum(dim=-1), max_score)
|
96 |
+
|
97 |
+
def _query_chunk_attention(
|
98 |
+
query: Tensor,
|
99 |
+
key_t: Tensor,
|
100 |
+
value: Tensor,
|
101 |
+
summarize_chunk: SummarizeChunk,
|
102 |
+
kv_chunk_size: int,
|
103 |
+
mask,
|
104 |
+
) -> Tensor:
|
105 |
+
batch_x_heads, k_channels_per_head, k_tokens = key_t.shape
|
106 |
+
_, _, v_channels_per_head = value.shape
|
107 |
+
|
108 |
+
def chunk_scanner(chunk_idx: int, mask) -> AttnChunk:
|
109 |
+
key_chunk = dynamic_slice(
|
110 |
+
key_t,
|
111 |
+
(0, 0, chunk_idx),
|
112 |
+
(batch_x_heads, k_channels_per_head, kv_chunk_size)
|
113 |
+
)
|
114 |
+
value_chunk = dynamic_slice(
|
115 |
+
value,
|
116 |
+
(0, chunk_idx, 0),
|
117 |
+
(batch_x_heads, kv_chunk_size, v_channels_per_head)
|
118 |
+
)
|
119 |
+
if mask is not None:
|
120 |
+
mask = mask[:,:,chunk_idx:chunk_idx + kv_chunk_size]
|
121 |
+
|
122 |
+
return summarize_chunk(query, key_chunk, value_chunk, mask=mask)
|
123 |
+
|
124 |
+
chunks: List[AttnChunk] = [
|
125 |
+
chunk_scanner(chunk, mask) for chunk in torch.arange(0, k_tokens, kv_chunk_size)
|
126 |
+
]
|
127 |
+
acc_chunk = AttnChunk(*map(torch.stack, zip(*chunks)))
|
128 |
+
chunk_values, chunk_weights, chunk_max = acc_chunk
|
129 |
+
|
130 |
+
global_max, _ = torch.max(chunk_max, 0, keepdim=True)
|
131 |
+
max_diffs = torch.exp(chunk_max - global_max)
|
132 |
+
chunk_values *= torch.unsqueeze(max_diffs, -1)
|
133 |
+
chunk_weights *= max_diffs
|
134 |
+
|
135 |
+
all_values = chunk_values.sum(dim=0)
|
136 |
+
all_weights = torch.unsqueeze(chunk_weights, -1).sum(dim=0)
|
137 |
+
return all_values / all_weights
|
138 |
+
|
139 |
+
# TODO: refactor CrossAttention#get_attention_scores to share code with this
|
140 |
+
def _get_attention_scores_no_kv_chunking(
|
141 |
+
query: Tensor,
|
142 |
+
key_t: Tensor,
|
143 |
+
value: Tensor,
|
144 |
+
scale: float,
|
145 |
+
upcast_attention: bool,
|
146 |
+
mask,
|
147 |
+
) -> Tensor:
|
148 |
+
if upcast_attention:
|
149 |
+
with torch.autocast(enabled=False, device_type = 'cuda'):
|
150 |
+
query = query.float()
|
151 |
+
key_t = key_t.float()
|
152 |
+
attn_scores = torch.baddbmm(
|
153 |
+
torch.empty(1, 1, 1, device=query.device, dtype=query.dtype),
|
154 |
+
query,
|
155 |
+
key_t,
|
156 |
+
alpha=scale,
|
157 |
+
beta=0,
|
158 |
+
)
|
159 |
+
else:
|
160 |
+
attn_scores = torch.baddbmm(
|
161 |
+
torch.empty(1, 1, 1, device=query.device, dtype=query.dtype),
|
162 |
+
query,
|
163 |
+
key_t,
|
164 |
+
alpha=scale,
|
165 |
+
beta=0,
|
166 |
+
)
|
167 |
+
|
168 |
+
if mask is not None:
|
169 |
+
attn_scores += mask
|
170 |
+
try:
|
171 |
+
attn_probs = attn_scores.softmax(dim=-1)
|
172 |
+
del attn_scores
|
173 |
+
except model_management.OOM_EXCEPTION:
|
174 |
+
logging.warning("ran out of memory while running softmax in _get_attention_scores_no_kv_chunking, trying slower in place softmax instead")
|
175 |
+
attn_scores -= attn_scores.max(dim=-1, keepdim=True).values
|
176 |
+
torch.exp(attn_scores, out=attn_scores)
|
177 |
+
summed = torch.sum(attn_scores, dim=-1, keepdim=True)
|
178 |
+
attn_scores /= summed
|
179 |
+
attn_probs = attn_scores
|
180 |
+
|
181 |
+
hidden_states_slice = torch.bmm(attn_probs.to(value.dtype), value)
|
182 |
+
return hidden_states_slice
|
183 |
+
|
184 |
+
class ScannedChunk(NamedTuple):
|
185 |
+
chunk_idx: int
|
186 |
+
attn_chunk: AttnChunk
|
187 |
+
|
188 |
+
def efficient_dot_product_attention(
|
189 |
+
query: Tensor,
|
190 |
+
key_t: Tensor,
|
191 |
+
value: Tensor,
|
192 |
+
query_chunk_size=1024,
|
193 |
+
kv_chunk_size: Optional[int] = None,
|
194 |
+
kv_chunk_size_min: Optional[int] = None,
|
195 |
+
use_checkpoint=True,
|
196 |
+
upcast_attention=False,
|
197 |
+
mask = None,
|
198 |
+
):
|
199 |
+
"""Computes efficient dot-product attention given query, transposed key, and value.
|
200 |
+
This is efficient version of attention presented in
|
201 |
+
https://arxiv.org/abs/2112.05682v2 which comes with O(sqrt(n)) memory requirements.
|
202 |
+
Args:
|
203 |
+
query: queries for calculating attention with shape of
|
204 |
+
`[batch * num_heads, tokens, channels_per_head]`.
|
205 |
+
key_t: keys for calculating attention with shape of
|
206 |
+
`[batch * num_heads, channels_per_head, tokens]`.
|
207 |
+
value: values to be used in attention with shape of
|
208 |
+
`[batch * num_heads, tokens, channels_per_head]`.
|
209 |
+
query_chunk_size: int: query chunks size
|
210 |
+
kv_chunk_size: Optional[int]: key/value chunks size. if None: defaults to sqrt(key_tokens)
|
211 |
+
kv_chunk_size_min: Optional[int]: key/value minimum chunk size. only considered when kv_chunk_size is None. changes `sqrt(key_tokens)` into `max(sqrt(key_tokens), kv_chunk_size_min)`, to ensure our chunk sizes don't get too small (smaller chunks = more chunks = less concurrent work done).
|
212 |
+
use_checkpoint: bool: whether to use checkpointing (recommended True for training, False for inference)
|
213 |
+
Returns:
|
214 |
+
Output of shape `[batch * num_heads, query_tokens, channels_per_head]`.
|
215 |
+
"""
|
216 |
+
batch_x_heads, q_tokens, q_channels_per_head = query.shape
|
217 |
+
_, _, k_tokens = key_t.shape
|
218 |
+
scale = q_channels_per_head ** -0.5
|
219 |
+
|
220 |
+
kv_chunk_size = min(kv_chunk_size or int(math.sqrt(k_tokens)), k_tokens)
|
221 |
+
if kv_chunk_size_min is not None:
|
222 |
+
kv_chunk_size = max(kv_chunk_size, kv_chunk_size_min)
|
223 |
+
|
224 |
+
if mask is not None and len(mask.shape) == 2:
|
225 |
+
mask = mask.unsqueeze(0)
|
226 |
+
|
227 |
+
def get_query_chunk(chunk_idx: int) -> Tensor:
|
228 |
+
return dynamic_slice(
|
229 |
+
query,
|
230 |
+
(0, chunk_idx, 0),
|
231 |
+
(batch_x_heads, min(query_chunk_size, q_tokens), q_channels_per_head)
|
232 |
+
)
|
233 |
+
|
234 |
+
def get_mask_chunk(chunk_idx: int) -> Tensor:
|
235 |
+
if mask is None:
|
236 |
+
return None
|
237 |
+
chunk = min(query_chunk_size, q_tokens)
|
238 |
+
return mask[:,chunk_idx:chunk_idx + chunk]
|
239 |
+
|
240 |
+
summarize_chunk: SummarizeChunk = partial(_summarize_chunk, scale=scale, upcast_attention=upcast_attention)
|
241 |
+
summarize_chunk: SummarizeChunk = partial(checkpoint, summarize_chunk) if use_checkpoint else summarize_chunk
|
242 |
+
compute_query_chunk_attn: ComputeQueryChunkAttn = partial(
|
243 |
+
_get_attention_scores_no_kv_chunking,
|
244 |
+
scale=scale,
|
245 |
+
upcast_attention=upcast_attention
|
246 |
+
) if k_tokens <= kv_chunk_size else (
|
247 |
+
# fast-path for when there's just 1 key-value chunk per query chunk (this is just sliced attention btw)
|
248 |
+
partial(
|
249 |
+
_query_chunk_attention,
|
250 |
+
kv_chunk_size=kv_chunk_size,
|
251 |
+
summarize_chunk=summarize_chunk,
|
252 |
+
)
|
253 |
+
)
|
254 |
+
|
255 |
+
if q_tokens <= query_chunk_size:
|
256 |
+
# fast-path for when there's just 1 query chunk
|
257 |
+
return compute_query_chunk_attn(
|
258 |
+
query=query,
|
259 |
+
key_t=key_t,
|
260 |
+
value=value,
|
261 |
+
mask=mask,
|
262 |
+
)
|
263 |
+
|
264 |
+
# TODO: maybe we should use torch.empty_like(query) to allocate storage in-advance,
|
265 |
+
# and pass slices to be mutated, instead of torch.cat()ing the returned slices
|
266 |
+
res = torch.cat([
|
267 |
+
compute_query_chunk_attn(
|
268 |
+
query=get_query_chunk(i * query_chunk_size),
|
269 |
+
key_t=key_t,
|
270 |
+
value=value,
|
271 |
+
mask=get_mask_chunk(i * query_chunk_size)
|
272 |
+
) for i in range(math.ceil(q_tokens / query_chunk_size))
|
273 |
+
], dim=1)
|
274 |
+
return res
|
ComfyUI/comfy/ldm/modules/temporal_ae.py
ADDED
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import functools
|
2 |
+
from typing import Callable, Iterable, Union
|
3 |
+
|
4 |
+
import torch
|
5 |
+
from einops import rearrange, repeat
|
6 |
+
|
7 |
+
import comfy.ops
|
8 |
+
ops = comfy.ops.disable_weight_init
|
9 |
+
|
10 |
+
from .diffusionmodules.model import (
|
11 |
+
AttnBlock,
|
12 |
+
Decoder,
|
13 |
+
ResnetBlock,
|
14 |
+
)
|
15 |
+
from .diffusionmodules.openaimodel import ResBlock, timestep_embedding
|
16 |
+
from .attention import BasicTransformerBlock
|
17 |
+
|
18 |
+
def partialclass(cls, *args, **kwargs):
|
19 |
+
class NewCls(cls):
|
20 |
+
__init__ = functools.partialmethod(cls.__init__, *args, **kwargs)
|
21 |
+
|
22 |
+
return NewCls
|
23 |
+
|
24 |
+
|
25 |
+
class VideoResBlock(ResnetBlock):
|
26 |
+
def __init__(
|
27 |
+
self,
|
28 |
+
out_channels,
|
29 |
+
*args,
|
30 |
+
dropout=0.0,
|
31 |
+
video_kernel_size=3,
|
32 |
+
alpha=0.0,
|
33 |
+
merge_strategy="learned",
|
34 |
+
**kwargs,
|
35 |
+
):
|
36 |
+
super().__init__(out_channels=out_channels, dropout=dropout, *args, **kwargs)
|
37 |
+
if video_kernel_size is None:
|
38 |
+
video_kernel_size = [3, 1, 1]
|
39 |
+
self.time_stack = ResBlock(
|
40 |
+
channels=out_channels,
|
41 |
+
emb_channels=0,
|
42 |
+
dropout=dropout,
|
43 |
+
dims=3,
|
44 |
+
use_scale_shift_norm=False,
|
45 |
+
use_conv=False,
|
46 |
+
up=False,
|
47 |
+
down=False,
|
48 |
+
kernel_size=video_kernel_size,
|
49 |
+
use_checkpoint=False,
|
50 |
+
skip_t_emb=True,
|
51 |
+
)
|
52 |
+
|
53 |
+
self.merge_strategy = merge_strategy
|
54 |
+
if self.merge_strategy == "fixed":
|
55 |
+
self.register_buffer("mix_factor", torch.Tensor([alpha]))
|
56 |
+
elif self.merge_strategy == "learned":
|
57 |
+
self.register_parameter(
|
58 |
+
"mix_factor", torch.nn.Parameter(torch.Tensor([alpha]))
|
59 |
+
)
|
60 |
+
else:
|
61 |
+
raise ValueError(f"unknown merge strategy {self.merge_strategy}")
|
62 |
+
|
63 |
+
def get_alpha(self, bs):
|
64 |
+
if self.merge_strategy == "fixed":
|
65 |
+
return self.mix_factor
|
66 |
+
elif self.merge_strategy == "learned":
|
67 |
+
return torch.sigmoid(self.mix_factor)
|
68 |
+
else:
|
69 |
+
raise NotImplementedError()
|
70 |
+
|
71 |
+
def forward(self, x, temb, skip_video=False, timesteps=None):
|
72 |
+
b, c, h, w = x.shape
|
73 |
+
if timesteps is None:
|
74 |
+
timesteps = b
|
75 |
+
|
76 |
+
x = super().forward(x, temb)
|
77 |
+
|
78 |
+
if not skip_video:
|
79 |
+
x_mix = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
|
80 |
+
|
81 |
+
x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
|
82 |
+
|
83 |
+
x = self.time_stack(x, temb)
|
84 |
+
|
85 |
+
alpha = self.get_alpha(bs=b // timesteps).to(x.device)
|
86 |
+
x = alpha * x + (1.0 - alpha) * x_mix
|
87 |
+
|
88 |
+
x = rearrange(x, "b c t h w -> (b t) c h w")
|
89 |
+
return x
|
90 |
+
|
91 |
+
|
92 |
+
class AE3DConv(ops.Conv2d):
|
93 |
+
def __init__(self, in_channels, out_channels, video_kernel_size=3, *args, **kwargs):
|
94 |
+
super().__init__(in_channels, out_channels, *args, **kwargs)
|
95 |
+
if isinstance(video_kernel_size, Iterable):
|
96 |
+
padding = [int(k // 2) for k in video_kernel_size]
|
97 |
+
else:
|
98 |
+
padding = int(video_kernel_size // 2)
|
99 |
+
|
100 |
+
self.time_mix_conv = ops.Conv3d(
|
101 |
+
in_channels=out_channels,
|
102 |
+
out_channels=out_channels,
|
103 |
+
kernel_size=video_kernel_size,
|
104 |
+
padding=padding,
|
105 |
+
)
|
106 |
+
|
107 |
+
def forward(self, input, timesteps=None, skip_video=False):
|
108 |
+
if timesteps is None:
|
109 |
+
timesteps = input.shape[0]
|
110 |
+
x = super().forward(input)
|
111 |
+
if skip_video:
|
112 |
+
return x
|
113 |
+
x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
|
114 |
+
x = self.time_mix_conv(x)
|
115 |
+
return rearrange(x, "b c t h w -> (b t) c h w")
|
116 |
+
|
117 |
+
|
118 |
+
class AttnVideoBlock(AttnBlock):
|
119 |
+
def __init__(
|
120 |
+
self, in_channels: int, alpha: float = 0, merge_strategy: str = "learned"
|
121 |
+
):
|
122 |
+
super().__init__(in_channels)
|
123 |
+
# no context, single headed, as in base class
|
124 |
+
self.time_mix_block = BasicTransformerBlock(
|
125 |
+
dim=in_channels,
|
126 |
+
n_heads=1,
|
127 |
+
d_head=in_channels,
|
128 |
+
checkpoint=False,
|
129 |
+
ff_in=True,
|
130 |
+
)
|
131 |
+
|
132 |
+
time_embed_dim = self.in_channels * 4
|
133 |
+
self.video_time_embed = torch.nn.Sequential(
|
134 |
+
ops.Linear(self.in_channels, time_embed_dim),
|
135 |
+
torch.nn.SiLU(),
|
136 |
+
ops.Linear(time_embed_dim, self.in_channels),
|
137 |
+
)
|
138 |
+
|
139 |
+
self.merge_strategy = merge_strategy
|
140 |
+
if self.merge_strategy == "fixed":
|
141 |
+
self.register_buffer("mix_factor", torch.Tensor([alpha]))
|
142 |
+
elif self.merge_strategy == "learned":
|
143 |
+
self.register_parameter(
|
144 |
+
"mix_factor", torch.nn.Parameter(torch.Tensor([alpha]))
|
145 |
+
)
|
146 |
+
else:
|
147 |
+
raise ValueError(f"unknown merge strategy {self.merge_strategy}")
|
148 |
+
|
149 |
+
def forward(self, x, timesteps=None, skip_time_block=False):
|
150 |
+
if skip_time_block:
|
151 |
+
return super().forward(x)
|
152 |
+
|
153 |
+
if timesteps is None:
|
154 |
+
timesteps = x.shape[0]
|
155 |
+
|
156 |
+
x_in = x
|
157 |
+
x = self.attention(x)
|
158 |
+
h, w = x.shape[2:]
|
159 |
+
x = rearrange(x, "b c h w -> b (h w) c")
|
160 |
+
|
161 |
+
x_mix = x
|
162 |
+
num_frames = torch.arange(timesteps, device=x.device)
|
163 |
+
num_frames = repeat(num_frames, "t -> b t", b=x.shape[0] // timesteps)
|
164 |
+
num_frames = rearrange(num_frames, "b t -> (b t)")
|
165 |
+
t_emb = timestep_embedding(num_frames, self.in_channels, repeat_only=False)
|
166 |
+
emb = self.video_time_embed(t_emb) # b, n_channels
|
167 |
+
emb = emb[:, None, :]
|
168 |
+
x_mix = x_mix + emb
|
169 |
+
|
170 |
+
alpha = self.get_alpha().to(x.device)
|
171 |
+
x_mix = self.time_mix_block(x_mix, timesteps=timesteps)
|
172 |
+
x = alpha * x + (1.0 - alpha) * x_mix # alpha merge
|
173 |
+
|
174 |
+
x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
|
175 |
+
x = self.proj_out(x)
|
176 |
+
|
177 |
+
return x_in + x
|
178 |
+
|
179 |
+
def get_alpha(
|
180 |
+
self,
|
181 |
+
):
|
182 |
+
if self.merge_strategy == "fixed":
|
183 |
+
return self.mix_factor
|
184 |
+
elif self.merge_strategy == "learned":
|
185 |
+
return torch.sigmoid(self.mix_factor)
|
186 |
+
else:
|
187 |
+
raise NotImplementedError(f"unknown merge strategy {self.merge_strategy}")
|
188 |
+
|
189 |
+
|
190 |
+
|
191 |
+
def make_time_attn(
|
192 |
+
in_channels,
|
193 |
+
attn_type="vanilla",
|
194 |
+
attn_kwargs=None,
|
195 |
+
alpha: float = 0,
|
196 |
+
merge_strategy: str = "learned",
|
197 |
+
):
|
198 |
+
return partialclass(
|
199 |
+
AttnVideoBlock, in_channels, alpha=alpha, merge_strategy=merge_strategy
|
200 |
+
)
|
201 |
+
|
202 |
+
|
203 |
+
class Conv2DWrapper(torch.nn.Conv2d):
|
204 |
+
def forward(self, input: torch.Tensor, **kwargs) -> torch.Tensor:
|
205 |
+
return super().forward(input)
|
206 |
+
|
207 |
+
|
208 |
+
class VideoDecoder(Decoder):
|
209 |
+
available_time_modes = ["all", "conv-only", "attn-only"]
|
210 |
+
|
211 |
+
def __init__(
|
212 |
+
self,
|
213 |
+
*args,
|
214 |
+
video_kernel_size: Union[int, list] = 3,
|
215 |
+
alpha: float = 0.0,
|
216 |
+
merge_strategy: str = "learned",
|
217 |
+
time_mode: str = "conv-only",
|
218 |
+
**kwargs,
|
219 |
+
):
|
220 |
+
self.video_kernel_size = video_kernel_size
|
221 |
+
self.alpha = alpha
|
222 |
+
self.merge_strategy = merge_strategy
|
223 |
+
self.time_mode = time_mode
|
224 |
+
assert (
|
225 |
+
self.time_mode in self.available_time_modes
|
226 |
+
), f"time_mode parameter has to be in {self.available_time_modes}"
|
227 |
+
|
228 |
+
if self.time_mode != "attn-only":
|
229 |
+
kwargs["conv_out_op"] = partialclass(AE3DConv, video_kernel_size=self.video_kernel_size)
|
230 |
+
if self.time_mode not in ["conv-only", "only-last-conv"]:
|
231 |
+
kwargs["attn_op"] = partialclass(make_time_attn, alpha=self.alpha, merge_strategy=self.merge_strategy)
|
232 |
+
if self.time_mode not in ["attn-only", "only-last-conv"]:
|
233 |
+
kwargs["resnet_op"] = partialclass(VideoResBlock, video_kernel_size=self.video_kernel_size, alpha=self.alpha, merge_strategy=self.merge_strategy)
|
234 |
+
|
235 |
+
super().__init__(*args, **kwargs)
|
236 |
+
|
237 |
+
def get_last_layer(self, skip_time_mix=False, **kwargs):
|
238 |
+
if self.time_mode == "attn-only":
|
239 |
+
raise NotImplementedError("TODO")
|
240 |
+
else:
|
241 |
+
return (
|
242 |
+
self.conv_out.time_mix_conv.weight
|
243 |
+
if not skip_time_mix
|
244 |
+
else self.conv_out.weight
|
245 |
+
)
|
ComfyUI/comfy/ldm/util.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import importlib
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from torch import optim
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
from inspect import isfunction
|
8 |
+
from PIL import Image, ImageDraw, ImageFont
|
9 |
+
|
10 |
+
|
11 |
+
def log_txt_as_img(wh, xc, size=10):
|
12 |
+
# wh a tuple of (width, height)
|
13 |
+
# xc a list of captions to plot
|
14 |
+
b = len(xc)
|
15 |
+
txts = list()
|
16 |
+
for bi in range(b):
|
17 |
+
txt = Image.new("RGB", wh, color="white")
|
18 |
+
draw = ImageDraw.Draw(txt)
|
19 |
+
font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
|
20 |
+
nc = int(40 * (wh[0] / 256))
|
21 |
+
lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
|
22 |
+
|
23 |
+
try:
|
24 |
+
draw.text((0, 0), lines, fill="black", font=font)
|
25 |
+
except UnicodeEncodeError:
|
26 |
+
print("Cant encode string for logging. Skipping.")
|
27 |
+
|
28 |
+
txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
|
29 |
+
txts.append(txt)
|
30 |
+
txts = np.stack(txts)
|
31 |
+
txts = torch.tensor(txts)
|
32 |
+
return txts
|
33 |
+
|
34 |
+
|
35 |
+
def ismap(x):
|
36 |
+
if not isinstance(x, torch.Tensor):
|
37 |
+
return False
|
38 |
+
return (len(x.shape) == 4) and (x.shape[1] > 3)
|
39 |
+
|
40 |
+
|
41 |
+
def isimage(x):
|
42 |
+
if not isinstance(x,torch.Tensor):
|
43 |
+
return False
|
44 |
+
return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
|
45 |
+
|
46 |
+
|
47 |
+
def exists(x):
|
48 |
+
return x is not None
|
49 |
+
|
50 |
+
|
51 |
+
def default(val, d):
|
52 |
+
if exists(val):
|
53 |
+
return val
|
54 |
+
return d() if isfunction(d) else d
|
55 |
+
|
56 |
+
|
57 |
+
def mean_flat(tensor):
|
58 |
+
"""
|
59 |
+
https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
|
60 |
+
Take the mean over all non-batch dimensions.
|
61 |
+
"""
|
62 |
+
return tensor.mean(dim=list(range(1, len(tensor.shape))))
|
63 |
+
|
64 |
+
|
65 |
+
def count_params(model, verbose=False):
|
66 |
+
total_params = sum(p.numel() for p in model.parameters())
|
67 |
+
if verbose:
|
68 |
+
print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
|
69 |
+
return total_params
|
70 |
+
|
71 |
+
|
72 |
+
def instantiate_from_config(config):
|
73 |
+
if not "target" in config:
|
74 |
+
if config == '__is_first_stage__':
|
75 |
+
return None
|
76 |
+
elif config == "__is_unconditional__":
|
77 |
+
return None
|
78 |
+
raise KeyError("Expected key `target` to instantiate.")
|
79 |
+
return get_obj_from_str(config["target"])(**config.get("params", dict()))
|
80 |
+
|
81 |
+
|
82 |
+
def get_obj_from_str(string, reload=False):
|
83 |
+
module, cls = string.rsplit(".", 1)
|
84 |
+
if reload:
|
85 |
+
module_imp = importlib.import_module(module)
|
86 |
+
importlib.reload(module_imp)
|
87 |
+
return getattr(importlib.import_module(module, package=None), cls)
|
88 |
+
|
89 |
+
|
90 |
+
class AdamWwithEMAandWings(optim.Optimizer):
|
91 |
+
# credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298
|
92 |
+
def __init__(self, params, lr=1.e-3, betas=(0.9, 0.999), eps=1.e-8, # TODO: check hyperparameters before using
|
93 |
+
weight_decay=1.e-2, amsgrad=False, ema_decay=0.9999, # ema decay to match previous code
|
94 |
+
ema_power=1., param_names=()):
|
95 |
+
"""AdamW that saves EMA versions of the parameters."""
|
96 |
+
if not 0.0 <= lr:
|
97 |
+
raise ValueError("Invalid learning rate: {}".format(lr))
|
98 |
+
if not 0.0 <= eps:
|
99 |
+
raise ValueError("Invalid epsilon value: {}".format(eps))
|
100 |
+
if not 0.0 <= betas[0] < 1.0:
|
101 |
+
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
|
102 |
+
if not 0.0 <= betas[1] < 1.0:
|
103 |
+
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
|
104 |
+
if not 0.0 <= weight_decay:
|
105 |
+
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
|
106 |
+
if not 0.0 <= ema_decay <= 1.0:
|
107 |
+
raise ValueError("Invalid ema_decay value: {}".format(ema_decay))
|
108 |
+
defaults = dict(lr=lr, betas=betas, eps=eps,
|
109 |
+
weight_decay=weight_decay, amsgrad=amsgrad, ema_decay=ema_decay,
|
110 |
+
ema_power=ema_power, param_names=param_names)
|
111 |
+
super().__init__(params, defaults)
|
112 |
+
|
113 |
+
def __setstate__(self, state):
|
114 |
+
super().__setstate__(state)
|
115 |
+
for group in self.param_groups:
|
116 |
+
group.setdefault('amsgrad', False)
|
117 |
+
|
118 |
+
@torch.no_grad()
|
119 |
+
def step(self, closure=None):
|
120 |
+
"""Performs a single optimization step.
|
121 |
+
Args:
|
122 |
+
closure (callable, optional): A closure that reevaluates the model
|
123 |
+
and returns the loss.
|
124 |
+
"""
|
125 |
+
loss = None
|
126 |
+
if closure is not None:
|
127 |
+
with torch.enable_grad():
|
128 |
+
loss = closure()
|
129 |
+
|
130 |
+
for group in self.param_groups:
|
131 |
+
params_with_grad = []
|
132 |
+
grads = []
|
133 |
+
exp_avgs = []
|
134 |
+
exp_avg_sqs = []
|
135 |
+
ema_params_with_grad = []
|
136 |
+
state_sums = []
|
137 |
+
max_exp_avg_sqs = []
|
138 |
+
state_steps = []
|
139 |
+
amsgrad = group['amsgrad']
|
140 |
+
beta1, beta2 = group['betas']
|
141 |
+
ema_decay = group['ema_decay']
|
142 |
+
ema_power = group['ema_power']
|
143 |
+
|
144 |
+
for p in group['params']:
|
145 |
+
if p.grad is None:
|
146 |
+
continue
|
147 |
+
params_with_grad.append(p)
|
148 |
+
if p.grad.is_sparse:
|
149 |
+
raise RuntimeError('AdamW does not support sparse gradients')
|
150 |
+
grads.append(p.grad)
|
151 |
+
|
152 |
+
state = self.state[p]
|
153 |
+
|
154 |
+
# State initialization
|
155 |
+
if len(state) == 0:
|
156 |
+
state['step'] = 0
|
157 |
+
# Exponential moving average of gradient values
|
158 |
+
state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
|
159 |
+
# Exponential moving average of squared gradient values
|
160 |
+
state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
|
161 |
+
if amsgrad:
|
162 |
+
# Maintains max of all exp. moving avg. of sq. grad. values
|
163 |
+
state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
|
164 |
+
# Exponential moving average of parameter values
|
165 |
+
state['param_exp_avg'] = p.detach().float().clone()
|
166 |
+
|
167 |
+
exp_avgs.append(state['exp_avg'])
|
168 |
+
exp_avg_sqs.append(state['exp_avg_sq'])
|
169 |
+
ema_params_with_grad.append(state['param_exp_avg'])
|
170 |
+
|
171 |
+
if amsgrad:
|
172 |
+
max_exp_avg_sqs.append(state['max_exp_avg_sq'])
|
173 |
+
|
174 |
+
# update the steps for each param group update
|
175 |
+
state['step'] += 1
|
176 |
+
# record the step after step update
|
177 |
+
state_steps.append(state['step'])
|
178 |
+
|
179 |
+
optim._functional.adamw(params_with_grad,
|
180 |
+
grads,
|
181 |
+
exp_avgs,
|
182 |
+
exp_avg_sqs,
|
183 |
+
max_exp_avg_sqs,
|
184 |
+
state_steps,
|
185 |
+
amsgrad=amsgrad,
|
186 |
+
beta1=beta1,
|
187 |
+
beta2=beta2,
|
188 |
+
lr=group['lr'],
|
189 |
+
weight_decay=group['weight_decay'],
|
190 |
+
eps=group['eps'],
|
191 |
+
maximize=False)
|
192 |
+
|
193 |
+
cur_ema_decay = min(ema_decay, 1 - state['step'] ** -ema_power)
|
194 |
+
for param, ema_param in zip(params_with_grad, ema_params_with_grad):
|
195 |
+
ema_param.mul_(cur_ema_decay).add_(param.float(), alpha=1 - cur_ema_decay)
|
196 |
+
|
197 |
+
return loss
|
ComfyUI/comfy/lora.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import comfy.utils
|
2 |
+
import logging
|
3 |
+
|
4 |
+
LORA_CLIP_MAP = {
|
5 |
+
"mlp.fc1": "mlp_fc1",
|
6 |
+
"mlp.fc2": "mlp_fc2",
|
7 |
+
"self_attn.k_proj": "self_attn_k_proj",
|
8 |
+
"self_attn.q_proj": "self_attn_q_proj",
|
9 |
+
"self_attn.v_proj": "self_attn_v_proj",
|
10 |
+
"self_attn.out_proj": "self_attn_out_proj",
|
11 |
+
}
|
12 |
+
|
13 |
+
|
14 |
+
def load_lora(lora, to_load):
|
15 |
+
patch_dict = {}
|
16 |
+
loaded_keys = set()
|
17 |
+
for x in to_load:
|
18 |
+
alpha_name = "{}.alpha".format(x)
|
19 |
+
alpha = None
|
20 |
+
if alpha_name in lora.keys():
|
21 |
+
alpha = lora[alpha_name].item()
|
22 |
+
loaded_keys.add(alpha_name)
|
23 |
+
|
24 |
+
regular_lora = "{}.lora_up.weight".format(x)
|
25 |
+
diffusers_lora = "{}_lora.up.weight".format(x)
|
26 |
+
transformers_lora = "{}.lora_linear_layer.up.weight".format(x)
|
27 |
+
A_name = None
|
28 |
+
|
29 |
+
if regular_lora in lora.keys():
|
30 |
+
A_name = regular_lora
|
31 |
+
B_name = "{}.lora_down.weight".format(x)
|
32 |
+
mid_name = "{}.lora_mid.weight".format(x)
|
33 |
+
elif diffusers_lora in lora.keys():
|
34 |
+
A_name = diffusers_lora
|
35 |
+
B_name = "{}_lora.down.weight".format(x)
|
36 |
+
mid_name = None
|
37 |
+
elif transformers_lora in lora.keys():
|
38 |
+
A_name = transformers_lora
|
39 |
+
B_name ="{}.lora_linear_layer.down.weight".format(x)
|
40 |
+
mid_name = None
|
41 |
+
|
42 |
+
if A_name is not None:
|
43 |
+
mid = None
|
44 |
+
if mid_name is not None and mid_name in lora.keys():
|
45 |
+
mid = lora[mid_name]
|
46 |
+
loaded_keys.add(mid_name)
|
47 |
+
patch_dict[to_load[x]] = ("lora", (lora[A_name], lora[B_name], alpha, mid))
|
48 |
+
loaded_keys.add(A_name)
|
49 |
+
loaded_keys.add(B_name)
|
50 |
+
|
51 |
+
|
52 |
+
######## loha
|
53 |
+
hada_w1_a_name = "{}.hada_w1_a".format(x)
|
54 |
+
hada_w1_b_name = "{}.hada_w1_b".format(x)
|
55 |
+
hada_w2_a_name = "{}.hada_w2_a".format(x)
|
56 |
+
hada_w2_b_name = "{}.hada_w2_b".format(x)
|
57 |
+
hada_t1_name = "{}.hada_t1".format(x)
|
58 |
+
hada_t2_name = "{}.hada_t2".format(x)
|
59 |
+
if hada_w1_a_name in lora.keys():
|
60 |
+
hada_t1 = None
|
61 |
+
hada_t2 = None
|
62 |
+
if hada_t1_name in lora.keys():
|
63 |
+
hada_t1 = lora[hada_t1_name]
|
64 |
+
hada_t2 = lora[hada_t2_name]
|
65 |
+
loaded_keys.add(hada_t1_name)
|
66 |
+
loaded_keys.add(hada_t2_name)
|
67 |
+
|
68 |
+
patch_dict[to_load[x]] = ("loha", (lora[hada_w1_a_name], lora[hada_w1_b_name], alpha, lora[hada_w2_a_name], lora[hada_w2_b_name], hada_t1, hada_t2))
|
69 |
+
loaded_keys.add(hada_w1_a_name)
|
70 |
+
loaded_keys.add(hada_w1_b_name)
|
71 |
+
loaded_keys.add(hada_w2_a_name)
|
72 |
+
loaded_keys.add(hada_w2_b_name)
|
73 |
+
|
74 |
+
|
75 |
+
######## lokr
|
76 |
+
lokr_w1_name = "{}.lokr_w1".format(x)
|
77 |
+
lokr_w2_name = "{}.lokr_w2".format(x)
|
78 |
+
lokr_w1_a_name = "{}.lokr_w1_a".format(x)
|
79 |
+
lokr_w1_b_name = "{}.lokr_w1_b".format(x)
|
80 |
+
lokr_t2_name = "{}.lokr_t2".format(x)
|
81 |
+
lokr_w2_a_name = "{}.lokr_w2_a".format(x)
|
82 |
+
lokr_w2_b_name = "{}.lokr_w2_b".format(x)
|
83 |
+
|
84 |
+
lokr_w1 = None
|
85 |
+
if lokr_w1_name in lora.keys():
|
86 |
+
lokr_w1 = lora[lokr_w1_name]
|
87 |
+
loaded_keys.add(lokr_w1_name)
|
88 |
+
|
89 |
+
lokr_w2 = None
|
90 |
+
if lokr_w2_name in lora.keys():
|
91 |
+
lokr_w2 = lora[lokr_w2_name]
|
92 |
+
loaded_keys.add(lokr_w2_name)
|
93 |
+
|
94 |
+
lokr_w1_a = None
|
95 |
+
if lokr_w1_a_name in lora.keys():
|
96 |
+
lokr_w1_a = lora[lokr_w1_a_name]
|
97 |
+
loaded_keys.add(lokr_w1_a_name)
|
98 |
+
|
99 |
+
lokr_w1_b = None
|
100 |
+
if lokr_w1_b_name in lora.keys():
|
101 |
+
lokr_w1_b = lora[lokr_w1_b_name]
|
102 |
+
loaded_keys.add(lokr_w1_b_name)
|
103 |
+
|
104 |
+
lokr_w2_a = None
|
105 |
+
if lokr_w2_a_name in lora.keys():
|
106 |
+
lokr_w2_a = lora[lokr_w2_a_name]
|
107 |
+
loaded_keys.add(lokr_w2_a_name)
|
108 |
+
|
109 |
+
lokr_w2_b = None
|
110 |
+
if lokr_w2_b_name in lora.keys():
|
111 |
+
lokr_w2_b = lora[lokr_w2_b_name]
|
112 |
+
loaded_keys.add(lokr_w2_b_name)
|
113 |
+
|
114 |
+
lokr_t2 = None
|
115 |
+
if lokr_t2_name in lora.keys():
|
116 |
+
lokr_t2 = lora[lokr_t2_name]
|
117 |
+
loaded_keys.add(lokr_t2_name)
|
118 |
+
|
119 |
+
if (lokr_w1 is not None) or (lokr_w2 is not None) or (lokr_w1_a is not None) or (lokr_w2_a is not None):
|
120 |
+
patch_dict[to_load[x]] = ("lokr", (lokr_w1, lokr_w2, alpha, lokr_w1_a, lokr_w1_b, lokr_w2_a, lokr_w2_b, lokr_t2))
|
121 |
+
|
122 |
+
#glora
|
123 |
+
a1_name = "{}.a1.weight".format(x)
|
124 |
+
a2_name = "{}.a2.weight".format(x)
|
125 |
+
b1_name = "{}.b1.weight".format(x)
|
126 |
+
b2_name = "{}.b2.weight".format(x)
|
127 |
+
if a1_name in lora:
|
128 |
+
patch_dict[to_load[x]] = ("glora", (lora[a1_name], lora[a2_name], lora[b1_name], lora[b2_name], alpha))
|
129 |
+
loaded_keys.add(a1_name)
|
130 |
+
loaded_keys.add(a2_name)
|
131 |
+
loaded_keys.add(b1_name)
|
132 |
+
loaded_keys.add(b2_name)
|
133 |
+
|
134 |
+
w_norm_name = "{}.w_norm".format(x)
|
135 |
+
b_norm_name = "{}.b_norm".format(x)
|
136 |
+
w_norm = lora.get(w_norm_name, None)
|
137 |
+
b_norm = lora.get(b_norm_name, None)
|
138 |
+
|
139 |
+
if w_norm is not None:
|
140 |
+
loaded_keys.add(w_norm_name)
|
141 |
+
patch_dict[to_load[x]] = ("diff", (w_norm,))
|
142 |
+
if b_norm is not None:
|
143 |
+
loaded_keys.add(b_norm_name)
|
144 |
+
patch_dict["{}.bias".format(to_load[x][:-len(".weight")])] = ("diff", (b_norm,))
|
145 |
+
|
146 |
+
diff_name = "{}.diff".format(x)
|
147 |
+
diff_weight = lora.get(diff_name, None)
|
148 |
+
if diff_weight is not None:
|
149 |
+
patch_dict[to_load[x]] = ("diff", (diff_weight,))
|
150 |
+
loaded_keys.add(diff_name)
|
151 |
+
|
152 |
+
diff_bias_name = "{}.diff_b".format(x)
|
153 |
+
diff_bias = lora.get(diff_bias_name, None)
|
154 |
+
if diff_bias is not None:
|
155 |
+
patch_dict["{}.bias".format(to_load[x][:-len(".weight")])] = ("diff", (diff_bias,))
|
156 |
+
loaded_keys.add(diff_bias_name)
|
157 |
+
|
158 |
+
for x in lora.keys():
|
159 |
+
if x not in loaded_keys:
|
160 |
+
logging.warning("lora key not loaded: {}".format(x))
|
161 |
+
return patch_dict
|
162 |
+
|
163 |
+
def model_lora_keys_clip(model, key_map={}):
|
164 |
+
sdk = model.state_dict().keys()
|
165 |
+
|
166 |
+
text_model_lora_key = "lora_te_text_model_encoder_layers_{}_{}"
|
167 |
+
clip_l_present = False
|
168 |
+
for b in range(32): #TODO: clean up
|
169 |
+
for c in LORA_CLIP_MAP:
|
170 |
+
k = "clip_h.transformer.text_model.encoder.layers.{}.{}.weight".format(b, c)
|
171 |
+
if k in sdk:
|
172 |
+
lora_key = text_model_lora_key.format(b, LORA_CLIP_MAP[c])
|
173 |
+
key_map[lora_key] = k
|
174 |
+
lora_key = "lora_te1_text_model_encoder_layers_{}_{}".format(b, LORA_CLIP_MAP[c])
|
175 |
+
key_map[lora_key] = k
|
176 |
+
lora_key = "text_encoder.text_model.encoder.layers.{}.{}".format(b, c) #diffusers lora
|
177 |
+
key_map[lora_key] = k
|
178 |
+
|
179 |
+
k = "clip_l.transformer.text_model.encoder.layers.{}.{}.weight".format(b, c)
|
180 |
+
if k in sdk:
|
181 |
+
lora_key = text_model_lora_key.format(b, LORA_CLIP_MAP[c])
|
182 |
+
key_map[lora_key] = k
|
183 |
+
lora_key = "lora_te1_text_model_encoder_layers_{}_{}".format(b, LORA_CLIP_MAP[c]) #SDXL base
|
184 |
+
key_map[lora_key] = k
|
185 |
+
clip_l_present = True
|
186 |
+
lora_key = "text_encoder.text_model.encoder.layers.{}.{}".format(b, c) #diffusers lora
|
187 |
+
key_map[lora_key] = k
|
188 |
+
|
189 |
+
k = "clip_g.transformer.text_model.encoder.layers.{}.{}.weight".format(b, c)
|
190 |
+
if k in sdk:
|
191 |
+
if clip_l_present:
|
192 |
+
lora_key = "lora_te2_text_model_encoder_layers_{}_{}".format(b, LORA_CLIP_MAP[c]) #SDXL base
|
193 |
+
key_map[lora_key] = k
|
194 |
+
lora_key = "text_encoder_2.text_model.encoder.layers.{}.{}".format(b, c) #diffusers lora
|
195 |
+
key_map[lora_key] = k
|
196 |
+
else:
|
197 |
+
lora_key = "lora_te_text_model_encoder_layers_{}_{}".format(b, LORA_CLIP_MAP[c]) #TODO: test if this is correct for SDXL-Refiner
|
198 |
+
key_map[lora_key] = k
|
199 |
+
lora_key = "text_encoder.text_model.encoder.layers.{}.{}".format(b, c) #diffusers lora
|
200 |
+
key_map[lora_key] = k
|
201 |
+
lora_key = "lora_prior_te_text_model_encoder_layers_{}_{}".format(b, LORA_CLIP_MAP[c]) #cascade lora: TODO put lora key prefix in the model config
|
202 |
+
key_map[lora_key] = k
|
203 |
+
|
204 |
+
|
205 |
+
k = "clip_g.transformer.text_projection.weight"
|
206 |
+
if k in sdk:
|
207 |
+
key_map["lora_prior_te_text_projection"] = k #cascade lora?
|
208 |
+
# key_map["text_encoder.text_projection"] = k #TODO: check if other lora have the text_projection too
|
209 |
+
# key_map["lora_te_text_projection"] = k
|
210 |
+
|
211 |
+
return key_map
|
212 |
+
|
213 |
+
def model_lora_keys_unet(model, key_map={}):
|
214 |
+
sdk = model.state_dict().keys()
|
215 |
+
|
216 |
+
for k in sdk:
|
217 |
+
if k.startswith("diffusion_model.") and k.endswith(".weight"):
|
218 |
+
key_lora = k[len("diffusion_model."):-len(".weight")].replace(".", "_")
|
219 |
+
key_map["lora_unet_{}".format(key_lora)] = k
|
220 |
+
key_map["lora_prior_unet_{}".format(key_lora)] = k #cascade lora: TODO put lora key prefix in the model config
|
221 |
+
|
222 |
+
diffusers_keys = comfy.utils.unet_to_diffusers(model.model_config.unet_config)
|
223 |
+
for k in diffusers_keys:
|
224 |
+
if k.endswith(".weight"):
|
225 |
+
unet_key = "diffusion_model.{}".format(diffusers_keys[k])
|
226 |
+
key_lora = k[:-len(".weight")].replace(".", "_")
|
227 |
+
key_map["lora_unet_{}".format(key_lora)] = unet_key
|
228 |
+
|
229 |
+
diffusers_lora_prefix = ["", "unet."]
|
230 |
+
for p in diffusers_lora_prefix:
|
231 |
+
diffusers_lora_key = "{}{}".format(p, k[:-len(".weight")].replace(".to_", ".processor.to_"))
|
232 |
+
if diffusers_lora_key.endswith(".to_out.0"):
|
233 |
+
diffusers_lora_key = diffusers_lora_key[:-2]
|
234 |
+
key_map[diffusers_lora_key] = unet_key
|
235 |
+
return key_map
|
ComfyUI/comfy/model_base.py
ADDED
@@ -0,0 +1,492 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import logging
|
3 |
+
from comfy.ldm.modules.diffusionmodules.openaimodel import UNetModel, Timestep
|
4 |
+
from comfy.ldm.cascade.stage_c import StageC
|
5 |
+
from comfy.ldm.cascade.stage_b import StageB
|
6 |
+
from comfy.ldm.modules.encoders.noise_aug_modules import CLIPEmbeddingNoiseAugmentation
|
7 |
+
from comfy.ldm.modules.diffusionmodules.upscaling import ImageConcatWithNoiseAugmentation
|
8 |
+
import comfy.model_management
|
9 |
+
import comfy.conds
|
10 |
+
import comfy.ops
|
11 |
+
from enum import Enum
|
12 |
+
from . import utils
|
13 |
+
|
14 |
+
class ModelType(Enum):
|
15 |
+
EPS = 1
|
16 |
+
V_PREDICTION = 2
|
17 |
+
V_PREDICTION_EDM = 3
|
18 |
+
STABLE_CASCADE = 4
|
19 |
+
EDM = 5
|
20 |
+
|
21 |
+
|
22 |
+
from comfy.model_sampling import EPS, V_PREDICTION, EDM, ModelSamplingDiscrete, ModelSamplingContinuousEDM, StableCascadeSampling
|
23 |
+
|
24 |
+
|
25 |
+
def model_sampling(model_config, model_type):
|
26 |
+
s = ModelSamplingDiscrete
|
27 |
+
|
28 |
+
if model_type == ModelType.EPS:
|
29 |
+
c = EPS
|
30 |
+
elif model_type == ModelType.V_PREDICTION:
|
31 |
+
c = V_PREDICTION
|
32 |
+
elif model_type == ModelType.V_PREDICTION_EDM:
|
33 |
+
c = V_PREDICTION
|
34 |
+
s = ModelSamplingContinuousEDM
|
35 |
+
elif model_type == ModelType.STABLE_CASCADE:
|
36 |
+
c = EPS
|
37 |
+
s = StableCascadeSampling
|
38 |
+
elif model_type == ModelType.EDM:
|
39 |
+
c = EDM
|
40 |
+
s = ModelSamplingContinuousEDM
|
41 |
+
|
42 |
+
class ModelSampling(s, c):
|
43 |
+
pass
|
44 |
+
|
45 |
+
return ModelSampling(model_config)
|
46 |
+
|
47 |
+
|
48 |
+
class BaseModel(torch.nn.Module):
|
49 |
+
def __init__(self, model_config, model_type=ModelType.EPS, device=None, unet_model=UNetModel):
|
50 |
+
super().__init__()
|
51 |
+
|
52 |
+
unet_config = model_config.unet_config
|
53 |
+
self.latent_format = model_config.latent_format
|
54 |
+
self.model_config = model_config
|
55 |
+
self.manual_cast_dtype = model_config.manual_cast_dtype
|
56 |
+
|
57 |
+
if not unet_config.get("disable_unet_model_creation", False):
|
58 |
+
if self.manual_cast_dtype is not None:
|
59 |
+
operations = comfy.ops.manual_cast
|
60 |
+
else:
|
61 |
+
operations = comfy.ops.disable_weight_init
|
62 |
+
self.diffusion_model = unet_model(**unet_config, device=device, operations=operations)
|
63 |
+
self.model_type = model_type
|
64 |
+
self.model_sampling = model_sampling(model_config, model_type)
|
65 |
+
|
66 |
+
self.adm_channels = unet_config.get("adm_in_channels", None)
|
67 |
+
if self.adm_channels is None:
|
68 |
+
self.adm_channels = 0
|
69 |
+
self.inpaint_model = False
|
70 |
+
logging.info("model_type {}".format(model_type.name))
|
71 |
+
logging.debug("adm {}".format(self.adm_channels))
|
72 |
+
|
73 |
+
def apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs):
|
74 |
+
sigma = t
|
75 |
+
xc = self.model_sampling.calculate_input(sigma, x)
|
76 |
+
if c_concat is not None:
|
77 |
+
xc = torch.cat([xc] + [c_concat], dim=1)
|
78 |
+
|
79 |
+
context = c_crossattn
|
80 |
+
dtype = self.get_dtype()
|
81 |
+
|
82 |
+
if self.manual_cast_dtype is not None:
|
83 |
+
dtype = self.manual_cast_dtype
|
84 |
+
|
85 |
+
xc = xc.to(dtype)
|
86 |
+
t = self.model_sampling.timestep(t).float()
|
87 |
+
context = context.to(dtype)
|
88 |
+
extra_conds = {}
|
89 |
+
for o in kwargs:
|
90 |
+
extra = kwargs[o]
|
91 |
+
if hasattr(extra, "dtype"):
|
92 |
+
if extra.dtype != torch.int and extra.dtype != torch.long:
|
93 |
+
extra = extra.to(dtype)
|
94 |
+
extra_conds[o] = extra
|
95 |
+
|
96 |
+
model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds).float()
|
97 |
+
return self.model_sampling.calculate_denoised(sigma, model_output, x)
|
98 |
+
|
99 |
+
def get_dtype(self):
|
100 |
+
return self.diffusion_model.dtype
|
101 |
+
|
102 |
+
def is_adm(self):
|
103 |
+
return self.adm_channels > 0
|
104 |
+
|
105 |
+
def encode_adm(self, **kwargs):
|
106 |
+
return None
|
107 |
+
|
108 |
+
def extra_conds(self, **kwargs):
|
109 |
+
out = {}
|
110 |
+
if self.inpaint_model:
|
111 |
+
concat_keys = ("mask", "masked_image")
|
112 |
+
cond_concat = []
|
113 |
+
denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
|
114 |
+
concat_latent_image = kwargs.get("concat_latent_image", None)
|
115 |
+
if concat_latent_image is None:
|
116 |
+
concat_latent_image = kwargs.get("latent_image", None)
|
117 |
+
else:
|
118 |
+
concat_latent_image = self.process_latent_in(concat_latent_image)
|
119 |
+
|
120 |
+
noise = kwargs.get("noise", None)
|
121 |
+
device = kwargs["device"]
|
122 |
+
|
123 |
+
if concat_latent_image.shape[1:] != noise.shape[1:]:
|
124 |
+
concat_latent_image = utils.common_upscale(concat_latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
125 |
+
|
126 |
+
concat_latent_image = utils.resize_to_batch_size(concat_latent_image, noise.shape[0])
|
127 |
+
|
128 |
+
if len(denoise_mask.shape) == len(noise.shape):
|
129 |
+
denoise_mask = denoise_mask[:,:1]
|
130 |
+
|
131 |
+
denoise_mask = denoise_mask.reshape((-1, 1, denoise_mask.shape[-2], denoise_mask.shape[-1]))
|
132 |
+
if denoise_mask.shape[-2:] != noise.shape[-2:]:
|
133 |
+
denoise_mask = utils.common_upscale(denoise_mask, noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
134 |
+
denoise_mask = utils.resize_to_batch_size(denoise_mask.round(), noise.shape[0])
|
135 |
+
|
136 |
+
def blank_inpaint_image_like(latent_image):
|
137 |
+
blank_image = torch.ones_like(latent_image)
|
138 |
+
# these are the values for "zero" in pixel space translated to latent space
|
139 |
+
blank_image[:,0] *= 0.8223
|
140 |
+
blank_image[:,1] *= -0.6876
|
141 |
+
blank_image[:,2] *= 0.6364
|
142 |
+
blank_image[:,3] *= 0.1380
|
143 |
+
return blank_image
|
144 |
+
|
145 |
+
for ck in concat_keys:
|
146 |
+
if denoise_mask is not None:
|
147 |
+
if ck == "mask":
|
148 |
+
cond_concat.append(denoise_mask.to(device))
|
149 |
+
elif ck == "masked_image":
|
150 |
+
cond_concat.append(concat_latent_image.to(device)) #NOTE: the latent_image should be masked by the mask in pixel space
|
151 |
+
else:
|
152 |
+
if ck == "mask":
|
153 |
+
cond_concat.append(torch.ones_like(noise)[:,:1])
|
154 |
+
elif ck == "masked_image":
|
155 |
+
cond_concat.append(blank_inpaint_image_like(noise))
|
156 |
+
data = torch.cat(cond_concat, dim=1)
|
157 |
+
out['c_concat'] = comfy.conds.CONDNoiseShape(data)
|
158 |
+
|
159 |
+
adm = self.encode_adm(**kwargs)
|
160 |
+
if adm is not None:
|
161 |
+
out['y'] = comfy.conds.CONDRegular(adm)
|
162 |
+
|
163 |
+
cross_attn = kwargs.get("cross_attn", None)
|
164 |
+
if cross_attn is not None:
|
165 |
+
out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)
|
166 |
+
|
167 |
+
cross_attn_cnet = kwargs.get("cross_attn_controlnet", None)
|
168 |
+
if cross_attn_cnet is not None:
|
169 |
+
out['crossattn_controlnet'] = comfy.conds.CONDCrossAttn(cross_attn_cnet)
|
170 |
+
|
171 |
+
c_concat = kwargs.get("noise_concat", None)
|
172 |
+
if c_concat is not None:
|
173 |
+
out['c_concat'] = comfy.conds.CONDNoiseShape(data)
|
174 |
+
|
175 |
+
return out
|
176 |
+
|
177 |
+
def load_model_weights(self, sd, unet_prefix=""):
|
178 |
+
to_load = {}
|
179 |
+
keys = list(sd.keys())
|
180 |
+
for k in keys:
|
181 |
+
if k.startswith(unet_prefix):
|
182 |
+
to_load[k[len(unet_prefix):]] = sd.pop(k)
|
183 |
+
|
184 |
+
to_load = self.model_config.process_unet_state_dict(to_load)
|
185 |
+
m, u = self.diffusion_model.load_state_dict(to_load, strict=False)
|
186 |
+
if len(m) > 0:
|
187 |
+
logging.warning("unet missing: {}".format(m))
|
188 |
+
|
189 |
+
if len(u) > 0:
|
190 |
+
logging.warning("unet unexpected: {}".format(u))
|
191 |
+
del to_load
|
192 |
+
return self
|
193 |
+
|
194 |
+
def process_latent_in(self, latent):
|
195 |
+
return self.latent_format.process_in(latent)
|
196 |
+
|
197 |
+
def process_latent_out(self, latent):
|
198 |
+
return self.latent_format.process_out(latent)
|
199 |
+
|
200 |
+
def state_dict_for_saving(self, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None):
|
201 |
+
extra_sds = []
|
202 |
+
if clip_state_dict is not None:
|
203 |
+
extra_sds.append(self.model_config.process_clip_state_dict_for_saving(clip_state_dict))
|
204 |
+
if vae_state_dict is not None:
|
205 |
+
extra_sds.append(self.model_config.process_vae_state_dict_for_saving(vae_state_dict))
|
206 |
+
if clip_vision_state_dict is not None:
|
207 |
+
extra_sds.append(self.model_config.process_clip_vision_state_dict_for_saving(clip_vision_state_dict))
|
208 |
+
|
209 |
+
unet_state_dict = self.diffusion_model.state_dict()
|
210 |
+
unet_state_dict = self.model_config.process_unet_state_dict_for_saving(unet_state_dict)
|
211 |
+
|
212 |
+
if self.get_dtype() == torch.float16:
|
213 |
+
extra_sds = map(lambda sd: utils.convert_sd_to(sd, torch.float16), extra_sds)
|
214 |
+
|
215 |
+
if self.model_type == ModelType.V_PREDICTION:
|
216 |
+
unet_state_dict["v_pred"] = torch.tensor([])
|
217 |
+
|
218 |
+
for sd in extra_sds:
|
219 |
+
unet_state_dict.update(sd)
|
220 |
+
|
221 |
+
return unet_state_dict
|
222 |
+
|
223 |
+
def set_inpaint(self):
|
224 |
+
self.inpaint_model = True
|
225 |
+
|
226 |
+
def memory_required(self, input_shape):
|
227 |
+
if comfy.model_management.xformers_enabled() or comfy.model_management.pytorch_attention_flash_attention():
|
228 |
+
dtype = self.get_dtype()
|
229 |
+
if self.manual_cast_dtype is not None:
|
230 |
+
dtype = self.manual_cast_dtype
|
231 |
+
#TODO: this needs to be tweaked
|
232 |
+
area = input_shape[0] * input_shape[2] * input_shape[3]
|
233 |
+
return (area * comfy.model_management.dtype_size(dtype) / 50) * (1024 * 1024)
|
234 |
+
else:
|
235 |
+
#TODO: this formula might be too aggressive since I tweaked the sub-quad and split algorithms to use less memory.
|
236 |
+
area = input_shape[0] * input_shape[2] * input_shape[3]
|
237 |
+
return (((area * 0.6) / 0.9) + 1024) * (1024 * 1024)
|
238 |
+
|
239 |
+
|
240 |
+
def unclip_adm(unclip_conditioning, device, noise_augmentor, noise_augment_merge=0.0, seed=None):
|
241 |
+
adm_inputs = []
|
242 |
+
weights = []
|
243 |
+
noise_aug = []
|
244 |
+
for unclip_cond in unclip_conditioning:
|
245 |
+
for adm_cond in unclip_cond["clip_vision_output"].image_embeds:
|
246 |
+
weight = unclip_cond["strength"]
|
247 |
+
noise_augment = unclip_cond["noise_augmentation"]
|
248 |
+
noise_level = round((noise_augmentor.max_noise_level - 1) * noise_augment)
|
249 |
+
c_adm, noise_level_emb = noise_augmentor(adm_cond.to(device), noise_level=torch.tensor([noise_level], device=device), seed=seed)
|
250 |
+
adm_out = torch.cat((c_adm, noise_level_emb), 1) * weight
|
251 |
+
weights.append(weight)
|
252 |
+
noise_aug.append(noise_augment)
|
253 |
+
adm_inputs.append(adm_out)
|
254 |
+
|
255 |
+
if len(noise_aug) > 1:
|
256 |
+
adm_out = torch.stack(adm_inputs).sum(0)
|
257 |
+
noise_augment = noise_augment_merge
|
258 |
+
noise_level = round((noise_augmentor.max_noise_level - 1) * noise_augment)
|
259 |
+
c_adm, noise_level_emb = noise_augmentor(adm_out[:, :noise_augmentor.time_embed.dim], noise_level=torch.tensor([noise_level], device=device))
|
260 |
+
adm_out = torch.cat((c_adm, noise_level_emb), 1)
|
261 |
+
|
262 |
+
return adm_out
|
263 |
+
|
264 |
+
class SD21UNCLIP(BaseModel):
|
265 |
+
def __init__(self, model_config, noise_aug_config, model_type=ModelType.V_PREDICTION, device=None):
|
266 |
+
super().__init__(model_config, model_type, device=device)
|
267 |
+
self.noise_augmentor = CLIPEmbeddingNoiseAugmentation(**noise_aug_config)
|
268 |
+
|
269 |
+
def encode_adm(self, **kwargs):
|
270 |
+
unclip_conditioning = kwargs.get("unclip_conditioning", None)
|
271 |
+
device = kwargs["device"]
|
272 |
+
if unclip_conditioning is None:
|
273 |
+
return torch.zeros((1, self.adm_channels))
|
274 |
+
else:
|
275 |
+
return unclip_adm(unclip_conditioning, device, self.noise_augmentor, kwargs.get("unclip_noise_augment_merge", 0.05), kwargs.get("seed", 0) - 10)
|
276 |
+
|
277 |
+
def sdxl_pooled(args, noise_augmentor):
|
278 |
+
if "unclip_conditioning" in args:
|
279 |
+
return unclip_adm(args.get("unclip_conditioning", None), args["device"], noise_augmentor, seed=args.get("seed", 0) - 10)[:,:1280]
|
280 |
+
else:
|
281 |
+
return args["pooled_output"]
|
282 |
+
|
283 |
+
class SDXLRefiner(BaseModel):
|
284 |
+
def __init__(self, model_config, model_type=ModelType.EPS, device=None):
|
285 |
+
super().__init__(model_config, model_type, device=device)
|
286 |
+
self.embedder = Timestep(256)
|
287 |
+
self.noise_augmentor = CLIPEmbeddingNoiseAugmentation(**{"noise_schedule_config": {"timesteps": 1000, "beta_schedule": "squaredcos_cap_v2"}, "timestep_dim": 1280})
|
288 |
+
|
289 |
+
def encode_adm(self, **kwargs):
|
290 |
+
clip_pooled = sdxl_pooled(kwargs, self.noise_augmentor)
|
291 |
+
width = kwargs.get("width", 768)
|
292 |
+
height = kwargs.get("height", 768)
|
293 |
+
crop_w = kwargs.get("crop_w", 0)
|
294 |
+
crop_h = kwargs.get("crop_h", 0)
|
295 |
+
|
296 |
+
if kwargs.get("prompt_type", "") == "negative":
|
297 |
+
aesthetic_score = kwargs.get("aesthetic_score", 2.5)
|
298 |
+
else:
|
299 |
+
aesthetic_score = kwargs.get("aesthetic_score", 6)
|
300 |
+
|
301 |
+
out = []
|
302 |
+
out.append(self.embedder(torch.Tensor([height])))
|
303 |
+
out.append(self.embedder(torch.Tensor([width])))
|
304 |
+
out.append(self.embedder(torch.Tensor([crop_h])))
|
305 |
+
out.append(self.embedder(torch.Tensor([crop_w])))
|
306 |
+
out.append(self.embedder(torch.Tensor([aesthetic_score])))
|
307 |
+
flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0).repeat(clip_pooled.shape[0], 1)
|
308 |
+
return torch.cat((clip_pooled.to(flat.device), flat), dim=1)
|
309 |
+
|
310 |
+
class SDXL(BaseModel):
|
311 |
+
def __init__(self, model_config, model_type=ModelType.EPS, device=None):
|
312 |
+
super().__init__(model_config, model_type, device=device)
|
313 |
+
self.embedder = Timestep(256)
|
314 |
+
self.noise_augmentor = CLIPEmbeddingNoiseAugmentation(**{"noise_schedule_config": {"timesteps": 1000, "beta_schedule": "squaredcos_cap_v2"}, "timestep_dim": 1280})
|
315 |
+
|
316 |
+
def encode_adm(self, **kwargs):
|
317 |
+
clip_pooled = sdxl_pooled(kwargs, self.noise_augmentor)
|
318 |
+
width = kwargs.get("width", 768)
|
319 |
+
height = kwargs.get("height", 768)
|
320 |
+
crop_w = kwargs.get("crop_w", 0)
|
321 |
+
crop_h = kwargs.get("crop_h", 0)
|
322 |
+
target_width = kwargs.get("target_width", width)
|
323 |
+
target_height = kwargs.get("target_height", height)
|
324 |
+
|
325 |
+
out = []
|
326 |
+
out.append(self.embedder(torch.Tensor([height])))
|
327 |
+
out.append(self.embedder(torch.Tensor([width])))
|
328 |
+
out.append(self.embedder(torch.Tensor([crop_h])))
|
329 |
+
out.append(self.embedder(torch.Tensor([crop_w])))
|
330 |
+
out.append(self.embedder(torch.Tensor([target_height])))
|
331 |
+
out.append(self.embedder(torch.Tensor([target_width])))
|
332 |
+
flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0).repeat(clip_pooled.shape[0], 1)
|
333 |
+
return torch.cat((clip_pooled.to(flat.device), flat), dim=1)
|
334 |
+
|
335 |
+
class SVD_img2vid(BaseModel):
|
336 |
+
def __init__(self, model_config, model_type=ModelType.V_PREDICTION_EDM, device=None):
|
337 |
+
super().__init__(model_config, model_type, device=device)
|
338 |
+
self.embedder = Timestep(256)
|
339 |
+
|
340 |
+
def encode_adm(self, **kwargs):
|
341 |
+
fps_id = kwargs.get("fps", 6) - 1
|
342 |
+
motion_bucket_id = kwargs.get("motion_bucket_id", 127)
|
343 |
+
augmentation = kwargs.get("augmentation_level", 0)
|
344 |
+
|
345 |
+
out = []
|
346 |
+
out.append(self.embedder(torch.Tensor([fps_id])))
|
347 |
+
out.append(self.embedder(torch.Tensor([motion_bucket_id])))
|
348 |
+
out.append(self.embedder(torch.Tensor([augmentation])))
|
349 |
+
|
350 |
+
flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0)
|
351 |
+
return flat
|
352 |
+
|
353 |
+
def extra_conds(self, **kwargs):
|
354 |
+
out = {}
|
355 |
+
adm = self.encode_adm(**kwargs)
|
356 |
+
if adm is not None:
|
357 |
+
out['y'] = comfy.conds.CONDRegular(adm)
|
358 |
+
|
359 |
+
latent_image = kwargs.get("concat_latent_image", None)
|
360 |
+
noise = kwargs.get("noise", None)
|
361 |
+
device = kwargs["device"]
|
362 |
+
|
363 |
+
if latent_image is None:
|
364 |
+
latent_image = torch.zeros_like(noise)
|
365 |
+
|
366 |
+
if latent_image.shape[1:] != noise.shape[1:]:
|
367 |
+
latent_image = utils.common_upscale(latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
368 |
+
|
369 |
+
latent_image = utils.resize_to_batch_size(latent_image, noise.shape[0])
|
370 |
+
|
371 |
+
out['c_concat'] = comfy.conds.CONDNoiseShape(latent_image)
|
372 |
+
|
373 |
+
cross_attn = kwargs.get("cross_attn", None)
|
374 |
+
if cross_attn is not None:
|
375 |
+
out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)
|
376 |
+
|
377 |
+
if "time_conditioning" in kwargs:
|
378 |
+
out["time_context"] = comfy.conds.CONDCrossAttn(kwargs["time_conditioning"])
|
379 |
+
|
380 |
+
out['num_video_frames'] = comfy.conds.CONDConstant(noise.shape[0])
|
381 |
+
return out
|
382 |
+
|
383 |
+
class Stable_Zero123(BaseModel):
|
384 |
+
def __init__(self, model_config, model_type=ModelType.EPS, device=None, cc_projection_weight=None, cc_projection_bias=None):
|
385 |
+
super().__init__(model_config, model_type, device=device)
|
386 |
+
self.cc_projection = comfy.ops.manual_cast.Linear(cc_projection_weight.shape[1], cc_projection_weight.shape[0], dtype=self.get_dtype(), device=device)
|
387 |
+
self.cc_projection.weight.copy_(cc_projection_weight)
|
388 |
+
self.cc_projection.bias.copy_(cc_projection_bias)
|
389 |
+
|
390 |
+
def extra_conds(self, **kwargs):
|
391 |
+
out = {}
|
392 |
+
|
393 |
+
latent_image = kwargs.get("concat_latent_image", None)
|
394 |
+
noise = kwargs.get("noise", None)
|
395 |
+
|
396 |
+
if latent_image is None:
|
397 |
+
latent_image = torch.zeros_like(noise)
|
398 |
+
|
399 |
+
if latent_image.shape[1:] != noise.shape[1:]:
|
400 |
+
latent_image = utils.common_upscale(latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
401 |
+
|
402 |
+
latent_image = utils.resize_to_batch_size(latent_image, noise.shape[0])
|
403 |
+
|
404 |
+
out['c_concat'] = comfy.conds.CONDNoiseShape(latent_image)
|
405 |
+
|
406 |
+
cross_attn = kwargs.get("cross_attn", None)
|
407 |
+
if cross_attn is not None:
|
408 |
+
if cross_attn.shape[-1] != 768:
|
409 |
+
cross_attn = self.cc_projection(cross_attn)
|
410 |
+
out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)
|
411 |
+
return out
|
412 |
+
|
413 |
+
class SD_X4Upscaler(BaseModel):
|
414 |
+
def __init__(self, model_config, model_type=ModelType.V_PREDICTION, device=None):
|
415 |
+
super().__init__(model_config, model_type, device=device)
|
416 |
+
self.noise_augmentor = ImageConcatWithNoiseAugmentation(noise_schedule_config={"linear_start": 0.0001, "linear_end": 0.02}, max_noise_level=350)
|
417 |
+
|
418 |
+
def extra_conds(self, **kwargs):
|
419 |
+
out = {}
|
420 |
+
|
421 |
+
image = kwargs.get("concat_image", None)
|
422 |
+
noise = kwargs.get("noise", None)
|
423 |
+
noise_augment = kwargs.get("noise_augmentation", 0.0)
|
424 |
+
device = kwargs["device"]
|
425 |
+
seed = kwargs["seed"] - 10
|
426 |
+
|
427 |
+
noise_level = round((self.noise_augmentor.max_noise_level) * noise_augment)
|
428 |
+
|
429 |
+
if image is None:
|
430 |
+
image = torch.zeros_like(noise)[:,:3]
|
431 |
+
|
432 |
+
if image.shape[1:] != noise.shape[1:]:
|
433 |
+
image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
434 |
+
|
435 |
+
noise_level = torch.tensor([noise_level], device=device)
|
436 |
+
if noise_augment > 0:
|
437 |
+
image, noise_level = self.noise_augmentor(image.to(device), noise_level=noise_level, seed=seed)
|
438 |
+
|
439 |
+
image = utils.resize_to_batch_size(image, noise.shape[0])
|
440 |
+
|
441 |
+
out['c_concat'] = comfy.conds.CONDNoiseShape(image)
|
442 |
+
out['y'] = comfy.conds.CONDRegular(noise_level)
|
443 |
+
return out
|
444 |
+
|
445 |
+
class StableCascade_C(BaseModel):
|
446 |
+
def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None):
|
447 |
+
super().__init__(model_config, model_type, device=device, unet_model=StageC)
|
448 |
+
self.diffusion_model.eval().requires_grad_(False)
|
449 |
+
|
450 |
+
def extra_conds(self, **kwargs):
|
451 |
+
out = {}
|
452 |
+
clip_text_pooled = kwargs["pooled_output"]
|
453 |
+
if clip_text_pooled is not None:
|
454 |
+
out['clip_text_pooled'] = comfy.conds.CONDRegular(clip_text_pooled)
|
455 |
+
|
456 |
+
if "unclip_conditioning" in kwargs:
|
457 |
+
embeds = []
|
458 |
+
for unclip_cond in kwargs["unclip_conditioning"]:
|
459 |
+
weight = unclip_cond["strength"]
|
460 |
+
embeds.append(unclip_cond["clip_vision_output"].image_embeds.unsqueeze(0) * weight)
|
461 |
+
clip_img = torch.cat(embeds, dim=1)
|
462 |
+
else:
|
463 |
+
clip_img = torch.zeros((1, 1, 768))
|
464 |
+
out["clip_img"] = comfy.conds.CONDRegular(clip_img)
|
465 |
+
out["sca"] = comfy.conds.CONDRegular(torch.zeros((1,)))
|
466 |
+
out["crp"] = comfy.conds.CONDRegular(torch.zeros((1,)))
|
467 |
+
|
468 |
+
cross_attn = kwargs.get("cross_attn", None)
|
469 |
+
if cross_attn is not None:
|
470 |
+
out['clip_text'] = comfy.conds.CONDCrossAttn(cross_attn)
|
471 |
+
return out
|
472 |
+
|
473 |
+
|
474 |
+
class StableCascade_B(BaseModel):
|
475 |
+
def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None):
|
476 |
+
super().__init__(model_config, model_type, device=device, unet_model=StageB)
|
477 |
+
self.diffusion_model.eval().requires_grad_(False)
|
478 |
+
|
479 |
+
def extra_conds(self, **kwargs):
|
480 |
+
out = {}
|
481 |
+
noise = kwargs.get("noise", None)
|
482 |
+
|
483 |
+
clip_text_pooled = kwargs["pooled_output"]
|
484 |
+
if clip_text_pooled is not None:
|
485 |
+
out['clip'] = comfy.conds.CONDRegular(clip_text_pooled)
|
486 |
+
|
487 |
+
#size of prior doesn't really matter if zeros because it gets resized but I still want it to get batched
|
488 |
+
prior = kwargs.get("stable_cascade_prior", torch.zeros((1, 16, (noise.shape[2] * 4) // 42, (noise.shape[3] * 4) // 42), dtype=noise.dtype, layout=noise.layout, device=noise.device))
|
489 |
+
|
490 |
+
out["effnet"] = comfy.conds.CONDRegular(prior)
|
491 |
+
out["sca"] = comfy.conds.CONDRegular(torch.zeros((1,)))
|
492 |
+
return out
|
ComfyUI/comfy/model_detection.py
ADDED
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import comfy.supported_models
|
2 |
+
import comfy.supported_models_base
|
3 |
+
import logging
|
4 |
+
|
5 |
+
def count_blocks(state_dict_keys, prefix_string):
|
6 |
+
count = 0
|
7 |
+
while True:
|
8 |
+
c = False
|
9 |
+
for k in state_dict_keys:
|
10 |
+
if k.startswith(prefix_string.format(count)):
|
11 |
+
c = True
|
12 |
+
break
|
13 |
+
if c == False:
|
14 |
+
break
|
15 |
+
count += 1
|
16 |
+
return count
|
17 |
+
|
18 |
+
def calculate_transformer_depth(prefix, state_dict_keys, state_dict):
|
19 |
+
context_dim = None
|
20 |
+
use_linear_in_transformer = False
|
21 |
+
|
22 |
+
transformer_prefix = prefix + "1.transformer_blocks."
|
23 |
+
transformer_keys = sorted(list(filter(lambda a: a.startswith(transformer_prefix), state_dict_keys)))
|
24 |
+
if len(transformer_keys) > 0:
|
25 |
+
last_transformer_depth = count_blocks(state_dict_keys, transformer_prefix + '{}')
|
26 |
+
context_dim = state_dict['{}0.attn2.to_k.weight'.format(transformer_prefix)].shape[1]
|
27 |
+
use_linear_in_transformer = len(state_dict['{}1.proj_in.weight'.format(prefix)].shape) == 2
|
28 |
+
time_stack = '{}1.time_stack.0.attn1.to_q.weight'.format(prefix) in state_dict or '{}1.time_mix_blocks.0.attn1.to_q.weight'.format(prefix) in state_dict
|
29 |
+
return last_transformer_depth, context_dim, use_linear_in_transformer, time_stack
|
30 |
+
return None
|
31 |
+
|
32 |
+
def detect_unet_config(state_dict, key_prefix):
|
33 |
+
state_dict_keys = list(state_dict.keys())
|
34 |
+
|
35 |
+
if '{}clf.1.weight'.format(key_prefix) in state_dict_keys: #stable cascade
|
36 |
+
unet_config = {}
|
37 |
+
text_mapper_name = '{}clip_txt_mapper.weight'.format(key_prefix)
|
38 |
+
if text_mapper_name in state_dict_keys:
|
39 |
+
unet_config['stable_cascade_stage'] = 'c'
|
40 |
+
w = state_dict[text_mapper_name]
|
41 |
+
if w.shape[0] == 1536: #stage c lite
|
42 |
+
unet_config['c_cond'] = 1536
|
43 |
+
unet_config['c_hidden'] = [1536, 1536]
|
44 |
+
unet_config['nhead'] = [24, 24]
|
45 |
+
unet_config['blocks'] = [[4, 12], [12, 4]]
|
46 |
+
elif w.shape[0] == 2048: #stage c full
|
47 |
+
unet_config['c_cond'] = 2048
|
48 |
+
elif '{}clip_mapper.weight'.format(key_prefix) in state_dict_keys:
|
49 |
+
unet_config['stable_cascade_stage'] = 'b'
|
50 |
+
w = state_dict['{}down_blocks.1.0.channelwise.0.weight'.format(key_prefix)]
|
51 |
+
if w.shape[-1] == 640:
|
52 |
+
unet_config['c_hidden'] = [320, 640, 1280, 1280]
|
53 |
+
unet_config['nhead'] = [-1, -1, 20, 20]
|
54 |
+
unet_config['blocks'] = [[2, 6, 28, 6], [6, 28, 6, 2]]
|
55 |
+
unet_config['block_repeat'] = [[1, 1, 1, 1], [3, 3, 2, 2]]
|
56 |
+
elif w.shape[-1] == 576: #stage b lite
|
57 |
+
unet_config['c_hidden'] = [320, 576, 1152, 1152]
|
58 |
+
unet_config['nhead'] = [-1, 9, 18, 18]
|
59 |
+
unet_config['blocks'] = [[2, 4, 14, 4], [4, 14, 4, 2]]
|
60 |
+
unet_config['block_repeat'] = [[1, 1, 1, 1], [2, 2, 2, 2]]
|
61 |
+
|
62 |
+
return unet_config
|
63 |
+
|
64 |
+
unet_config = {
|
65 |
+
"use_checkpoint": False,
|
66 |
+
"image_size": 32,
|
67 |
+
"use_spatial_transformer": True,
|
68 |
+
"legacy": False
|
69 |
+
}
|
70 |
+
|
71 |
+
y_input = '{}label_emb.0.0.weight'.format(key_prefix)
|
72 |
+
if y_input in state_dict_keys:
|
73 |
+
unet_config["num_classes"] = "sequential"
|
74 |
+
unet_config["adm_in_channels"] = state_dict[y_input].shape[1]
|
75 |
+
else:
|
76 |
+
unet_config["adm_in_channels"] = None
|
77 |
+
|
78 |
+
model_channels = state_dict['{}input_blocks.0.0.weight'.format(key_prefix)].shape[0]
|
79 |
+
in_channels = state_dict['{}input_blocks.0.0.weight'.format(key_prefix)].shape[1]
|
80 |
+
|
81 |
+
out_key = '{}out.2.weight'.format(key_prefix)
|
82 |
+
if out_key in state_dict:
|
83 |
+
out_channels = state_dict[out_key].shape[0]
|
84 |
+
else:
|
85 |
+
out_channels = 4
|
86 |
+
|
87 |
+
num_res_blocks = []
|
88 |
+
channel_mult = []
|
89 |
+
attention_resolutions = []
|
90 |
+
transformer_depth = []
|
91 |
+
transformer_depth_output = []
|
92 |
+
context_dim = None
|
93 |
+
use_linear_in_transformer = False
|
94 |
+
|
95 |
+
video_model = False
|
96 |
+
|
97 |
+
current_res = 1
|
98 |
+
count = 0
|
99 |
+
|
100 |
+
last_res_blocks = 0
|
101 |
+
last_channel_mult = 0
|
102 |
+
|
103 |
+
input_block_count = count_blocks(state_dict_keys, '{}input_blocks'.format(key_prefix) + '.{}.')
|
104 |
+
for count in range(input_block_count):
|
105 |
+
prefix = '{}input_blocks.{}.'.format(key_prefix, count)
|
106 |
+
prefix_output = '{}output_blocks.{}.'.format(key_prefix, input_block_count - count - 1)
|
107 |
+
|
108 |
+
block_keys = sorted(list(filter(lambda a: a.startswith(prefix), state_dict_keys)))
|
109 |
+
if len(block_keys) == 0:
|
110 |
+
break
|
111 |
+
|
112 |
+
block_keys_output = sorted(list(filter(lambda a: a.startswith(prefix_output), state_dict_keys)))
|
113 |
+
|
114 |
+
if "{}0.op.weight".format(prefix) in block_keys: #new layer
|
115 |
+
num_res_blocks.append(last_res_blocks)
|
116 |
+
channel_mult.append(last_channel_mult)
|
117 |
+
|
118 |
+
current_res *= 2
|
119 |
+
last_res_blocks = 0
|
120 |
+
last_channel_mult = 0
|
121 |
+
out = calculate_transformer_depth(prefix_output, state_dict_keys, state_dict)
|
122 |
+
if out is not None:
|
123 |
+
transformer_depth_output.append(out[0])
|
124 |
+
else:
|
125 |
+
transformer_depth_output.append(0)
|
126 |
+
else:
|
127 |
+
res_block_prefix = "{}0.in_layers.0.weight".format(prefix)
|
128 |
+
if res_block_prefix in block_keys:
|
129 |
+
last_res_blocks += 1
|
130 |
+
last_channel_mult = state_dict["{}0.out_layers.3.weight".format(prefix)].shape[0] // model_channels
|
131 |
+
|
132 |
+
out = calculate_transformer_depth(prefix, state_dict_keys, state_dict)
|
133 |
+
if out is not None:
|
134 |
+
transformer_depth.append(out[0])
|
135 |
+
if context_dim is None:
|
136 |
+
context_dim = out[1]
|
137 |
+
use_linear_in_transformer = out[2]
|
138 |
+
video_model = out[3]
|
139 |
+
else:
|
140 |
+
transformer_depth.append(0)
|
141 |
+
|
142 |
+
res_block_prefix = "{}0.in_layers.0.weight".format(prefix_output)
|
143 |
+
if res_block_prefix in block_keys_output:
|
144 |
+
out = calculate_transformer_depth(prefix_output, state_dict_keys, state_dict)
|
145 |
+
if out is not None:
|
146 |
+
transformer_depth_output.append(out[0])
|
147 |
+
else:
|
148 |
+
transformer_depth_output.append(0)
|
149 |
+
|
150 |
+
|
151 |
+
num_res_blocks.append(last_res_blocks)
|
152 |
+
channel_mult.append(last_channel_mult)
|
153 |
+
if "{}middle_block.1.proj_in.weight".format(key_prefix) in state_dict_keys:
|
154 |
+
transformer_depth_middle = count_blocks(state_dict_keys, '{}middle_block.1.transformer_blocks.'.format(key_prefix) + '{}')
|
155 |
+
elif "{}middle_block.0.in_layers.0.weight".format(key_prefix) in state_dict_keys:
|
156 |
+
transformer_depth_middle = -1
|
157 |
+
else:
|
158 |
+
transformer_depth_middle = -2
|
159 |
+
|
160 |
+
unet_config["in_channels"] = in_channels
|
161 |
+
unet_config["out_channels"] = out_channels
|
162 |
+
unet_config["model_channels"] = model_channels
|
163 |
+
unet_config["num_res_blocks"] = num_res_blocks
|
164 |
+
unet_config["transformer_depth"] = transformer_depth
|
165 |
+
unet_config["transformer_depth_output"] = transformer_depth_output
|
166 |
+
unet_config["channel_mult"] = channel_mult
|
167 |
+
unet_config["transformer_depth_middle"] = transformer_depth_middle
|
168 |
+
unet_config['use_linear_in_transformer'] = use_linear_in_transformer
|
169 |
+
unet_config["context_dim"] = context_dim
|
170 |
+
|
171 |
+
if video_model:
|
172 |
+
unet_config["extra_ff_mix_layer"] = True
|
173 |
+
unet_config["use_spatial_context"] = True
|
174 |
+
unet_config["merge_strategy"] = "learned_with_images"
|
175 |
+
unet_config["merge_factor"] = 0.0
|
176 |
+
unet_config["video_kernel_size"] = [3, 1, 1]
|
177 |
+
unet_config["use_temporal_resblock"] = True
|
178 |
+
unet_config["use_temporal_attention"] = True
|
179 |
+
else:
|
180 |
+
unet_config["use_temporal_resblock"] = False
|
181 |
+
unet_config["use_temporal_attention"] = False
|
182 |
+
|
183 |
+
return unet_config
|
184 |
+
|
185 |
+
def model_config_from_unet_config(unet_config):
|
186 |
+
for model_config in comfy.supported_models.models:
|
187 |
+
if model_config.matches(unet_config):
|
188 |
+
return model_config(unet_config)
|
189 |
+
|
190 |
+
logging.error("no match {}".format(unet_config))
|
191 |
+
return None
|
192 |
+
|
193 |
+
def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=False):
|
194 |
+
unet_config = detect_unet_config(state_dict, unet_key_prefix)
|
195 |
+
model_config = model_config_from_unet_config(unet_config)
|
196 |
+
if model_config is None and use_base_if_no_match:
|
197 |
+
return comfy.supported_models_base.BASE(unet_config)
|
198 |
+
else:
|
199 |
+
return model_config
|
200 |
+
|
201 |
+
def convert_config(unet_config):
|
202 |
+
new_config = unet_config.copy()
|
203 |
+
num_res_blocks = new_config.get("num_res_blocks", None)
|
204 |
+
channel_mult = new_config.get("channel_mult", None)
|
205 |
+
|
206 |
+
if isinstance(num_res_blocks, int):
|
207 |
+
num_res_blocks = len(channel_mult) * [num_res_blocks]
|
208 |
+
|
209 |
+
if "attention_resolutions" in new_config:
|
210 |
+
attention_resolutions = new_config.pop("attention_resolutions")
|
211 |
+
transformer_depth = new_config.get("transformer_depth", None)
|
212 |
+
transformer_depth_middle = new_config.get("transformer_depth_middle", None)
|
213 |
+
|
214 |
+
if isinstance(transformer_depth, int):
|
215 |
+
transformer_depth = len(channel_mult) * [transformer_depth]
|
216 |
+
if transformer_depth_middle is None:
|
217 |
+
transformer_depth_middle = transformer_depth[-1]
|
218 |
+
t_in = []
|
219 |
+
t_out = []
|
220 |
+
s = 1
|
221 |
+
for i in range(len(num_res_blocks)):
|
222 |
+
res = num_res_blocks[i]
|
223 |
+
d = 0
|
224 |
+
if s in attention_resolutions:
|
225 |
+
d = transformer_depth[i]
|
226 |
+
|
227 |
+
t_in += [d] * res
|
228 |
+
t_out += [d] * (res + 1)
|
229 |
+
s *= 2
|
230 |
+
transformer_depth = t_in
|
231 |
+
transformer_depth_output = t_out
|
232 |
+
new_config["transformer_depth"] = t_in
|
233 |
+
new_config["transformer_depth_output"] = t_out
|
234 |
+
new_config["transformer_depth_middle"] = transformer_depth_middle
|
235 |
+
|
236 |
+
new_config["num_res_blocks"] = num_res_blocks
|
237 |
+
return new_config
|
238 |
+
|
239 |
+
|
240 |
+
def unet_config_from_diffusers_unet(state_dict, dtype=None):
|
241 |
+
match = {}
|
242 |
+
transformer_depth = []
|
243 |
+
|
244 |
+
attn_res = 1
|
245 |
+
down_blocks = count_blocks(state_dict, "down_blocks.{}")
|
246 |
+
for i in range(down_blocks):
|
247 |
+
attn_blocks = count_blocks(state_dict, "down_blocks.{}.attentions.".format(i) + '{}')
|
248 |
+
res_blocks = count_blocks(state_dict, "down_blocks.{}.resnets.".format(i) + '{}')
|
249 |
+
for ab in range(attn_blocks):
|
250 |
+
transformer_count = count_blocks(state_dict, "down_blocks.{}.attentions.{}.transformer_blocks.".format(i, ab) + '{}')
|
251 |
+
transformer_depth.append(transformer_count)
|
252 |
+
if transformer_count > 0:
|
253 |
+
match["context_dim"] = state_dict["down_blocks.{}.attentions.{}.transformer_blocks.0.attn2.to_k.weight".format(i, ab)].shape[1]
|
254 |
+
|
255 |
+
attn_res *= 2
|
256 |
+
if attn_blocks == 0:
|
257 |
+
for i in range(res_blocks):
|
258 |
+
transformer_depth.append(0)
|
259 |
+
|
260 |
+
match["transformer_depth"] = transformer_depth
|
261 |
+
|
262 |
+
match["model_channels"] = state_dict["conv_in.weight"].shape[0]
|
263 |
+
match["in_channels"] = state_dict["conv_in.weight"].shape[1]
|
264 |
+
match["adm_in_channels"] = None
|
265 |
+
if "class_embedding.linear_1.weight" in state_dict:
|
266 |
+
match["adm_in_channels"] = state_dict["class_embedding.linear_1.weight"].shape[1]
|
267 |
+
elif "add_embedding.linear_1.weight" in state_dict:
|
268 |
+
match["adm_in_channels"] = state_dict["add_embedding.linear_1.weight"].shape[1]
|
269 |
+
|
270 |
+
SDXL = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
|
271 |
+
'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
|
272 |
+
'num_res_blocks': [2, 2, 2], 'transformer_depth': [0, 0, 2, 2, 10, 10], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': 10,
|
273 |
+
'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 0, 2, 2, 2, 10, 10, 10],
|
274 |
+
'use_temporal_attention': False, 'use_temporal_resblock': False}
|
275 |
+
|
276 |
+
SDXL_refiner = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
|
277 |
+
'num_classes': 'sequential', 'adm_in_channels': 2560, 'dtype': dtype, 'in_channels': 4, 'model_channels': 384,
|
278 |
+
'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [0, 0, 4, 4, 4, 4, 0, 0], 'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 4,
|
279 |
+
'use_linear_in_transformer': True, 'context_dim': 1280, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 0, 4, 4, 4, 4, 4, 4, 0, 0, 0],
|
280 |
+
'use_temporal_attention': False, 'use_temporal_resblock': False}
|
281 |
+
|
282 |
+
SD21 = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
|
283 |
+
'adm_in_channels': None, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2],
|
284 |
+
'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0], 'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': True,
|
285 |
+
'context_dim': 1024, 'num_head_channels': 64, 'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
|
286 |
+
'use_temporal_attention': False, 'use_temporal_resblock': False}
|
287 |
+
|
288 |
+
SD21_uncliph = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
|
289 |
+
'num_classes': 'sequential', 'adm_in_channels': 2048, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
|
290 |
+
'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0], 'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1,
|
291 |
+
'use_linear_in_transformer': True, 'context_dim': 1024, 'num_head_channels': 64, 'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
|
292 |
+
'use_temporal_attention': False, 'use_temporal_resblock': False}
|
293 |
+
|
294 |
+
SD21_unclipl = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
|
295 |
+
'num_classes': 'sequential', 'adm_in_channels': 1536, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
|
296 |
+
'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0], 'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1,
|
297 |
+
'use_linear_in_transformer': True, 'context_dim': 1024, 'num_head_channels': 64, 'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
|
298 |
+
'use_temporal_attention': False, 'use_temporal_resblock': False}
|
299 |
+
|
300 |
+
SD15 = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False, 'adm_in_channels': None,
|
301 |
+
'dtype': dtype, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0],
|
302 |
+
'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': False, 'context_dim': 768, 'num_heads': 8,
|
303 |
+
'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
|
304 |
+
'use_temporal_attention': False, 'use_temporal_resblock': False}
|
305 |
+
|
306 |
+
SDXL_mid_cnet = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
|
307 |
+
'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
|
308 |
+
'num_res_blocks': [2, 2, 2], 'transformer_depth': [0, 0, 0, 0, 1, 1], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': 1,
|
309 |
+
'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 0, 0, 0, 0, 1, 1, 1],
|
310 |
+
'use_temporal_attention': False, 'use_temporal_resblock': False}
|
311 |
+
|
312 |
+
SDXL_small_cnet = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
|
313 |
+
'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
|
314 |
+
'num_res_blocks': [2, 2, 2], 'transformer_depth': [0, 0, 0, 0, 0, 0], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': 0,
|
315 |
+
'use_linear_in_transformer': True, 'num_head_channels': 64, 'context_dim': 1, 'transformer_depth_output': [0, 0, 0, 0, 0, 0, 0, 0, 0],
|
316 |
+
'use_temporal_attention': False, 'use_temporal_resblock': False}
|
317 |
+
|
318 |
+
SDXL_diffusers_inpaint = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
|
319 |
+
'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 9, 'model_channels': 320,
|
320 |
+
'num_res_blocks': [2, 2, 2], 'transformer_depth': [0, 0, 2, 2, 10, 10], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': 10,
|
321 |
+
'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 0, 2, 2, 2, 10, 10, 10],
|
322 |
+
'use_temporal_attention': False, 'use_temporal_resblock': False}
|
323 |
+
|
324 |
+
SSD_1B = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
|
325 |
+
'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
|
326 |
+
'num_res_blocks': [2, 2, 2], 'transformer_depth': [0, 0, 2, 2, 4, 4], 'transformer_depth_output': [0, 0, 0, 1, 1, 2, 10, 4, 4],
|
327 |
+
'channel_mult': [1, 2, 4], 'transformer_depth_middle': -1, 'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64,
|
328 |
+
'use_temporal_attention': False, 'use_temporal_resblock': False}
|
329 |
+
|
330 |
+
Segmind_Vega = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
|
331 |
+
'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
|
332 |
+
'num_res_blocks': [2, 2, 2], 'transformer_depth': [0, 0, 1, 1, 2, 2], 'transformer_depth_output': [0, 0, 0, 1, 1, 1, 2, 2, 2],
|
333 |
+
'channel_mult': [1, 2, 4], 'transformer_depth_middle': -1, 'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64,
|
334 |
+
'use_temporal_attention': False, 'use_temporal_resblock': False}
|
335 |
+
|
336 |
+
KOALA_700M = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
|
337 |
+
'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
|
338 |
+
'num_res_blocks': [1, 1, 1], 'transformer_depth': [0, 2, 5], 'transformer_depth_output': [0, 0, 2, 2, 5, 5],
|
339 |
+
'channel_mult': [1, 2, 4], 'transformer_depth_middle': -2, 'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64,
|
340 |
+
'use_temporal_attention': False, 'use_temporal_resblock': False}
|
341 |
+
|
342 |
+
KOALA_1B = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
|
343 |
+
'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
|
344 |
+
'num_res_blocks': [1, 1, 1], 'transformer_depth': [0, 2, 6], 'transformer_depth_output': [0, 0, 2, 2, 6, 6],
|
345 |
+
'channel_mult': [1, 2, 4], 'transformer_depth_middle': 6, 'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64,
|
346 |
+
'use_temporal_attention': False, 'use_temporal_resblock': False}
|
347 |
+
|
348 |
+
supported_models = [SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint, SSD_1B, Segmind_Vega, KOALA_700M, KOALA_1B]
|
349 |
+
|
350 |
+
for unet_config in supported_models:
|
351 |
+
matches = True
|
352 |
+
for k in match:
|
353 |
+
if match[k] != unet_config[k]:
|
354 |
+
matches = False
|
355 |
+
break
|
356 |
+
if matches:
|
357 |
+
return convert_config(unet_config)
|
358 |
+
return None
|
359 |
+
|
360 |
+
def model_config_from_diffusers_unet(state_dict):
|
361 |
+
unet_config = unet_config_from_diffusers_unet(state_dict)
|
362 |
+
if unet_config is not None:
|
363 |
+
return model_config_from_unet_config(unet_config)
|
364 |
+
return None
|
ComfyUI/comfy/model_management.py
ADDED
@@ -0,0 +1,832 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import psutil
|
2 |
+
import logging
|
3 |
+
from enum import Enum
|
4 |
+
from comfy.cli_args import args
|
5 |
+
import comfy.utils
|
6 |
+
import torch
|
7 |
+
import sys
|
8 |
+
|
9 |
+
class VRAMState(Enum):
|
10 |
+
DISABLED = 0 #No vram present: no need to move models to vram
|
11 |
+
NO_VRAM = 1 #Very low vram: enable all the options to save vram
|
12 |
+
LOW_VRAM = 2
|
13 |
+
NORMAL_VRAM = 3
|
14 |
+
HIGH_VRAM = 4
|
15 |
+
SHARED = 5 #No dedicated vram: memory shared between CPU and GPU but models still need to be moved between both.
|
16 |
+
|
17 |
+
class CPUState(Enum):
|
18 |
+
GPU = 0
|
19 |
+
CPU = 1
|
20 |
+
MPS = 2
|
21 |
+
|
22 |
+
# Determine VRAM State
|
23 |
+
vram_state = VRAMState.NORMAL_VRAM
|
24 |
+
set_vram_to = VRAMState.NORMAL_VRAM
|
25 |
+
cpu_state = CPUState.GPU
|
26 |
+
|
27 |
+
total_vram = 0
|
28 |
+
|
29 |
+
lowvram_available = True
|
30 |
+
xpu_available = False
|
31 |
+
|
32 |
+
if args.deterministic:
|
33 |
+
logging.info("Using deterministic algorithms for pytorch")
|
34 |
+
torch.use_deterministic_algorithms(True, warn_only=True)
|
35 |
+
|
36 |
+
directml_enabled = False
|
37 |
+
if args.directml is not None:
|
38 |
+
import torch_directml
|
39 |
+
directml_enabled = True
|
40 |
+
device_index = args.directml
|
41 |
+
if device_index < 0:
|
42 |
+
directml_device = torch_directml.device()
|
43 |
+
else:
|
44 |
+
directml_device = torch_directml.device(device_index)
|
45 |
+
logging.info("Using directml with device: {}".format(torch_directml.device_name(device_index)))
|
46 |
+
# torch_directml.disable_tiled_resources(True)
|
47 |
+
lowvram_available = False #TODO: need to find a way to get free memory in directml before this can be enabled by default.
|
48 |
+
|
49 |
+
try:
|
50 |
+
import intel_extension_for_pytorch as ipex
|
51 |
+
if torch.xpu.is_available():
|
52 |
+
xpu_available = True
|
53 |
+
except:
|
54 |
+
pass
|
55 |
+
|
56 |
+
try:
|
57 |
+
if torch.backends.mps.is_available():
|
58 |
+
cpu_state = CPUState.MPS
|
59 |
+
import torch.mps
|
60 |
+
except:
|
61 |
+
pass
|
62 |
+
|
63 |
+
if args.cpu:
|
64 |
+
cpu_state = CPUState.CPU
|
65 |
+
|
66 |
+
def is_intel_xpu():
|
67 |
+
global cpu_state
|
68 |
+
global xpu_available
|
69 |
+
if cpu_state == CPUState.GPU:
|
70 |
+
if xpu_available:
|
71 |
+
return True
|
72 |
+
return False
|
73 |
+
|
74 |
+
def get_torch_device():
|
75 |
+
global directml_enabled
|
76 |
+
global cpu_state
|
77 |
+
if directml_enabled:
|
78 |
+
global directml_device
|
79 |
+
return directml_device
|
80 |
+
if cpu_state == CPUState.MPS:
|
81 |
+
return torch.device("mps")
|
82 |
+
if cpu_state == CPUState.CPU:
|
83 |
+
return torch.device("cpu")
|
84 |
+
else:
|
85 |
+
if is_intel_xpu():
|
86 |
+
return torch.device("xpu")
|
87 |
+
else:
|
88 |
+
return torch.device(torch.cuda.current_device())
|
89 |
+
|
90 |
+
def get_total_memory(dev=None, torch_total_too=False):
|
91 |
+
global directml_enabled
|
92 |
+
if dev is None:
|
93 |
+
dev = get_torch_device()
|
94 |
+
|
95 |
+
if hasattr(dev, 'type') and (dev.type == 'cpu' or dev.type == 'mps'):
|
96 |
+
mem_total = psutil.virtual_memory().total
|
97 |
+
mem_total_torch = mem_total
|
98 |
+
else:
|
99 |
+
if directml_enabled:
|
100 |
+
mem_total = 1024 * 1024 * 1024 #TODO
|
101 |
+
mem_total_torch = mem_total
|
102 |
+
elif is_intel_xpu():
|
103 |
+
stats = torch.xpu.memory_stats(dev)
|
104 |
+
mem_reserved = stats['reserved_bytes.all.current']
|
105 |
+
mem_total = torch.xpu.get_device_properties(dev).total_memory
|
106 |
+
mem_total_torch = mem_reserved
|
107 |
+
else:
|
108 |
+
stats = torch.cuda.memory_stats(dev)
|
109 |
+
mem_reserved = stats['reserved_bytes.all.current']
|
110 |
+
_, mem_total_cuda = torch.cuda.mem_get_info(dev)
|
111 |
+
mem_total_torch = mem_reserved
|
112 |
+
mem_total = mem_total_cuda
|
113 |
+
|
114 |
+
if torch_total_too:
|
115 |
+
return (mem_total, mem_total_torch)
|
116 |
+
else:
|
117 |
+
return mem_total
|
118 |
+
|
119 |
+
total_vram = get_total_memory(get_torch_device()) / (1024 * 1024)
|
120 |
+
total_ram = psutil.virtual_memory().total / (1024 * 1024)
|
121 |
+
logging.info("Total VRAM {:0.0f} MB, total RAM {:0.0f} MB".format(total_vram, total_ram))
|
122 |
+
if not args.normalvram and not args.cpu:
|
123 |
+
if lowvram_available and total_vram <= 4096:
|
124 |
+
logging.warning("Trying to enable lowvram mode because your GPU seems to have 4GB or less. If you don't want this use: --normalvram")
|
125 |
+
set_vram_to = VRAMState.LOW_VRAM
|
126 |
+
|
127 |
+
try:
|
128 |
+
OOM_EXCEPTION = torch.cuda.OutOfMemoryError
|
129 |
+
except:
|
130 |
+
OOM_EXCEPTION = Exception
|
131 |
+
|
132 |
+
XFORMERS_VERSION = ""
|
133 |
+
XFORMERS_ENABLED_VAE = True
|
134 |
+
if args.disable_xformers:
|
135 |
+
XFORMERS_IS_AVAILABLE = False
|
136 |
+
else:
|
137 |
+
try:
|
138 |
+
import xformers
|
139 |
+
import xformers.ops
|
140 |
+
XFORMERS_IS_AVAILABLE = True
|
141 |
+
try:
|
142 |
+
XFORMERS_IS_AVAILABLE = xformers._has_cpp_library
|
143 |
+
except:
|
144 |
+
pass
|
145 |
+
try:
|
146 |
+
XFORMERS_VERSION = xformers.version.__version__
|
147 |
+
logging.info("xformers version: {}".format(XFORMERS_VERSION))
|
148 |
+
if XFORMERS_VERSION.startswith("0.0.18"):
|
149 |
+
logging.warning("\nWARNING: This version of xformers has a major bug where you will get black images when generating high resolution images.")
|
150 |
+
logging.warning("Please downgrade or upgrade xformers to a different version.\n")
|
151 |
+
XFORMERS_ENABLED_VAE = False
|
152 |
+
except:
|
153 |
+
pass
|
154 |
+
except:
|
155 |
+
XFORMERS_IS_AVAILABLE = False
|
156 |
+
|
157 |
+
def is_nvidia():
|
158 |
+
global cpu_state
|
159 |
+
if cpu_state == CPUState.GPU:
|
160 |
+
if torch.version.cuda:
|
161 |
+
return True
|
162 |
+
return False
|
163 |
+
|
164 |
+
ENABLE_PYTORCH_ATTENTION = False
|
165 |
+
if args.use_pytorch_cross_attention:
|
166 |
+
ENABLE_PYTORCH_ATTENTION = True
|
167 |
+
XFORMERS_IS_AVAILABLE = False
|
168 |
+
|
169 |
+
VAE_DTYPE = torch.float32
|
170 |
+
|
171 |
+
try:
|
172 |
+
if is_nvidia():
|
173 |
+
torch_version = torch.version.__version__
|
174 |
+
if int(torch_version[0]) >= 2:
|
175 |
+
if ENABLE_PYTORCH_ATTENTION == False and args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
|
176 |
+
ENABLE_PYTORCH_ATTENTION = True
|
177 |
+
if torch.cuda.is_bf16_supported() and torch.cuda.get_device_properties(torch.cuda.current_device()).major >= 8:
|
178 |
+
VAE_DTYPE = torch.bfloat16
|
179 |
+
if is_intel_xpu():
|
180 |
+
if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
|
181 |
+
ENABLE_PYTORCH_ATTENTION = True
|
182 |
+
except:
|
183 |
+
pass
|
184 |
+
|
185 |
+
if is_intel_xpu():
|
186 |
+
VAE_DTYPE = torch.bfloat16
|
187 |
+
|
188 |
+
if args.cpu_vae:
|
189 |
+
VAE_DTYPE = torch.float32
|
190 |
+
|
191 |
+
if args.fp16_vae:
|
192 |
+
VAE_DTYPE = torch.float16
|
193 |
+
elif args.bf16_vae:
|
194 |
+
VAE_DTYPE = torch.bfloat16
|
195 |
+
elif args.fp32_vae:
|
196 |
+
VAE_DTYPE = torch.float32
|
197 |
+
|
198 |
+
|
199 |
+
if ENABLE_PYTORCH_ATTENTION:
|
200 |
+
torch.backends.cuda.enable_math_sdp(True)
|
201 |
+
torch.backends.cuda.enable_flash_sdp(True)
|
202 |
+
torch.backends.cuda.enable_mem_efficient_sdp(True)
|
203 |
+
|
204 |
+
if args.lowvram:
|
205 |
+
set_vram_to = VRAMState.LOW_VRAM
|
206 |
+
lowvram_available = True
|
207 |
+
elif args.novram:
|
208 |
+
set_vram_to = VRAMState.NO_VRAM
|
209 |
+
elif args.highvram or args.gpu_only:
|
210 |
+
vram_state = VRAMState.HIGH_VRAM
|
211 |
+
|
212 |
+
FORCE_FP32 = False
|
213 |
+
FORCE_FP16 = False
|
214 |
+
if args.force_fp32:
|
215 |
+
logging.info("Forcing FP32, if this improves things please report it.")
|
216 |
+
FORCE_FP32 = True
|
217 |
+
|
218 |
+
if args.force_fp16:
|
219 |
+
logging.info("Forcing FP16.")
|
220 |
+
FORCE_FP16 = True
|
221 |
+
|
222 |
+
if lowvram_available:
|
223 |
+
if set_vram_to in (VRAMState.LOW_VRAM, VRAMState.NO_VRAM):
|
224 |
+
vram_state = set_vram_to
|
225 |
+
|
226 |
+
|
227 |
+
if cpu_state != CPUState.GPU:
|
228 |
+
vram_state = VRAMState.DISABLED
|
229 |
+
|
230 |
+
if cpu_state == CPUState.MPS:
|
231 |
+
vram_state = VRAMState.SHARED
|
232 |
+
|
233 |
+
logging.info(f"Set vram state to: {vram_state.name}")
|
234 |
+
|
235 |
+
DISABLE_SMART_MEMORY = args.disable_smart_memory
|
236 |
+
|
237 |
+
if DISABLE_SMART_MEMORY:
|
238 |
+
logging.info("Disabling smart memory management")
|
239 |
+
|
240 |
+
def get_torch_device_name(device):
|
241 |
+
if hasattr(device, 'type'):
|
242 |
+
if device.type == "cuda":
|
243 |
+
try:
|
244 |
+
allocator_backend = torch.cuda.get_allocator_backend()
|
245 |
+
except:
|
246 |
+
allocator_backend = ""
|
247 |
+
return "{} {} : {}".format(device, torch.cuda.get_device_name(device), allocator_backend)
|
248 |
+
else:
|
249 |
+
return "{}".format(device.type)
|
250 |
+
elif is_intel_xpu():
|
251 |
+
return "{} {}".format(device, torch.xpu.get_device_name(device))
|
252 |
+
else:
|
253 |
+
return "CUDA {}: {}".format(device, torch.cuda.get_device_name(device))
|
254 |
+
|
255 |
+
try:
|
256 |
+
logging.info("Device: {}".format(get_torch_device_name(get_torch_device())))
|
257 |
+
except:
|
258 |
+
logging.warning("Could not pick default device.")
|
259 |
+
|
260 |
+
logging.info("VAE dtype: {}".format(VAE_DTYPE))
|
261 |
+
|
262 |
+
current_loaded_models = []
|
263 |
+
|
264 |
+
def module_size(module):
|
265 |
+
module_mem = 0
|
266 |
+
sd = module.state_dict()
|
267 |
+
for k in sd:
|
268 |
+
t = sd[k]
|
269 |
+
module_mem += t.nelement() * t.element_size()
|
270 |
+
return module_mem
|
271 |
+
|
272 |
+
class LoadedModel:
|
273 |
+
def __init__(self, model):
|
274 |
+
self.model = model
|
275 |
+
self.device = model.load_device
|
276 |
+
|
277 |
+
def model_memory(self):
|
278 |
+
return self.model.model_size()
|
279 |
+
|
280 |
+
def model_memory_required(self, device):
|
281 |
+
if device == self.model.current_device:
|
282 |
+
return 0
|
283 |
+
else:
|
284 |
+
return self.model_memory()
|
285 |
+
|
286 |
+
def model_load(self, lowvram_model_memory=0):
|
287 |
+
patch_model_to = self.device
|
288 |
+
|
289 |
+
self.model.model_patches_to(self.device)
|
290 |
+
self.model.model_patches_to(self.model.model_dtype())
|
291 |
+
|
292 |
+
try:
|
293 |
+
if lowvram_model_memory > 0:
|
294 |
+
self.real_model = self.model.patch_model_lowvram(device_to=patch_model_to, lowvram_model_memory=lowvram_model_memory)
|
295 |
+
else:
|
296 |
+
self.real_model = self.model.patch_model(device_to=patch_model_to)
|
297 |
+
except Exception as e:
|
298 |
+
self.model.unpatch_model(self.model.offload_device)
|
299 |
+
self.model_unload()
|
300 |
+
raise e
|
301 |
+
|
302 |
+
if is_intel_xpu() and not args.disable_ipex_optimize:
|
303 |
+
self.real_model = torch.xpu.optimize(self.real_model.eval(), inplace=True, auto_kernel_selection=True, graph_mode=True)
|
304 |
+
|
305 |
+
return self.real_model
|
306 |
+
|
307 |
+
def model_unload(self):
|
308 |
+
self.model.unpatch_model(self.model.offload_device)
|
309 |
+
self.model.model_patches_to(self.model.offload_device)
|
310 |
+
|
311 |
+
def __eq__(self, other):
|
312 |
+
return self.model is other.model
|
313 |
+
|
314 |
+
def minimum_inference_memory():
|
315 |
+
return (1024 * 1024 * 1024)
|
316 |
+
|
317 |
+
def unload_model_clones(model):
|
318 |
+
to_unload = []
|
319 |
+
for i in range(len(current_loaded_models)):
|
320 |
+
if model.is_clone(current_loaded_models[i].model):
|
321 |
+
to_unload = [i] + to_unload
|
322 |
+
|
323 |
+
for i in to_unload:
|
324 |
+
logging.debug("unload clone {}".format(i))
|
325 |
+
current_loaded_models.pop(i).model_unload()
|
326 |
+
|
327 |
+
def free_memory(memory_required, device, keep_loaded=[]):
|
328 |
+
unloaded_model = False
|
329 |
+
for i in range(len(current_loaded_models) -1, -1, -1):
|
330 |
+
if not DISABLE_SMART_MEMORY:
|
331 |
+
if get_free_memory(device) > memory_required:
|
332 |
+
break
|
333 |
+
shift_model = current_loaded_models[i]
|
334 |
+
if shift_model.device == device:
|
335 |
+
if shift_model not in keep_loaded:
|
336 |
+
m = current_loaded_models.pop(i)
|
337 |
+
m.model_unload()
|
338 |
+
del m
|
339 |
+
unloaded_model = True
|
340 |
+
|
341 |
+
if unloaded_model:
|
342 |
+
soft_empty_cache()
|
343 |
+
else:
|
344 |
+
if vram_state != VRAMState.HIGH_VRAM:
|
345 |
+
mem_free_total, mem_free_torch = get_free_memory(device, torch_free_too=True)
|
346 |
+
if mem_free_torch > mem_free_total * 0.25:
|
347 |
+
soft_empty_cache()
|
348 |
+
|
349 |
+
def load_models_gpu(models, memory_required=0):
|
350 |
+
global vram_state
|
351 |
+
|
352 |
+
inference_memory = minimum_inference_memory()
|
353 |
+
extra_mem = max(inference_memory, memory_required)
|
354 |
+
|
355 |
+
models_to_load = []
|
356 |
+
models_already_loaded = []
|
357 |
+
for x in models:
|
358 |
+
loaded_model = LoadedModel(x)
|
359 |
+
|
360 |
+
if loaded_model in current_loaded_models:
|
361 |
+
index = current_loaded_models.index(loaded_model)
|
362 |
+
current_loaded_models.insert(0, current_loaded_models.pop(index))
|
363 |
+
models_already_loaded.append(loaded_model)
|
364 |
+
else:
|
365 |
+
if hasattr(x, "model"):
|
366 |
+
logging.info(f"Requested to load {x.model.__class__.__name__}")
|
367 |
+
models_to_load.append(loaded_model)
|
368 |
+
|
369 |
+
if len(models_to_load) == 0:
|
370 |
+
devs = set(map(lambda a: a.device, models_already_loaded))
|
371 |
+
for d in devs:
|
372 |
+
if d != torch.device("cpu"):
|
373 |
+
free_memory(extra_mem, d, models_already_loaded)
|
374 |
+
return
|
375 |
+
|
376 |
+
logging.info(f"Loading {len(models_to_load)} new model{'s' if len(models_to_load) > 1 else ''}")
|
377 |
+
|
378 |
+
total_memory_required = {}
|
379 |
+
for loaded_model in models_to_load:
|
380 |
+
unload_model_clones(loaded_model.model)
|
381 |
+
total_memory_required[loaded_model.device] = total_memory_required.get(loaded_model.device, 0) + loaded_model.model_memory_required(loaded_model.device)
|
382 |
+
|
383 |
+
for device in total_memory_required:
|
384 |
+
if device != torch.device("cpu"):
|
385 |
+
free_memory(total_memory_required[device] * 1.3 + extra_mem, device, models_already_loaded)
|
386 |
+
|
387 |
+
for loaded_model in models_to_load:
|
388 |
+
model = loaded_model.model
|
389 |
+
torch_dev = model.load_device
|
390 |
+
if is_device_cpu(torch_dev):
|
391 |
+
vram_set_state = VRAMState.DISABLED
|
392 |
+
else:
|
393 |
+
vram_set_state = vram_state
|
394 |
+
lowvram_model_memory = 0
|
395 |
+
if lowvram_available and (vram_set_state == VRAMState.LOW_VRAM or vram_set_state == VRAMState.NORMAL_VRAM):
|
396 |
+
model_size = loaded_model.model_memory_required(torch_dev)
|
397 |
+
current_free_mem = get_free_memory(torch_dev)
|
398 |
+
lowvram_model_memory = int(max(64 * (1024 * 1024), (current_free_mem - 1024 * (1024 * 1024)) / 1.3 ))
|
399 |
+
if model_size > (current_free_mem - inference_memory): #only switch to lowvram if really necessary
|
400 |
+
vram_set_state = VRAMState.LOW_VRAM
|
401 |
+
else:
|
402 |
+
lowvram_model_memory = 0
|
403 |
+
|
404 |
+
if vram_set_state == VRAMState.NO_VRAM:
|
405 |
+
lowvram_model_memory = 64 * 1024 * 1024
|
406 |
+
|
407 |
+
cur_loaded_model = loaded_model.model_load(lowvram_model_memory)
|
408 |
+
current_loaded_models.insert(0, loaded_model)
|
409 |
+
return
|
410 |
+
|
411 |
+
|
412 |
+
def load_model_gpu(model):
|
413 |
+
return load_models_gpu([model])
|
414 |
+
|
415 |
+
def cleanup_models():
|
416 |
+
to_delete = []
|
417 |
+
for i in range(len(current_loaded_models)):
|
418 |
+
if sys.getrefcount(current_loaded_models[i].model) <= 2:
|
419 |
+
to_delete = [i] + to_delete
|
420 |
+
|
421 |
+
for i in to_delete:
|
422 |
+
x = current_loaded_models.pop(i)
|
423 |
+
x.model_unload()
|
424 |
+
del x
|
425 |
+
|
426 |
+
def dtype_size(dtype):
|
427 |
+
dtype_size = 4
|
428 |
+
if dtype == torch.float16 or dtype == torch.bfloat16:
|
429 |
+
dtype_size = 2
|
430 |
+
elif dtype == torch.float32:
|
431 |
+
dtype_size = 4
|
432 |
+
else:
|
433 |
+
try:
|
434 |
+
dtype_size = dtype.itemsize
|
435 |
+
except: #Old pytorch doesn't have .itemsize
|
436 |
+
pass
|
437 |
+
return dtype_size
|
438 |
+
|
439 |
+
def unet_offload_device():
|
440 |
+
if vram_state == VRAMState.HIGH_VRAM:
|
441 |
+
return get_torch_device()
|
442 |
+
else:
|
443 |
+
return torch.device("cpu")
|
444 |
+
|
445 |
+
def unet_inital_load_device(parameters, dtype):
|
446 |
+
torch_dev = get_torch_device()
|
447 |
+
if vram_state == VRAMState.HIGH_VRAM:
|
448 |
+
return torch_dev
|
449 |
+
|
450 |
+
cpu_dev = torch.device("cpu")
|
451 |
+
if DISABLE_SMART_MEMORY:
|
452 |
+
return cpu_dev
|
453 |
+
|
454 |
+
model_size = dtype_size(dtype) * parameters
|
455 |
+
|
456 |
+
mem_dev = get_free_memory(torch_dev)
|
457 |
+
mem_cpu = get_free_memory(cpu_dev)
|
458 |
+
if mem_dev > mem_cpu and model_size < mem_dev:
|
459 |
+
return torch_dev
|
460 |
+
else:
|
461 |
+
return cpu_dev
|
462 |
+
|
463 |
+
def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, torch.bfloat16, torch.float32]):
|
464 |
+
if args.bf16_unet:
|
465 |
+
return torch.bfloat16
|
466 |
+
if args.fp16_unet:
|
467 |
+
return torch.float16
|
468 |
+
if args.fp8_e4m3fn_unet:
|
469 |
+
return torch.float8_e4m3fn
|
470 |
+
if args.fp8_e5m2_unet:
|
471 |
+
return torch.float8_e5m2
|
472 |
+
if should_use_fp16(device=device, model_params=model_params, manual_cast=True):
|
473 |
+
if torch.float16 in supported_dtypes:
|
474 |
+
return torch.float16
|
475 |
+
if should_use_bf16(device, model_params=model_params, manual_cast=True):
|
476 |
+
if torch.bfloat16 in supported_dtypes:
|
477 |
+
return torch.bfloat16
|
478 |
+
return torch.float32
|
479 |
+
|
480 |
+
# None means no manual cast
|
481 |
+
def unet_manual_cast(weight_dtype, inference_device, supported_dtypes=[torch.float16, torch.bfloat16, torch.float32]):
|
482 |
+
if weight_dtype == torch.float32:
|
483 |
+
return None
|
484 |
+
|
485 |
+
fp16_supported = should_use_fp16(inference_device, prioritize_performance=False)
|
486 |
+
if fp16_supported and weight_dtype == torch.float16:
|
487 |
+
return None
|
488 |
+
|
489 |
+
bf16_supported = should_use_bf16(inference_device)
|
490 |
+
if bf16_supported and weight_dtype == torch.bfloat16:
|
491 |
+
return None
|
492 |
+
|
493 |
+
if fp16_supported and torch.float16 in supported_dtypes:
|
494 |
+
return torch.float16
|
495 |
+
|
496 |
+
elif bf16_supported and torch.bfloat16 in supported_dtypes:
|
497 |
+
return torch.bfloat16
|
498 |
+
else:
|
499 |
+
return torch.float32
|
500 |
+
|
501 |
+
def text_encoder_offload_device():
|
502 |
+
if args.gpu_only:
|
503 |
+
return get_torch_device()
|
504 |
+
else:
|
505 |
+
return torch.device("cpu")
|
506 |
+
|
507 |
+
def text_encoder_device():
|
508 |
+
if args.gpu_only:
|
509 |
+
return get_torch_device()
|
510 |
+
elif vram_state == VRAMState.HIGH_VRAM or vram_state == VRAMState.NORMAL_VRAM:
|
511 |
+
if is_intel_xpu():
|
512 |
+
return torch.device("cpu")
|
513 |
+
if should_use_fp16(prioritize_performance=False):
|
514 |
+
return get_torch_device()
|
515 |
+
else:
|
516 |
+
return torch.device("cpu")
|
517 |
+
else:
|
518 |
+
return torch.device("cpu")
|
519 |
+
|
520 |
+
def text_encoder_dtype(device=None):
|
521 |
+
if args.fp8_e4m3fn_text_enc:
|
522 |
+
return torch.float8_e4m3fn
|
523 |
+
elif args.fp8_e5m2_text_enc:
|
524 |
+
return torch.float8_e5m2
|
525 |
+
elif args.fp16_text_enc:
|
526 |
+
return torch.float16
|
527 |
+
elif args.fp32_text_enc:
|
528 |
+
return torch.float32
|
529 |
+
|
530 |
+
if is_device_cpu(device):
|
531 |
+
return torch.float16
|
532 |
+
|
533 |
+
return torch.float16
|
534 |
+
|
535 |
+
|
536 |
+
def intermediate_device():
|
537 |
+
if args.gpu_only:
|
538 |
+
return get_torch_device()
|
539 |
+
else:
|
540 |
+
return torch.device("cpu")
|
541 |
+
|
542 |
+
def vae_device():
|
543 |
+
if args.cpu_vae:
|
544 |
+
return torch.device("cpu")
|
545 |
+
return get_torch_device()
|
546 |
+
|
547 |
+
def vae_offload_device():
|
548 |
+
if args.gpu_only:
|
549 |
+
return get_torch_device()
|
550 |
+
else:
|
551 |
+
return torch.device("cpu")
|
552 |
+
|
553 |
+
def vae_dtype():
|
554 |
+
global VAE_DTYPE
|
555 |
+
return VAE_DTYPE
|
556 |
+
|
557 |
+
def get_autocast_device(dev):
|
558 |
+
if hasattr(dev, 'type'):
|
559 |
+
return dev.type
|
560 |
+
return "cuda"
|
561 |
+
|
562 |
+
def supports_dtype(device, dtype): #TODO
|
563 |
+
if dtype == torch.float32:
|
564 |
+
return True
|
565 |
+
if is_device_cpu(device):
|
566 |
+
return False
|
567 |
+
if dtype == torch.float16:
|
568 |
+
return True
|
569 |
+
if dtype == torch.bfloat16:
|
570 |
+
return True
|
571 |
+
return False
|
572 |
+
|
573 |
+
def device_supports_non_blocking(device):
|
574 |
+
if is_device_mps(device):
|
575 |
+
return False #pytorch bug? mps doesn't support non blocking
|
576 |
+
return True
|
577 |
+
|
578 |
+
def cast_to_device(tensor, device, dtype, copy=False):
|
579 |
+
device_supports_cast = False
|
580 |
+
if tensor.dtype == torch.float32 or tensor.dtype == torch.float16:
|
581 |
+
device_supports_cast = True
|
582 |
+
elif tensor.dtype == torch.bfloat16:
|
583 |
+
if hasattr(device, 'type') and device.type.startswith("cuda"):
|
584 |
+
device_supports_cast = True
|
585 |
+
elif is_intel_xpu():
|
586 |
+
device_supports_cast = True
|
587 |
+
|
588 |
+
non_blocking = device_supports_non_blocking(device)
|
589 |
+
|
590 |
+
if device_supports_cast:
|
591 |
+
if copy:
|
592 |
+
if tensor.device == device:
|
593 |
+
return tensor.to(dtype, copy=copy, non_blocking=non_blocking)
|
594 |
+
return tensor.to(device, copy=copy, non_blocking=non_blocking).to(dtype, non_blocking=non_blocking)
|
595 |
+
else:
|
596 |
+
return tensor.to(device, non_blocking=non_blocking).to(dtype, non_blocking=non_blocking)
|
597 |
+
else:
|
598 |
+
return tensor.to(device, dtype, copy=copy, non_blocking=non_blocking)
|
599 |
+
|
600 |
+
def xformers_enabled():
|
601 |
+
global directml_enabled
|
602 |
+
global cpu_state
|
603 |
+
if cpu_state != CPUState.GPU:
|
604 |
+
return False
|
605 |
+
if is_intel_xpu():
|
606 |
+
return False
|
607 |
+
if directml_enabled:
|
608 |
+
return False
|
609 |
+
return XFORMERS_IS_AVAILABLE
|
610 |
+
|
611 |
+
|
612 |
+
def xformers_enabled_vae():
|
613 |
+
enabled = xformers_enabled()
|
614 |
+
if not enabled:
|
615 |
+
return False
|
616 |
+
|
617 |
+
return XFORMERS_ENABLED_VAE
|
618 |
+
|
619 |
+
def pytorch_attention_enabled():
|
620 |
+
global ENABLE_PYTORCH_ATTENTION
|
621 |
+
return ENABLE_PYTORCH_ATTENTION
|
622 |
+
|
623 |
+
def pytorch_attention_flash_attention():
|
624 |
+
global ENABLE_PYTORCH_ATTENTION
|
625 |
+
if ENABLE_PYTORCH_ATTENTION:
|
626 |
+
#TODO: more reliable way of checking for flash attention?
|
627 |
+
if is_nvidia(): #pytorch flash attention only works on Nvidia
|
628 |
+
return True
|
629 |
+
return False
|
630 |
+
|
631 |
+
def get_free_memory(dev=None, torch_free_too=False):
|
632 |
+
global directml_enabled
|
633 |
+
if dev is None:
|
634 |
+
dev = get_torch_device()
|
635 |
+
|
636 |
+
if hasattr(dev, 'type') and (dev.type == 'cpu' or dev.type == 'mps'):
|
637 |
+
mem_free_total = psutil.virtual_memory().available
|
638 |
+
mem_free_torch = mem_free_total
|
639 |
+
else:
|
640 |
+
if directml_enabled:
|
641 |
+
mem_free_total = 1024 * 1024 * 1024 #TODO
|
642 |
+
mem_free_torch = mem_free_total
|
643 |
+
elif is_intel_xpu():
|
644 |
+
stats = torch.xpu.memory_stats(dev)
|
645 |
+
mem_active = stats['active_bytes.all.current']
|
646 |
+
mem_allocated = stats['allocated_bytes.all.current']
|
647 |
+
mem_reserved = stats['reserved_bytes.all.current']
|
648 |
+
mem_free_torch = mem_reserved - mem_active
|
649 |
+
mem_free_total = torch.xpu.get_device_properties(dev).total_memory - mem_allocated
|
650 |
+
else:
|
651 |
+
stats = torch.cuda.memory_stats(dev)
|
652 |
+
mem_active = stats['active_bytes.all.current']
|
653 |
+
mem_reserved = stats['reserved_bytes.all.current']
|
654 |
+
mem_free_cuda, _ = torch.cuda.mem_get_info(dev)
|
655 |
+
mem_free_torch = mem_reserved - mem_active
|
656 |
+
mem_free_total = mem_free_cuda + mem_free_torch
|
657 |
+
|
658 |
+
if torch_free_too:
|
659 |
+
return (mem_free_total, mem_free_torch)
|
660 |
+
else:
|
661 |
+
return mem_free_total
|
662 |
+
|
663 |
+
def cpu_mode():
|
664 |
+
global cpu_state
|
665 |
+
return cpu_state == CPUState.CPU
|
666 |
+
|
667 |
+
def mps_mode():
|
668 |
+
global cpu_state
|
669 |
+
return cpu_state == CPUState.MPS
|
670 |
+
|
671 |
+
def is_device_type(device, type):
|
672 |
+
if hasattr(device, 'type'):
|
673 |
+
if (device.type == type):
|
674 |
+
return True
|
675 |
+
return False
|
676 |
+
|
677 |
+
def is_device_cpu(device):
|
678 |
+
return is_device_type(device, 'cpu')
|
679 |
+
|
680 |
+
def is_device_mps(device):
|
681 |
+
return is_device_type(device, 'mps')
|
682 |
+
|
683 |
+
def is_device_cuda(device):
|
684 |
+
return is_device_type(device, 'cuda')
|
685 |
+
|
686 |
+
def should_use_fp16(device=None, model_params=0, prioritize_performance=True, manual_cast=False):
|
687 |
+
global directml_enabled
|
688 |
+
|
689 |
+
if device is not None:
|
690 |
+
if is_device_cpu(device):
|
691 |
+
return False
|
692 |
+
|
693 |
+
if FORCE_FP16:
|
694 |
+
return True
|
695 |
+
|
696 |
+
if device is not None:
|
697 |
+
if is_device_mps(device):
|
698 |
+
return True
|
699 |
+
|
700 |
+
if FORCE_FP32:
|
701 |
+
return False
|
702 |
+
|
703 |
+
if directml_enabled:
|
704 |
+
return False
|
705 |
+
|
706 |
+
if mps_mode():
|
707 |
+
return True
|
708 |
+
|
709 |
+
if cpu_mode():
|
710 |
+
return False
|
711 |
+
|
712 |
+
if is_intel_xpu():
|
713 |
+
return True
|
714 |
+
|
715 |
+
if torch.version.hip:
|
716 |
+
return True
|
717 |
+
|
718 |
+
props = torch.cuda.get_device_properties("cuda")
|
719 |
+
if props.major >= 8:
|
720 |
+
return True
|
721 |
+
|
722 |
+
if props.major < 6:
|
723 |
+
return False
|
724 |
+
|
725 |
+
fp16_works = False
|
726 |
+
#FP16 is confirmed working on a 1080 (GP104) but it's a bit slower than FP32 so it should only be enabled
|
727 |
+
#when the model doesn't actually fit on the card
|
728 |
+
#TODO: actually test if GP106 and others have the same type of behavior
|
729 |
+
nvidia_10_series = ["1080", "1070", "titan x", "p3000", "p3200", "p4000", "p4200", "p5000", "p5200", "p6000", "1060", "1050", "p40", "p100", "p6", "p4"]
|
730 |
+
for x in nvidia_10_series:
|
731 |
+
if x in props.name.lower():
|
732 |
+
fp16_works = True
|
733 |
+
|
734 |
+
if fp16_works or manual_cast:
|
735 |
+
free_model_memory = (get_free_memory() * 0.9 - minimum_inference_memory())
|
736 |
+
if (not prioritize_performance) or model_params * 4 > free_model_memory:
|
737 |
+
return True
|
738 |
+
|
739 |
+
if props.major < 7:
|
740 |
+
return False
|
741 |
+
|
742 |
+
#FP16 is just broken on these cards
|
743 |
+
nvidia_16_series = ["1660", "1650", "1630", "T500", "T550", "T600", "MX550", "MX450", "CMP 30HX", "T2000", "T1000", "T1200"]
|
744 |
+
for x in nvidia_16_series:
|
745 |
+
if x in props.name:
|
746 |
+
return False
|
747 |
+
|
748 |
+
return True
|
749 |
+
|
750 |
+
def should_use_bf16(device=None, model_params=0, prioritize_performance=True, manual_cast=False):
|
751 |
+
if device is not None:
|
752 |
+
if is_device_cpu(device): #TODO ? bf16 works on CPU but is extremely slow
|
753 |
+
return False
|
754 |
+
|
755 |
+
if device is not None: #TODO not sure about mps bf16 support
|
756 |
+
if is_device_mps(device):
|
757 |
+
return False
|
758 |
+
|
759 |
+
if FORCE_FP32:
|
760 |
+
return False
|
761 |
+
|
762 |
+
if directml_enabled:
|
763 |
+
return False
|
764 |
+
|
765 |
+
if cpu_mode() or mps_mode():
|
766 |
+
return False
|
767 |
+
|
768 |
+
if is_intel_xpu():
|
769 |
+
return True
|
770 |
+
|
771 |
+
if device is None:
|
772 |
+
device = torch.device("cuda")
|
773 |
+
|
774 |
+
props = torch.cuda.get_device_properties(device)
|
775 |
+
if props.major >= 8:
|
776 |
+
return True
|
777 |
+
|
778 |
+
bf16_works = torch.cuda.is_bf16_supported()
|
779 |
+
|
780 |
+
if bf16_works or manual_cast:
|
781 |
+
free_model_memory = (get_free_memory() * 0.9 - minimum_inference_memory())
|
782 |
+
if (not prioritize_performance) or model_params * 4 > free_model_memory:
|
783 |
+
return True
|
784 |
+
|
785 |
+
return False
|
786 |
+
|
787 |
+
def soft_empty_cache(force=False):
|
788 |
+
global cpu_state
|
789 |
+
if cpu_state == CPUState.MPS:
|
790 |
+
torch.mps.empty_cache()
|
791 |
+
elif is_intel_xpu():
|
792 |
+
torch.xpu.empty_cache()
|
793 |
+
elif torch.cuda.is_available():
|
794 |
+
if force or is_nvidia(): #This seems to make things worse on ROCm so I only do it for cuda
|
795 |
+
torch.cuda.empty_cache()
|
796 |
+
torch.cuda.ipc_collect()
|
797 |
+
|
798 |
+
def unload_all_models():
|
799 |
+
free_memory(1e30, get_torch_device())
|
800 |
+
|
801 |
+
|
802 |
+
def resolve_lowvram_weight(weight, model, key): #TODO: remove
|
803 |
+
return weight
|
804 |
+
|
805 |
+
#TODO: might be cleaner to put this somewhere else
|
806 |
+
import threading
|
807 |
+
|
808 |
+
class InterruptProcessingException(Exception):
|
809 |
+
pass
|
810 |
+
|
811 |
+
interrupt_processing_mutex = threading.RLock()
|
812 |
+
|
813 |
+
interrupt_processing = False
|
814 |
+
def interrupt_current_processing(value=True):
|
815 |
+
global interrupt_processing
|
816 |
+
global interrupt_processing_mutex
|
817 |
+
with interrupt_processing_mutex:
|
818 |
+
interrupt_processing = value
|
819 |
+
|
820 |
+
def processing_interrupted():
|
821 |
+
global interrupt_processing
|
822 |
+
global interrupt_processing_mutex
|
823 |
+
with interrupt_processing_mutex:
|
824 |
+
return interrupt_processing
|
825 |
+
|
826 |
+
def throw_exception_if_processing_interrupted():
|
827 |
+
global interrupt_processing
|
828 |
+
global interrupt_processing_mutex
|
829 |
+
with interrupt_processing_mutex:
|
830 |
+
if interrupt_processing:
|
831 |
+
interrupt_processing = False
|
832 |
+
raise InterruptProcessingException()
|