skytnt commited on
Commit
12bcf5e
·
1 Parent(s): 041cb52

Create new file

Browse files
Files changed (1) hide show
  1. pipeline.py +934 -0
pipeline.py ADDED
@@ -0,0 +1,934 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import inspect
2
+ import re
3
+ from typing import Callable, List, Optional, Union
4
+ import PIL
5
+ import numpy as np
6
+ import torch
7
+ from transformers import CLIPFeatureExtractor, CLIPTokenizer
8
+
9
+ from diffusers.onnx_utils import OnnxRuntimeModel
10
+ from diffusers.pipeline_utils import DiffusionPipeline
11
+ from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
12
+ from diffusers.utils import deprecate, logging
13
+ from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
14
+
15
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
16
+
17
+ re_attention = re.compile(r"""
18
+ \\\(|
19
+ \\\)|
20
+ \\\[|
21
+ \\]|
22
+ \\\\|
23
+ \\|
24
+ \(|
25
+ \[|
26
+ :([+-]?[.\d]+)\)|
27
+ \)|
28
+ ]|
29
+ [^\\()\[\]:]+|
30
+ :
31
+ """, re.X)
32
+
33
+
34
+ def parse_prompt_attention(text):
35
+ """
36
+ Parses a string with attention tokens and returns a list of pairs: text and its assoicated weight.
37
+ Accepted tokens are:
38
+ (abc) - increases attention to abc by a multiplier of 1.1
39
+ (abc:3.12) - increases attention to abc by a multiplier of 3.12
40
+ [abc] - decreases attention to abc by a multiplier of 1.1
41
+ \( - literal character '('
42
+ \[ - literal character '['
43
+ \) - literal character ')'
44
+ \] - literal character ']'
45
+ \\ - literal character '\'
46
+ anything else - just text
47
+ >>> parse_prompt_attention('normal text')
48
+ [['normal text', 1.0]]
49
+ >>> parse_prompt_attention('an (important) word')
50
+ [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
51
+ >>> parse_prompt_attention('(unbalanced')
52
+ [['unbalanced', 1.1]]
53
+ >>> parse_prompt_attention('\(literal\]')
54
+ [['(literal]', 1.0]]
55
+ >>> parse_prompt_attention('(unnecessary)(parens)')
56
+ [['unnecessaryparens', 1.1]]
57
+ >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
58
+ [['a ', 1.0],
59
+ ['house', 1.5730000000000004],
60
+ [' ', 1.1],
61
+ ['on', 1.0],
62
+ [' a ', 1.1],
63
+ ['hill', 0.55],
64
+ [', sun, ', 1.1],
65
+ ['sky', 1.4641000000000006],
66
+ ['.', 1.1]]
67
+ """
68
+
69
+ res = []
70
+ round_brackets = []
71
+ square_brackets = []
72
+
73
+ round_bracket_multiplier = 1.1
74
+ square_bracket_multiplier = 1 / 1.1
75
+
76
+ def multiply_range(start_position, multiplier):
77
+ for p in range(start_position, len(res)):
78
+ res[p][1] *= multiplier
79
+
80
+ for m in re_attention.finditer(text):
81
+ text = m.group(0)
82
+ weight = m.group(1)
83
+
84
+ if text.startswith('\\'):
85
+ res.append([text[1:], 1.0])
86
+ elif text == '(':
87
+ round_brackets.append(len(res))
88
+ elif text == '[':
89
+ square_brackets.append(len(res))
90
+ elif weight is not None and len(round_brackets) > 0:
91
+ multiply_range(round_brackets.pop(), float(weight))
92
+ elif text == ')' and len(round_brackets) > 0:
93
+ multiply_range(round_brackets.pop(), round_bracket_multiplier)
94
+ elif text == ']' and len(square_brackets) > 0:
95
+ multiply_range(square_brackets.pop(), square_bracket_multiplier)
96
+ else:
97
+ res.append([text, 1.0])
98
+
99
+ for pos in round_brackets:
100
+ multiply_range(pos, round_bracket_multiplier)
101
+
102
+ for pos in square_brackets:
103
+ multiply_range(pos, square_bracket_multiplier)
104
+
105
+ if len(res) == 0:
106
+ res = [["", 1.0]]
107
+
108
+ # merge runs of identical weights
109
+ i = 0
110
+ while i + 1 < len(res):
111
+ if res[i][1] == res[i + 1][1]:
112
+ res[i][0] += res[i + 1][0]
113
+ res.pop(i + 1)
114
+ else:
115
+ i += 1
116
+
117
+ return res
118
+
119
+
120
+ def get_prompts_with_weights(
121
+ pipe,
122
+ prompt: List[str],
123
+ max_length: int
124
+ ):
125
+ r"""
126
+ Tokenize a list of prompts and return its tokens with weights of each token.
127
+
128
+ No padding, starting or ending token is included.
129
+ """
130
+ tokens = []
131
+ weights = []
132
+ for text in prompt:
133
+ texts_and_weights = parse_prompt_attention(text)
134
+ text_token = []
135
+ text_weight = []
136
+ for word, weight in texts_and_weights:
137
+ # tokenize and discard the starting and the ending token
138
+ token = pipe.tokenizer(word, return_tensors="np").input_ids[0, 1:-1]
139
+ text_token += list(token)
140
+
141
+ # copy the weight by length of token
142
+ text_weight += [weight] * len(token)
143
+
144
+ # stop if the text is too long (longer than truncation limit)
145
+ if len(text_token) > max_length:
146
+ break
147
+
148
+ # truncate
149
+ if len(text_token) > max_length:
150
+ text_token = text_token[:max_length]
151
+ text_weight = text_weight[:max_length]
152
+
153
+ tokens.append(text_token)
154
+ weights.append(text_weight)
155
+ return tokens, weights
156
+
157
+
158
+ def pad_tokens_and_weights(tokens, weights, max_length, bos, eos,
159
+ no_boseos_middle=True,
160
+ chunk_length=77):
161
+ r"""
162
+ Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
163
+ """
164
+ max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
165
+ weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
166
+ for i in range(len(tokens)):
167
+ tokens[i] = [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
168
+ if no_boseos_middle:
169
+ weights[i] = [1.] + weights[i] + [1.] * (max_length - 1 - len(weights[i]))
170
+ else:
171
+ w = []
172
+ if len(weights[i]) == 0:
173
+ w = [1.] * weights_length
174
+ else:
175
+ for j in range((len(weights[i]) - 1) // chunk_length + 1):
176
+ w.append(1.) # weight for starting token in this chunk
177
+ w += weights[i][j * chunk_length: min(len(weights[i]), (j + 1) * chunk_length)]
178
+ w.append(1.) # weight for ending token in this chunk
179
+ w += [1.] * (weights_length - len(w))
180
+ weights[i] = w[:]
181
+
182
+ return tokens, weights
183
+
184
+
185
+ def get_unweighted_text_embeddings(
186
+ pipe,
187
+ text_input: np.array,
188
+ chunk_length: int,
189
+ no_boseos_middle: Optional[bool] = True
190
+ ):
191
+ """
192
+ When the length of tokens is a multiple of the capacity of the text encoder,
193
+ it should be split into chunks and sent to the text encoder individually.
194
+ """
195
+ max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
196
+ if max_embeddings_multiples > 1:
197
+ text_embeddings = []
198
+ for i in range(max_embeddings_multiples):
199
+ # extract the i-th chunk
200
+ text_input_chunk = text_input[:, i * (chunk_length - 2):(i + 1) * (chunk_length - 2) + 2].copy()
201
+
202
+ # cover the head and the tail by the starting and the ending tokens
203
+ text_input_chunk[:, 0] = text_input[0, 0]
204
+ text_input_chunk[:, -1] = text_input[0, -1]
205
+
206
+ text_embedding = pipe.text_encoder(input_ids=text_input_chunk)[0]
207
+
208
+ if no_boseos_middle:
209
+ if i == 0:
210
+ # discard the ending token
211
+ text_embedding = text_embedding[:, :-1]
212
+ elif i == max_embeddings_multiples - 1:
213
+ # discard the starting token
214
+ text_embedding = text_embedding[:, 1:]
215
+ else:
216
+ # discard both starting and ending tokens
217
+ text_embedding = text_embedding[:, 1:-1]
218
+
219
+ text_embeddings.append(text_embedding)
220
+ text_embeddings = np.concatenate(text_embeddings, axis=1)
221
+ else:
222
+ text_embeddings = pipe.text_encoder(input_ids=text_input)[0]
223
+ return text_embeddings
224
+
225
+
226
+ def get_weighted_text_embeddings(
227
+ pipe,
228
+ prompt: Union[str, List[str]],
229
+ uncond_prompt: Optional[Union[str, List[str]]] = None,
230
+ max_embeddings_multiples: Optional[int] = 4,
231
+ no_boseos_middle: Optional[bool] = False,
232
+ skip_parsing: Optional[bool] = False,
233
+ skip_weighting: Optional[bool] = False,
234
+ **kwargs
235
+ ):
236
+ r"""
237
+ Prompts can be assigned with local weights using brackets. For example,
238
+ prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
239
+ and the embedding tokens corresponding to the words get multipled by a constant, 1.1.
240
+
241
+ Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the origional mean.
242
+
243
+ Args:
244
+ pipe (`DiffusionPipeline`):
245
+ Pipe to provide access to the tokenizer and the text encoder.
246
+ prompt (`str` or `List[str]`):
247
+ The prompt or prompts to guide the image generation.
248
+ uncond_prompt (`str` or `List[str]`):
249
+ The unconditional prompt or prompts for guide the image generation. If unconditional prompt
250
+ is provided, the embeddings of prompt and uncond_prompt are concatenated.
251
+ max_embeddings_multiples (`int`, *optional*, defaults to `1`):
252
+ The max multiple length of prompt embeddings compared to the max output length of text encoder.
253
+ no_boseos_middle (`bool`, *optional*, defaults to `False`):
254
+ If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
255
+ ending token in each of the chunk in the middle.
256
+ skip_parsing (`bool`, *optional*, defaults to `False`):
257
+ Skip the parsing of brackets.
258
+ skip_weighting (`bool`, *optional*, defaults to `False`):
259
+ Skip the weighting. When the parsing is skipped, it is forced True.
260
+ """
261
+ max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
262
+ if isinstance(prompt, str):
263
+ prompt = [prompt]
264
+
265
+ if not skip_parsing:
266
+ prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
267
+ if uncond_prompt is not None:
268
+ if isinstance(uncond_prompt, str):
269
+ uncond_prompt = [uncond_prompt]
270
+ uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
271
+ else:
272
+ prompt_tokens = [token[1:-1] for token in
273
+ pipe.tokenizer(prompt, max_length=max_length, truncation=True, return_tensors="np").input_ids]
274
+ prompt_weights = [[1.] * len(token) for token in prompt_tokens]
275
+ if uncond_prompt is not None:
276
+ if isinstance(uncond_prompt, str):
277
+ uncond_prompt = [uncond_prompt]
278
+ uncond_tokens = [token[1:-1] for token in
279
+ pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True,
280
+ return_tensors="np").input_ids]
281
+ uncond_weights = [[1.] * len(token) for token in uncond_tokens]
282
+
283
+ # round up the longest length of tokens to a multiple of (model_max_length - 2)
284
+ max_length = max([len(token) for token in prompt_tokens])
285
+ if uncond_prompt is not None:
286
+ max_length = max(max_length, max([len(token) for token in uncond_tokens]))
287
+
288
+ max_embeddings_multiples = min(max_embeddings_multiples,
289
+ (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1)
290
+ max_embeddings_multiples = max(1, max_embeddings_multiples)
291
+ max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
292
+
293
+ # pad the length of tokens and weights
294
+ bos = pipe.tokenizer.bos_token_id
295
+ eos = pipe.tokenizer.eos_token_id
296
+ prompt_tokens, prompt_weights = pad_tokens_and_weights(prompt_tokens, prompt_weights, max_length, bos, eos,
297
+ no_boseos_middle=no_boseos_middle,
298
+ chunk_length=pipe.tokenizer.model_max_length)
299
+ prompt_tokens = np.array(prompt_tokens, dtype=np.int32)
300
+ if uncond_prompt is not None:
301
+ uncond_tokens, uncond_weights = pad_tokens_and_weights(uncond_tokens, uncond_weights, max_length, bos, eos,
302
+ no_boseos_middle=no_boseos_middle,
303
+ chunk_length=pipe.tokenizer.model_max_length)
304
+ uncond_tokens = np.array(uncond_tokens, dtype=np.int32)
305
+
306
+ # get the embeddings
307
+ text_embeddings = get_unweighted_text_embeddings(pipe, prompt_tokens, pipe.tokenizer.model_max_length,
308
+ no_boseos_middle=no_boseos_middle)
309
+ prompt_weights = np.array(prompt_weights, dtype=text_embeddings.dtype)
310
+ if uncond_prompt is not None:
311
+ uncond_embeddings = get_unweighted_text_embeddings(pipe, uncond_tokens, pipe.tokenizer.model_max_length,
312
+ no_boseos_middle=no_boseos_middle)
313
+ uncond_weights = np.array(uncond_weights, dtype=uncond_embeddings.dtype)
314
+
315
+ # assign weights to the prompts and normalize in the sense of mean
316
+ # TODO: should we normalize by chunk or in a whole (current implementation)?
317
+ if (not skip_parsing) and (not skip_weighting):
318
+ previous_mean = text_embeddings.mean(axis=(-2, -1))
319
+ text_embeddings *= prompt_weights[:, :, None]
320
+ text_embeddings *= (previous_mean / text_embeddings.mean(axis=(-2, -1)))[:, None, None]
321
+ if uncond_prompt is not None:
322
+ previous_mean = uncond_embeddings.mean(axis=(-2, -1))
323
+ uncond_embeddings *= uncond_weights[:, :, None]
324
+ uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=(-2, -1)))[:, None, None]
325
+
326
+ # For classifier free guidance, we need to do two forward passes.
327
+ # Here we concatenate the unconditional and text embeddings into a single batch
328
+ # to avoid doing two forward passes
329
+ if uncond_prompt is not None:
330
+ return text_embeddings, uncond_embeddings
331
+
332
+ return text_embeddings
333
+
334
+
335
+ def preprocess_image(image):
336
+ w, h = image.size
337
+ w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
338
+ image = image.resize((w, h), resample=PIL.Image.LANCZOS)
339
+ image = np.array(image).astype(np.float32) / 255.0
340
+ image = image[None].transpose(0, 3, 1, 2)
341
+ return 2.0 * image - 1.0
342
+
343
+
344
+ def preprocess_mask(mask):
345
+ mask = mask.convert("L")
346
+ w, h = mask.size
347
+ w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32
348
+ mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
349
+ mask = np.array(mask).astype(np.float32) / 255.0
350
+ mask = np.tile(mask, (4, 1, 1))
351
+ mask = mask[None].transpose(0, 1, 2, 3) # what does this step do?
352
+ mask = 1 - mask # repaint white, keep black
353
+ return mask
354
+
355
+
356
+ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
357
+ r"""
358
+ Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
359
+ weighting in prompt.
360
+
361
+ This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
362
+ library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
363
+ """
364
+
365
+ def __init__(
366
+ self,
367
+ vae_encoder: OnnxRuntimeModel,
368
+ vae_decoder: OnnxRuntimeModel,
369
+ text_encoder: OnnxRuntimeModel,
370
+ tokenizer: CLIPTokenizer,
371
+ unet: OnnxRuntimeModel,
372
+ scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
373
+ safety_checker: OnnxRuntimeModel,
374
+ feature_extractor: CLIPFeatureExtractor,
375
+ ):
376
+ super().__init__()
377
+ self.register_modules(
378
+ vae_encoder=vae_encoder,
379
+ vae_decoder=vae_decoder,
380
+ text_encoder=text_encoder,
381
+ tokenizer=tokenizer,
382
+ unet=unet,
383
+ scheduler=scheduler,
384
+ safety_checker=safety_checker,
385
+ feature_extractor=feature_extractor,
386
+ )
387
+
388
+ @torch.no_grad()
389
+ def __call__(
390
+ self,
391
+ prompt: Union[str, List[str]],
392
+ negative_prompt: Optional[Union[str, List[str]]] = None,
393
+ init_image: Union[np.ndarray, PIL.Image.Image] = None,
394
+ mask_image: Union[np.ndarray, PIL.Image.Image] = None,
395
+ height: int = 512,
396
+ width: int = 512,
397
+ num_inference_steps: int = 50,
398
+ guidance_scale: float = 7.5,
399
+ strength: float = 0.8,
400
+ num_images_per_prompt: Optional[int] = 1,
401
+ eta: float = 0.0,
402
+ generator: Optional[np.random.RandomState] = None,
403
+ latents: Optional[np.ndarray] = None,
404
+ max_embeddings_multiples: Optional[int] = 3,
405
+ output_type: Optional[str] = "pil",
406
+ return_dict: bool = True,
407
+ callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
408
+ callback_steps: Optional[int] = 1,
409
+ **kwargs,
410
+ ):
411
+ r"""
412
+ Function invoked when calling the pipeline for generation.
413
+
414
+ Args:
415
+ prompt (`str` or `List[str]`):
416
+ The prompt or prompts to guide the image generation.
417
+ negative_prompt (`str` or `List[str]`, *optional*):
418
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
419
+ if `guidance_scale` is less than `1`).
420
+ init_image (`torch.FloatTensor` or `PIL.Image.Image`):
421
+ `Image`, or tensor representing an image batch, that will be used as the starting point for the
422
+ process.
423
+ mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
424
+ `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
425
+ replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
426
+ PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
427
+ contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
428
+ height (`int`, *optional*, defaults to 512):
429
+ The height in pixels of the generated image.
430
+ width (`int`, *optional*, defaults to 512):
431
+ The width in pixels of the generated image.
432
+ num_inference_steps (`int`, *optional*, defaults to 50):
433
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
434
+ expense of slower inference.
435
+ guidance_scale (`float`, *optional*, defaults to 7.5):
436
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
437
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
438
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
439
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
440
+ usually at the expense of lower image quality.
441
+ strength (`float`, *optional*, defaults to 0.8):
442
+ Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
443
+ `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
444
+ number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
445
+ noise will be maximum and the denoising process will run for the full number of iterations specified in
446
+ `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
447
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
448
+ The number of images to generate per prompt.
449
+ eta (`float`, *optional*, defaults to 0.0):
450
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
451
+ [`schedulers.DDIMScheduler`], will be ignored for others.
452
+ generator (`torch.Generator`, *optional*):
453
+ A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
454
+ deterministic.
455
+ latents (`torch.FloatTensor`, *optional*):
456
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
457
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
458
+ tensor will ge generated by sampling using the supplied random `generator`.
459
+ max_embeddings_multiples (`int`, *optional*, defaults to `3`):
460
+ The max multiple length of prompt embeddings compared to the max output length of text encoder.
461
+ output_type (`str`, *optional*, defaults to `"pil"`):
462
+ The output format of the generate image. Choose between
463
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
464
+ return_dict (`bool`, *optional*, defaults to `True`):
465
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
466
+ plain tuple.
467
+ callback (`Callable`, *optional*):
468
+ A function that will be called every `callback_steps` steps during inference. The function will be
469
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
470
+ callback_steps (`int`, *optional*, defaults to 1):
471
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
472
+ called at every step.
473
+
474
+ Returns:
475
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
476
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
477
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
478
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
479
+ (nsfw) content, according to the `safety_checker`.
480
+ """
481
+
482
+ if isinstance(prompt, str):
483
+ batch_size = 1
484
+ prompt = [prompt]
485
+ elif isinstance(prompt, list):
486
+ batch_size = len(prompt)
487
+ else:
488
+ raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
489
+
490
+ if strength < 0 or strength > 1:
491
+ raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
492
+
493
+ if height % 8 != 0 or width % 8 != 0:
494
+ raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
495
+
496
+ if (callback_steps is None) or (
497
+ callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
498
+ ):
499
+ raise ValueError(
500
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
501
+ f" {type(callback_steps)}."
502
+ )
503
+
504
+ # get prompt text embeddings
505
+
506
+ # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
507
+ # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
508
+ # corresponds to doing no classifier free guidance.
509
+ do_classifier_free_guidance = guidance_scale > 1.0
510
+ # get unconditional embeddings for classifier free guidance
511
+ if negative_prompt is None:
512
+ negative_prompt = [""] * batch_size
513
+ elif isinstance(negative_prompt, str):
514
+ negative_prompt = [negative_prompt] * batch_size
515
+ if batch_size != len(negative_prompt):
516
+ raise ValueError(
517
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
518
+ f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
519
+ " the batch size of `prompt`."
520
+ )
521
+
522
+ if generator is None:
523
+ generator = np.random
524
+
525
+ text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
526
+ pipe=self,
527
+ prompt=prompt,
528
+ uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
529
+ max_embeddings_multiples=max_embeddings_multiples,
530
+ **kwargs
531
+ )
532
+
533
+ text_embeddings = text_embeddings.repeat(num_images_per_prompt, 0)
534
+ if do_classifier_free_guidance:
535
+ uncond_embeddings = uncond_embeddings.repeat(num_images_per_prompt, 0)
536
+ text_embeddings = np.concatenate([uncond_embeddings, text_embeddings])
537
+
538
+ # set timesteps
539
+ self.scheduler.set_timesteps(num_inference_steps)
540
+
541
+ latents_dtype = text_embeddings.dtype
542
+ init_latents_orig = None
543
+ mask = None
544
+ noise = None
545
+
546
+ if init_image is None:
547
+ latents_shape = (batch_size * num_images_per_prompt, 4, height // 8, width // 8)
548
+
549
+ if latents is None:
550
+ latents = generator.randn(*latents_shape).astype(latents_dtype)
551
+ elif latents.shape != latents_shape:
552
+ raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
553
+
554
+ timesteps = self.scheduler.timesteps.to(self.device)
555
+
556
+ # scale the initial noise by the standard deviation required by the scheduler
557
+ latents = latents * self.scheduler.init_noise_sigma
558
+ else:
559
+ if isinstance(init_image, PIL.Image.Image):
560
+ init_image = preprocess_image(init_image)
561
+ # encode the init image into latents and scale the latents
562
+ init_image = init_image.astype(latents_dtype)
563
+ init_latents = self.vae_encoder(sample=init_image)[0]
564
+ init_latents = 0.18215 * init_latents
565
+ init_latents = np.concatenate([init_latents] * batch_size * num_images_per_prompt)
566
+ init_latents_orig = init_latents
567
+
568
+ # preprocess mask
569
+ if mask_image is not None:
570
+ if isinstance(mask_image, PIL.Image.Image):
571
+ mask_image = preprocess_mask(mask_image)
572
+ mask_image = mask_image.astype(latents_dtype)
573
+ mask = np.concatenate([mask_image] * batch_size * num_images_per_prompt)
574
+
575
+ # check sizes
576
+ if not mask.shape == init_latents.shape:
577
+ print(mask.shape, init_latents.shape)
578
+ raise ValueError("The mask and init_image should be the same size!")
579
+
580
+ # get the original timestep using init_timestep
581
+ offset = self.scheduler.config.get("steps_offset", 0)
582
+ init_timestep = int(num_inference_steps * strength) + offset
583
+ init_timestep = min(init_timestep, num_inference_steps)
584
+
585
+ timesteps = self.scheduler.timesteps[-init_timestep]
586
+ timesteps = torch.tensor([timesteps] * batch_size * num_images_per_prompt)
587
+
588
+ # add noise to latents using the timesteps
589
+ noise = generator.randn(*init_latents.shape).astype(latents_dtype)
590
+ latents = self.scheduler.add_noise(torch.from_numpy(init_latents), torch.from_numpy(noise),
591
+ timesteps).numpy()
592
+
593
+ t_start = max(num_inference_steps - init_timestep + offset, 0)
594
+ timesteps = self.scheduler.timesteps[t_start:]
595
+
596
+ # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
597
+ # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
598
+ # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
599
+ # and should be between [0, 1]
600
+ accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
601
+ extra_step_kwargs = {}
602
+ if accepts_eta:
603
+ extra_step_kwargs["eta"] = eta
604
+
605
+ for i, t in enumerate(self.progress_bar(timesteps)):
606
+ # expand the latents if we are doing classifier free guidance
607
+ latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
608
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
609
+
610
+ # predict the noise residual
611
+ noise_pred = self.unet(
612
+ sample=latent_model_input, timestep=np.array([t]), encoder_hidden_states=text_embeddings
613
+ )
614
+ noise_pred = noise_pred[0]
615
+
616
+ # perform guidance
617
+ if do_classifier_free_guidance:
618
+ noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
619
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
620
+
621
+ # compute the previous noisy sample x_t -> x_t-1
622
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample.numpy()
623
+
624
+ if mask is not None:
625
+ # masking
626
+ init_latents_proper = self.scheduler.add_noise(torch.from_numpy(init_latents_orig),
627
+ torch.from_numpy(noise), torch.tensor([t])).numpy()
628
+ latents = (init_latents_proper * mask) + (latents * (1 - mask))
629
+
630
+ # call the callback, if provided
631
+ if callback is not None and i % callback_steps == 0:
632
+ callback(i, t, latents)
633
+
634
+ latents = 1 / 0.18215 * latents
635
+ # image = self.vae_decoder(latent_sample=latents)[0]
636
+ # it seems likes there is a problem for using vae with half precision
637
+ image = []
638
+ for i in range(latents.shape[0]):
639
+ image.append(self.vae_decoder(latent_sample=latents[i:i + 1])[0])
640
+ image = np.concatenate(image)
641
+
642
+ image = np.clip(image / 2 + 0.5, 0, 1)
643
+ image = image.transpose((0, 2, 3, 1))
644
+
645
+ if self.safety_checker is not None:
646
+ safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="np")
647
+ image, has_nsfw_concept = self.safety_checker(clip_input=safety_checker_input.pixel_values, images=image)
648
+ else:
649
+ has_nsfw_concept = None
650
+
651
+ if output_type == "pil":
652
+ image = self.numpy_to_pil(image)
653
+
654
+ if not return_dict:
655
+ return (image, has_nsfw_concept)
656
+
657
+ return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
658
+
659
+ def text2img(
660
+ self,
661
+ prompt: Union[str, List[str]],
662
+ negative_prompt: Optional[Union[str, List[str]]] = None,
663
+ height: int = 512,
664
+ width: int = 512,
665
+ num_inference_steps: int = 50,
666
+ guidance_scale: float = 7.5,
667
+ num_images_per_prompt: Optional[int] = 1,
668
+ eta: float = 0.0,
669
+ generator: Optional[np.random.RandomState] = None,
670
+ latents: Optional[np.ndarray] = None,
671
+ max_embeddings_multiples: Optional[int] = 3,
672
+ output_type: Optional[str] = "pil",
673
+ return_dict: bool = True,
674
+ callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
675
+ callback_steps: Optional[int] = 1,
676
+ **kwargs,
677
+ ):
678
+ r"""
679
+ Function for text-to-image generation.
680
+ Args:
681
+ prompt (`str` or `List[str]`):
682
+ The prompt or prompts to guide the image generation.
683
+ negative_prompt (`str` or `List[str]`, *optional*):
684
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
685
+ if `guidance_scale` is less than `1`).
686
+ height (`int`, *optional*, defaults to 512):
687
+ The height in pixels of the generated image.
688
+ width (`int`, *optional*, defaults to 512):
689
+ The width in pixels of the generated image.
690
+ num_inference_steps (`int`, *optional*, defaults to 50):
691
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
692
+ expense of slower inference.
693
+ guidance_scale (`float`, *optional*, defaults to 7.5):
694
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
695
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
696
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
697
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
698
+ usually at the expense of lower image quality.
699
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
700
+ The number of images to generate per prompt.
701
+ eta (`float`, *optional*, defaults to 0.0):
702
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
703
+ [`schedulers.DDIMScheduler`], will be ignored for others.
704
+ generator (`np.random.RandomState`, *optional*):
705
+ A numpy RandomState to make generation deterministic.
706
+ latents (`np.ndarray`, *optional*):
707
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
708
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
709
+ tensor will ge generated by sampling using the supplied random `generator`.
710
+ max_embeddings_multiples (`int`, *optional*, defaults to `3`):
711
+ The max multiple length of prompt embeddings compared to the max output length of text encoder.
712
+ output_type (`str`, *optional*, defaults to `"pil"`):
713
+ The output format of the generate image. Choose between
714
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
715
+ return_dict (`bool`, *optional*, defaults to `True`):
716
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
717
+ plain tuple.
718
+ callback (`Callable`, *optional*):
719
+ A function that will be called every `callback_steps` steps during inference. The function will be
720
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
721
+ callback_steps (`int`, *optional*, defaults to 1):
722
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
723
+ called at every step.
724
+ Returns:
725
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
726
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
727
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
728
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
729
+ (nsfw) content, according to the `safety_checker`.
730
+ """
731
+ return self.__call__(
732
+ prompt=prompt,
733
+ negative_prompt=negative_prompt,
734
+ height=height,
735
+ width=width,
736
+ num_inference_steps=num_inference_steps,
737
+ guidance_scale=guidance_scale,
738
+ num_images_per_prompt=num_images_per_prompt,
739
+ eta=eta,
740
+ generator=generator,
741
+ latents=latents,
742
+ max_embeddings_multiples=max_embeddings_multiples,
743
+ output_type=output_type,
744
+ return_dict=return_dict,
745
+ callback=callback,
746
+ callback_steps=callback_steps,
747
+ **kwargs
748
+ )
749
+
750
+ def img2img(
751
+ self,
752
+ init_image: Union[np.ndarray, PIL.Image.Image],
753
+ prompt: Union[str, List[str]],
754
+ negative_prompt: Optional[Union[str, List[str]]] = None,
755
+ strength: float = 0.8,
756
+ num_inference_steps: Optional[int] = 50,
757
+ guidance_scale: Optional[float] = 7.5,
758
+ num_images_per_prompt: Optional[int] = 1,
759
+ eta: Optional[float] = 0.0,
760
+ generator: Optional[np.random.RandomState] = None,
761
+ max_embeddings_multiples: Optional[int] = 3,
762
+ output_type: Optional[str] = "pil",
763
+ return_dict: bool = True,
764
+ callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
765
+ callback_steps: Optional[int] = 1,
766
+ **kwargs,
767
+ ):
768
+ r"""
769
+ Function for image-to-image generation.
770
+ Args:
771
+ init_image (`np.ndarray` or `PIL.Image.Image`):
772
+ `Image`, or ndarray representing an image batch, that will be used as the starting point for the
773
+ process.
774
+ prompt (`str` or `List[str]`):
775
+ The prompt or prompts to guide the image generation.
776
+ negative_prompt (`str` or `List[str]`, *optional*):
777
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
778
+ if `guidance_scale` is less than `1`).
779
+ strength (`float`, *optional*, defaults to 0.8):
780
+ Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
781
+ `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
782
+ number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
783
+ noise will be maximum and the denoising process will run for the full number of iterations specified in
784
+ `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
785
+ num_inference_steps (`int`, *optional*, defaults to 50):
786
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
787
+ expense of slower inference. This parameter will be modulated by `strength`.
788
+ guidance_scale (`float`, *optional*, defaults to 7.5):
789
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
790
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
791
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
792
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
793
+ usually at the expense of lower image quality.
794
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
795
+ The number of images to generate per prompt.
796
+ eta (`float`, *optional*, defaults to 0.0):
797
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
798
+ [`schedulers.DDIMScheduler`], will be ignored for others.
799
+ generator (`torch.Generator`, *optional*):
800
+ A numpy RandomState to make generation deterministic.
801
+ max_embeddings_multiples (`int`, *optional*, defaults to `3`):
802
+ The max multiple length of prompt embeddings compared to the max output length of text encoder.
803
+ output_type (`str`, *optional*, defaults to `"pil"`):
804
+ The output format of the generate image. Choose between
805
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
806
+ return_dict (`bool`, *optional*, defaults to `True`):
807
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
808
+ plain tuple.
809
+ callback (`Callable`, *optional*):
810
+ A function that will be called every `callback_steps` steps during inference. The function will be
811
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
812
+ callback_steps (`int`, *optional*, defaults to 1):
813
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
814
+ called at every step.
815
+ Returns:
816
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
817
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
818
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
819
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
820
+ (nsfw) content, according to the `safety_checker`.
821
+ """
822
+ return self.__call__(
823
+ prompt=prompt,
824
+ negative_prompt=negative_prompt,
825
+ init_image=init_image,
826
+ num_inference_steps=num_inference_steps,
827
+ guidance_scale=guidance_scale,
828
+ strength=strength,
829
+ num_images_per_prompt=num_images_per_prompt,
830
+ eta=eta,
831
+ generator=generator,
832
+ max_embeddings_multiples=max_embeddings_multiples,
833
+ output_type=output_type,
834
+ return_dict=return_dict,
835
+ callback=callback,
836
+ callback_steps=callback_steps,
837
+ **kwargs
838
+ )
839
+
840
+ def inpaint(
841
+ self,
842
+ init_image: Union[np.ndarray, PIL.Image.Image],
843
+ mask_image: Union[np.ndarray, PIL.Image.Image],
844
+ prompt: Union[str, List[str]],
845
+ negative_prompt: Optional[Union[str, List[str]]] = None,
846
+ strength: float = 0.8,
847
+ num_inference_steps: Optional[int] = 50,
848
+ guidance_scale: Optional[float] = 7.5,
849
+ num_images_per_prompt: Optional[int] = 1,
850
+ eta: Optional[float] = 0.0,
851
+ generator: Optional[np.random.RandomState] = None,
852
+ max_embeddings_multiples: Optional[int] = 3,
853
+ output_type: Optional[str] = "pil",
854
+ return_dict: bool = True,
855
+ callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
856
+ callback_steps: Optional[int] = 1,
857
+ **kwargs,
858
+ ):
859
+ r"""
860
+ Function for inpaint.
861
+ Args:
862
+ init_image (`np.ndarray` or `PIL.Image.Image`):
863
+ `Image`, or tensor representing an image batch, that will be used as the starting point for the
864
+ process. This is the image whose masked region will be inpainted.
865
+ mask_image (`np.ndarray` or `PIL.Image.Image`):
866
+ `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
867
+ replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
868
+ PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
869
+ contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
870
+ prompt (`str` or `List[str]`):
871
+ The prompt or prompts to guide the image generation.
872
+ negative_prompt (`str` or `List[str]`, *optional*):
873
+ The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
874
+ if `guidance_scale` is less than `1`).
875
+ strength (`float`, *optional*, defaults to 0.8):
876
+ Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
877
+ is 1, the denoising process will be run on the masked area for the full number of iterations specified
878
+ in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
879
+ noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
880
+ num_inference_steps (`int`, *optional*, defaults to 50):
881
+ The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
882
+ the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
883
+ guidance_scale (`float`, *optional*, defaults to 7.5):
884
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
885
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
886
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
887
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
888
+ usually at the expense of lower image quality.
889
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
890
+ The number of images to generate per prompt.
891
+ eta (`float`, *optional*, defaults to 0.0):
892
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
893
+ [`schedulers.DDIMScheduler`], will be ignored for others.
894
+ generator (`torch.Generator`, *optional*):
895
+ A random RandomState to make generation deterministic.
896
+ max_embeddings_multiples (`int`, *optional*, defaults to `3`):
897
+ The max multiple length of prompt embeddings compared to the max output length of text encoder.
898
+ output_type (`str`, *optional*, defaults to `"pil"`):
899
+ The output format of the generate image. Choose between
900
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
901
+ return_dict (`bool`, *optional*, defaults to `True`):
902
+ Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
903
+ plain tuple.
904
+ callback (`Callable`, *optional*):
905
+ A function that will be called every `callback_steps` steps during inference. The function will be
906
+ called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
907
+ callback_steps (`int`, *optional*, defaults to 1):
908
+ The frequency at which the `callback` function will be called. If not specified, the callback will be
909
+ called at every step.
910
+ Returns:
911
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
912
+ [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
913
+ When returning a tuple, the first element is a list with the generated images, and the second element is a
914
+ list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
915
+ (nsfw) content, according to the `safety_checker`.
916
+ """
917
+ return self.__call__(
918
+ prompt=prompt,
919
+ negative_prompt=negative_prompt,
920
+ init_image=init_image,
921
+ mask_image=mask_image,
922
+ num_inference_steps=num_inference_steps,
923
+ guidance_scale=guidance_scale,
924
+ strength=strength,
925
+ num_images_per_prompt=num_images_per_prompt,
926
+ eta=eta,
927
+ generator=generator,
928
+ max_embeddings_multiples=max_embeddings_multiples,
929
+ output_type=output_type,
930
+ return_dict=return_dict,
931
+ callback=callback,
932
+ callback_steps=callback_steps,
933
+ **kwargs
934
+ )