radames commited on
Commit
248bc06
·
1 Parent(s): f308f82
Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +127 -0
  3. requirements.txt +8 -0
  4. safety_checker.py +137 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
app.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from diffusers import StableDiffusionXLPipeline
4
+ from diffusers.schedulers import TCDScheduler
5
+ import spaces
6
+ from PIL import Image
7
+
8
+ SAFETY_CHECKER = True
9
+
10
+ # Constants
11
+ base = "stabilityai/stable-diffusion-xl-base-1.0"
12
+ repo = "ByteDance/SDXL-Lightning"
13
+ checkpoints = {
14
+ "2-Step": ["pcm_sdxl_smallcfg_2step_converted.safetensors", 2, 0.0],
15
+ "4-Step": ["pcm_sdxl_smallcfg_4step_converted.safetensors", 4, 0.0],
16
+ "8-Step": ["pcm_sdxl_smallcfg_8step_converted.safetensors", 8, 0.0],
17
+ "16-Step": ["pcm_sdxl_smallcfg_16step_converted.safetensors", 16, 0.0],
18
+ "Normal CFG 4-Step": ["pcm_sdxl_normalcfg_4step_converted.safetensors", 4, 7.5],
19
+ "Normal CFG 8-Step": ["pcm_sdxl_normalcfg_8step_converted.safetensors", 8, 7.5],
20
+ "Normal CFG 16-Step": ["pcm_sdxl_normalcfg_16step_converted.safetensors", 16, 7.5],
21
+ "LCM-Like LoRA": ["pcm_sdxl_lcmlike_lora_converted.safetensors", 16, 0.0],
22
+ }
23
+
24
+
25
+ loaded = None
26
+
27
+ # Ensure model and scheduler are initialized in GPU-enabled function
28
+ if torch.cuda.is_available():
29
+ pipe = StableDiffusionXLPipeline.from_pretrained(
30
+ base, torch_dtype=torch.float16, variant="fp16"
31
+ ).to("cuda")
32
+
33
+ if SAFETY_CHECKER:
34
+ from safety_checker import StableDiffusionSafetyChecker
35
+ from transformers import CLIPFeatureExtractor
36
+
37
+ safety_checker = StableDiffusionSafetyChecker.from_pretrained(
38
+ "CompVis/stable-diffusion-safety-checker"
39
+ ).to("cuda")
40
+ feature_extractor = CLIPFeatureExtractor.from_pretrained(
41
+ "openai/clip-vit-base-patch32"
42
+ )
43
+
44
+ def check_nsfw_images(
45
+ images: list[Image.Image],
46
+ ) -> tuple[list[Image.Image], list[bool]]:
47
+ safety_checker_input = feature_extractor(images, return_tensors="pt").to("cuda")
48
+ has_nsfw_concepts = safety_checker(
49
+ images=[images], clip_input=safety_checker_input.pixel_values.to("cuda")
50
+ )
51
+
52
+ return images, has_nsfw_concepts
53
+
54
+
55
+ # Function
56
+ @spaces.GPU(enable_queue=True)
57
+ def generate_image(prompt, ckpt):
58
+ global loaded
59
+ print(prompt, ckpt)
60
+
61
+ checkpoint = checkpoints[ckpt][0]
62
+ num_inference_steps = checkpoints[ckpt][1]
63
+ guidance_scale = checkpoints[ckpt][2]
64
+
65
+ if loaded != num_inference_steps:
66
+ pipe.scheduler = TCDScheduler(
67
+ num_train_timesteps=1000,
68
+ beta_start=0.00085,
69
+ beta_end=0.012,
70
+ beta_schedule="scaled_linear",
71
+ timestep_spacing="trailing",
72
+ )
73
+ pipe.load_lora_weights(
74
+ "wangfuyun/PCM_Weights", weight_name=checkpoint, subfolder="sdxl"
75
+ )
76
+
77
+ loaded = num_inference_steps
78
+
79
+ results = pipe(
80
+ prompt, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale
81
+ )
82
+
83
+ if SAFETY_CHECKER:
84
+ images, has_nsfw_concepts = check_nsfw_images(results.images)
85
+ if any(has_nsfw_concepts):
86
+ gr.Warning("NSFW content detected.")
87
+ return Image.new("RGB", (512, 512))
88
+ return images[0]
89
+ return results.images[0]
90
+
91
+
92
+ # Gradio Interface
93
+
94
+ css = """
95
+ .gradio-container {
96
+ max-width: 60rem !important;
97
+ }
98
+ """
99
+ with gr.Blocks(css=css) as demo:
100
+ gr.HTML("<h1><center>SDXL-Lightning ⚡</center></h1>")
101
+ gr.HTML(
102
+ "<p><center>Lightning-fast text-to-image generation</center></p><p><center><a href='https://huggingface.co/ByteDance/SDXL-Lightning'>https://huggingface.co/ByteDance/SDXL-Lightning</a></center></p>"
103
+ )
104
+ with gr.Group():
105
+ with gr.Row():
106
+ prompt = gr.Textbox(label="Enter your prompt (English)", scale=8)
107
+ ckpt = gr.Dropdown(
108
+ label="Select inference steps",
109
+ choices=list(checkpoints.keys()),
110
+ value="4-Step",
111
+ interactive=True,
112
+ )
113
+ submit = gr.Button(scale=1, variant="primary")
114
+ img = gr.Image(label="SDXL-Lightning Generated Image")
115
+
116
+ prompt.submit(
117
+ fn=generate_image,
118
+ inputs=[prompt, ckpt],
119
+ outputs=img,
120
+ )
121
+ submit.click(
122
+ fn=generate_image,
123
+ inputs=[prompt, ckpt],
124
+ outputs=img,
125
+ )
126
+
127
+ demo.queue().launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ diffusers==0.28.0
2
+ datasets
3
+ transformers
4
+ accelerate
5
+ peft
6
+ xformers
7
+ gradio==4.32.2
8
+ spaces
safety_checker.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import numpy as np
16
+ import torch
17
+ import torch.nn as nn
18
+ from transformers import CLIPConfig, CLIPVisionModel, PreTrainedModel
19
+
20
+
21
+ def cosine_distance(image_embeds, text_embeds):
22
+ normalized_image_embeds = nn.functional.normalize(image_embeds)
23
+ normalized_text_embeds = nn.functional.normalize(text_embeds)
24
+ return torch.mm(normalized_image_embeds, normalized_text_embeds.t())
25
+
26
+
27
+ class StableDiffusionSafetyChecker(PreTrainedModel):
28
+ config_class = CLIPConfig
29
+
30
+ _no_split_modules = ["CLIPEncoderLayer"]
31
+
32
+ def __init__(self, config: CLIPConfig):
33
+ super().__init__(config)
34
+
35
+ self.vision_model = CLIPVisionModel(config.vision_config)
36
+ self.visual_projection = nn.Linear(
37
+ config.vision_config.hidden_size, config.projection_dim, bias=False
38
+ )
39
+
40
+ self.concept_embeds = nn.Parameter(
41
+ torch.ones(17, config.projection_dim), requires_grad=False
42
+ )
43
+ self.special_care_embeds = nn.Parameter(
44
+ torch.ones(3, config.projection_dim), requires_grad=False
45
+ )
46
+
47
+ self.concept_embeds_weights = nn.Parameter(torch.ones(17), requires_grad=False)
48
+ self.special_care_embeds_weights = nn.Parameter(
49
+ torch.ones(3), requires_grad=False
50
+ )
51
+
52
+ @torch.no_grad()
53
+ def forward(self, clip_input, images):
54
+ pooled_output = self.vision_model(clip_input)[1] # pooled_output
55
+ image_embeds = self.visual_projection(pooled_output)
56
+
57
+ # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
58
+ special_cos_dist = (
59
+ cosine_distance(image_embeds, self.special_care_embeds)
60
+ .cpu()
61
+ .float()
62
+ .numpy()
63
+ )
64
+ cos_dist = (
65
+ cosine_distance(image_embeds, self.concept_embeds).cpu().float().numpy()
66
+ )
67
+
68
+ result = []
69
+ batch_size = image_embeds.shape[0]
70
+ for i in range(batch_size):
71
+ result_img = {
72
+ "special_scores": {},
73
+ "special_care": [],
74
+ "concept_scores": {},
75
+ "bad_concepts": [],
76
+ }
77
+
78
+ # increase this value to create a stronger `nfsw` filter
79
+ # at the cost of increasing the possibility of filtering benign images
80
+ adjustment = 0.0
81
+
82
+ for concept_idx in range(len(special_cos_dist[0])):
83
+ concept_cos = special_cos_dist[i][concept_idx]
84
+ concept_threshold = self.special_care_embeds_weights[concept_idx].item()
85
+ result_img["special_scores"][concept_idx] = round(
86
+ concept_cos - concept_threshold + adjustment, 3
87
+ )
88
+ if result_img["special_scores"][concept_idx] > 0:
89
+ result_img["special_care"].append(
90
+ {concept_idx, result_img["special_scores"][concept_idx]}
91
+ )
92
+ adjustment = 0.01
93
+
94
+ for concept_idx in range(len(cos_dist[0])):
95
+ concept_cos = cos_dist[i][concept_idx]
96
+ concept_threshold = self.concept_embeds_weights[concept_idx].item()
97
+ result_img["concept_scores"][concept_idx] = round(
98
+ concept_cos - concept_threshold + adjustment, 3
99
+ )
100
+ if result_img["concept_scores"][concept_idx] > 0:
101
+ result_img["bad_concepts"].append(concept_idx)
102
+
103
+ result.append(result_img)
104
+
105
+ has_nsfw_concepts = [len(res["bad_concepts"]) > 0 for res in result]
106
+
107
+ return has_nsfw_concepts
108
+
109
+ @torch.no_grad()
110
+ def forward_onnx(self, clip_input: torch.FloatTensor, images: torch.FloatTensor):
111
+ pooled_output = self.vision_model(clip_input)[1] # pooled_output
112
+ image_embeds = self.visual_projection(pooled_output)
113
+
114
+ special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds)
115
+ cos_dist = cosine_distance(image_embeds, self.concept_embeds)
116
+
117
+ # increase this value to create a stronger `nsfw` filter
118
+ # at the cost of increasing the possibility of filtering benign images
119
+ adjustment = 0.0
120
+
121
+ special_scores = (
122
+ special_cos_dist - self.special_care_embeds_weights + adjustment
123
+ )
124
+ # special_scores = special_scores.round(decimals=3)
125
+ special_care = torch.any(special_scores > 0, dim=1)
126
+ special_adjustment = special_care * 0.01
127
+ special_adjustment = special_adjustment.unsqueeze(1).expand(
128
+ -1, cos_dist.shape[1]
129
+ )
130
+
131
+ concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment
132
+ # concept_scores = concept_scores.round(decimals=3)
133
+ has_nsfw_concepts = torch.any(concept_scores > 0, dim=1)
134
+
135
+ images[has_nsfw_concepts] = 0.0 # black image
136
+
137
+ return images, has_nsfw_concepts