Dreamshaper_v7-Openvino / convert_to_openvino.py

Upload 24 files

3de498f about 1 year ago

5.46 kB

	from typing import Dict, Optional, Tuple, OrderedDict
	from transformers import CLIPTextConfig
	from diffusers import UNet2DConditionModel

	import torch

	from optimum.exporters.onnx.model_configs import VisionOnnxConfig, NormalizedConfig, DummyVisionInputGenerator, DummyTimestepInputGenerator, DummySeq2SeqDecoderTextInputGenerator, DummySeq2SeqDecoderTextInputGenerator
	from optimum.exporters.openvino import main_export
	from optimum.utils.input_generators import DummyInputGenerator, DEFAULT_DUMMY_SHAPES
	from optimum.utils.normalized_config import NormalizedTextConfig

	# IMPORTANT: You need to specify some scheduler in downloaded model cache folder to avoid errors

	class CustomDummyTimestepInputGenerator(DummyInputGenerator):
	"""
	Generates dummy time step inputs.
	"""

	SUPPORTED_INPUT_NAMES = (
	"timestep",
	"timestep_cond",
	"text_embeds",
	"time_ids",
	)

	def __init__(
	self,
	task: str,
	normalized_config: NormalizedConfig,
	batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
	time_cond_proj_dim: int = 256,
	random_batch_size_range: Optional[Tuple[int, int]] = None,
	**kwargs,
	):
	self.task = task
	self.vocab_size = normalized_config.vocab_size
	self.text_encoder_projection_dim = normalized_config.text_encoder_projection_dim
	self.time_ids = 5 if normalized_config.requires_aesthetics_score else 6
	if random_batch_size_range:
	low, high = random_batch_size_range
	self.batch_size = random.randint(low, high)
	else:
	self.batch_size = batch_size
	self.time_cond_proj_dim = normalized_config.get("time_cond_proj_dim", time_cond_proj_dim)

	def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
	shape = [self.batch_size]

	if input_name == "timestep":
	return self.random_int_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=int_dtype)

	if input_name == "timestep_cond":
	shape.append(self.time_cond_proj_dim)
	return self.random_float_tensor(shape, min_value=-1.0, max_value=1.0, framework=framework, dtype=float_dtype)


	shape.append(self.text_encoder_projection_dim if input_name == "text_embeds" else self.time_ids)
	return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype)

	class LCMUNetOnnxConfig(VisionOnnxConfig):
	ATOL_FOR_VALIDATION = 1e-3
	# The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu
	# operator support, available since opset 14
	DEFAULT_ONNX_OPSET = 14

	NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(
	image_size="sample_size",
	num_channels="in_channels",
	hidden_size="cross_attention_dim",
	vocab_size="norm_num_groups",
	allow_new=True,
	)

	DUMMY_INPUT_GENERATOR_CLASSES = (
	DummyVisionInputGenerator,
	CustomDummyTimestepInputGenerator,
	DummySeq2SeqDecoderTextInputGenerator,
	)

	@property
	def inputs(self) -> Dict[str, Dict[int, str]]:
	common_inputs = OrderedDict({
	"sample": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
	"timestep": {0: "steps"},
	"encoder_hidden_states": {0: "batch_size", 1: "sequence_length"},
	"timestep_cond": {0: "batch_size"},
	})

	# TODO : add text_image, image and image_embeds
	if getattr(self._normalized_config, "addition_embed_type", None) == "text_time":
	common_inputs["text_embeds"] = {0: "batch_size"}
	common_inputs["time_ids"] = {0: "batch_size"}

	return common_inputs

	@property
	def outputs(self) -> Dict[str, Dict[int, str]]:
	return {
	"out_sample": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
	}

	@property
	def torch_to_onnx_output_map(self) -> Dict[str, str]:
	return {
	"sample": "out_sample",
	}

	def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
	dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs)
	dummy_inputs["encoder_hidden_states"] = dummy_inputs["encoder_hidden_states"][0]

	if getattr(self._normalized_config, "addition_embed_type", None) == "text_time":
	dummy_inputs["added_cond_kwargs"] = {
	"text_embeds": dummy_inputs.pop("text_embeds"),
	"time_ids": dummy_inputs.pop("time_ids"),
	}

	return dummy_inputs

	def ordered_inputs(self, model) -> Dict[str, Dict[int, str]]:
	return self.inputs # Breaks order if timestep_cond involved ( so just copy original one )

	model_id = "SimianLuo/LCM_Dreamshaper_v7"

	text_encoder_config = CLIPTextConfig.from_pretrained(model_id, subfolder = "text_encoder")
	unet_config = UNet2DConditionModel.from_pretrained(model_id, subfolder = "unet").config

	unet_config.text_encoder_projection_dim = text_encoder_config.projection_dim
	unet_config.requires_aesthetics_score = False

	custom_onnx_configs = {
	"unet": LCMUNetOnnxConfig(config = unet_config, task = "semantic-segmentation")
	}

	main_export(model_name_or_path = model_id, output = "./", task = "stable-diffusion", fp16 = False, int8 = False, custom_onnx_configs = custom_onnx_configs)