File size: 3,687 Bytes
1822fe2 bfffca7 1822fe2 f2191a0 ca410d5 1822fe2 de1c669 1822fe2 ca410d5 1822fe2 e662c48 1822fe2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
from diffusers import DiffusionPipeline
import torch
import numpy as np
import importlib.util
import sys
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
import os
from .vae import AutoencoderKL
from .mar import mar_base, mar_large, mar_huge
# inheriting from DiffusionPipeline for HF
class MARModel(DiffusionPipeline):
def __init__(self):
super().__init__()
@torch.no_grad()
def __call__(self, *args, **kwargs):
"""
This method downloads the model and VAE components,
then executes the forward pass based on the user's input.
"""
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# init the mar model architecture
buffer_size = kwargs.get("buffer_size", 64)
diffloss_d = kwargs.get("diffloss_d", 3)
diffloss_w = kwargs.get("diffloss_w", 1024)
num_sampling_steps = kwargs.get("num_sampling_steps", 100)
model_type = kwargs.get("model_type", "mar_base")
if model_type == "mar_base":
self.model = mar_base(
buffer_size=buffer_size,
diffloss_d=diffloss_d,
diffloss_w=diffloss_w,
num_sampling_steps=str(num_sampling_steps)
).to(device)
elif model_type == "mar_large":
self.model = mar_large(
buffer_size=buffer_size,
diffloss_d=diffloss_d,
diffloss_w=diffloss_w,
num_sampling_steps=str(num_sampling_steps)
).to(device)
elif model_type == "mar_huge":
self.model = mar_huge(
buffer_size=buffer_size,
diffloss_d=diffloss_d,
diffloss_w=diffloss_w,
num_sampling_steps=str(num_sampling_steps)
).to(device)
# download and load the model weights (.safetensors or .pth)
model_checkpoint_path = hf_hub_download(
repo_id=kwargs.get("repo_id", "jadechoghari/mar"),
filename=kwargs.get("model_filename", "checkpoint-last.pth")
)
state_dict = torch.load(model_checkpoint_path, map_location=device)["model_ema"]
self.model.load_state_dict(state_dict, strict=False)
self.model.eval()
# download and load the vae
vae_checkpoint_path = hf_hub_download(
repo_id=kwargs.get("repo_id", "jadechoghari/mar"),
filename=kwargs.get("vae_filename", "kl16.ckpt")
)
vae = AutoencoderKL(embed_dim=16, ch_mult=(1, 1, 2, 2, 4), ckpt_path=vae_checkpoint_path)
vae = vae.to(device).eval()
# set up user-specified or default values for generation
seed = kwargs.get("seed", 0)
torch.manual_seed(seed)
np.random.seed(seed)
num_ar_steps = kwargs.get("num_ar_steps", 64)
cfg_scale = kwargs.get("cfg_scale", 4)
cfg_schedule = kwargs.get("cfg_schedule", "constant")
temperature = kwargs.get("temperature", 1.0)
class_labels = kwargs.get("class_labels", [207, 360, 388, 113, 355, 980, 323, 979])
class_labels = torch.Tensor(class_labels).long().to(device)
# generate the tokens and images
with torch.cuda.amp.autocast():
sampled_tokens = self.model.sample_tokens(
bsz=len(class_labels), num_iter=num_ar_steps,
cfg=cfg_scale, cfg_schedule=cfg_schedule,
labels=torch.Tensor(class_labels).long().to(device),
temperature=temperature, progress=True
)
sampled_images = vae.decode(sampled_tokens / 0.2325)
return sampled_images
|