File size: 3,687 Bytes
1822fe2
 
 
bfffca7
 
1822fe2
 
 
f2191a0
ca410d5
1822fe2
 
 
 
 
 
 
 
de1c669
1822fe2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca410d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1822fe2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e662c48
1822fe2
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from diffusers import DiffusionPipeline
import torch
import numpy as np
import importlib.util
import sys
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
import os
from .vae import AutoencoderKL
from .mar import mar_base, mar_large, mar_huge

# inheriting from DiffusionPipeline for HF
class MARModel(DiffusionPipeline):  

    def __init__(self):
        super().__init__()

    @torch.no_grad()
    def __call__(self, *args, **kwargs):
        """
        This method downloads the model and VAE components,
        then executes the forward pass based on the user's input.
        """
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



        # init the mar model architecture
        buffer_size = kwargs.get("buffer_size", 64)
        diffloss_d = kwargs.get("diffloss_d", 3)
        diffloss_w = kwargs.get("diffloss_w", 1024)
        num_sampling_steps = kwargs.get("num_sampling_steps", 100)
        model_type = kwargs.get("model_type", "mar_base")


        if model_type == "mar_base":
            self.model = mar_base(
                buffer_size=buffer_size,
                diffloss_d=diffloss_d,
                diffloss_w=diffloss_w,
                num_sampling_steps=str(num_sampling_steps)
            ).to(device)
        elif model_type == "mar_large":
            self.model = mar_large(
                buffer_size=buffer_size,
                diffloss_d=diffloss_d,
                diffloss_w=diffloss_w,
                num_sampling_steps=str(num_sampling_steps)
            ).to(device)
        elif model_type == "mar_huge":
            self.model = mar_huge(
                buffer_size=buffer_size,
                diffloss_d=diffloss_d,
                diffloss_w=diffloss_w,
                num_sampling_steps=str(num_sampling_steps)
            ).to(device)
        # download and load the model weights (.safetensors or .pth)
        model_checkpoint_path = hf_hub_download(
            repo_id=kwargs.get("repo_id", "jadechoghari/mar"),
            filename=kwargs.get("model_filename", "checkpoint-last.pth")
        )

        state_dict = torch.load(model_checkpoint_path, map_location=device)["model_ema"]

        self.model.load_state_dict(state_dict, strict=False)
        self.model.eval()

        # download and load the vae
        vae_checkpoint_path = hf_hub_download(
            repo_id=kwargs.get("repo_id", "jadechoghari/mar"),
            filename=kwargs.get("vae_filename", "kl16.ckpt")
        )

        vae = AutoencoderKL(embed_dim=16, ch_mult=(1, 1, 2, 2, 4), ckpt_path=vae_checkpoint_path)
        vae = vae.to(device).eval()

        # set up user-specified or default values for generation
        seed = kwargs.get("seed", 0)
        torch.manual_seed(seed)
        np.random.seed(seed)

        num_ar_steps = kwargs.get("num_ar_steps", 64)
        cfg_scale = kwargs.get("cfg_scale", 4)
        cfg_schedule = kwargs.get("cfg_schedule", "constant")
        temperature = kwargs.get("temperature", 1.0)
        class_labels = kwargs.get("class_labels", [207, 360, 388, 113, 355, 980, 323, 979])
        class_labels = torch.Tensor(class_labels).long().to(device) 

        # generate the tokens and images
        with torch.cuda.amp.autocast():
            sampled_tokens = self.model.sample_tokens(
                bsz=len(class_labels), num_iter=num_ar_steps,
                cfg=cfg_scale, cfg_schedule=cfg_schedule,
                labels=torch.Tensor(class_labels).long().to(device),
                temperature=temperature, progress=True
            )

            sampled_images = vae.decode(sampled_tokens / 0.2325)

        return sampled_images