Sapir commited on
Commit
e46ff5e
1 Parent(s): d699d2b

Examples: update and fix scripts.

Browse files
scripts/to_safetensors.py CHANGED
@@ -60,7 +60,7 @@ def load_vae_config(vae_path: Path) -> str:
60
  return str(config_path)
61
 
62
 
63
- def main(unet_path: str, vae_path: str, t5_path: str, out_path: str, mode: str,
64
  unet_config_path: str = None, scheduler_config_path: str = None) -> None:
65
  unet = convert_unet(torch.load(unet_path, weights_only=True), add_prefix=(mode == 'single'))
66
 
@@ -98,7 +98,6 @@ if __name__ == '__main__':
98
  parser = argparse.ArgumentParser()
99
  parser.add_argument('--unet_path', '-u', type=str, default='unet/ema-002.pt')
100
  parser.add_argument('--vae_path', '-v', type=str, default='vae/')
101
- parser.add_argument('--t5_path', '-t', type=str, default='t5/PixArt-XL-2-1024-MS/')
102
  parser.add_argument('--out_path', '-o', type=str, default='xora.safetensors')
103
  parser.add_argument('--mode', '-m', type=str, choices=['single', 'separate'], default='single',
104
  help="Choose 'single' for the original behavior, 'separate' to save unet and vae separately.")
 
60
  return str(config_path)
61
 
62
 
63
+ def main(unet_path: str, vae_path: str, out_path: str, mode: str,
64
  unet_config_path: str = None, scheduler_config_path: str = None) -> None:
65
  unet = convert_unet(torch.load(unet_path, weights_only=True), add_prefix=(mode == 'single'))
66
 
 
98
  parser = argparse.ArgumentParser()
99
  parser.add_argument('--unet_path', '-u', type=str, default='unet/ema-002.pt')
100
  parser.add_argument('--vae_path', '-v', type=str, default='vae/')
 
101
  parser.add_argument('--out_path', '-o', type=str, default='xora.safetensors')
102
  parser.add_argument('--mode', '-m', type=str, choices=['single', 'separate'], default='single',
103
  help="Choose 'single' for the original behavior, 'separate' to save unet and vae separately.")
xora/examples/image_to_video.py CHANGED
@@ -5,94 +5,107 @@ from xora.models.transformers.symmetric_patchifier import SymmetricPatchifier
5
  from xora.schedulers.rf import RectifiedFlowScheduler
6
  from xora.pipelines.pipeline_video_pixart_alpha import VideoPixArtAlphaPipeline
7
  from pathlib import Path
 
8
  import safetensors.torch
9
  import json
 
10
 
11
- # Paths for the separate mode directories
12
- separate_dir = Path("/opt/models/xora-img2video")
13
- unet_dir = separate_dir / 'unet'
14
- vae_dir = separate_dir / 'vae'
15
- scheduler_dir = separate_dir / 'scheduler'
16
-
17
- # Load VAE from separate mode
18
- vae_ckpt_path = vae_dir / "diffusion_pytorch_model.safetensors"
19
- vae_config_path = vae_dir / "config.json"
20
- with open(vae_config_path, 'r') as f:
21
- vae_config = json.load(f)
22
- vae = CausalVideoAutoencoder.from_config(vae_config)
23
- vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
24
- vae.load_state_dict(
25
- state_dict=vae_state_dict,
26
- )
27
- vae = vae.cuda().to(torch.bfloat16)
28
-
29
- # Load UNet (Transformer) from separate mode
30
- unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"
31
- unet_config_path = unet_dir / "config.json"
32
- transformer_config = Transformer3DModel.load_config(unet_config_path)
33
- transformer = Transformer3DModel.from_config(transformer_config)
34
- unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
35
- transformer.load_state_dict(unet_state_dict, strict=True)
36
- transformer = transformer.cuda()
37
- unet = transformer
38
-
39
- # Load Scheduler from separate mode
40
- scheduler_config_path = scheduler_dir / "scheduler_config.json"
41
- scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
42
- scheduler = RectifiedFlowScheduler.from_config(scheduler_config)
43
-
44
- # Patchifier (remains the same)
45
- patchifier = SymmetricPatchifier(patch_size=1)
46
-
47
- # Use submodels for the pipeline
48
- submodel_dict = {
49
- "unet": unet,
50
- "transformer": transformer,
51
- "patchifier": patchifier,
52
- "text_encoder": None,
53
- "scheduler": scheduler,
54
- "vae": vae,
55
- }
56
-
57
- model_name_or_path = "PixArt-alpha/PixArt-XL-2-1024-MS"
58
- pipeline = VideoPixArtAlphaPipeline.from_pretrained(model_name_or_path,
59
- safety_checker=None,
60
- revision=None,
61
- torch_dtype=torch.float32, # dtype adjusted
62
- **submodel_dict,
63
- ).to("cuda")
64
-
65
- num_inference_steps = 20
66
- num_images_per_prompt = 2
67
- guidance_scale = 3
68
- height = 512
69
- width = 768
70
- num_frames = 57
71
- frame_rate = 25
72
-
73
- # Assuming sample is a dict loaded from a .pt file
74
- sample = torch.load("/opt/sample.pt")
75
- for key, item in sample.items():
76
- if item is not None:
77
- sample[key] = item.cuda()
78
-
79
- media_items = torch.load("/opt/sample_media.pt")
80
-
81
- # Generate images (video frames)
82
- images = pipeline(
83
- num_inference_steps=num_inference_steps,
84
- num_images_per_prompt=num_images_per_prompt,
85
- guidance_scale=guidance_scale,
86
- generator=None,
87
- output_type="pt",
88
- callback_on_step_end=None,
89
- height=height,
90
- width=width,
91
- num_frames=num_frames,
92
- frame_rate=frame_rate,
93
- **sample,
94
- is_video=True,
95
- vae_per_channel_normalize=True,
96
- ).images
97
-
98
- print("Generated video frames.")
 
 
 
 
 
 
 
 
 
 
 
 
5
  from xora.schedulers.rf import RectifiedFlowScheduler
6
  from xora.pipelines.pipeline_video_pixart_alpha import VideoPixArtAlphaPipeline
7
  from pathlib import Path
8
+ from transformers import T5EncoderModel, T5Tokenizer
9
  import safetensors.torch
10
  import json
11
+ import argparse
12
 
13
+ def load_vae(vae_dir):
14
+ vae_ckpt_path = vae_dir / "diffusion_pytorch_model.safetensors"
15
+ vae_config_path = vae_dir / "config.json"
16
+ with open(vae_config_path, 'r') as f:
17
+ vae_config = json.load(f)
18
+ vae = CausalVideoAutoencoder.from_config(vae_config)
19
+ vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
20
+ vae.load_state_dict(vae_state_dict)
21
+ return vae.cuda().to(torch.bfloat16)
22
+
23
+ def load_unet(unet_dir):
24
+ unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"
25
+ unet_config_path = unet_dir / "config.json"
26
+ transformer_config = Transformer3DModel.load_config(unet_config_path)
27
+ transformer = Transformer3DModel.from_config(transformer_config)
28
+ unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
29
+ transformer.load_state_dict(unet_state_dict, strict=True)
30
+ return transformer.cuda()
31
+
32
+ def load_scheduler(scheduler_dir):
33
+ scheduler_config_path = scheduler_dir / "scheduler_config.json"
34
+ scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
35
+ return RectifiedFlowScheduler.from_config(scheduler_config)
36
+
37
+ def main():
38
+ # Parse command line arguments
39
+ parser = argparse.ArgumentParser(description='Load models from separate directories')
40
+ parser.add_argument('--separate_dir', type=str, required=True, help='Path to the directory containing unet, vae, and scheduler subdirectories')
41
+ args = parser.parse_args()
42
+
43
+ # Paths for the separate mode directories
44
+ separate_dir = Path(args.separate_dir)
45
+ unet_dir = separate_dir / 'unet'
46
+ vae_dir = separate_dir / 'vae'
47
+ scheduler_dir = separate_dir / 'scheduler'
48
+
49
+ # Load models
50
+ vae = load_vae(vae_dir)
51
+ unet = load_unet(unet_dir)
52
+ scheduler = load_scheduler(scheduler_dir)
53
+
54
+ # Patchifier (remains the same)
55
+ patchifier = SymmetricPatchifier(patch_size=1)
56
+
57
+ # text_encoder = T5EncoderModel.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder").to("cuda")
58
+ # tokenizer = T5Tokenizer.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer")
59
+
60
+ # Use submodels for the pipeline
61
+ submodel_dict = {
62
+ "transformer": unet, # using unet for transformer
63
+ "patchifier": patchifier,
64
+ "text_encoder": None,
65
+ "tokenizer": None,
66
+ "scheduler": scheduler,
67
+ "vae": vae,
68
+ }
69
+
70
+ model_name_or_path = "PixArt-alpha/PixArt-XL-2-1024-MS"
71
+ pipeline = VideoPixArtAlphaPipeline(
72
+ **submodel_dict
73
+ ).to("cuda")
74
+
75
+ num_inference_steps = 20
76
+ num_images_per_prompt = 1
77
+ guidance_scale = 3
78
+ height = 512
79
+ width = 768
80
+ num_frames = 57
81
+ frame_rate = 25
82
+
83
+ # Sample input stays the same
84
+ sample = torch.load("/opt/sample_media.pt")
85
+ for key, item in sample.items():
86
+ if item is not None:
87
+ sample[key] = item.cuda()
88
+
89
+ # media_items = torch.load("/opt/sample_media.pt")
90
+
91
+ # Generate images (video frames)
92
+ images = pipeline(
93
+ num_inference_steps=num_inference_steps,
94
+ num_images_per_prompt=num_images_per_prompt,
95
+ guidance_scale=guidance_scale,
96
+ generator=None,
97
+ output_type="pt",
98
+ callback_on_step_end=None,
99
+ height=height,
100
+ width=width,
101
+ num_frames=num_frames,
102
+ frame_rate=frame_rate,
103
+ **sample,
104
+ is_video=True,
105
+ vae_per_channel_normalize=True,
106
+ ).images
107
+
108
+ print("Generated video frames.")
109
+
110
+ if __name__ == "__main__":
111
+ main()
xora/examples/text_to_video.py CHANGED
@@ -5,93 +5,104 @@ from xora.models.transformers.symmetric_patchifier import SymmetricPatchifier
5
  from xora.schedulers.rf import RectifiedFlowScheduler
6
  from xora.pipelines.pipeline_video_pixart_alpha import VideoPixArtAlphaPipeline
7
  from pathlib import Path
8
- from transformers import T5EncoderModel
9
  import safetensors.torch
10
  import json
 
11
 
12
- # Paths for the separate mode directories
13
- separate_dir = Path("/opt/models/xora-img2video")
14
- unet_dir = separate_dir / 'unet'
15
- vae_dir = separate_dir / 'vae'
16
- scheduler_dir = separate_dir / 'scheduler'
 
 
 
 
17
 
18
- # Load VAE from separate mode
19
- vae_ckpt_path = vae_dir / "diffusion_pytorch_model.safetensors"
20
- vae_config_path = vae_dir / "config.json"
21
- with open(vae_config_path, 'r') as f:
22
- vae_config = json.load(f)
23
- vae = CausalVideoAutoencoder.from_config(vae_config)
24
- vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
25
- vae.load_state_dict(
26
- state_dict=vae_state_dict,
27
- )
28
- vae = vae.cuda().to(torch.bfloat16)
29
 
30
- # Load UNet (Transformer) from separate mode
31
- unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"
32
- unet_config_path = unet_dir / "config.json"
33
- transformer_config = Transformer3DModel.load_config(unet_config_path)
34
- transformer = Transformer3DModel.from_config(transformer_config)
35
- unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
36
- transformer.load_state_dict(unet_state_dict, strict=True)
37
- transformer = transformer.cuda()
38
- unet = transformer
39
 
40
- # Load Scheduler from separate mode
41
- scheduler_config_path = scheduler_dir / "scheduler_config.json"
42
- scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
43
- scheduler = RectifiedFlowScheduler.from_config(scheduler_config)
 
44
 
45
- # Patchifier (remains the same)
46
- patchifier = SymmetricPatchifier(patch_size=1)
 
 
 
47
 
48
- # Use submodels for the pipeline
49
- submodel_dict = {
50
- "unet": unet,
51
- "transformer": transformer,
52
- "patchifier": patchifier,
53
- "scheduler": scheduler,
54
- "vae": vae,
55
- }
56
- model_name_or_path = "PixArt-alpha/PixArt-XL-2-1024-MS"
57
- pipeline = VideoPixArtAlphaPipeline.from_pretrained(model_name_or_path,
58
- safety_checker=None,
59
- revision=None,
60
- torch_dtype=torch.float32,
61
- **submodel_dict,
62
- ).to("cuda")
63
 
64
- # Sample input
65
- num_inference_steps = 20
66
- num_images_per_prompt = 2
67
- guidance_scale = 3
68
- height = 512
69
- width = 768
70
- num_frames = 57
71
- frame_rate = 25
72
- sample = {
73
- "prompt": "A middle-aged man with glasses and a salt-and-pepper beard is driving a car and talking, gesturing with his right hand. "
74
- "The man is wearing a dark blue zip-up jacket and a light blue collared shirt. He is sitting in the driver's seat of a car with a black interior. The car is moving on a road with trees and bushes on either side. The man has a serious expression on his face and is looking straight ahead.",
75
- 'prompt_attention_mask': None, # Adjust attention masks as needed
76
- 'negative_prompt': "Ugly deformed",
77
- 'negative_prompt_attention_mask': None
78
- }
79
 
80
- # Generate images (video frames)
81
- images = pipeline(
82
- num_inference_steps=num_inference_steps,
83
- num_images_per_prompt=num_images_per_prompt,
84
- guidance_scale=guidance_scale,
85
- generator=None,
86
- output_type="pt",
87
- callback_on_step_end=None,
88
- height=height,
89
- width=width,
90
- num_frames=num_frames,
91
- frame_rate=frame_rate,
92
- **sample,
93
- is_video=True,
94
- vae_per_channel_normalize=True,
95
- ).images
96
 
97
- print("Generated images (video frames).")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from xora.schedulers.rf import RectifiedFlowScheduler
6
  from xora.pipelines.pipeline_video_pixart_alpha import VideoPixArtAlphaPipeline
7
  from pathlib import Path
8
+ from transformers import T5EncoderModel, T5Tokenizer
9
  import safetensors.torch
10
  import json
11
+ import argparse
12
 
13
+ def load_vae(vae_dir):
14
+ vae_ckpt_path = vae_dir / "diffusion_pytorch_model.safetensors"
15
+ vae_config_path = vae_dir / "config.json"
16
+ with open(vae_config_path, 'r') as f:
17
+ vae_config = json.load(f)
18
+ vae = CausalVideoAutoencoder.from_config(vae_config)
19
+ vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
20
+ vae.load_state_dict(vae_state_dict)
21
+ return vae.cuda().to(torch.bfloat16)
22
 
23
+ def load_unet(unet_dir):
24
+ unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"
25
+ unet_config_path = unet_dir / "config.json"
26
+ transformer_config = Transformer3DModel.load_config(unet_config_path)
27
+ transformer = Transformer3DModel.from_config(transformer_config)
28
+ unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
29
+ transformer.load_state_dict(unet_state_dict, strict=True)
30
+ return transformer.cuda()
 
 
 
31
 
32
+ def load_scheduler(scheduler_dir):
33
+ scheduler_config_path = scheduler_dir / "scheduler_config.json"
34
+ scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
35
+ return RectifiedFlowScheduler.from_config(scheduler_config)
 
 
 
 
 
36
 
37
+ def main():
38
+ # Parse command line arguments
39
+ parser = argparse.ArgumentParser(description='Load models from separate directories')
40
+ parser.add_argument('--separate_dir', type=str, required=True, help='Path to the directory containing unet, vae, and scheduler subdirectories')
41
+ args = parser.parse_args()
42
 
43
+ # Paths for the separate mode directories
44
+ separate_dir = Path(args.separate_dir)
45
+ unet_dir = separate_dir / 'unet'
46
+ vae_dir = separate_dir / 'vae'
47
+ scheduler_dir = separate_dir / 'scheduler'
48
 
49
+ # Load models
50
+ vae = load_vae(vae_dir)
51
+ unet = load_unet(unet_dir)
52
+ scheduler = load_scheduler(scheduler_dir)
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ # Patchifier (remains the same)
55
+ patchifier = SymmetricPatchifier(patch_size=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ text_encoder = T5EncoderModel.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder").to("cuda")
58
+ tokenizer = T5Tokenizer.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+ # Use submodels for the pipeline
61
+ submodel_dict = {
62
+ "transformer": unet, # using unet for transformer
63
+ "patchifier": patchifier,
64
+ "scheduler": scheduler,
65
+ "text_encoder": text_encoder,
66
+ "tokenizer": tokenizer,
67
+ "vae": vae,
68
+ }
69
+
70
+ pipeline = VideoPixArtAlphaPipeline(**submodel_dict).to("cuda")
71
+
72
+ # Sample input
73
+ num_inference_steps = 20
74
+ num_images_per_prompt = 2
75
+ guidance_scale = 3
76
+ height = 512
77
+ width = 768
78
+ num_frames = 57
79
+ frame_rate = 25
80
+ sample = {
81
+ "prompt": "A middle-aged man with glasses and a salt-and-pepper beard is driving a car and talking, gesturing with his right hand. "
82
+ "The man is wearing a dark blue zip-up jacket and a light blue collared shirt. He is sitting in the driver's seat of a car with a black interior. The car is moving on a road with trees and bushes on either side. The man has a serious expression on his face and is looking straight ahead.",
83
+ 'prompt_attention_mask': None, # Adjust attention masks as needed
84
+ 'negative_prompt': "Ugly deformed",
85
+ 'negative_prompt_attention_mask': None
86
+ }
87
+
88
+ # Generate images (video frames)
89
+ images = pipeline(
90
+ num_inference_steps=num_inference_steps,
91
+ num_images_per_prompt=num_images_per_prompt,
92
+ guidance_scale=guidance_scale,
93
+ generator=None,
94
+ output_type="pt",
95
+ callback_on_step_end=None,
96
+ height=height,
97
+ width=width,
98
+ num_frames=num_frames,
99
+ frame_rate=frame_rate,
100
+ **sample,
101
+ is_video=True,
102
+ vae_per_channel_normalize=True,
103
+ ).images
104
+
105
+ print("Generated images (video frames).")
106
+
107
+ if __name__ == "__main__":
108
+ main()