jbilcke-hf HF staff commited on
Commit
606d9c1
1 Parent(s): 366a4b9

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +37 -40
handler.py CHANGED
@@ -15,52 +15,45 @@ def get_default_args():
15
  # Model configuration
16
  parser.add_argument("--model", type=str, default="HYVideo-T/2")
17
  parser.add_argument("--model-resolution", type=str, default="720p", choices=["540p", "720p"])
18
- parser.add_argument("--latent-channels", type=int, default=4)
19
  parser.add_argument("--precision", type=str, default="bf16", choices=["bf16", "fp32", "fp16"])
20
- parser.add_argument("--rope-theta", type=float, default=10000)
21
 
22
  # VAE settings
23
  parser.add_argument("--vae", type=str, default="884-16c-hy")
24
- parser.add_argument("--vae-precision", type=str, default="bf16", choices=["bf16", "fp32", "fp16"])
25
- parser.add_argument("--vae-tiling", action="store_true")
26
 
27
  # Text encoder settings
28
- parser.add_argument("--text-encoder", type=str, default="clipL", choices=["clipL", "llm"])
29
- parser.add_argument("--text-encoder-precision", type=str, default="bf16", choices=["bf16", "fp32", "fp16"])
30
- parser.add_argument("--text-states-dim", type=int, default=1024)
31
- parser.add_argument("--text-len", type=int, default=77)
32
- parser.add_argument("--tokenizer", type=str, default="clipL", choices=["clipL", "llm"])
33
 
34
  # Prompt template settings
35
- parser.add_argument("--prompt-template", type=str, default="dit-llm-encode",
36
- choices=["dit-llm-encode", "dit-llm-encode-video"])
37
- parser.add_argument("--prompt-template-video", type=str, default="dit-llm-encode",
38
- choices=["dit-llm-encode", "dit-llm-encode-video"])
39
 
40
  # Additional text encoder settings
41
- parser.add_argument("--hidden-state-skip-layer", type=int, default=0)
42
  parser.add_argument("--apply-final-norm", action="store_true")
43
- parser.add_argument("--text-encoder-2", type=str, default="clipL", choices=["clipL", "llm"])
44
- parser.add_argument("--text-encoder-precision-2", type=str, default="bf16", choices=["bf16", "fp32", "fp16"])
45
- parser.add_argument("--text-states-dim-2", type=int, default=1024)
46
- parser.add_argument("--tokenizer-2", type=str, default="clipL", choices=["clipL", "llm"])
47
- parser.add_argument("--text-len-2", type=int, default=77)
48
 
49
  # Inference settings
50
- parser.add_argument("--denoise-type", type=str, default="v-prediction")
51
  parser.add_argument("--flow-shift", type=float, default=7.0)
52
- parser.add_argument("--flow-reverse", action="store_true")
53
  parser.add_argument("--flow-solver", type=str, default="euler")
54
  parser.add_argument("--use-linear-quadratic-schedule", action="store_true")
55
- parser.add_argument("--linear-schedule-end", type=float, default=0.0)
56
-
57
- # Model paths and weights
58
- parser.add_argument("--model-base", type=str, default=None)
59
- parser.add_argument("--dit-weight", type=str, default=None)
60
- parser.add_argument("--load-key", type=str, default=None)
61
 
62
  # Hardware settings
63
- parser.add_argument("--use-cpu-offload", action="store_true")
64
  parser.add_argument("--batch-size", type=int, default=1)
65
  parser.add_argument("--infer-steps", type=int, default=50)
66
  parser.add_argument("--disable-autocast", action="store_true")
@@ -72,22 +65,23 @@ def get_default_args():
72
 
73
  # Generation settings
74
  parser.add_argument("--num-videos", type=int, default=1)
75
- parser.add_argument("--video-size", nargs="+", type=int, default=None)
76
  parser.add_argument("--video-length", type=int, default=129)
77
  parser.add_argument("--prompt", type=str, default=None)
78
- parser.add_argument("--seed-type", type=str, default="random", choices=["file", "random", "fixed", "auto"])
79
- parser.add_argument("--seed", type=int, default=-1)
80
  parser.add_argument("--neg-prompt", type=str, default="")
81
  parser.add_argument("--cfg-scale", type=float, default=1.0)
82
  parser.add_argument("--embedded-cfg-scale", type=float, default=6.0)
83
  parser.add_argument("--reproduce", action="store_true")
84
 
85
- # Additional degrees
86
- parser.add_argument("--ulysses-degree", type=float, default=1.0)
87
- parser.add_argument("--ring-degree", type=float, default=1.0)
88
 
89
  # Parse with empty args list to avoid reading sys.argv
90
  args = parser.parse_args([])
 
91
  return args
92
 
93
  class EndpointHandler:
@@ -95,7 +89,10 @@ class EndpointHandler:
95
  """Initialize the handler with model path and default config."""
96
  # Use default args instead of parsing from command line
97
  self.args = get_default_args()
98
- self.args.model_base = path # Use the provided model path
 
 
 
99
 
100
  # Initialize model
101
  models_root_path = Path(path)
@@ -110,12 +107,12 @@ class EndpointHandler:
110
  Args:
111
  data: Dictionary containing:
112
  - inputs (str): The prompt text
113
- - resolution (str, optional): Video resolution like "1280x720"
114
  - video_length (int, optional): Number of frames
115
  - num_inference_steps (int, optional): Number of inference steps
116
  - seed (int, optional): Random seed (-1 for random)
117
  - guidance_scale (float, optional): Guidance scale value
118
- - flow_shift (float, optional): Flow shift value
119
  - embedded_guidance_scale (float, optional): Embedded guidance scale
120
 
121
  Returns:
@@ -126,7 +123,7 @@ class EndpointHandler:
126
  if prompt is None:
127
  raise ValueError("No prompt provided in the 'inputs' field")
128
 
129
- # Parse resolution
130
  resolution = data.pop("resolution", "1280x720")
131
  width, height = map(int, resolution.split("x"))
132
 
@@ -144,7 +141,7 @@ class EndpointHandler:
144
  prompt=prompt,
145
  height=height,
146
  width=width,
147
- video_length=video_length,
148
  seed=seed,
149
  negative_prompt="",
150
  infer_steps=num_inference_steps,
@@ -176,4 +173,4 @@ class EndpointHandler:
176
  "video_base64": video_base64,
177
  "seed": outputs['seeds'][0],
178
  "prompt": outputs['prompts'][0]
179
- }
 
15
  # Model configuration
16
  parser.add_argument("--model", type=str, default="HYVideo-T/2")
17
  parser.add_argument("--model-resolution", type=str, default="720p", choices=["540p", "720p"])
18
+ parser.add_argument("--latent-channels", type=int, default=16) # Changed from 4 to match VAE
19
  parser.add_argument("--precision", type=str, default="bf16", choices=["bf16", "fp32", "fp16"])
20
+ parser.add_argument("--rope-theta", type=int, default=256) # Changed to match original config
21
 
22
  # VAE settings
23
  parser.add_argument("--vae", type=str, default="884-16c-hy")
24
+ parser.add_argument("--vae-precision", type=str, default="fp16") # Changed to fp16 to match docs
25
+ parser.add_argument("--vae-tiling", action="store_true", default=True) # Set default to True
26
 
27
  # Text encoder settings
28
+ parser.add_argument("--text-encoder", type=str, default="llm") # Changed to llm
29
+ parser.add_argument("--text-encoder-precision", type=str, default="fp16") # Changed to fp16
30
+ parser.add_argument("--text-states-dim", type=int, default=4096) # Updated to match docs
31
+ parser.add_argument("--text-len", type=int, default=256) # Updated to match docs
32
+ parser.add_argument("--tokenizer", type=str, default="llm") # Changed to llm
33
 
34
  # Prompt template settings
35
+ parser.add_argument("--prompt-template", type=str, default="dit-llm-encode")
36
+ parser.add_argument("--prompt-template-video", type=str, default="dit-llm-encode-video")
 
 
37
 
38
  # Additional text encoder settings
39
+ parser.add_argument("--hidden-state-skip-layer", type=int, default=2) # Updated to match docs
40
  parser.add_argument("--apply-final-norm", action="store_true")
41
+ parser.add_argument("--text-encoder-2", type=str, default="clipL")
42
+ parser.add_argument("--text-encoder-precision-2", type=str, default="fp16") # Changed to fp16
43
+ parser.add_argument("--text-states-dim-2", type=int, default=768) # Updated to match docs
44
+ parser.add_argument("--tokenizer-2", type=str, default="clipL")
45
+ parser.add_argument("--text-len-2", type=int, default=77) # Updated to match docs
46
 
47
  # Inference settings
48
+ parser.add_argument("--denoise-type", type=str, default="flow") # Changed to flow
49
  parser.add_argument("--flow-shift", type=float, default=7.0)
50
+ parser.add_argument("--flow-reverse", action="store_true", default=False)
51
  parser.add_argument("--flow-solver", type=str, default="euler")
52
  parser.add_argument("--use-linear-quadratic-schedule", action="store_true")
53
+ parser.add_argument("--linear-schedule-end", type=int, default=25) # Updated to match docs
 
 
 
 
 
54
 
55
  # Hardware settings
56
+ parser.add_argument("--use-cpu-offload", action="store_true", default=False)
57
  parser.add_argument("--batch-size", type=int, default=1)
58
  parser.add_argument("--infer-steps", type=int, default=50)
59
  parser.add_argument("--disable-autocast", action="store_true")
 
65
 
66
  # Generation settings
67
  parser.add_argument("--num-videos", type=int, default=1)
68
+ parser.add_argument("--video-size", nargs="+", type=int, default=[720, 1280])
69
  parser.add_argument("--video-length", type=int, default=129)
70
  parser.add_argument("--prompt", type=str, default=None)
71
+ parser.add_argument("--seed-type", type=str, default="auto", choices=["file", "random", "fixed", "auto"])
72
+ parser.add_argument("--seed", type=int, default=None)
73
  parser.add_argument("--neg-prompt", type=str, default="")
74
  parser.add_argument("--cfg-scale", type=float, default=1.0)
75
  parser.add_argument("--embedded-cfg-scale", type=float, default=6.0)
76
  parser.add_argument("--reproduce", action="store_true")
77
 
78
+ # Parallel settings
79
+ parser.add_argument("--ulysses-degree", type=int, default=1)
80
+ parser.add_argument("--ring-degree", type=int, default=1)
81
 
82
  # Parse with empty args list to avoid reading sys.argv
83
  args = parser.parse_args([])
84
+
85
  return args
86
 
87
  class EndpointHandler:
 
89
  """Initialize the handler with model path and default config."""
90
  # Use default args instead of parsing from command line
91
  self.args = get_default_args()
92
+
93
+ # Set up model paths
94
+ self.args.model_base = path
95
+ self.args.dit_weight = str(Path(path) / "hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt")
96
 
97
  # Initialize model
98
  models_root_path = Path(path)
 
107
  Args:
108
  data: Dictionary containing:
109
  - inputs (str): The prompt text
110
+ - resolution (str, optional): Video resolution like "1280x720"
111
  - video_length (int, optional): Number of frames
112
  - num_inference_steps (int, optional): Number of inference steps
113
  - seed (int, optional): Random seed (-1 for random)
114
  - guidance_scale (float, optional): Guidance scale value
115
+ - flow_shift (float, optional): Flow shift value
116
  - embedded_guidance_scale (float, optional): Embedded guidance scale
117
 
118
  Returns:
 
123
  if prompt is None:
124
  raise ValueError("No prompt provided in the 'inputs' field")
125
 
126
+ # Parse resolution
127
  resolution = data.pop("resolution", "1280x720")
128
  width, height = map(int, resolution.split("x"))
129
 
 
141
  prompt=prompt,
142
  height=height,
143
  width=width,
144
+ video_length=video_length,
145
  seed=seed,
146
  negative_prompt="",
147
  infer_steps=num_inference_steps,
 
173
  "video_base64": video_base64,
174
  "seed": outputs['seeds'][0],
175
  "prompt": outputs['prompts'][0]
176
+ }