Commit
•
606d9c1
1
Parent(s):
366a4b9
Update handler.py
Browse files- handler.py +37 -40
handler.py
CHANGED
@@ -15,52 +15,45 @@ def get_default_args():
|
|
15 |
# Model configuration
|
16 |
parser.add_argument("--model", type=str, default="HYVideo-T/2")
|
17 |
parser.add_argument("--model-resolution", type=str, default="720p", choices=["540p", "720p"])
|
18 |
-
parser.add_argument("--latent-channels", type=int, default=4
|
19 |
parser.add_argument("--precision", type=str, default="bf16", choices=["bf16", "fp32", "fp16"])
|
20 |
-
parser.add_argument("--rope-theta", type=
|
21 |
|
22 |
# VAE settings
|
23 |
parser.add_argument("--vae", type=str, default="884-16c-hy")
|
24 |
-
parser.add_argument("--vae-precision", type=str, default="
|
25 |
-
parser.add_argument("--vae-tiling", action="store_true")
|
26 |
|
27 |
# Text encoder settings
|
28 |
-
parser.add_argument("--text-encoder", type=str, default="
|
29 |
-
parser.add_argument("--text-encoder-precision", type=str, default="
|
30 |
-
parser.add_argument("--text-states-dim", type=int, default=
|
31 |
-
parser.add_argument("--text-len", type=int, default=
|
32 |
-
parser.add_argument("--tokenizer", type=str, default="
|
33 |
|
34 |
# Prompt template settings
|
35 |
-
parser.add_argument("--prompt-template", type=str, default="dit-llm-encode"
|
36 |
-
|
37 |
-
parser.add_argument("--prompt-template-video", type=str, default="dit-llm-encode",
|
38 |
-
choices=["dit-llm-encode", "dit-llm-encode-video"])
|
39 |
|
40 |
# Additional text encoder settings
|
41 |
-
parser.add_argument("--hidden-state-skip-layer", type=int, default=
|
42 |
parser.add_argument("--apply-final-norm", action="store_true")
|
43 |
-
parser.add_argument("--text-encoder-2", type=str, default="clipL"
|
44 |
-
parser.add_argument("--text-encoder-precision-2", type=str, default="
|
45 |
-
parser.add_argument("--text-states-dim-2", type=int, default=
|
46 |
-
parser.add_argument("--tokenizer-2", type=str, default="clipL"
|
47 |
-
parser.add_argument("--text-len-2", type=int, default=77)
|
48 |
|
49 |
# Inference settings
|
50 |
-
parser.add_argument("--denoise-type", type=str, default="
|
51 |
parser.add_argument("--flow-shift", type=float, default=7.0)
|
52 |
-
parser.add_argument("--flow-reverse", action="store_true")
|
53 |
parser.add_argument("--flow-solver", type=str, default="euler")
|
54 |
parser.add_argument("--use-linear-quadratic-schedule", action="store_true")
|
55 |
-
parser.add_argument("--linear-schedule-end", type=
|
56 |
-
|
57 |
-
# Model paths and weights
|
58 |
-
parser.add_argument("--model-base", type=str, default=None)
|
59 |
-
parser.add_argument("--dit-weight", type=str, default=None)
|
60 |
-
parser.add_argument("--load-key", type=str, default=None)
|
61 |
|
62 |
# Hardware settings
|
63 |
-
parser.add_argument("--use-cpu-offload", action="store_true")
|
64 |
parser.add_argument("--batch-size", type=int, default=1)
|
65 |
parser.add_argument("--infer-steps", type=int, default=50)
|
66 |
parser.add_argument("--disable-autocast", action="store_true")
|
@@ -72,22 +65,23 @@ def get_default_args():
|
|
72 |
|
73 |
# Generation settings
|
74 |
parser.add_argument("--num-videos", type=int, default=1)
|
75 |
-
parser.add_argument("--video-size", nargs="+", type=int, default=
|
76 |
parser.add_argument("--video-length", type=int, default=129)
|
77 |
parser.add_argument("--prompt", type=str, default=None)
|
78 |
-
parser.add_argument("--seed-type", type=str, default="
|
79 |
-
parser.add_argument("--seed", type=int, default
|
80 |
parser.add_argument("--neg-prompt", type=str, default="")
|
81 |
parser.add_argument("--cfg-scale", type=float, default=1.0)
|
82 |
parser.add_argument("--embedded-cfg-scale", type=float, default=6.0)
|
83 |
parser.add_argument("--reproduce", action="store_true")
|
84 |
|
85 |
-
#
|
86 |
-
parser.add_argument("--ulysses-degree", type=
|
87 |
-
parser.add_argument("--ring-degree", type=
|
88 |
|
89 |
# Parse with empty args list to avoid reading sys.argv
|
90 |
args = parser.parse_args([])
|
|
|
91 |
return args
|
92 |
|
93 |
class EndpointHandler:
|
@@ -95,7 +89,10 @@ class EndpointHandler:
|
|
95 |
"""Initialize the handler with model path and default config."""
|
96 |
# Use default args instead of parsing from command line
|
97 |
self.args = get_default_args()
|
98 |
-
|
|
|
|
|
|
|
99 |
|
100 |
# Initialize model
|
101 |
models_root_path = Path(path)
|
@@ -110,12 +107,12 @@ class EndpointHandler:
|
|
110 |
Args:
|
111 |
data: Dictionary containing:
|
112 |
- inputs (str): The prompt text
|
113 |
-
- resolution (str, optional): Video resolution like "1280x720"
|
114 |
- video_length (int, optional): Number of frames
|
115 |
- num_inference_steps (int, optional): Number of inference steps
|
116 |
- seed (int, optional): Random seed (-1 for random)
|
117 |
- guidance_scale (float, optional): Guidance scale value
|
118 |
-
- flow_shift (float, optional): Flow shift value
|
119 |
- embedded_guidance_scale (float, optional): Embedded guidance scale
|
120 |
|
121 |
Returns:
|
@@ -126,7 +123,7 @@ class EndpointHandler:
|
|
126 |
if prompt is None:
|
127 |
raise ValueError("No prompt provided in the 'inputs' field")
|
128 |
|
129 |
-
# Parse resolution
|
130 |
resolution = data.pop("resolution", "1280x720")
|
131 |
width, height = map(int, resolution.split("x"))
|
132 |
|
@@ -144,7 +141,7 @@ class EndpointHandler:
|
|
144 |
prompt=prompt,
|
145 |
height=height,
|
146 |
width=width,
|
147 |
-
video_length=video_length,
|
148 |
seed=seed,
|
149 |
negative_prompt="",
|
150 |
infer_steps=num_inference_steps,
|
@@ -176,4 +173,4 @@ class EndpointHandler:
|
|
176 |
"video_base64": video_base64,
|
177 |
"seed": outputs['seeds'][0],
|
178 |
"prompt": outputs['prompts'][0]
|
179 |
-
}
|
|
|
15 |
# Model configuration
|
16 |
parser.add_argument("--model", type=str, default="HYVideo-T/2")
|
17 |
parser.add_argument("--model-resolution", type=str, default="720p", choices=["540p", "720p"])
|
18 |
+
parser.add_argument("--latent-channels", type=int, default=16) # Changed from 4 to match VAE
|
19 |
parser.add_argument("--precision", type=str, default="bf16", choices=["bf16", "fp32", "fp16"])
|
20 |
+
parser.add_argument("--rope-theta", type=int, default=256) # Changed to match original config
|
21 |
|
22 |
# VAE settings
|
23 |
parser.add_argument("--vae", type=str, default="884-16c-hy")
|
24 |
+
parser.add_argument("--vae-precision", type=str, default="fp16") # Changed to fp16 to match docs
|
25 |
+
parser.add_argument("--vae-tiling", action="store_true", default=True) # Set default to True
|
26 |
|
27 |
# Text encoder settings
|
28 |
+
parser.add_argument("--text-encoder", type=str, default="llm") # Changed to llm
|
29 |
+
parser.add_argument("--text-encoder-precision", type=str, default="fp16") # Changed to fp16
|
30 |
+
parser.add_argument("--text-states-dim", type=int, default=4096) # Updated to match docs
|
31 |
+
parser.add_argument("--text-len", type=int, default=256) # Updated to match docs
|
32 |
+
parser.add_argument("--tokenizer", type=str, default="llm") # Changed to llm
|
33 |
|
34 |
# Prompt template settings
|
35 |
+
parser.add_argument("--prompt-template", type=str, default="dit-llm-encode")
|
36 |
+
parser.add_argument("--prompt-template-video", type=str, default="dit-llm-encode-video")
|
|
|
|
|
37 |
|
38 |
# Additional text encoder settings
|
39 |
+
parser.add_argument("--hidden-state-skip-layer", type=int, default=2) # Updated to match docs
|
40 |
parser.add_argument("--apply-final-norm", action="store_true")
|
41 |
+
parser.add_argument("--text-encoder-2", type=str, default="clipL")
|
42 |
+
parser.add_argument("--text-encoder-precision-2", type=str, default="fp16") # Changed to fp16
|
43 |
+
parser.add_argument("--text-states-dim-2", type=int, default=768) # Updated to match docs
|
44 |
+
parser.add_argument("--tokenizer-2", type=str, default="clipL")
|
45 |
+
parser.add_argument("--text-len-2", type=int, default=77) # Updated to match docs
|
46 |
|
47 |
# Inference settings
|
48 |
+
parser.add_argument("--denoise-type", type=str, default="flow") # Changed to flow
|
49 |
parser.add_argument("--flow-shift", type=float, default=7.0)
|
50 |
+
parser.add_argument("--flow-reverse", action="store_true", default=False)
|
51 |
parser.add_argument("--flow-solver", type=str, default="euler")
|
52 |
parser.add_argument("--use-linear-quadratic-schedule", action="store_true")
|
53 |
+
parser.add_argument("--linear-schedule-end", type=int, default=25) # Updated to match docs
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
# Hardware settings
|
56 |
+
parser.add_argument("--use-cpu-offload", action="store_true", default=False)
|
57 |
parser.add_argument("--batch-size", type=int, default=1)
|
58 |
parser.add_argument("--infer-steps", type=int, default=50)
|
59 |
parser.add_argument("--disable-autocast", action="store_true")
|
|
|
65 |
|
66 |
# Generation settings
|
67 |
parser.add_argument("--num-videos", type=int, default=1)
|
68 |
+
parser.add_argument("--video-size", nargs="+", type=int, default=[720, 1280])
|
69 |
parser.add_argument("--video-length", type=int, default=129)
|
70 |
parser.add_argument("--prompt", type=str, default=None)
|
71 |
+
parser.add_argument("--seed-type", type=str, default="auto", choices=["file", "random", "fixed", "auto"])
|
72 |
+
parser.add_argument("--seed", type=int, default=None)
|
73 |
parser.add_argument("--neg-prompt", type=str, default="")
|
74 |
parser.add_argument("--cfg-scale", type=float, default=1.0)
|
75 |
parser.add_argument("--embedded-cfg-scale", type=float, default=6.0)
|
76 |
parser.add_argument("--reproduce", action="store_true")
|
77 |
|
78 |
+
# Parallel settings
|
79 |
+
parser.add_argument("--ulysses-degree", type=int, default=1)
|
80 |
+
parser.add_argument("--ring-degree", type=int, default=1)
|
81 |
|
82 |
# Parse with empty args list to avoid reading sys.argv
|
83 |
args = parser.parse_args([])
|
84 |
+
|
85 |
return args
|
86 |
|
87 |
class EndpointHandler:
|
|
|
89 |
"""Initialize the handler with model path and default config."""
|
90 |
# Use default args instead of parsing from command line
|
91 |
self.args = get_default_args()
|
92 |
+
|
93 |
+
# Set up model paths
|
94 |
+
self.args.model_base = path
|
95 |
+
self.args.dit_weight = str(Path(path) / "hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt")
|
96 |
|
97 |
# Initialize model
|
98 |
models_root_path = Path(path)
|
|
|
107 |
Args:
|
108 |
data: Dictionary containing:
|
109 |
- inputs (str): The prompt text
|
110 |
+
- resolution (str, optional): Video resolution like "1280x720"
|
111 |
- video_length (int, optional): Number of frames
|
112 |
- num_inference_steps (int, optional): Number of inference steps
|
113 |
- seed (int, optional): Random seed (-1 for random)
|
114 |
- guidance_scale (float, optional): Guidance scale value
|
115 |
+
- flow_shift (float, optional): Flow shift value
|
116 |
- embedded_guidance_scale (float, optional): Embedded guidance scale
|
117 |
|
118 |
Returns:
|
|
|
123 |
if prompt is None:
|
124 |
raise ValueError("No prompt provided in the 'inputs' field")
|
125 |
|
126 |
+
# Parse resolution
|
127 |
resolution = data.pop("resolution", "1280x720")
|
128 |
width, height = map(int, resolution.split("x"))
|
129 |
|
|
|
141 |
prompt=prompt,
|
142 |
height=height,
|
143 |
width=width,
|
144 |
+
video_length=video_length,
|
145 |
seed=seed,
|
146 |
negative_prompt="",
|
147 |
infer_steps=num_inference_steps,
|
|
|
173 |
"video_base64": video_base64,
|
174 |
"seed": outputs['seeds'][0],
|
175 |
"prompt": outputs['prompts'][0]
|
176 |
+
}
|