|
from typing import Dict, Any |
|
import os |
|
from pathlib import Path |
|
import time |
|
from datetime import datetime |
|
import argparse |
|
from hyvideo.utils.file_utils import save_videos_grid |
|
from hyvideo.inference import HunyuanVideoSampler |
|
from hyvideo.constants import NEGATIVE_PROMPT |
|
|
|
def get_default_args(): |
|
"""Create default arguments instead of parsing from command line""" |
|
parser = argparse.ArgumentParser() |
|
|
|
|
|
parser.add_argument("--model", type=str, default="HYVideo-T/2") |
|
parser.add_argument("--model-resolution", type=str, default="720p", choices=["540p", "720p"]) |
|
parser.add_argument("--latent-channels", type=int, default=4) |
|
parser.add_argument("--precision", type=str, default="bf16", choices=["bf16", "fp32", "fp16"]) |
|
parser.add_argument("--rope-theta", type=float, default=10000) |
|
|
|
|
|
parser.add_argument("--vae", type=str, default="884-16c-hy") |
|
parser.add_argument("--vae-precision", type=str, default="bf16", choices=["bf16", "fp32", "fp16"]) |
|
parser.add_argument("--vae-tiling", action="store_true") |
|
|
|
|
|
parser.add_argument("--text-encoder", type=str, default="clipL", choices=["clipL", "llm"]) |
|
parser.add_argument("--text-encoder-precision", type=str, default="bf16", choices=["bf16", "fp32", "fp16"]) |
|
parser.add_argument("--text-states-dim", type=int, default=1024) |
|
parser.add_argument("--text-len", type=int, default=77) |
|
parser.add_argument("--tokenizer", type=str, default="clipL", choices=["clipL", "llm"]) |
|
|
|
|
|
parser.add_argument("--prompt-template", type=str, default="dit-llm-encode", |
|
choices=["dit-llm-encode", "dit-llm-encode-video"]) |
|
parser.add_argument("--prompt-template-video", type=str, default="dit-llm-encode", |
|
choices=["dit-llm-encode", "dit-llm-encode-video"]) |
|
|
|
|
|
parser.add_argument("--hidden-state-skip-layer", type=int, default=0) |
|
parser.add_argument("--apply-final-norm", action="store_true") |
|
parser.add_argument("--text-encoder-2", type=str, default="clipL", choices=["clipL", "llm"]) |
|
parser.add_argument("--text-encoder-precision-2", type=str, default="bf16", choices=["bf16", "fp32", "fp16"]) |
|
parser.add_argument("--text-states-dim-2", type=int, default=1024) |
|
parser.add_argument("--tokenizer-2", type=str, default="clipL", choices=["clipL", "llm"]) |
|
parser.add_argument("--text-len-2", type=int, default=77) |
|
|
|
|
|
parser.add_argument("--denoise-type", type=str, default="v-prediction") |
|
parser.add_argument("--flow-shift", type=float, default=7.0) |
|
parser.add_argument("--flow-reverse", action="store_true") |
|
parser.add_argument("--flow-solver", type=str, default="euler") |
|
parser.add_argument("--use-linear-quadratic-schedule", action="store_true") |
|
parser.add_argument("--linear-schedule-end", type=float, default=0.0) |
|
|
|
|
|
parser.add_argument("--model-base", type=str, default=None) |
|
parser.add_argument("--dit-weight", type=str, default=None) |
|
parser.add_argument("--load-key", type=str, default=None) |
|
|
|
|
|
parser.add_argument("--use-cpu-offload", action="store_true") |
|
parser.add_argument("--batch-size", type=int, default=1) |
|
parser.add_argument("--infer-steps", type=int, default=50) |
|
parser.add_argument("--disable-autocast", action="store_true") |
|
|
|
|
|
parser.add_argument("--save-path", type=str, default="outputs") |
|
parser.add_argument("--save-path-suffix", type=str, default="") |
|
parser.add_argument("--name-suffix", type=str, default="") |
|
|
|
|
|
parser.add_argument("--num-videos", type=int, default=1) |
|
parser.add_argument("--video-size", nargs="+", type=int, default=None) |
|
parser.add_argument("--video-length", type=int, default=129) |
|
parser.add_argument("--prompt", type=str, default=None) |
|
parser.add_argument("--seed-type", type=str, default="random", choices=["file", "random", "fixed", "auto"]) |
|
parser.add_argument("--seed", type=int, default=-1) |
|
parser.add_argument("--neg-prompt", type=str, default="") |
|
parser.add_argument("--cfg-scale", type=float, default=1.0) |
|
parser.add_argument("--embedded-cfg-scale", type=float, default=6.0) |
|
parser.add_argument("--reproduce", action="store_true") |
|
|
|
|
|
parser.add_argument("--ulysses-degree", type=float, default=1.0) |
|
parser.add_argument("--ring-degree", type=float, default=1.0) |
|
|
|
|
|
args = parser.parse_args([]) |
|
return args |
|
|
|
class EndpointHandler: |
|
def __init__(self, path: str = ""): |
|
"""Initialize the handler with model path and default config.""" |
|
|
|
self.args = get_default_args() |
|
self.args.model_base = path |
|
|
|
|
|
models_root_path = Path(path) |
|
if not models_root_path.exists(): |
|
raise ValueError(f"`models_root` not exists: {models_root_path}") |
|
|
|
self.model = HunyuanVideoSampler.from_pretrained(models_root_path, args=self.args) |
|
|
|
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: |
|
"""Process a single request |
|
|
|
Args: |
|
data: Dictionary containing: |
|
- inputs (str): The prompt text |
|
- resolution (str, optional): Video resolution like "1280x720" |
|
- video_length (int, optional): Number of frames |
|
- num_inference_steps (int, optional): Number of inference steps |
|
- seed (int, optional): Random seed (-1 for random) |
|
- guidance_scale (float, optional): Guidance scale value |
|
- flow_shift (float, optional): Flow shift value |
|
- embedded_guidance_scale (float, optional): Embedded guidance scale |
|
|
|
Returns: |
|
Dictionary containing the generated video as base64 string |
|
""" |
|
|
|
prompt = data.pop("inputs", None) |
|
if prompt is None: |
|
raise ValueError("No prompt provided in the 'inputs' field") |
|
|
|
|
|
resolution = data.pop("resolution", "1280x720") |
|
width, height = map(int, resolution.split("x")) |
|
|
|
|
|
video_length = int(data.pop("video_length", 129)) |
|
seed = data.pop("seed", -1) |
|
seed = None if seed == -1 else int(seed) |
|
num_inference_steps = int(data.pop("num_inference_steps", 50)) |
|
guidance_scale = float(data.pop("guidance_scale", 1.0)) |
|
flow_shift = float(data.pop("flow_shift", 7.0)) |
|
embedded_guidance_scale = float(data.pop("embedded_guidance_scale", 6.0)) |
|
|
|
|
|
outputs = self.model.predict( |
|
prompt=prompt, |
|
height=height, |
|
width=width, |
|
video_length=video_length, |
|
seed=seed, |
|
negative_prompt="", |
|
infer_steps=num_inference_steps, |
|
guidance_scale=guidance_scale, |
|
num_videos_per_prompt=1, |
|
flow_shift=flow_shift, |
|
batch_size=1, |
|
embedded_guidance_scale=embedded_guidance_scale |
|
) |
|
|
|
|
|
samples = outputs['samples'] |
|
sample = samples[0].unsqueeze(0) |
|
|
|
|
|
temp_path = "/tmp/temp_video.mp4" |
|
save_videos_grid(sample, temp_path, fps=24) |
|
|
|
|
|
with open(temp_path, "rb") as f: |
|
video_bytes = f.read() |
|
import base64 |
|
video_base64 = base64.b64encode(video_bytes).decode() |
|
|
|
|
|
os.remove(temp_path) |
|
|
|
return { |
|
"video_base64": video_base64, |
|
"seed": outputs['seeds'][0], |
|
"prompt": outputs['prompts'][0] |
|
} |
|
|