|
from typing import Dict, Any |
|
import os |
|
from pathlib import Path |
|
import time |
|
from datetime import datetime |
|
import torch |
|
import base64 |
|
from io import BytesIO |
|
|
|
from hyvideo.utils.file_utils import save_videos_grid |
|
from hyvideo.config import parse_args |
|
from hyvideo.inference import HunyuanVideoSampler |
|
|
|
class EndpointHandler: |
|
def __init__(self, path: str = ""): |
|
"""Initialize the handler with the model path. |
|
|
|
Args: |
|
path: Path to the model weights directory |
|
""" |
|
self.args = parse_args() |
|
models_root_path = Path(path) |
|
if not models_root_path.exists(): |
|
raise ValueError(f"`models_root` not exists: {models_root_path}") |
|
|
|
|
|
self.model = HunyuanVideoSampler.from_pretrained(models_root_path, args=self.args) |
|
|
|
|
|
self.default_params = { |
|
"num_inference_steps": 50, |
|
"guidance_scale": 1.0, |
|
"flow_shift": 7.0, |
|
"embedded_guidance_scale": 6.0, |
|
"video_length": 129, |
|
"resolution": "1280x720" |
|
} |
|
|
|
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: |
|
"""Process the input data and generate video. |
|
|
|
Args: |
|
data: Dictionary containing the input parameters |
|
Required: |
|
- inputs (str): The prompt text |
|
Optional: |
|
- resolution (str): Video resolution like "1280x720" |
|
- video_length (int): Number of frames |
|
- seed (int): Random seed (-1 for random) |
|
- num_inference_steps (int): Number of inference steps |
|
- guidance_scale (float): Guidance scale value |
|
- flow_shift (float): Flow shift value |
|
- embedded_guidance_scale (float): Embedded guidance scale value |
|
|
|
Returns: |
|
Dictionary containing the base64 encoded video |
|
""" |
|
|
|
prompt = data.pop("inputs", None) |
|
if prompt is None: |
|
raise ValueError("No prompt provided in the 'inputs' field") |
|
|
|
|
|
resolution = data.pop("resolution", self.default_params["resolution"]) |
|
video_length = int(data.pop("video_length", self.default_params["video_length"])) |
|
seed = int(data.pop("seed", -1)) |
|
num_inference_steps = int(data.pop("num_inference_steps", self.default_params["num_inference_steps"])) |
|
guidance_scale = float(data.pop("guidance_scale", self.default_params["guidance_scale"])) |
|
flow_shift = float(data.pop("flow_shift", self.default_params["flow_shift"])) |
|
embedded_guidance_scale = float(data.pop("embedded_guidance_scale", self.default_params["embedded_guidance_scale"])) |
|
|
|
|
|
width, height = resolution.split("x") |
|
width, height = int(width), int(height) |
|
|
|
|
|
seed = None if seed == -1 else seed |
|
|
|
|
|
outputs = self.model.predict( |
|
prompt=prompt, |
|
height=height, |
|
width=width, |
|
video_length=video_length, |
|
seed=seed, |
|
negative_prompt="", |
|
infer_steps=num_inference_steps, |
|
guidance_scale=guidance_scale, |
|
num_videos_per_prompt=1, |
|
flow_shift=flow_shift, |
|
batch_size=1, |
|
embedded_guidance_scale=embedded_guidance_scale |
|
) |
|
|
|
|
|
samples = outputs['samples'] |
|
sample = samples[0].unsqueeze(0) |
|
|
|
|
|
temp_dir = "/tmp/video_output" |
|
os.makedirs(temp_dir, exist_ok=True) |
|
|
|
time_flag = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d-%H:%M:%S") |
|
video_path = f"{temp_dir}/{time_flag}_seed{outputs['seeds'][0]}.mp4" |
|
save_videos_grid(sample, video_path, fps=24) |
|
|
|
|
|
with open(video_path, "rb") as f: |
|
video_bytes = f.read() |
|
video_base64 = base64.b64encode(video_bytes).decode() |
|
|
|
|
|
os.remove(video_path) |
|
|
|
return { |
|
"video_base64": video_base64, |
|
"seed": outputs['seeds'][0], |
|
"prompt": outputs['prompts'][0] |
|
} |
|
|