ValueError: Image features and image tokens do not match: tokens: 1, features 576

#2
by DsnTgr - opened

It happens error when running the demo:

ValueError                                Traceback (most recent call last)
Cell In[1], line 29
     24 prompt = "A little girl is riding a bicycle at high speed. Focused, detailed, realistic."
     25 image = load_image(
     26     "./a girl.png"
     27 )
---> 29 output = pipe(image=image, prompt=prompt).frames[0]
     30 export_to_video(output, "HunyuanVideo-I2V-diffusers.mp4", fps=15)

File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    113 @functools.wraps(func)
    114 def decorate_context(*args, **kwargs):
    115     with ctx_factory():
--> 116         return func(*args, **kwargs)

File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py:751, in HunyuanVideoImageToVideoPipeline.__call__(self, image, prompt, prompt_2, negative_prompt, negative_prompt_2, height, width, num_frames, num_inference_steps, sigmas, true_cfg_scale, guidance_scale, num_videos_per_prompt, generator, latents, prompt_embeds, pooled_prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_pooled_prompt_embeds, negative_prompt_attention_mask, output_type, return_dict, attention_kwargs, callback_on_step_end, callback_on_step_end_tensor_inputs, prompt_template, max_sequence_length)
    749 # 4. Encode input prompt
    750 transformer_dtype = self.transformer.dtype
--> 751 prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = self.encode_prompt(
    752     image=image,
    753     prompt=prompt,
    754     prompt_2=prompt_2,
    755     prompt_template=prompt_template,
    756     num_videos_per_prompt=num_videos_per_prompt,
    757     prompt_embeds=prompt_embeds,
    758     pooled_prompt_embeds=pooled_prompt_embeds,
    759     prompt_attention_mask=prompt_attention_mask,
    760     device=device,
    761     max_sequence_length=max_sequence_length,
    762 )
    763 prompt_embeds = prompt_embeds.to(transformer_dtype)
    764 prompt_attention_mask = prompt_attention_mask.to(transformer_dtype)

File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py:404, in HunyuanVideoImageToVideoPipeline.encode_prompt(self, image, prompt, prompt_2, prompt_template, num_videos_per_prompt, prompt_embeds, pooled_prompt_embeds, prompt_attention_mask, device, dtype, max_sequence_length)
    389 def encode_prompt(
    390     self,
    391     image: torch.Tensor,
   (...)
    401     max_sequence_length: int = 256,
    402 ):
    403     if prompt_embeds is None:
--> 404         prompt_embeds, prompt_attention_mask = self._get_llama_prompt_embeds(
    405             image,
    406             prompt,
    407             prompt_template,
    408             num_videos_per_prompt,
    409             device=device,
    410             dtype=dtype,
    411             max_sequence_length=max_sequence_length,
    412         )
    414     if pooled_prompt_embeds is None:
    415         if prompt_2 is None:

File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py:277, in HunyuanVideoImageToVideoPipeline._get_llama_prompt_embeds(self, image, prompt, prompt_template, num_videos_per_prompt, device, dtype, max_sequence_length, num_hidden_layers_to_skip, image_embed_interleave)
    273 prompt_attention_mask = text_inputs.attention_mask.to(device=device)
    275 image_embeds = self.image_processor(image, return_tensors="pt").pixel_values.to(device)
--> 277 prompt_embeds = self.text_encoder(
    278     input_ids=text_input_ids,
    279     attention_mask=prompt_attention_mask,
    280     pixel_values=image_embeds,
    281     output_hidden_states=True,
    282 ).hidden_states[-(num_hidden_layers_to_skip + 1)]
    283 prompt_embeds = prompt_embeds.to(dtype=dtype)
    285 image_emb_len = prompt_template.get("image_emb_len", 576)

File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/torch/nn/modules/module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
   1737     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1738 else:
-> 1739     return self._call_impl(*args, **kwargs)

File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/torch/nn/modules/module.py:1750, in Module._call_impl(self, *args, **kwargs)
   1745 # If we don't have any hooks, we want to skip the rest of the logic in
   1746 # this function, and just call forward.
   1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1748         or _global_backward_pre_hooks or _global_backward_hooks
   1749         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1750     return forward_call(*args, **kwargs)
   1752 result = None
   1753 called_always_called_hooks = set()

File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/accelerate/hooks.py:170, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    168         output = module._old_forward(*args, **kwargs)
    169 else:
--> 170     output = module._old_forward(*args, **kwargs)
    171 return module._hf_hook.post_forward(module, output)

File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/transformers/utils/deprecation.py:172, in deprecate_kwarg.<locals>.wrapper.<locals>.wrapped_func(*args, **kwargs)
    168 elif minimum_action in (Action.NOTIFY, Action.NOTIFY_ALWAYS) and not is_torchdynamo_compiling():
    169     # DeprecationWarning is ignored by default, so we use FutureWarning instead
    170     warnings.warn(message, FutureWarning, stacklevel=2)
--> 172 return func(*args, **kwargs)

File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.py:426, in LlavaForConditionalGeneration.forward(self, input_ids, pixel_values, attention_mask, position_ids, past_key_values, inputs_embeds, vision_feature_layer, vision_feature_select_strategy, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, logits_to_keep, image_sizes, **lm_kwargs)
    424     n_image_tokens = (input_ids == self.config.image_token_index).sum()
    425     n_image_features = image_features.shape[0] * image_features.shape[1]
--> 426     raise ValueError(
    427         f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
    428     )
    429 image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
    430 inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)

ValueError: Image features and image tokens do not match: tokens: 1, features 576

Env:
Name: diffusers
Version: 0.33.0.dev0
Name: transformers
Version: 4.50.0.dev0

HunyuanVideo Community org

This was the environment the model was tested with: https://github.com/huggingface/diffusers/pull/10983#issuecomment-2706597418

For other versions, we'll have to introduce version guards and try to fix

Your need to confirm your account before you can post a new comment.

Sign up or log in to comment