diff --git "a/modeling_hunyuan_image_3.py" "b/modeling_hunyuan_image_3.py"
new file mode 100644--- /dev/null
+++ "b/modeling_hunyuan_image_3.py"
@@ -0,0 +1,3403 @@
+# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import math
+import random
+import re
+import time
+import warnings
+from dataclasses import dataclass
+from typing import List, Union, Optional, Dict, Any, Tuple, Callable, TYPE_CHECKING
+from datetime import datetime
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from PIL import Image
+from einops import rearrange
+from torch import Tensor
+from torch import nn
+from torch.cuda import nvtx
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, StaticCache
+from transformers.generation.logits_process import LogitsProcessor, LogitsProcessorList
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+from transformers.generation.streamers import TextStreamer
+from transformers.generation.utils import GenerationMixin, GenerationConfig, ALL_CACHE_NAMES
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    logging,
+)
+
+try:
+    import flashinfer
+except Exception as e:
+    flashinfer = None
+
+#from .autoencoder_kl_3d import AutoencoderKLConv3D
+from .autoencoder_kl_3d import AutoencoderKLConv3D_Dist, AutoencoderKLConv3D
+from .configuration_hunyuan_image_3 import HunyuanImage3Config
+from .hunyuan_image_3_pipeline import HunyuanImage3Text2ImagePipeline, FlowMatchDiscreteScheduler
+from .image_processor import HunyuanImage3ImageProcessor
+from .siglip2 import Siglip2VisionTransformer, LightProjector
+from .tokenization_hunyuan_image_3 import HunyuanImage3TokenizerFast, ImageInfo, ImageTensor, CondImage
+from .system_prompt import get_system_prompt
+
+from .cache_utils import TaylorCacheContainer, CacheWithFreqsContainer
+
+if TYPE_CHECKING:
+    from transformers.generation.streamers import BaseStreamer
+
+logger = logging.get_logger(__name__)
+
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func
+
+# Type aliases
+BatchRaggedImages = Union[torch.Tensor, List[Union[torch.Tensor, List[torch.Tensor]]]]
+BatchRaggedTensor = Union[torch.Tensor, List[torch.Tensor]]
+InputImage = Optional[Union[Image.Image, str, bytes]]
+
+
+def get_device(tensor: BatchRaggedImages):
+    if isinstance(tensor, torch.Tensor):
+        return tensor.device
+    elif isinstance(tensor, list):
+        return get_device(tensor[0])
+    else:
+        raise ValueError(f"Unsupported type for get_device: {type(tensor)}")
+
+
+_CONFIG_FOR_DOC = "HunyuanImage3Config"
+
+Hunyuan_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`HunyuanImage3Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+# =======================================================
+#     Helper Functions
+# =======================================================
+
+def default(val, d):
+    return val if val is not None else d
+
+
+def to_device(data, device):
+    if device is None:
+        return data
+    if isinstance(data, torch.Tensor):
+        return data.to(device)
+    elif isinstance(data, list):
+        return [to_device(x, device) for x in data]
+    else:
+        return data
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def real_batched_index_select(t, dim, idx):
+    """ index_select for batched index and batched t """
+    assert t.ndim >= 2 and idx.ndim >= 2, f"{t.ndim=} {idx.ndim=}"
+    assert len(t) == len(idx), f"{len(t)=} != {len(idx)=}"
+    return torch.stack([torch.index_select(t[i], dim - 1, idx[i]) for i in range(len(t))])
+
+
+# =======================================================
+#     Module Functions
+# =======================================================
+
+def timestep_embedding(t, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+
+    Args:
+        t (torch.Tensor): a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        dim (int): the dimension of the output.
+        max_period (int): controls the minimum frequency of the embeddings.
+
+    Returns:
+        embedding (torch.Tensor): An (N, D) Tensor of positional embeddings.
+
+    .. ref_link: https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, dtype=torch.float32)
+        / half
+    ).to(device=t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat(
+            [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+        )
+    return embedding
+
+
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+
+
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def normalization(channels, **kwargs):
+    """
+    Make a standard normalization layer.
+
+    :param channels: number of input channels.
+    :return: a nn.Module for normalization.
+    """
+    return nn.GroupNorm(32, channels, **kwargs)
+
+
+def topkgating(
+        logits: Tensor,
+        topk: int,
+        group_limited_greedy: bool = False,
+        n_group: int = None,
+        topk_group: int = None,
+        norm_topk_prob: bool = True,
+        routed_scaling_factor: float = 1.0,
+        capacity_factor: float = 1.0,
+        drop_tokens: bool = False,
+):
+    logits = logits.float()
+    gates = F.softmax(logits, dim=1)
+
+    if group_limited_greedy:
+        group_shape = list(gates.shape[:-1]) + [n_group, gates.shape[-1] // n_group]
+        group_scores = (
+            gates.reshape(group_shape).max(dim=-1).values
+        )  # [n, n_group]
+        group_idx = torch.topk(
+            group_scores, topk_group, dim=-1, sorted=False
+        )[
+            1
+        ]  # [n, top_k_group]
+        group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+        group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+        score_mask = (
+            group_mask.unsqueeze(-1)
+            .expand(
+                group_shape
+            )
+            .reshape(list(gates.shape))
+        )  # [n, e]
+        gates = gates.masked_fill(~score_mask.bool(), 0.0)
+
+    num_experts = int(gates.shape[1])
+    # Top-k router probability and corresponding expert indices for each token.
+    # Shape: [tokens_per_group, num_selected_experts].
+    expert_gate, expert_index = torch.topk(gates, topk)
+    expert_mask = F.one_hot(expert_index, num_experts)
+    # For a given token, determine if it was routed to a given expert.
+    # Shape: [tokens_per_group, num_experts]
+    expert_mask_aux = expert_mask.max(dim=-2)[0]
+    tokens_per_group_and_expert = torch.mean(expert_mask_aux.float(), dim=-2)
+    router_prob_per_group_and_expert = torch.mean(gates.float(), dim=-2)
+    l_aux = num_experts ** 2 * torch.mean(tokens_per_group_and_expert * router_prob_per_group_and_expert)
+
+    if drop_tokens:
+        expert_capacity = int(max(topk, topk * gates.shape[0] // gates.shape[1]) * capacity_factor)
+    else:
+        expert_index_flat = expert_index.flatten()
+        tokens_per_expert = torch.bincount(expert_index_flat, minlength=num_experts)
+        expert_capacity = torch.max(tokens_per_expert).item()
+
+    if norm_topk_prob and topk > 1:
+        gates_s = torch.clamp(
+            torch.matmul(expert_mask.float(), gates.unsqueeze(-1)).sum(dim=1), min=torch.finfo(gates.dtype).eps
+        )
+        router_probs = gates / gates_s
+    else:
+        router_probs = gates * routed_scaling_factor
+    # Make num_selected_experts the leading axis to ensure that top-1 choices
+    # have priority over top-2 choices, which have priority over top-3 choices,
+    # etc.
+    expert_index = torch.transpose(expert_index, 0, 1)
+    # Shape: [num_selected_experts * tokens_per_group]
+    expert_index = expert_index.reshape(-1)
+
+    # Create mask out of indices.
+    # Shape: [tokens_per_group * num_selected_experts, num_experts].
+    expert_mask = F.one_hot(expert_index, num_experts).to(torch.int32)
+    exp_counts = torch.sum(expert_mask, dim=0).detach()
+
+    # Experts have a fixed capacity that we cannot exceed. A token's priority
+    # within the expert's buffer is given by the masked, cumulative capacity of
+    # its target expert.
+    # Shape: [tokens_per_group * num_selected_experts, num_experts].
+    token_priority = torch.cumsum(expert_mask, dim=0) * expert_mask - 1
+    # Shape: [num_selected_experts, tokens_per_group, num_experts].
+    token_priority = token_priority.reshape((topk, -1, num_experts))
+    # Shape: [tokens_per_group, num_selected_experts, num_experts].
+    token_priority = torch.transpose(token_priority, 0, 1)
+    # For each token, across all selected experts, select the only non-negative
+    # (unmasked) priority. Now, for group G routing to expert E, token T has
+    # non-negative priority (i.e. token_priority[G,T,E] >= 0) if and only if E
+    # is its targeted expert.
+    # Shape: [tokens_per_group, num_experts].
+    token_priority = torch.max(token_priority, dim=1)[0]
+
+    # Token T can only be routed to expert E if its priority is positive and
+    # less than the expert capacity. One-hot matrix will ignore indices outside
+    # the range [0, expert_capacity).
+    # Shape: [tokens_per_group, num_experts, expert_capacity].
+    valid_mask = torch.logical_and(token_priority >= 0, token_priority < expert_capacity)
+    token_priority = torch.masked_fill(token_priority, ~valid_mask, 0)
+    dispatch_mask = F.one_hot(token_priority, expert_capacity).to(torch.bool)
+    valid_mask = valid_mask.unsqueeze(-1).expand(-1, -1, expert_capacity)
+    dispatch_mask = torch.masked_fill(dispatch_mask, ~valid_mask, 0)
+
+    # The combine array will be used for combining expert outputs, scaled by the
+    # router probabilities. Shape: [num_groups, tokens_per_group, num_experts,
+    # expert_capacity].
+    combine_weights = torch.einsum("...te,...tec->...tec", router_probs, dispatch_mask)
+    exp_counts_capacity = torch.sum(dispatch_mask)
+    exp_capacity_rate = exp_counts_capacity / (logits.shape[0] * topk)
+
+    return [l_aux, exp_capacity_rate], combine_weights, dispatch_mask, exp_counts
+
+
+# =======================================================
+#     Multi-Dimensional RoPE
+# =======================================================
+
+def _to_tuple(x, dim=2):
+    if isinstance(x, int):
+        return (x,) * dim
+    elif len(x) == dim:
+        return x
+    else:
+        raise ValueError(f"Expected length {dim} or int, but got {x}")
+
+
+def get_meshgrid_nd(start, *args, dim=2, device="cpu"):
+    """
+    Get n-D meshgrid with start, stop and num.
+
+    Args:
+        start (int or tuple): If len(args) == 0, start is num; If len(args) == 1, start is start, args[0] is stop,
+            step is 1; If len(args) == 2, start is start, args[0] is stop, args[1] is num. For n-dim, start/stop/num
+            should be int or n-tuple. If n-tuple is provided, the meshgrid will be stacked following the dim order in
+            n-tuples.
+        *args: See above.
+        dim (int): Dimension of the meshgrid. Defaults to 2.
+
+    Returns:
+        grid (np.ndarray): [dim, ...]
+    """
+    if len(args) == 0:
+        # start is grid_size
+        num = _to_tuple(start, dim=dim)
+        start = (0,) * dim
+        stop = num
+    elif len(args) == 1:
+        # start is start, args[0] is stop, step is 1
+        start = _to_tuple(start, dim=dim)
+        stop = _to_tuple(args[0], dim=dim)
+        num = [stop[i] - start[i] for i in range(dim)]
+        # assert num are all integers
+        num_int = [int(x) for x in num]
+        assert (torch.tensor(num) == torch.tensor(num_int)).all(), f"num should be int, but got {num}"
+        num = num_int
+    elif len(args) == 2:
+        # start is start, args[0] is stop, args[1] is num
+        start = _to_tuple(start, dim=dim)       # Left-Top       eg: 12,0
+        stop = _to_tuple(args[0], dim=dim)      # Right-Bottom   eg: 20,32
+        num = _to_tuple(args[1], dim=dim)       # Target Size    eg: 32,124
+    else:
+        raise ValueError(f"len(args) should be 0, 1 or 2, but got {len(args)}")
+
+    # PyTorch implement of np.linspace(start[i], stop[i], num[i], endpoint=False)
+    axis_grid = []
+    for i in range(dim):
+        a, b, n = start[i], stop[i], num[i]
+        g = torch.linspace(a, b, n + 1, dtype=torch.float32, device=device)[:n]
+        axis_grid.append(g)
+    grid = torch.meshgrid(*axis_grid, indexing="ij")   # dim x [H, W]
+    grid = torch.stack(grid, dim=0)     # [dim, H, W]
+
+    return grid
+
+
+def build_2d_rope(
+        seq_len: int, n_elem: int, image_infos: Optional[List[Tuple[slice, Tuple[int, int]]]] = None,
+        device: Optional[torch.device] = None, base: int = 10000, base_rescale_factor: float = 1.0,
+        return_all_pos: bool = False,
+):
+    """
+    Reference: https://kexue.fm/archives/10352
+
+    Start from 1, we have
+        beta_y = L + (wh - h)/2
+        beta_x = L + (wh - w)/2
+
+    Returns
+    -------
+    cos: torch.Tensor with shape of [seq_len, n_elem]
+    sin: torch.Tensor with shape of [seq_len, n_elem]
+    """
+    assert n_elem % 4 == 0, f"n_elem must be divisible by 4, but got {n_elem}."
+
+    # theta
+    if base_rescale_factor != 1.0:
+        base *= base_rescale_factor ** (n_elem / (n_elem - 2))
+    theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem))
+    theta = theta.reshape(1, n_elem // 4, 2)    # [1, half_d, 2]
+
+    # position indices
+    if image_infos is None:
+        image_infos = []
+
+    image_infos_list = [image_infos]
+    sample_seq_lens = [seq_len]
+
+    # Prepare position indices for each sample
+    x_sections = []
+    y_sections = []
+    for sample_id, sample_image_infos in enumerate(image_infos_list):
+        last_pos = 0
+        for sec_slice, (h, w) in sample_image_infos:
+            L = sec_slice.start   # start from 0, so image_slice.start is just L
+            # previous text
+            if last_pos < L:
+                y_sections.append(torch.arange(last_pos, L, device=device))
+                x_sections.append(torch.arange(last_pos, L, device=device))
+            elif h is None:
+                # Interleave data has overlapped positions for <boi> <size> <ratio> <timestep> <eoi> tokens.
+                y_sections.append(torch.arange(sec_slice.start, sec_slice.stop, device=device))
+                x_sections.append(torch.arange(sec_slice.start, sec_slice.stop, device=device))
+                continue
+            else:
+                # Interleave data has overlapped positions for noised image and the successive clean image,
+                # leading to last_pos (= last text end L + noise w * h) > L (last text end L).
+                pass
+            # current image
+            beta_y = L + (w * h - h) / 2
+            beta_x = L + (w * h - w) / 2
+            grid = get_meshgrid_nd((beta_y, beta_x), (beta_y + h, beta_x + w), device=device)  # [2, h, w]
+            grid = grid.reshape(2, -1)  # (y, x)
+            y_sections.append(grid[0])
+            x_sections.append(grid[1])
+            # step
+            last_pos = L + w * h
+        # final text
+        y_sections.append(torch.arange(last_pos, sample_seq_lens[sample_id], device=device))
+        x_sections.append(torch.arange(last_pos, sample_seq_lens[sample_id], device=device))
+
+    x_pos = torch.cat(x_sections).long()
+    y_pos = torch.cat(y_sections).long()
+    # If there are overlap positions, we need to remove them.
+    x_pos = x_pos[:seq_len]
+    y_pos = y_pos[:seq_len]
+    all_pos = torch.stack((y_pos, x_pos), dim=1).unsqueeze(1).to(device)    # [seq_len, 1, 2]
+
+    # calc rope
+    idx_theta = (all_pos * theta).reshape(all_pos.shape[0], n_elem // 2).repeat(1, 2)
+
+    cos = torch.cos(idx_theta)
+    sin = torch.sin(idx_theta)
+
+    if return_all_pos:
+        return cos, sin, all_pos
+
+    return cos, sin
+
+
+def build_batch_2d_rope(
+        seq_len: int, n_elem: int, image_infos: Optional[List[List[Tuple[slice, Tuple[int, int]]]]] = None,
+        device: Optional[torch.device] = None, base: int = 10000, base_rescale_factor: float = 1.0,
+        return_all_pos: bool = False,
+):
+    cos_list, sin_list, all_pos_list = [], [], []
+    if image_infos is None:
+        image_infos = [None]
+    for i, image_info in enumerate(image_infos):
+        res = build_2d_rope(
+            seq_len, n_elem, image_infos=image_info, device=device,
+            base=base, base_rescale_factor=base_rescale_factor,
+            return_all_pos=return_all_pos,
+        )
+        if return_all_pos:
+            cos, sin, all_pos = res
+        else:
+            cos, sin = res
+            all_pos = None
+        cos_list.append(cos)
+        sin_list.append(sin)
+        all_pos_list.append(all_pos)
+
+    stacked_cos = torch.stack(cos_list, dim=0)
+    stacked_sin = torch.stack(sin_list, dim=0)
+
+    if return_all_pos:
+        return stacked_cos, stacked_sin, all_pos_list
+
+    return stacked_cos, stacked_sin
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass shifted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    if position_ids is not None:
+        cos = cos[position_ids]
+        sin = sin[position_ids]
+
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# =======================================================
+#     Modules for Image Generation
+# =======================================================
+
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self,
+                 hidden_size,
+                 act_layer=nn.GELU,
+                 frequency_embedding_size=256,
+                 max_period=10000,
+                 out_size=None,
+                 dtype=None,
+                 device=None
+                 ):
+        factory_kwargs = {'dtype': dtype, 'device': device}
+        super().__init__()
+        self.frequency_embedding_size = frequency_embedding_size
+        self.max_period = max_period
+        if out_size is None:
+            out_size = hidden_size
+
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True, **factory_kwargs),
+            act_layer(),
+            nn.Linear(hidden_size, out_size, bias=True, **factory_kwargs),
+        )
+        nn.init.normal_(self.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.mlp[2].weight, std=0.02)
+
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size, self.max_period).type(self.mlp[0].weight.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+
+
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, device=None, dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1, **factory_kwargs)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(
+                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
+            )
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+
+
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, device=None, dtype=None):
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(
+                dims, self.channels, self.out_channels, 3, stride=stride, padding=1, **factory_kwargs
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+
+
+class ResBlock(nn.Module):
+    """
+    A residual block that can optionally change the number of channels.
+
+    :param in_channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        emb_channels,
+        out_channels=None,
+        dropout=0.0,
+        use_conv=False,
+        dims=2,
+        up=False,
+        down=False,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {'dtype': dtype, 'device': device}
+        super().__init__()
+        self.in_channels = in_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or self.in_channels
+        self.use_conv = use_conv
+
+        self.in_layers = nn.Sequential(
+            normalization(self.in_channels, **factory_kwargs),
+            nn.SiLU(),
+            conv_nd(dims, self.in_channels, self.out_channels, 3, padding=1, **factory_kwargs),
+        )
+
+        self.updown = up or down
+
+        if up:
+            self.h_upd = Upsample(self.in_channels, False, dims, **factory_kwargs)
+            self.x_upd = Upsample(self.in_channels, False, dims, **factory_kwargs)
+        elif down:
+            self.h_upd = Downsample(self.in_channels, False, dims, **factory_kwargs)
+            self.x_upd = Downsample(self.in_channels, False, dims, **factory_kwargs)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            linear(emb_channels, 2 * self.out_channels, **factory_kwargs)
+        )
+
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels, **factory_kwargs),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1, **factory_kwargs)
+            ),
+        )
+
+        if self.out_channels == self.in_channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(
+                dims, self.in_channels, self.out_channels, 3, padding=1, **factory_kwargs
+            )
+        else:
+            self.skip_connection = conv_nd(dims, self.in_channels, self.out_channels, 1, **factory_kwargs)
+
+    def forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+
+        emb_out = self.emb_layers(emb)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+
+        # Adaptive Group Normalization
+        out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+        scale, shift = torch.chunk(emb_out, 2, dim=1)
+        h = out_norm(h) * (1. + scale) + shift
+        h = out_rest(h)
+
+        return self.skip_connection(x) + h
+
+
+class UNetDown(nn.Module):
+    """
+    patch_size: one of [1, 2 ,4 ,8]
+    in_channels: vae latent dim
+    hidden_channels: hidden dim for reducing parameters
+    out_channels: transformer model dim
+    """
+    def __init__(self, patch_size, in_channels, emb_channels, hidden_channels, out_channels,
+                 dropout=0.0, device=None, dtype=None):
+        factory_kwargs = {'dtype': dtype, 'device': device}
+        super().__init__()
+
+        self.patch_size = patch_size
+        assert self.patch_size in [1, 2, 4, 8]
+
+        self.model = nn.ModuleList(
+            [conv_nd(
+                2,
+                in_channels=in_channels,
+                out_channels=hidden_channels,
+                kernel_size=3,
+                padding=1,
+                **factory_kwargs
+            )]
+        )
+
+        if self.patch_size == 1:
+            self.model.append(ResBlock(
+                in_channels=hidden_channels,
+                emb_channels=emb_channels,
+                out_channels=out_channels,
+                dropout=dropout,
+                **factory_kwargs
+            ))
+        else:
+            for i in range(self.patch_size // 2):
+                self.model.append(ResBlock(
+                    in_channels=hidden_channels,
+                    emb_channels=emb_channels,
+                    out_channels=hidden_channels if (i + 1) * 2 != self.patch_size else out_channels,
+                    dropout=dropout,
+                    down=True,
+                    **factory_kwargs
+                ))
+
+    def forward(self, x, t):
+        assert x.shape[2] % self.patch_size == 0 and x.shape[3] % self.patch_size == 0
+        for module in self.model:
+            if isinstance(module, ResBlock):
+                x = module(x, t)
+            else:
+                x = module(x)
+        _, _, token_h, token_w = x.shape
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        return x, token_h, token_w
+
+
+class UNetUp(nn.Module):
+    """
+    patch_size: one of [1, 2 ,4 ,8]
+    in_channels: transformer model dim
+    hidden_channels: hidden dim for reducing parameters
+    out_channels: vae latent dim
+    """
+    def __init__(self, patch_size, in_channels, emb_channels, hidden_channels, out_channels,
+                 dropout=0.0, device=None, dtype=None, out_norm=False):
+        factory_kwargs = {'dtype': dtype, 'device': device}
+        super().__init__()
+
+        self.patch_size = patch_size
+        assert self.patch_size in [1, 2, 4, 8]
+
+        self.model = nn.ModuleList()
+
+        if self.patch_size == 1:
+            self.model.append(ResBlock(
+                in_channels=in_channels,
+                emb_channels=emb_channels,
+                out_channels=hidden_channels,
+                dropout=dropout,
+                **factory_kwargs
+            ))
+        else:
+            for i in range(self.patch_size // 2):
+                self.model.append(ResBlock(
+                    in_channels=in_channels if i == 0 else hidden_channels,
+                    emb_channels=emb_channels,
+                    out_channels=hidden_channels,
+                    dropout=dropout,
+                    up=True,
+                    **factory_kwargs
+                ))
+
+        if out_norm:
+            self.model.append(nn.Sequential(
+                normalization(hidden_channels, **factory_kwargs),
+                nn.SiLU(),
+                conv_nd(
+                    2,
+                    in_channels=hidden_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    **factory_kwargs
+                ),
+            ))
+        else:
+            self.model.append(conv_nd(
+                2,
+                in_channels=hidden_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                padding=1,
+                **factory_kwargs
+            ))
+
+    # batch_size, seq_len, model_dim
+    def forward(self, x, t, token_h, token_w):
+        x = rearrange(x, 'b (h w) c -> b c h w', h=token_h, w=token_w)
+        for module in self.model:
+            if isinstance(module, ResBlock):
+                x = module(x, t)
+            else:
+                x = module(x)
+        return x
+
+
+# =======================================================
+#     Modules for Transformer Backbone
+# =======================================================
+
+@dataclass
+class CausalMMOutputWithPast(CausalLMOutputWithPast):
+    diffusion_prediction: Optional[torch.Tensor] = None
+
+
+class HunyuanStaticCache(StaticCache):
+    """
+    A custom static cache for multi-modal models that supports dynamic extension of the cache
+    and inplace updates of the cache.
+
+    This cache supports batch cache_position updates.
+    """
+    def __init__(self, *args, **kwargs):
+        self.dynamic = kwargs.pop("dynamic", False)
+        super().__init__(*args, **kwargs)
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+        It is VERY important to index using a tensor, otherwise you introduce a copy to the device.
+
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. The `StaticCache` needs the `cache_position` input
+                to know how where to write in the cache.
+
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        cache_position = cache_kwargs.get("cache_position")
+        if self.layers[layer_idx].keys is None:
+            self.layers[layer_idx].lazy_initialization(key_states)
+        k_out = self.layers[layer_idx].keys
+        v_out = self.layers[layer_idx].values
+
+        if cache_position is None:
+            k_out.copy_(key_states)
+            v_out.copy_(value_states)
+        else:
+            # Note: here we use `tensor.index_copy_(dim, index, tensor)` that is equivalent to
+            # `tensor[:, :, index] = tensor`, but the first one is compile-friendly and it does explicitly an in-place
+            # operation, that avoids copies and uses less memory.
+            if cache_position.dim() == 1:
+                k_out.index_copy_(2, cache_position, key_states)
+                v_out.index_copy_(2, cache_position, value_states)
+
+                if self.dynamic:
+                    end = cache_position[-1].item() + 1
+                    k_out = k_out[:, :, :end]
+                    v_out = v_out[:, :, :end]
+            else:
+                assert cache_position.dim() == 2, f"multiple batch dims not yet {cache_position.shape=}"
+                batch_size, idx_size = cache_position.shape
+                assert batch_size == k_out.size(0)
+                assert batch_size == v_out.size(0)
+                assert batch_size == key_states.size(0)
+                assert batch_size == value_states.size(0)
+                for i in range(batch_size):
+                    unbatched_dim = 1
+                    k_out[i].index_copy_(unbatched_dim, cache_position[i], key_states[i])
+                    v_out[i].index_copy_(unbatched_dim, cache_position[i], value_states[i])
+
+                if self.dynamic:
+                    assert len(cache_position) == 1
+                    end = cache_position[0, -1].item() + 1
+                    k_out = k_out[:, :, :end]
+                    v_out = v_out[:, :, :end]
+
+        return k_out, v_out
+
+
+class CachedRoPE(object):
+    """ A 2D RoPE is determined by rope_image_info and seq_len. """
+
+    def __init__(self, config):
+        self.config = config
+        self.cos_cache = None
+        self.sin_cache = None
+        self.seq_len = None
+        self.rope_image_info = None
+
+    def __call__(self, seq_len, device, rope_image_info=None, position_ids=None):
+        """ Get cached RoPE for given seq_len and rope_image_info.
+        If cache miss, compute and cache it.
+
+        Args:
+            seq_len (int): The sequence length.
+            device (torch.device): The device to store the RoPE.
+            rope_image_info (list): The rope image info. list of lists of (slice, (height, width)) tuples.
+            position_ids (torch.Tensor): The input positions.
+
+        Returns:
+            The RoPE cos and sin tensors.
+        """
+        if (self.seq_len != seq_len) or (rope_image_info is not None and self.rope_image_info != rope_image_info):
+            # Cache miss, compute RoPE
+            if self.config.rope_type in ["2d", "default"]:
+                self.cos_cache, self.sin_cache = build_batch_2d_rope(
+                    image_infos=rope_image_info,
+                    seq_len=seq_len,
+                    n_elem=self.config.attention_head_dim,
+                    device=device,
+                    base=self.config.rope_theta,
+                )
+            else:
+                raise NotImplementedError(f"rope_type `{self.config.rope_type}` not supported")
+        else:
+            # hit cache
+            pass
+
+        if position_ids is None:
+            # Typically for training
+            cos, sin = self.cos_cache, self.sin_cache
+        else:
+            # Typically for inference
+            assert position_ids.dim() == 2, f"{position_ids.shape=}"
+            head_size = self.cos_cache.size(-1)
+            cos = torch.gather(self.cos_cache, dim=1, index=position_ids.unsqueeze(-1).expand(-1, -1, head_size))
+            sin = torch.gather(self.sin_cache, dim=1, index=position_ids.unsqueeze(-1).expand(-1, -1, head_size))
+
+        return cos, sin
+
+
+class HunyuanRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6, cast_weight_fp32=False):
+        """
+        HunyuanRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.cast_weight_fp32 = cast_weight_fp32
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        if self.cast_weight_fp32:
+            return (self.weight.float() * hidden_states).to(input_dtype)
+        else:
+            return self.weight * hidden_states.to(input_dtype)
+
+
+class HunyuanMLP(nn.Module):
+    def __init__(self, config: HunyuanImage3Config, layer_idx=None, is_shared_mlp=False, is_moe=False):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.hidden_act = config.hidden_act
+
+        self.intermediate_size = config.intermediate_size
+        if is_shared_mlp or is_moe:
+            # 如果是 moe 的话，优先用 moe_intermediate_size
+            if config.moe_intermediate_size is not None:
+                self.intermediate_size = config.moe_intermediate_size \
+                    if isinstance(config.moe_intermediate_size, int) else config.moe_intermediate_size[layer_idx]
+
+            if is_shared_mlp:
+                num_shared_expert = config.num_shared_expert \
+                    if isinstance(config.num_shared_expert, int) else config.num_shared_expert[layer_idx]
+                self.intermediate_size *= num_shared_expert
+
+        self.act_fn = ACT2FN[config.hidden_act]
+        if self.hidden_act == "silu":
+            self.intermediate_size *= 2  # SwiGLU
+            self.gate_and_up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+            self.down_proj = nn.Linear(self.intermediate_size // 2, self.hidden_size, bias=config.mlp_bias)
+        elif self.hidden_act == "gelu":
+            self.gate_and_up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+            self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        else:
+            assert False, "other hidden_act are not supported"
+
+    def forward(self, x):
+        if self.hidden_act == "silu":
+            gate_and_up_proj = self.gate_and_up_proj(x)
+            x1, x2 = gate_and_up_proj.chunk(2, dim=2)
+            down_proj = self.down_proj(x1 * self.act_fn(x2))
+            return down_proj
+        elif self.hidden_act == "gelu":
+            intermediate = self.gate_and_up_proj(x)
+            intermediate = self.act_fn(intermediate)
+            output = self.down_proj(intermediate)
+            return output
+        else:
+            assert False, "other hidden_act are not supported"
+
+
+class HunyuanTopKGate(nn.Module):
+    def __init__(self, config: HunyuanImage3Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.moe_topk = config.moe_topk if isinstance(config.moe_topk, int) else config.moe_topk[layer_idx]
+        self.drop_tokens = config.moe_drop_tokens
+        self.min_capacity = 8
+        self.random_routing_dropped_token = config.moe_random_routing_dropped_token
+        num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx]
+        self.wg = nn.Linear(config.hidden_size, num_experts, bias=False, dtype=torch.float32)
+
+        # DeepSeek gating args
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        self.norm_topk_prob = config.norm_topk_prob
+        self.group_limited_greedy = config.group_limited_greedy
+
+    def forward(self, hidden_states, topk_impl='default'):
+        bsz, seq_len, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.reshape(-1, hidden_size)
+        if self.wg.weight.dtype == torch.float32:
+            hidden_states = hidden_states.float()
+        logits = self.wg(hidden_states)
+        if topk_impl == 'default':
+            gate_output = topkgating(logits, self.moe_topk, group_limited_greedy=self.group_limited_greedy,
+                                     n_group=self.n_group, topk_group=self.topk_group,
+                                     norm_topk_prob=self.norm_topk_prob,
+                                     routed_scaling_factor=self.routed_scaling_factor,
+                                     capacity_factor=self.config.capacity_factor,
+                                     drop_tokens=self.drop_tokens)
+        elif topk_impl == 'easy':
+            gate_output = self.easy_topk(logits, self.moe_topk)
+        else:
+            raise ValueError(f"Unsupported topk_impl: {topk_impl}")
+
+        return gate_output
+
+    @staticmethod
+    def easy_topk(logits, moe_topk):
+        gates = F.softmax(logits, dim=1)
+        topk_weight_1, expert_index = torch.topk(gates, moe_topk)
+        weight_sums = topk_weight_1.sum(dim=1, keepdim=True)
+        weight_sums = torch.clamp(weight_sums, min=1e-8)
+        topk_weight = topk_weight_1 / weight_sums
+
+        return topk_weight, expert_index
+
+
+class HunyuanMoE(nn.Module):
+    def __init__(self, config: HunyuanImage3Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.moe_topk = config.moe_topk
+        self.num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx]
+        if config.use_mixed_mlp_moe:
+            self.shared_mlp = HunyuanMLP(config, layer_idx=layer_idx, is_shared_mlp=True)
+        self.gate = HunyuanTopKGate(config, layer_idx=layer_idx)
+        self.experts = nn.ModuleList(
+            [HunyuanMLP(config, layer_idx=layer_idx, is_shared_mlp=False, is_moe=True) for _ in range(self.num_experts)]
+        )
+
+        self._moe_impl = config.moe_impl
+        # For FlashInfer
+        self.moe_weight = None
+        self.moe_weight_2 = None
+        self._weights_initialized = False
+
+    @property
+    def moe_impl(self):
+        return self._moe_impl
+
+    @moe_impl.setter
+    def moe_impl(self, value):
+        self._moe_impl = value
+        if self._moe_impl == "flashinfer":
+            assert flashinfer is not None, "When using fused_moe, flashinfer must be installed."
+
+    def forward(self, hidden_states):
+        torch.cuda.set_device(hidden_states.device.index)
+        bsz, seq_len, hidden_size = hidden_states.shape
+
+        if self.config.use_mixed_mlp_moe:
+            hidden_states_mlp = self.shared_mlp(hidden_states)
+
+        reshaped_input = hidden_states.reshape(-1, hidden_size) # [bsz*seq_len, hidden_size]
+
+        with nvtx.range("MoE"):
+            if self._moe_impl == "flashinfer":
+                # Get expert weights
+                if not self._weights_initialized:
+                    self._initialize_weights_on_device(hidden_states.device)
+                topk_weight, topk_index = self.gate(hidden_states, topk_impl='easy')
+
+                combined_output = torch.zeros_like(reshaped_input)
+                _ = flashinfer.fused_moe.cutlass_fused_moe(     # noqa
+                    reshaped_input.contiguous(),
+                    topk_index.to(torch.int).contiguous(),
+                    topk_weight.to(torch.float).contiguous(),
+                    self.moe_weight,
+                    self.moe_weight_2,
+                    torch.bfloat16,
+                    output=combined_output,
+                    quant_scales=None,
+                )
+            else:
+                # Original implementation - fallback for compatibility
+                l_moe, combine_weights, dispatch_mask, exp_counts = self.gate(hidden_states, topk_impl='default')
+                dispatched_input = torch.einsum("sec,sm->ecm", dispatch_mask.type_as(hidden_states), reshaped_input)
+                chunks = dispatched_input.chunk(self.num_experts, dim=0)
+                expert_outputs = []
+                for chunk, expert in zip(chunks, self.experts):
+                    expert_outputs.append(expert(chunk))
+
+                expert_output = torch.cat(expert_outputs, dim=0)
+                combined_output = torch.einsum("sec,ecm->sm", combine_weights.type_as(hidden_states), expert_output)
+
+        combined_output = combined_output.reshape(bsz, seq_len, hidden_size)
+
+        if self.config.use_mixed_mlp_moe:
+            output = hidden_states_mlp + combined_output    # noqa
+        else:
+            output = combined_output
+
+        return output
+
+    def _initialize_weights_on_device(self, device):
+        expert_weights_gate_up = []
+        expert_weights_down = []
+
+        for expert in self.experts:
+            expert.to(device)
+            expert_weights_gate_up.append(expert.gate_and_up_proj.weight.to(device))
+            expert_weights_down.append(expert.down_proj.weight.to(device))
+
+        self.moe_weight = torch.stack(expert_weights_gate_up).contiguous()
+        self.moe_weight_2 = torch.stack(expert_weights_down).contiguous()
+        # empty the expert weights
+        for expert in self.experts:
+            expert.gate_and_up_proj.weight.data = torch.empty(0, device=device)
+            if expert.gate_and_up_proj.bias is not None:
+                expert.gate_and_up_proj.bias.data = torch.empty(0, device=device)
+            expert.down_proj.weight.data = torch.empty(0, device=device)
+            if expert.down_proj.bias is not None:
+                expert.down_proj.bias.data = torch.empty(0, device=device)
+
+        self._weights_initialized = True
+
+
+class HunyuanImage3SDPAAttention(nn.Module):
+    """PyTorch SDPA attention implementation using torch.nn.functional.scaled_dot_product_attention"""
+
+    def __init__(self, config: HunyuanImage3Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.attention_type = 'self'
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        # self.head_dim = self.hidden_size // self.num_heads
+        self.head_dim: int = config.attention_head_dim
+        self.num_key_value_heads = config.num_key_value_heads if config.num_key_value_heads else self.num_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.use_qk_norm = config.use_qk_norm
+        self.use_rotary_pos_emb = config.use_rotary_pos_emb
+        self.hidden_size_q = self.head_dim * self.num_heads
+        self.hidden_size_kv = self.head_dim * self.num_key_value_heads
+
+        # define layers
+        self.qkv_proj = nn.Linear(
+            self.hidden_size,
+            self.hidden_size_q + 2 * self.hidden_size_kv,
+            bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(self.hidden_size_q, self.hidden_size, bias=config.attention_bias)
+
+        if self.use_qk_norm:
+            self.query_layernorm = HunyuanRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.key_layernorm = HunyuanRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        if self.use_rotary_pos_emb:
+            self._init_rope()
+
+    def _init_rope(self):
+        scaling_type = self.config.rope_scaling["type"]
+        if scaling_type == "custom":
+            # Using custom rotary embedding
+            self.rotary_emb = None
+        else:
+            raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.reshape(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Cache] = None,
+            output_attentions: bool = False,
+            use_cache: Optional[bool] = False,
+            custom_pos_emb: Optional[Tuple[torch.FloatTensor]] = None,
+            **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if output_attentions:
+            raise NotImplementedError(
+                'HunyuanImage3Model is using HunyuanImage3SDPAAttention,'
+                'but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`.'
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        qkv_states = self.qkv_proj(hidden_states)
+        qkv_states = qkv_states.reshape(bsz, q_len, self.num_key_value_heads, self.num_key_value_groups + 2,
+                                        self.head_dim)
+        query_states, key_states, value_states = torch.split(qkv_states, [self.num_key_value_groups, 1, 1], dim=3)
+
+        query_states = query_states.reshape(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if self.use_rotary_pos_emb:
+            cos, sin = custom_pos_emb
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if self.use_qk_norm:
+            query_states = self.query_layernorm(query_states)
+            key_states = self.key_layernorm(key_states)
+
+        query_states = query_states.to(value_states.dtype)
+        key_states = key_states.to(value_states.dtype)
+
+        if past_key_value is not None:
+            cache_kwargs = {"cache_position": position_ids}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+            query_states = query_states.to(key_states.dtype)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with
+        # custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states, key_states, value_states, attn_mask=attention_mask, dropout_p=0.0
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+Hunyuan_ATTENTION_CLASSES = {
+    "eager": HunyuanImage3SDPAAttention,
+    "sdpa": HunyuanImage3SDPAAttention,
+}
+
+
+class HunyuanImage3DecoderLayer(nn.Module):
+    def __init__(self, config: HunyuanImage3Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+
+        attn_impl = config._attn_implementation     # noqa
+        if attn_impl in Hunyuan_ATTENTION_CLASSES:
+            self.self_attn = Hunyuan_ATTENTION_CLASSES[attn_impl](config=config, layer_idx=layer_idx)
+        else:
+            raise ValueError(f"Unsupported attention implementation: {attn_impl}")
+
+        if ((isinstance(config.num_experts, int) and config.num_experts > 1) or (
+                isinstance(config.num_experts, list) and max(
+                config.num_experts) > 1)) and layer_idx >= config.moe_layer_num_skipped:
+            self.mlp = HunyuanMoE(config, layer_idx=layer_idx)
+        else:
+            self.mlp = HunyuanMLP(config, layer_idx=layer_idx, is_shared_mlp=False, is_moe=False)
+        if config.norm_type == 'hf_rms' or config.norm_type == 'rms':
+            self.input_layernorm = HunyuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.post_attention_layernorm = HunyuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        elif config.norm_type == 'fused' or config.norm_type == 'torch_nn':
+            self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            assert False, "other norm_type are not supported"
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = False,
+            custom_pos_emb: Optional[Tuple[torch.FloatTensor]] = None,
+            **kwargs,
+    ) -> Tuple[torch.FloatTensor | Any]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            position_ids (`torch.LongTensor`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            custom_pos_emb (`Tuple[torch.FloatTensor]`, *optional*): custom position embedding for rotary
+                position embedding
+        """
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use "
+                "`attention_mask` instead.`"
+            )
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            custom_pos_emb=custom_pos_emb,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+@add_start_docstrings(
+    "The bare Hunyuan Image 3 Model outputting raw hidden-states without any specific head on top.",
+    Hunyuan_START_DOCSTRING,
+)
+class HunyuanImage3PreTrainedModel(PreTrainedModel):
+    config_class = HunyuanImage3Config
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HunyuanImage3DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+Hunyuan_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Hunyuan Model outputting raw hidden-states without any specific head on top.",
+    Hunyuan_START_DOCSTRING,
+)
+class HunyuanImage3Model(HunyuanImage3PreTrainedModel):
+    def __init__(self, config: HunyuanImage3Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.add_classification_head = config.add_classification_head
+        self.wte = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [HunyuanImage3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        if not config.add_classification_head:
+            self.ln_f = HunyuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.shared_tensor = None
+
+    @add_start_docstrings_to_model_forward(Hunyuan_INPUTS_DOCSTRING)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            custom_pos_emb: Optional[Tuple[torch.FloatTensor]] = None,
+            mode: str = "gen_text",
+            first_step: Optional[bool] = None,
+            post_token_len: int = None,
+            num_image_tokens: int = None,
+            gen_timestep_scatter_index: Optional[torch.Tensor] = None,
+            num_special_tokens: int = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for layer_idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                custom_pos_emb=custom_pos_emb,
+                mode=mode,
+                first_step=first_step,
+            )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        if not self.add_classification_head:
+            # Do ln_f outside of the model for compatibility with image generation.
+            pass
+            # hidden_states = self.ln_f(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class HunyuanImage3ForCausalMM(HunyuanImage3PreTrainedModel, GenerationMixin):
+    def __init__(self, config: HunyuanImage3Config, skip_load_module:set[str]={}, use_dist_vae=False, wgt_path=""):
+        """HunyuanImage3ForCausalMM
+
+        Args:
+            config (HunyuanImage3Config): model config to initialize the model
+            skip_load_module (set[str], optional):
+                modules to skip loading, used for vllm inference. Defaults to {}.
+
+        Raises:
+            ValueError: if config is invalid
+        """
+        super().__init__(config)
+        self.config = config
+        self._tokenizer: Optional[HunyuanImage3TokenizerFast] = None
+
+        #self.generation_config = GenerationConfig.from_model_config(config)
+
+        # Initialize image preprocessor (for conditional images)
+        self.image_processor = HunyuanImage3ImageProcessor(config)
+
+        if 'all' in skip_load_module:
+            skip_load_module = {
+                'vae',
+                'vit',
+                'timestep_emb',
+                'patch_embed',
+                'time_embed',
+                'final_layer',
+                'time_embed_2',
+                'transformers',
+            }
+        if 'vae' not in skip_load_module:
+            # vae and gen_image pipeline
+            if not use_dist_vae:
+                self.vae = AutoencoderKLConv3D.from_config(config.vae)
+                self.vae_dtype = getattr(torch, config.vae_dtype)
+                self.vae_autocast_dtype = getattr(torch, config.vae_autocast_dtype)
+                self.vae = self.vae.eval()
+                for param in self.vae.parameters():
+                    param.requires_grad = False  #
+            else:
+                self.vae = AutoencoderKLConv3D_Dist.from_config(config.vae)
+                self.vae_dtype = getattr(torch, config.vae_dtype)
+                self.vae_autocast_dtype = getattr(torch, config.vae_autocast_dtype)
+                self.vae.create_dist(wgt_path, config.vae)
+        self._pipeline = None
+
+        if 'vit' not in skip_load_module:
+            # vit
+            self.vision_model = Siglip2VisionTransformer(config.vit)
+            self.vision_aligner = LightProjector(config.vit_aligner)
+
+        if 'timestep_emb' not in skip_load_module:
+            # image generation related
+            self.timestep_emb = TimestepEmbedder(hidden_size=config.hidden_size)
+
+        if self.config.cfg_distilled:
+            self.guidance_emb = TimestepEmbedder(hidden_size=config.hidden_size)
+        if self.config.use_meanflow: 
+            self.timestep_r_emb = TimestepEmbedder(hidden_size=config.hidden_size)
+
+        if config.img_proj_type == "unet":
+            if 'patch_embed' not in skip_load_module:
+                self.patch_embed = UNetDown(
+                    patch_size=config.patch_size,
+                    emb_channels=config.hidden_size,
+                    in_channels=config.vae["latent_channels"],
+                    hidden_channels=config.patch_embed_hidden_dim,
+                    out_channels=config.hidden_size,
+                )
+            if 'time_embed' not in skip_load_module:
+                self.time_embed = TimestepEmbedder(hidden_size=config.hidden_size)
+
+            if 'final_layer' not in skip_load_module:
+                self.final_layer = UNetUp(
+                    patch_size=config.patch_size,
+                    emb_channels=config.hidden_size,
+                    in_channels=config.hidden_size,
+                    hidden_channels=config.patch_embed_hidden_dim,
+                    out_channels=config.vae["latent_channels"],
+                    out_norm=True,
+                )
+            if 'time_embed_2' not in skip_load_module:
+                self.time_embed_2 = TimestepEmbedder(hidden_size=config.hidden_size)
+        else:
+            raise ValueError(f"Unknown img_proj_type {config.img_proj_type}")
+
+        if 'transformers' not in skip_load_module:
+            # transformer backbone
+            self.model = HunyuanImage3Model(config)
+            # linear head
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.pad_id = config.pad_id
+        self.vocab_size = config.vocab_size
+      
+        # Taylor Cache
+        self.use_taylor_cache = False
+
+        self.num_image_tokens = None
+        self.num_special_tokens = None
+        # Initialize cached rope, supporting automatic cache update
+        self.cached_rope = CachedRoPE(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @classmethod
+    def from_config(cls, config: HunyuanImage3Config,  skip_load_module:set[str]={}):
+        return cls(config, skip_load_module=skip_load_module)
+
+    @property
+    def tokenizer(self):
+        if self._tokenizer is None:
+            raise ValueError("Attribute `tokenizer` has not been initialized yet. Please set it first.")
+        return self._tokenizer
+
+    def load_tokenizer(self, tokenizer):
+        self._tokenizer = HunyuanImage3TokenizerFast.from_pretrained(tokenizer)
+
+    @property
+    def pipeline(self):
+        if self._pipeline is None:
+            self.scheduler = FlowMatchDiscreteScheduler(
+                shift=self.generation_config.flow_shift, reverse=True, solver="euler",
+            )
+            self._pipeline = HunyuanImage3Text2ImagePipeline(
+                model=self, scheduler=self.scheduler, vae=self.vae,
+            )
+        return self._pipeline
+
+    def instantiate_vae_image_tokens(
+            self,
+            hidden_states: torch.Tensor,
+            timesteps: BatchRaggedTensor,
+            images: BatchRaggedImages,
+            image_mask: torch.Tensor,
+            guidance: torch.Tensor = None,
+            timesteps_r: torch.Tensor = None,
+    ):
+        """
+        Instantiate the VAE image embeddings into the input embedding sequence.
+
+        Args:
+            hidden_states: input sequence, (batch_size, seq_len, n_embd)
+            images: BatchRaggedImages
+                images can be a 4-D tensor, or a list of 4-D tensors, or a list of lists of 3-D tensors.
+            timesteps: BatchRaggedTensor
+                ts can be a 1-D tensor, or a list of 1-D tensors
+            image_mask: (batch_size, seq_len)
+        """
+        if hidden_states is None:
+            # Only for inference in non-first step image generation
+            t_emb = self.time_embed(timesteps)
+            image_emb = self.patch_embed(images, t_emb)[0]
+            timestep_emb = self.timestep_emb(timesteps).reshape(images.size(0), -1, self.config.hidden_size)
+            cat_list = [timestep_emb, image_emb]
+            
+            if guidance is not None:
+                guidance_src = self.guidance_emb(guidance.reshape(-1))    # (bsz * n, n_embd)
+                guidance_emb = guidance_src.reshape(images.size(0), -1, self.config.hidden_size)
+            if timesteps_r is not None:
+                timesteps_r_src = self.timestep_r_emb(timesteps_r.reshape(-1))    # (bsz * n, n_embd)
+                timesteps_r_emb = timesteps_r_src.reshape(images.size(0), -1, self.config.hidden_size)
+
+            if guidance is not None and timesteps_r is not None:
+                cat_list = [timestep_emb, guidance_emb, timesteps_r_emb, image_emb]
+            elif guidance is not None:
+                cat_list = [timestep_emb, guidance_emb, image_emb]
+            elif timesteps_r is not None:
+                cat_list = [timestep_emb, timesteps_r_emb, image_emb]
+            hidden_states = torch.cat(cat_list, dim=1)
+            return hidden_states
+
+        bsz, seqlen, n_embd = hidden_states.shape
+        assert isinstance(images, (torch.Tensor, list)), f"images should be BatchRaggedImages, got {type(images)}"
+
+        if isinstance(images, torch.Tensor):
+            assert images.ndim == 4, f"images should be a 4-D tensor, got {images.ndim}-D tensor"
+            assert isinstance(timesteps, torch.Tensor), f"timesteps should be 1-D tensor, got {type(timesteps)}"
+
+            bsz, seqlen, n_embd = hidden_states.shape
+            index = torch.arange(seqlen, device=hidden_states.device).unsqueeze(0).repeat(bsz, 1)   # (bsz, seqlen)
+            t_emb = self.time_embed(timesteps)     # (bsz, n_embd)
+            image_seq, token_h, token_w = self.patch_embed(images, t_emb)   # (bsz, num_patches, n_embd)
+            image_scatter_index = index.masked_select(image_mask.bool()).reshape(bsz, -1)   # (bsz, num_patches)
+            hidden_states.scatter_(
+                dim=1,
+                index=image_scatter_index.unsqueeze(-1).repeat(1, 1, n_embd),
+                src=image_seq,
+            )
+
+        else:   # list
+            index = torch.arange(seqlen, device=hidden_states.device).unsqueeze(0).repeat(bsz, 1)   # (bsz, seqlen)
+            for i, (image_i, t_i) in enumerate(zip(images, timesteps)):
+                t_i_emb = self.time_embed(t_i)      # (n_i, n_embd)
+
+                if isinstance(image_i, torch.Tensor):
+                    image_i_seq, _, _ = self.patch_embed(image_i, t_i_emb)  # (n_i, num_patches, n_embd)
+
+                elif isinstance(image_i, list):
+                    image_i_seq_list = []
+                    for j in range(len(image_i)):
+                        image_ij = image_i[j].unsqueeze(0)
+                        assert image_ij.ndim == 4, \
+                            f"image_ij should have size of (1, C, H, W), got {list(image_ij.size())}"
+                        image_i_seq_j = self.patch_embed(image_ij, t_i_emb[j:j + 1])[0]  # (1, num_patches, n_embd)
+                        image_i_seq_list.append(image_i_seq_j)
+                    image_i_seq = torch.cat(image_i_seq_list, dim=1)    # (1, Σj num_patches_j, n_embd)
+
+                else:
+                    raise TypeError(f"image_i should be a torch.Tensor or a list, got {type(image_i)}")
+
+                image_i_index = index[i:i + 1].masked_select(image_mask[i:i + 1].bool()).reshape(1, -1)  # (1, img_seqlen)
+                hidden_states[i:i + 1].scatter_(
+                    dim=1,
+                    index=image_i_index.unsqueeze(-1).repeat(1, 1, n_embd),
+                    src=image_i_seq.reshape(1, -1, n_embd),  # (1, img_seqlen, n_embd)
+                )
+
+        return hidden_states
+
+    def _forward_vision_encoder(self, images, **image_kwargs):
+        image_embeds = self.vision_model(images, **image_kwargs).last_hidden_state
+        image_embeds = self.vision_aligner(image_embeds)
+
+        return image_embeds
+
+    def instantiate_vit_image_tokens(
+            self,
+            hidden_states: torch.Tensor,
+            images: torch.Tensor | list[torch.Tensor],
+            image_masks: torch.Tensor,
+            image_kwargs: dict[str, torch.Tensor],
+    ):
+        """
+        Encode images using vision encoder(vit), and then instantiate the image embeddings into
+        the input embedding sequence.
+
+        Args:
+            hidden_states (torch.Tensor): input sequence, (bsz, seqlen, n_embd)
+            images (torch.Tensor | list[torch.Tensor]): images can be a 3-D or 4-D tensor, or a list of tensors.
+            image_masks (torch.Tensor): mask for the images, (bsz, seqlen)
+            image_kwargs (dict[str, torch.Tensor]): additional keyword arguments for the image encoder
+
+        Returns:
+            Instantiated input sequence
+        """
+        bsz, seqlen, n_embd = hidden_states.shape
+        index = torch.arange(seqlen, device=hidden_states.device).unsqueeze(0).repeat(bsz, 1)
+
+        if isinstance(images, torch.Tensor):
+            assert images.ndim in [3, 4, 5], f"images should be a 3-D, 4-D, or 5-D tensor, got {images.ndim}-D tensor."
+            if images.ndim in [4, 5]:
+                bsz, n = images.shape[:2]
+                images = images.view(bsz * n, *images.shape[2:])
+                image_kwargs = image_kwargs if image_kwargs is not None else {}
+                for k, v in image_kwargs.items():
+                    image_kwargs[k] = v.reshape(bsz * n, *v.shape[2:])
+            else:
+                n = 1
+            image_embeds = self._forward_vision_encoder(images, **image_kwargs)
+            image_seqlen = image_embeds.size(1)
+
+            image_scatter_index = index.masked_select(image_masks.bool()).reshape(bsz, -1)
+            hidden_states.scatter_(
+                dim=1,
+                index=image_scatter_index.unsqueeze(-1).repeat(1, 1, n_embd),
+                src=image_embeds.reshape(bsz, n * image_seqlen, n_embd),
+            )
+
+        elif isinstance(images, list):
+            for i, (image, image_mask) in enumerate(zip(images, image_masks)):
+                cur_kwargs = {k: v[i] for k, v in image_kwargs.items()} if image_kwargs is not None else {}
+                image_embed = self._forward_vision_encoder(image, **cur_kwargs)
+                n, image_seqlen, n_embd = image_embed.shape
+                image_embed = image_embed.reshape(n * image_seqlen, n_embd)
+
+                image_scatter_index = index[i:i+1].masked_select(image_mask.bool()).reshape(1, -1)
+                hidden_states[i:i+1].scatter_(
+                    dim=1,
+                    index=image_scatter_index.unsqueeze(-1).repeat(1, 1, n_embd),
+                    src=image_embed.reshape(1, -1, n_embd),
+                )
+        else:
+            raise ValueError(f"und_images should be Tensor or List, but got {type(images)}")
+
+        return hidden_states
+
+    def instantiate_continuous_tokens(
+            self,
+            hidden_states: torch.Tensor,
+            timesteps: Optional[BatchRaggedTensor] = None,
+            timesteps_index: Optional[BatchRaggedTensor] = None,
+    ):
+        bsz, seqlen, n_embd = hidden_states.shape
+
+        if isinstance(timesteps, list):
+            for i, timestep in enumerate(timesteps):
+                timestep_src = self.timestep_emb(timestep)  # (n, n_embd)
+                hidden_states[i:i+1].scatter_(
+                    dim=1,
+                    index=timesteps_index[i].unsqueeze(0).unsqueeze(-1).repeat(1, 1, n_embd),
+                    src=timestep_src.reshape(1, -1, n_embd),
+                )
+        else:
+            timesteps_src = self.timestep_emb(timesteps.reshape(-1))    # (bsz * n, n_embd)
+            hidden_states.scatter_(
+                dim=1,
+                index=timesteps_index.unsqueeze(-1).repeat(1, 1, n_embd),
+                src=timesteps_src.reshape(bsz, -1, n_embd),
+            )
+
+        return hidden_states
+
+    def instantiate_guidance_tokens(
+            self,
+            hidden_states: torch.Tensor,
+            guidance: Optional[BatchRaggedTensor] = None,
+            guidance_index: Optional[BatchRaggedTensor] = None,
+    ):
+        bsz, seqlen, n_embd = hidden_states.shape
+
+        guidance_src = self.guidance_emb(guidance.reshape(-1))    # (bsz * n, n_embd)
+        hidden_states.scatter_(
+            dim=1,
+            index=guidance_index.unsqueeze(-1).repeat(1, 1, n_embd),
+            src=guidance_src.reshape(bsz, -1, n_embd),
+        )
+
+        return hidden_states
+
+
+    def instantiate_timestep_r_tokens(
+            self,
+            hidden_states: torch.Tensor,
+            timesteps_r: Optional[BatchRaggedTensor] = None,
+            timesteps_r_index: Optional[BatchRaggedTensor] = None,
+    ):
+        bsz, seqlen, n_embd = hidden_states.shape
+
+        if isinstance(timesteps_r, list):
+            for i, timestep_r in enumerate(timesteps_r):
+                timestep_r_src = self.timestep_r_emb(timestep_r)  # (n, n_embd)
+                hidden_states[i:i+1].scatter_(
+                    dim=1,
+                    index=timesteps_r_index[i].unsqueeze(0).unsqueeze(-1).repeat(1, 1, n_embd),
+                    src=timestep_r_src.reshape(1, -1, n_embd),
+                )
+        else:
+            timesteps_r_src = self.timestep_r_emb(timesteps_r.reshape(-1))    # (bsz * n, n_embd)
+            hidden_states.scatter_(
+                dim=1,
+                index=timesteps_r_index.unsqueeze(-1).repeat(1, 1, n_embd),
+                src=timesteps_r_src.reshape(bsz, -1, n_embd),
+            )
+
+        return hidden_states
+
+    def get_image_tokens_hw(self, images: BatchRaggedImages):
+        assert isinstance(images, (torch.Tensor, list)), f"images should be BatchRaggedImages, got {type(images)}"
+        if isinstance(images, torch.Tensor):
+            token_h = images.shape[-2] // self.config.patch_size
+            token_w = images.shape[-1] // self.config.patch_size
+        else:
+            token_h, token_w = [], []
+            for image_i in images:
+                assert isinstance(image_i, (torch.Tensor, list)), \
+                    f"image_i should be a tensor or a list of tensors, got {type(image_i)}"
+                if isinstance(image_i, torch.Tensor):
+                    token_h.append(image_i.shape[-2] // self.config.patch_size)
+                    token_w.append(image_i.shape[-1] // self.config.patch_size)
+                else:
+                    token_h.append([])
+                    token_w.append([])
+                    for j in range(len(image_i)):
+                        token_h[-1].append(image_i[j].shape[-2] // self.config.patch_size)
+                        token_w[-1].append(image_i[j].shape[-1] // self.config.patch_size)
+        return token_h, token_w
+
+    def ragged_final_layer(self, hidden_states, image_mask, timesteps, token_h, token_w, first_step=None):
+        n_embd = hidden_states.size(-1)
+        if isinstance(timesteps, torch.Tensor):
+            # Only one target image.
+            t_emb = self.time_embed_2(timesteps)
+            if first_step is False:
+                # only for gen_image non-first-step inference
+                image_output = hidden_states[:, self.num_special_tokens:, :]
+            else:   # first_step is True or None
+                image_output = hidden_states.masked_select(
+                    image_mask.unsqueeze(-1).bool()).reshape(-1, token_h * token_w, n_embd)
+            pred = self.final_layer(image_output, t_emb, token_h, token_w)
+        else:
+            # Multiple target images(interleave data).
+            # In this case, each line of the image_mask may contain different number of Trues, leading
+            # the `reshape(batch_size, ...)` is not possible.
+            sections = image_mask.sum(1).tolist()
+            image_output = hidden_states.masked_select(
+                image_mask.unsqueeze(-1).bool()).reshape(-1, n_embd).split(sections)
+            pred = []
+            for image_output_i, t_i, token_h_i, token_w_i in zip(image_output, timesteps, token_h, token_w):
+                t_emb_i = self.time_embed_2(t_i)
+                if isinstance(token_h_i, int):
+                    image_output_i = image_output_i.reshape(-1, token_h_i * token_w_i, n_embd)
+                    pred_i = self.final_layer(image_output_i, t_emb_i, token_h_i, token_w_i)
+                    pred.append(pred_i)
+                else:
+                    subsections = [token_h_ij * token_w_ij for token_h_ij, token_w_ij in zip(token_h_i, token_w_i)]
+                    image_output_i = image_output_i.split(subsections)
+                    pred_i = []
+                    for j, image_output_ij in enumerate(image_output_i):
+                        pred_ij = self.final_layer(image_output_ij[None], t_emb_i[j:j+1], token_h_i[j], token_w_i[j])
+                        pred_i.append(pred_ij)
+                    pred.append(pred_i)
+        return pred
+
+    @staticmethod
+    def _check_inputs(cond, target, check_list):
+        if cond:
+            for name, item in check_list:
+                assert item is not None, f"`{name}` should be provided when `{target}`."
+
+    @add_start_docstrings_to_model_forward(Hunyuan_INPUTS_DOCSTRING)
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,  # bsz x seqlen
+            attention_mask: Optional[torch.Tensor] = None,  # bsz x 1 x seqlen x seqlen
+            rope_image_info: Optional[list[list[tuple[slice, tuple[int, int]]]]] = None,
+            return_dict: bool = True,
+            # for gen images
+            images: Optional[BatchRaggedImages] = None,  # bsz x c x h x w, or bsz x (n_i x (c x h_ij x w_ij))
+            image_mask: Optional[torch.Tensor] = None,  # bsz x seqlen
+            timesteps: Optional[BatchRaggedTensor] = None,  # bsz, or bsz x (n_i)
+            timesteps_index: Optional[BatchRaggedTensor] = None,  # bsz x k, or bsz x (k_i)
+            timesteps_r: Optional[BatchRaggedTensor] = None,  # bsz, or bsz x (n_i)
+            timesteps_r_index: Optional[BatchRaggedTensor] = None,  # bsz x k, or bsz x (k_i)
+            guidance: Optional[BatchRaggedTensor] = None,  # bsz, or bsz x (n_i)
+            guidance_index: Optional[BatchRaggedTensor] = None,  # bsz x k, or bsz x (k_i)
+            # for cond images
+            cond_vae_images: Optional[BatchRaggedImages] = None,  # bsz x c x h x w, or bsz x (m_i x (c x h_ij x w_ij))
+            cond_vae_image_mask: Optional[torch.Tensor] = None,  # bsz x seqlen
+            cond_timesteps: Optional[BatchRaggedTensor] = None,  # bsz, or bsz x (m_i)
+            cond_timesteps_index: Optional[BatchRaggedTensor] = None,
+            cond_vit_images: Optional[BatchRaggedImages] = None,
+            cond_vit_image_mask: Optional[torch.Tensor] = None,
+            cond_vit_image_kwargs: Optional[dict[str, Any]] = None,
+            # only for inference
+            position_ids: Optional[torch.Tensor] = None,  # bsz x seq_len-1, used for KVCache
+            past_key_values: Optional[HunyuanStaticCache] = None,
+            mode: Optional[str] = None,
+            first_step: Optional[bool] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            cache_dic = None,
+            gen_timestep_scatter_index: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalMMOutputWithPast]:
+        
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Sanity Check of Inputs
+        self._check_inputs(mode == "gen_image", "in `gen_image` mode", [
+            ("images", images), ("timesteps", timesteps),
+        ])
+        self._check_inputs(mode == "gen_image" and first_step, "in `gen_image` mode at the first step", [
+            ("image_mask", image_mask), ("timesteps_index", timesteps_index),
+        ])
+        self._check_inputs(cond_vae_images is not None, "`cond_vae_images` is provided", [
+            ("cond_timesteps", cond_timesteps), ("cond_vae_image_mask", cond_vae_image_mask),
+            ("cond_timesteps_index", cond_timesteps_index),
+        ])
+        self._check_inputs(cond_vit_images is not None, "`cond_vit_images` is provided", [
+            ("cond_vit_image_mask", cond_vit_image_mask),
+        ])
+        if input_ids is None and images is None:
+            raise ValueError("Either input_ids or images should be provided.")
+        if input_ids is not None:
+            device = input_ids.device
+        else:
+            device = get_device(images)
+        if self.training:
+            seqlen = input_ids.size(1)
+        else:
+            # For inference, we always set seqlen to maximum length to simplify the rope cache handling
+            seqlen = self.config.max_position_embeddings
+        assert self.config.max_position_embeddings >= seqlen, (
+            f"Cannot forward sequence of length {seqlen}, "
+            f"max position embeddings is only {self.config.max_position_embeddings}, "
+            f"try set --max-position-embeddings to a larger value."
+        )
+
+        # Calculate multimodal 2d rope
+        cos, sin = self.cached_rope(
+            seqlen, device, rope_image_info=rope_image_info, position_ids=position_ids,
+        )
+        # === Map token ids to embeddings ===
+        if input_ids is not None:
+            hidden_states = self.model.wte(input_ids)  # (bsz, seqlen, n_embd)
+        else:
+            hidden_states = None  # only for non-first step inference of the image generation
+         
+        # === Input layers ===
+        if images is not None:
+            if self.config.cfg_distilled and input_ids is None:
+                hidden_states = self.instantiate_vae_image_tokens(hidden_states, timesteps, images, image_mask, guidance, timesteps_r)
+            else:
+                hidden_states = self.instantiate_vae_image_tokens(hidden_states, timesteps, images, image_mask)
+
+        if cond_vae_images is not None:
+            hidden_states = self.instantiate_vae_image_tokens(hidden_states, cond_timesteps, cond_vae_images,
+                                                              cond_vae_image_mask)
+
+        if cond_vit_images is not None:
+            hidden_states = self.instantiate_vit_image_tokens(hidden_states, cond_vit_images, cond_vit_image_mask,
+                                                              cond_vit_image_kwargs)
+        if timesteps_index is not None:
+            hidden_states = self.instantiate_continuous_tokens(hidden_states, timesteps, timesteps_index)
+
+        # guidance token
+        if guidance_index is not None:
+            hidden_states = self.instantiate_guidance_tokens(hidden_states, guidance, guidance_index)
+
+        # timestep r token
+        if timesteps_r_index is not None:
+            hidden_states = self.instantiate_timestep_r_tokens(hidden_states, timesteps_r, timesteps_r_index)
+
+        if cond_timesteps_index is not None:
+            hidden_states = self.instantiate_continuous_tokens(hidden_states, cond_timesteps, cond_timesteps_index)
+        if mode == "gen_text":
+            first_step = True
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        if not self.use_taylor_cache:
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=hidden_states,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                custom_pos_emb=(cos, sin),
+                mode=mode,
+                first_step=first_step,
+                post_token_len = self.post_token_len,
+                num_image_tokens = self.num_image_tokens,
+                gen_timestep_scatter_index = gen_timestep_scatter_index,
+                num_special_tokens = self.num_special_tokens,
+            )
+            hidden_states = outputs[0]
+        else:
+            if not hasattr(self.model, "taylor_cache"):
+                self.model.taylor_cache = CacheWithFreqsContainer(cache_dic['max_order'])
+            if not hasattr(self.model, "counter"):
+                self.model.counter = 0
+
+            full_computation = (cache_dic['current_step'] == 0) \
+                or (self.model.counter == cache_dic['cache_interval'] -1) \
+                or (cache_dic['enable_first_enhance'] and cache_dic['current_step'] < cache_dic['first_enhance_steps']) \
+                or (cache_dic['enable_tailing_enhance'] and cache_dic['current_step'] >= cache_dic['num_steps'] - cache_dic['tailing_enhance_steps'])
+            if not hasattr(self.model, "last_full_computation_step"):
+                self.model.last_full_computation_step = 0
+            if full_computation:
+                self.model.counter = 0
+                outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=hidden_states,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                    custom_pos_emb=(cos, sin),
+                    mode=mode,
+                    first_step=first_step,
+                    post_token_len = self.post_token_len,
+                    num_image_tokens = self.num_image_tokens,
+                    gen_timestep_scatter_index = gen_timestep_scatter_index,
+                    num_special_tokens = self.num_special_tokens,
+                )
+                hidden_states = outputs[0]
+
+                if cache_dic['enable_first_enhance'] and (cache_dic['current_step'] < (cache_dic['first_enhance_steps']-1)):
+                    pass
+                else:
+                    self.model.taylor_cache.derivatives_computation(hidden_states, distance = cache_dic['current_step'] - self.model.last_full_computation_step, low_freqs_order=cache_dic['low_freqs_order'], high_freqs_order=cache_dic['high_freqs_order'])
+
+                self.model.last_full_computation_step = cache_dic['current_step']
+                self.model.taylor_cache.last_past_key_values = outputs.past_key_values  
+            else:
+                self.model.counter += 1
+                hidden_states = self.model.taylor_cache.taylor_formula(distance = self.model.counter)
+                outputs = BaseModelOutputWithPast(
+                    last_hidden_state=hidden_states,
+                    past_key_values=self.model.taylor_cache.last_past_key_values,
+                    hidden_states=None,
+                    attentions=None,
+                ) 
+            if cache_dic['current_step'] == cache_dic['num_steps'] - 1:
+                self.model.taylor_cache.clear_derivatives()
+
+
+        # === Output layers ===
+        # -- image tokens
+        if images is not None:
+            token_h, token_w = self.get_image_tokens_hw(images)
+            hidden_states = hidden_states.to(device=get_device(images))
+            diff_pred = self.ragged_final_layer(
+                hidden_states, image_mask, timesteps, token_h, token_w, first_step)
+        else:
+            diff_pred = None
+        # -- text tokens
+        if input_ids is None or mode == "gen_image":
+            logits = None
+        else:
+            hidden_states = self.model.ln_f(hidden_states)
+            logits = self.lm_head(hidden_states)  # (bsz, seqlen, vocab_size)
+        # -- for inference
+        if not return_dict:
+            return (logits.float(),) + outputs[1:] + (diff_pred,)
+        return CausalMMOutputWithPast(
+            logits=logits.float() if logits is not None else None,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            diffusion_prediction=diff_pred,
+        )
+
+    @staticmethod
+    def check_inputs(prompt=None, image=None, message_list=None):
+        if prompt is None and message_list is None:
+            raise ValueError("Either `prompt` or `message_list` should be provided.")
+        if prompt is not None and message_list is not None:
+            raise ValueError("`prompt` and `message_list` cannot be provided at the same time.")
+        if message_list is not None:
+            if not isinstance(message_list, list):
+                raise ValueError(f"`message_list` should be a list of messages, but got {type(message_list)}.")
+            assert len(message_list) > 0, "`message_list` should be a non-empty list."
+            for message in message_list:
+                assert isinstance(message, list) or isinstance(message, dict), \
+                    f"Each message should be a list of dicts or a dict, but got {type(message)}."
+        if image is not None:
+            error_msg = \
+                "`image` should be a PIL Image, a string path, a base64 string, bytes, or a list of them, but got {}."
+            if isinstance(image, list):
+                for im in image:
+                    assert isinstance(im, (Image.Image, str, bytes)), error_msg.format(type(im))
+            else:
+                assert isinstance(image, (Image.Image, str, bytes)), error_msg.format(type(image))
+
+    @staticmethod
+    def _validate_and_batchify_text(text, name, check_batch_size=None):
+        if text is None:
+            return text
+        assert isinstance(text, str) or isinstance(text, list), \
+            f"Input `{name}` should be a string or a list of strings, but got {type(text)}."
+        if isinstance(text, str):
+            text = [text]
+        #assert len(text) > 0 and all(isinstance(p, str) and len(p) > 0 for p in text), \
+        #    f"Input `{name}` should be a non-empty list of non-empty strings, got {text}."
+        if check_batch_size is not None:
+            assert len(text) == check_batch_size, \
+                f"Input `{name}` should have the same batch size as other inputs({check_batch_size}), got {len(text)}."
+        return text
+
+    @staticmethod
+    def _validate_and_batchify_image(image, name, check_batch_size=None):
+        if image is None:
+            return image
+        assert isinstance(image, (InputImage, list)), \
+            f"Input `{name}` should be a image or a list of images, but got {type(image)}."
+        if not isinstance(image, list):
+            image = [image]
+        batch_image_list = [image] if not isinstance(image[0], list) else image
+        for image_list in batch_image_list:
+            assert all(isinstance(im, InputImage) for im in image_list), \
+                (f"Each item in `{name}` should be a PIL Image, a string path, a base64 string, or bytes, "
+                 f"got {[type(im) for im in image_list]}.")
+        if check_batch_size is not None:
+            assert len(batch_image_list) == check_batch_size, \
+                f"Input `{name}` should have the same batch size as other inputs({check_batch_size})"
+        return batch_image_list
+
+    @staticmethod
+    def prepare_seed(seed, batch_size):
+        if isinstance(seed, torch.Tensor):
+            seed = seed.tolist()
+        if seed is None:
+            seeds = [random.randint(0, 10_000_000) for _ in range(batch_size)]
+        elif isinstance(seed, int):
+            seeds = [seed for _ in range(batch_size)]
+        elif isinstance(seed, (list, tuple)):
+            if len(seed) == batch_size:
+                seeds = [int(seed[i]) for i in range(batch_size)]
+            else:
+                raise ValueError(f"Length of seed must be equal to the batch_size({batch_size}), got {seed}.")
+        else:
+            raise ValueError(f"Seed must be an integer, a list of integers, or None, got {seed}.")
+        return seeds
+
+    def build_batch_rope_image_info(self, output, sections):
+        # Rope 1D. No need to build rope_image_info
+        if self.config.rope_type == "default":
+            return None
+
+        # Rope 2D
+        assert self.config.rope_type == "2d", \
+            f"Rope type {self.config.rope_type} not supported by method 'build_batch_rope_image_info'."
+        rope_image_info = []
+        for image_slices, sections_i in zip(output.all_image_slices, sections):
+            rope_2d_image_slices = []
+            rope_2d_image_shapes = []
+            image_idx = 0
+
+            for section in sections_i:
+                if section['type'] in ["gen_image", "cond_vae_image", "cond_vit_image"]:
+                    assert image_idx < len(image_slices), \
+                        f"Image index {image_idx} out of range for image slices with length {len(image_slices)}."
+                    rope_2d_image_slices.append(image_slices[image_idx])
+                    rope_2d_image_shapes.append((section['token_height'], section['token_width']))
+                    image_idx += 1
+
+                elif section['type'] == "cond_joint_image":
+                    assert image_idx + 1 < len(image_slices), \
+                        f"Image index {image_idx + 1} out of range for image slices with length {len(image_slices)}."
+                    assert len(section['token_height']) == len(section['token_width']), \
+                        (f"token_height and token_width should have the same length, "
+                         f"but got {len(section['token_height'])} and {len(section['token_width'])}")
+
+                    if self.image_processor.cond_token_attn_type in ["full", "joint_full"]:
+                        rope_2d_image_slices.extend([image_slices[image_idx], image_slices[image_idx + 1]])
+                        rope_2d_image_shapes.extend(list(zip(section['token_height'], section['token_width'])))
+                    elif self.image_processor.cond_token_attn_type == "full_causal":
+                        rope_2d_image_slices.append(image_slices[image_idx])
+                        rope_2d_image_shapes.append((section['token_height'][0], section['token_width'][0]))
+                    elif self.image_processor.cond_token_attn_type == "causal":
+                        pass
+                    else:
+                        raise NotImplementedError(
+                            f"cond_token_attn_type {self.image_processor.cond_token_attn_type} not supported "
+                            f"by method 'build_batch_rope_image_info'."
+                        )
+                    image_idx += 2
+
+            rope_image_info.append(list(zip(rope_2d_image_slices, rope_2d_image_shapes)))
+
+        return rope_image_info
+
+    def vae_encode(self, image, cfg_factor=1, generator=None):
+        config = self.vae.config
+
+        with torch.autocast(
+                device_type="cuda", dtype=self.vae_autocast_dtype,  # noqa
+                enabled=self.vae_autocast_dtype != torch.float32
+        ):
+            vae_encode_result = self.vae.encode(image)
+            if isinstance(vae_encode_result, torch.Tensor):
+                latents = vae_encode_result
+            else:
+                latents = vae_encode_result.latent_dist.sample(generator)
+            if hasattr(config, 'shift_factor') and config.shift_factor:
+                latents.sub_(config.shift_factor)
+            if hasattr(config, 'scaling_factor') and config.scaling_factor:
+                latents.mul_(config.scaling_factor)
+
+        if hasattr(self.vae, "ffactor_temporal"):
+            assert latents.shape[2] == 1, "latents should have shape [B, C, T, H, W] and T should be 1"
+            latents = latents.squeeze(2)
+
+        # Here we always use t=0 to declare it is a clean conditional image
+        t = torch.zeros((latents.shape[0],))
+
+        if cfg_factor > 1:
+            t = t.repeat(cfg_factor)
+            latents = latents.repeat(cfg_factor, 1, 1, 1)
+
+        return t, latents
+
+    def _encode_cond_image(
+            self,
+            batch_cond_images: list[list[Union[ImageTensor, CondImage]]],
+            cfg_factor: int = 1,
+            generator=None,
+    ):
+        if batch_cond_images is None or len(batch_cond_images[0]) == 0:
+            return None, None, None
+
+        first_image = batch_cond_images[0][0]
+
+        # 1. If vae_image presents
+        if first_image.section_type in ["cond_vae_image", "cond_joint_image"]:
+            # VAE encode one by one, as we assume cond images have different sizes
+            batch_cond_vae_images, batch_cond_t = [], []
+            for cond_images in batch_cond_images:
+                cond_vae_image_list, cond_t_list = [], []
+                for cond_image in cond_images:
+                    vae_image = (
+                        cond_image.vae_image
+                        if cond_image.section_type == "cond_joint_image"
+                        else cond_image
+                    )
+                    cond_t_, cond_vae_image_ = self.vae_encode(
+                        vae_image[None].to(self.device),
+                        generator=generator,
+                    )
+                    cond_vae_image_list.append(cond_vae_image_.squeeze(0))
+                    cond_t_list.append(cond_t_)
+                batch_cond_vae_images.append(cond_vae_image_list)
+                batch_cond_t.append(cond_t_list)
+
+            # If only one cond image for each sample and all have the same size, we can batch them together
+            # In this case, cond_vae_images is a 4-D tensor.
+            if all([len(items) == 1 for items in batch_cond_vae_images]) and all(
+                    items[0].shape == batch_cond_vae_images[0][0].shape for items in batch_cond_vae_images):
+                cond_vae_images = torch.stack([items[0] for items in batch_cond_vae_images], dim=0)
+                cond_t = torch.cat([items[0] for items in batch_cond_t], dim=0)
+                if cfg_factor > 1:
+                    cond_t = cond_t.repeat(cfg_factor)
+                    cond_vae_images = cond_vae_images.repeat(cfg_factor, 1, 1, 1)
+            else:
+                # In this case, cond_vae_images is a list of 4-D tensors or a list of lists of 3-D tensors.
+                cond_t = [torch.cat(item, dim=0) for item in batch_cond_t]
+                cond_vae_images = []
+                for items in batch_cond_vae_images:
+                    if all(items[0].shape == item.shape for item in items):
+                        cond_vae_images.append(torch.stack(items, dim=0))
+                    else:
+                        cond_vae_images.append(items)
+                if cfg_factor > 1:
+                    cond_t = cond_t * cfg_factor
+                    cond_vae_images = cond_vae_images * cfg_factor
+
+        else:
+            cond_vae_images = None
+            cond_t = None
+
+        # 2. If vit_image presents
+        if first_image.section_type in ["cond_vit_image", "cond_joint_image"]:
+            cond_vit_images = []
+            for cond_images in batch_cond_images:
+                cond_vit_image_list = []
+                for cond_image in cond_images:
+                    vit_image = (
+                        cond_image.vit_image
+                        if cond_image.section_type == "cond_joint_image"
+                        else cond_image
+                    )
+                    cond_vit_image_list.append(vit_image)
+                # Here we force convert the tensor to dtype
+                cond_vit_images.append(
+                    torch.stack(cond_vit_image_list, dim=0).to(dtype=self.dtype)
+                )
+
+            if cfg_factor > 1:
+                cond_vit_images = cond_vit_images * cfg_factor
+
+        else:
+            cond_vit_images = None
+
+        return cond_vae_images, cond_t, cond_vit_images
+
+    @staticmethod
+    def _prepare_vit_image_kwargs(batch_cond_images, cfg_factor):
+        if batch_cond_images is None or len(batch_cond_images[0]) == 0:
+            return None
+        first_image = batch_cond_images[0][0]
+        if first_image.section_type == "cond_joint_image":
+            vit_image = first_image.vit_image
+        else:
+            vit_image = first_image
+        if not hasattr(vit_image, "vision_encoder_kwargs") or len(vit_image.vision_encoder_kwargs) == 0:
+            return None
+
+        # Pack vit kwargs. Siglip2-so requires spatial_shapes and attention_mask for inference.
+        cond_vit_image_kwargs = {"spatial_shapes": [], "attention_mask": []}
+        for cond_images in batch_cond_images:
+            cond_vit_image_kwargs["spatial_shapes"].append(
+                torch.stack([
+                    cond_image.vit_image.vision_encoder_kwargs["spatial_shapes"]
+                    for cond_image in cond_images
+                ]))
+            cond_vit_image_kwargs["attention_mask"].append(
+                torch.stack([
+                    cond_image.vit_image.vision_encoder_kwargs["pixel_attention_mask"]
+                    for cond_image in cond_images
+                ]))
+        if cfg_factor > 1:
+            cond_vit_image_kwargs["spatial_shapes"] = cond_vit_image_kwargs["spatial_shapes"] * cfg_factor
+            cond_vit_image_kwargs["attention_mask"] = cond_vit_image_kwargs["attention_mask"] * cfg_factor
+        return cond_vit_image_kwargs
+
+    @torch.no_grad()
+    def prepare_message_list(
+            self,
+            message_list,
+            cond_images: list[CondImage] = None,
+            gen_image_info: ImageInfo = None,
+    ):
+        """ Convert a batch message list of OpenAI style to the internal format. """
+        inner_message_list = []
+        image_idx = 0
+        for message in message_list:
+            content = message["content"]
+            if isinstance(content, str):
+                inner_message_list.append(dict(role=message["role"], type="text", content=content))
+            elif isinstance(content, list):
+                for item in content:
+                    if item["type"] == "text":
+                        inner_message_list.append(dict(role=message["role"], type="text", content=item['text']))
+                    elif item["type"] == "image":
+                        if all(key not in item for key in ["image", "url", "path", "base64"]):
+                            continue
+                        assert cond_images is not None and image_idx < len(cond_images), \
+                            f"Image index {image_idx} out of range for cond images with length {len(cond_images)}."
+                        image = cond_images[image_idx]
+                        inner_message_list.append(dict(role="assistant", type=image.section_type, content=image.i))
+                        image_idx += 1
+                    else:
+                        raise NotImplementedError(f"Message content type {item['type']} not supported.")
+            else:
+                raise ValueError(f"Message content should be str or list, but got {type(content)}.")
+
+        if gen_image_info is not None:
+            inner_message_list.append(dict(role="assistant", type="gen_image", content=gen_image_info))
+
+        return inner_message_list
+
+    def preprocess_inputs(
+            self,
+            prompt: str | list[str] = None,
+            image: InputImage | list[InputImage] = None,
+            cot_text=None,
+            message_list=None,
+            cfg_factor=1,
+            bot_task='auto',
+            system_prompt=None,
+            max_new_tokens=None,
+            mode="gen_text",
+            image_size="auto",
+            infer_align_image_size=False,
+            device=None,
+            **kwargs,
+    ):
+        # 1. Sanity check
+        self.check_inputs(prompt, image, message_list)
+
+        # 2. Format inputs
+        batch_message_list = message_list
+        batch_prompt = prompt
+        batch_cot_text = cot_text
+        batch_system_prompt = system_prompt
+
+        #   -- 2.1 message_list
+        batch_cond_images = kwargs.get('batch_cond_images', None)
+        if batch_message_list is not None:
+            if isinstance(batch_message_list[0], dict):
+                batch_message_list = [batch_message_list]
+            batch_size = len(batch_message_list)
+
+            # Multiple cond images are allowed.
+            if batch_cond_images is None:
+                batch_cond_images = [
+                    self.image_processor.build_cond_images(
+                        message_list=message_list_,
+                        infer_align_image_size=infer_align_image_size,
+                    )
+                    for message_list_ in batch_message_list
+                ]
+            if mode == "gen_image":
+                batch_gen_image_info = [
+                    self.image_processor.build_gen_image_info(image_size, add_guidance_token=self.config.cfg_distilled, add_timestep_r_token=self.config.use_meanflow) for _ in range(batch_size)
+                ]
+            else:
+                batch_gen_image_info = [None] * batch_size
+            # Convert OpenAI message list into inner message list
+            batch_message_list = [
+                self.prepare_message_list(message_list_, cond_images, gen_image_info)
+                for message_list_, cond_images, gen_image_info in zip(
+                    batch_message_list, batch_cond_images, batch_gen_image_info
+                )
+            ]
+
+        #   -- 2.2 Prompt, image, cot text, system prompt
+        else:
+            batch_prompt = self._validate_and_batchify_text(batch_prompt, 'prompt')
+            batch_size = len(batch_prompt)
+
+            batch_cot_text = self._validate_and_batchify_text(batch_cot_text, 'cot_text', batch_size)
+            batch_system_prompt = self._validate_and_batchify_text(batch_system_prompt, 'system_prompt', batch_size)
+
+            batch_image_list = self._validate_and_batchify_image(image, 'image', batch_size)
+            if batch_cond_images is None:
+                batch_cond_images = [
+                    self.image_processor.build_cond_images(
+                        image_list=image_list,
+                        infer_align_image_size=infer_align_image_size
+                    )
+                    for image_list in batch_image_list
+                ] if batch_image_list is not None else None
+
+            if mode == "gen_image":
+                batch_gen_image_info = [
+                    self.image_processor.build_gen_image_info(image_size, add_guidance_token=self.config.cfg_distilled, add_timestep_r_token=self.config.use_meanflow) for _ in range(batch_size)
+                ]
+            else:
+                batch_gen_image_info = [None] * batch_size
+
+        # Apply batched prompt or batched message_list to build input sequence with associated info.
+        # If `drop_think` enabled, always drop <tool_call> parts in the context.
+        drop_think = kwargs.get('drop_think', getattr(self.generation_config, 'drop_think', False))
+        out = self._tokenizer.apply_chat_template(
+            batch_prompt=batch_prompt,
+            batch_message_list=batch_message_list,
+            mode=mode,
+            batch_gen_image_info=batch_gen_image_info,
+            batch_cond_images=batch_cond_images,
+            batch_system_prompt=batch_system_prompt,
+            batch_cot_text=batch_cot_text,
+            max_length=kwargs.get('max_length', self.generation_config.max_length),
+            bot_task=bot_task,
+            image_base_size=(
+                None if mode == "gen_text" and bot_task == "auto" else self.image_processor.vae_reso_group.base_size
+            ),
+            sequence_template=getattr(self.generation_config, 'sequence_template', 'pretrain'),
+            cfg_factor=cfg_factor,
+            drop_think=drop_think,
+        )
+        out['batch_size'] = batch_size
+        out['batch_cond_images'] = batch_cond_images
+        out['batch_gen_image_info'] = batch_gen_image_info
+
+        # 8. Define stop tokens by tasks
+        tkw = self._tokenizer
+        if bot_task == "auto":
+            stop_token_id = dict(
+                auto=self._tokenizer.conversation.stop_token_ids,
+            )
+        else:
+            if image_size == "auto":
+                extra_auto_stops = [tkw.ratio_token_id(i) for i in range(33)]
+            else:
+                extra_auto_stops = [tkw.boi_token_id]
+            stop_token_id = dict(
+                auto=self._tokenizer.conversation.stop_token_ids + extra_auto_stops,
+                recaption=[tkw.end_of_recaption_token_id],
+                think=[tkw.end_of_think_token_id, tkw.end_of_recaption_token_id],
+                img_ratio=extra_auto_stops,
+            )
+        out['stop_token_id'] = stop_token_id
+
+        return out
+
+    def prepare_model_inputs(
+            self,
+            prompt: str | list[str] = None,
+            image: InputImage | list[InputImage] = None,
+            mode="gen_text",
+            system_prompt=None,
+            cot_text=None,
+            image_size="auto",
+            message_list=None,
+            device=None,
+            max_new_tokens=None,
+            **kwargs,
+    ):
+        device = default(device, self.device)
+
+        # 1. apply chat template
+        cfg_factor = {"gen_text": 1, "gen_image": 2}
+        if self.config.cfg_distilled:
+            cfg_factor["gen_image"] = 1
+
+        bot_task = kwargs.pop("bot_task", "auto")
+
+        out = kwargs.pop('tokenizer_output', None)
+        if out is None:
+            out = self.preprocess_inputs(
+                prompt=prompt,
+                image=image,
+                mode=mode,
+                system_prompt=system_prompt,
+                cot_text=cot_text,
+                image_size=image_size,
+                message_list=message_list,
+                cfg_factor=cfg_factor[mode],
+                bot_task=bot_task,
+                **kwargs,
+            )
+        output, sections = out['output'], out['sections']
+
+        batch_size = out['batch_size']
+        batch_cond_images = out['batch_cond_images']
+        batch_gen_image_info = out['batch_gen_image_info']
+        stop_token_id = out['stop_token_id']
+        #if batch_gen_image_info[0] is not None:
+        #    print("batch_gen_image_info image_token_length:", batch_gen_image_info[0].image_token_length)
+        #   -- 2.3 seed
+        seeds = self.prepare_seed(seed=kwargs.get('seed'), batch_size=batch_size)
+        generator = [torch.Generator(self.device).manual_seed(seed) for seed in seeds]
+
+        # 4. Encode conditional images
+        cond_vae_images, cond_timesteps, cond_vit_images = self._encode_cond_image(
+            batch_cond_images, cfg_factor[mode], generator=generator,
+        )
+        cond_vit_image_kwargs = self._prepare_vit_image_kwargs(batch_cond_images, cfg_factor[mode])
+
+        # 5. Build position embeddings
+        rope_image_info = self.build_batch_rope_image_info(output, sections)
+
+        # 6. Build kv cache
+        if mode == "gen_image":
+            # Image generation will not extend sequence length, using token length as max_cache_len is enough.
+            max_cache_len = output.tokens.shape[1]
+        else:
+            max_cache_len = output.tokens.shape[1] + default(max_new_tokens, self.generation_config.max_length)
+        cache = HunyuanStaticCache(
+            config=self.config,
+            max_batch_size=batch_size * cfg_factor[mode],
+            max_cache_len=max_cache_len,
+            dtype=self.dtype,
+            dynamic=mode == "gen_text",
+        )
+
+        # 7. Build position ids
+        batch_position_ids = torch.arange(
+            0, output.tokens.shape[1], dtype=torch.long, device=device)[None].expand(
+            batch_size * cfg_factor[mode], -1)  # use expand to share indices to save memory
+
+        # 8. Define stop tokens by tasks
+        tkw = self._tokenizer
+        if mode == "gen_image":
+            eos_token_id = None  # don't need to define eos_token_id for image generation
+        else:
+            if bot_task == "auto":
+                stop_token_id = dict(
+                    auto=self._tokenizer.conversation.stop_token_ids,
+                )
+            else:
+                if image_size == "auto":
+                    extra_auto_stops = tkw.get_all_ratio_token_ids()
+                else:
+                    extra_auto_stops = [tkw.boi_token_id]
+                stop_token_id = dict(
+                    auto=self._tokenizer.conversation.stop_token_ids + extra_auto_stops,
+                    recaption=[tkw.end_of_recaption_token_id],
+                    think=[tkw.end_of_think_token_id, tkw.end_of_recaption_token_id],
+                    img_ratio=extra_auto_stops,
+                )
+            eos_token_id = stop_token_id[bot_task]
+
+        # 9. Build model input kwargs
+        model_input_kwargs = dict(
+            input_ids=output.tokens.to(device),
+            position_ids=batch_position_ids,
+            past_key_values=cache,
+            mode=mode,
+            rope_image_info=rope_image_info,
+            image_mask=to_device(output.gen_image_mask, device),
+            timesteps_index=to_device(output.gen_timestep_scatter_index, device),
+            guidance_index=to_device(output.guidance_scatter_index, device),
+            timesteps_r_index=to_device(output.gen_timestep_r_scatter_index, device),
+            cond_vae_images=to_device(cond_vae_images, device),
+            cond_vae_image_mask=to_device(output.vae_image_mask, device),
+            cond_timesteps=to_device(cond_timesteps, device),
+            cond_timesteps_index=to_device(output.cond_timestep_scatter_index, device),
+            cond_vit_images=to_device(cond_vit_images, device),
+            cond_vit_image_mask=to_device(output.vit_image_mask, device),
+            cond_vit_image_kwargs=to_device(cond_vit_image_kwargs, device),
+            # for inner usage
+            tokenizer_output=output,
+            batch_gen_image_info=batch_gen_image_info,
+            generator=generator,
+            batch_cond_images=batch_cond_images,
+            # generation config
+            eos_token_id=eos_token_id,
+            max_new_tokens=max_new_tokens,
+            gen_timestep_scatter_index=to_device(output.gen_timestep_scatter_index, device),
+        )
+
+        return model_input_kwargs
+
+    def _prepare_attention_mask_for_generation(
+            self,
+            inputs_tensor: torch.Tensor,
+            generation_config: GenerationConfig,
+            model_kwargs: dict[str, Any],
+    ) -> Optional[torch.Tensor]:
+        # create `4d` bool attention mask (b, 1, seqlen, seqlen) using this implementation to bypass the 2d requirement
+        # in the `transformers.generation_utils.GenerationMixin.generate`.
+        # This implementation can handle sequences with text and image modalities, where text tokens use causal
+        # attention and image tokens use full attention.
+        bsz, seq_len = inputs_tensor.shape
+        tokenizer_output = model_kwargs["tokenizer_output"]
+        batch_full_attn_slices = [
+            self.image_processor.prepare_full_attn_slices(tokenizer_output, i)
+            for i in range(bsz)
+        ]
+        #if len(batch_full_attn_slices[0]) == 0:
+        #    return None
+
+        attention_mask = torch.ones(seq_len, seq_len, dtype=torch.bool, device=inputs_tensor.device).tril(
+            diagonal=0).repeat(bsz, 1, 1)
+        for i in range(bsz):
+            for j, image_slice in enumerate(batch_full_attn_slices[i]):
+                attention_mask[i, image_slice, image_slice] = True
+        attention_mask = attention_mask.unsqueeze(1)
+        return attention_mask
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None,
+            tokenizer_output=None, batch_gen_image_info=None, batch_cond_images=None,
+            infer_align_image_size=False, generator=None, **kwargs
+    ):
+        position_ids = kwargs.get("position_ids")
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            assert position_ids is not None, "position_ids must be provided in kwargs."
+            if input_ids is not None and input_ids.shape[1] != position_ids.shape[1]:    # in decode steps
+                input_ids = torch.gather(input_ids, dim=1, index=position_ids)
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "attention_mask": attention_mask,
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                # "use_cache": kwargs.get("use_cache"),
+                "rope_image_info": kwargs["rope_image_info"],
+                "mode": kwargs["mode"],
+                "images": kwargs.get("images"),
+                "image_mask": kwargs.get("image_mask"),
+                "timesteps": kwargs.get("timesteps"),
+                "timesteps_index": kwargs.get("timesteps_index"),
+                "timesteps_r": kwargs.get("timesteps_r"),
+                "timesteps_r_index": kwargs.get("timesteps_r_index"),
+                "guidance": kwargs.get("guidance"),
+                "guidance_index": kwargs.get("guidance_index"),
+                "cond_vae_images": kwargs.get("cond_vae_images"),
+                "cond_vae_image_mask": kwargs.get("cond_vae_image_mask"),
+                "cond_timesteps": kwargs.get("cond_timesteps"),
+                "cond_timesteps_index": kwargs.get("cond_timesteps_index"),
+                "cond_vit_images": kwargs.get("cond_vit_images"),
+                "cond_vit_image_mask": kwargs.get("cond_vit_image_mask"),
+                "cond_vit_image_kwargs": kwargs.get("cond_vit_image_kwargs"),
+                "cache_dic":  kwargs.get("cache_dic"),
+                "gen_timestep_scatter_index": kwargs.get("gen_timestep_scatter_index"),
+            }
+        )
+
+        return model_inputs
+
+    def _update_model_kwargs_for_generation(
+            self,
+            outputs: ModelOutput,
+            model_kwargs: dict[str, Any],
+            is_encoder_decoder: bool = False,
+            num_new_tokens: int = 1,
+    ) -> dict[str, Any]:
+        """ This function is run after each step of model forward. It updates model kwargs for next forward step.
+        """
+        mode = model_kwargs["mode"]
+
+        updated_model_kwargs = {
+            "mode": mode,
+            "rope_image_info": model_kwargs["rope_image_info"],
+        }
+
+        # update past_key_values keeping its naming used in model code
+        for possible_cache_name in ALL_CACHE_NAMES:
+            if possible_cache_name in outputs:
+                # TODO (joao): remove output/input mismatch when these old models (xlnet, reformer) are deprecated
+                if possible_cache_name in ("past_buckets_states", "mems"):
+                    cache_name = "past_key_values"
+                else:
+                    cache_name = possible_cache_name
+                updated_model_kwargs[cache_name] = getattr(outputs, possible_cache_name)
+                break
+
+        if "tokenizer_output" in model_kwargs:
+            # After prefill step
+            if mode == "gen_text":
+                # When enable batching, we use right padding, which requires a real_pos to index the valid
+                # end position of the sequence. If tokenizer_output in model_kwargs, it means we are in the
+                # prefill step of generation.
+                real_pos = to_device(model_kwargs["tokenizer_output"].real_pos, self.device)
+                updated_model_kwargs["position_ids"] = real_pos
+            else:
+                # inputs_pos
+                image_mask = model_kwargs["image_mask"]
+                bsz, seq_len = image_mask.shape
+                index = torch.arange(seq_len, device=image_mask.device).unsqueeze(0).repeat(bsz, 1)
+                position_ids = index.masked_select(image_mask.bool()).reshape(bsz, -1)
+                timestep_position_ids = \
+                    index[torch.arange(bsz), model_kwargs["timesteps_index"][:, -1]].unsqueeze(-1)
+                pos_cat_list = [timestep_position_ids, ]
+                if self.config.cfg_distilled:
+                    guidance_position_ids = index[torch.arange(bsz), model_kwargs["guidance_index"][:, -1]].unsqueeze(-1)
+                    pos_cat_list.append(guidance_position_ids)
+                if self.config.use_meanflow:
+                    timestep_r_position_ids = index[torch.arange(bsz), model_kwargs["timesteps_r_index"][:, -1]].unsqueeze(-1)
+                    pos_cat_list.append(timestep_r_position_ids)
+                pos_cat_list.append(position_ids)
+                updated_model_kwargs["position_ids"] = torch.cat(pos_cat_list, dim=1)
+
+                # attention mask
+                mask_list = []
+                for attention_mask_i, position_ids_i in zip(
+                        model_kwargs["attention_mask"], updated_model_kwargs["position_ids"]):
+                    mask_list.append(torch.index_select(attention_mask_i, dim=1, index=position_ids_i.reshape(-1)))
+                attention_mask = torch.stack(mask_list, dim=0)
+                updated_model_kwargs["attention_mask"] = attention_mask
+                updated_model_kwargs["gen_timestep_scatter_index"] = model_kwargs["gen_timestep_scatter_index"]
+        else:
+            # After decode steps
+            if mode == "gen_text":
+                # Now we are in the decode steps.
+                updated_model_kwargs["position_ids"] = model_kwargs["position_ids"] + 1
+                # Remove attention mask to use full attention of 1 x seqlen in decode steps
+            else:
+                updated_model_kwargs["position_ids"] = model_kwargs["position_ids"]
+                updated_model_kwargs["attention_mask"] = model_kwargs["attention_mask"]
+                updated_model_kwargs["gen_timestep_scatter_index"] = model_kwargs["gen_timestep_scatter_index"]
+        return updated_model_kwargs
+
+    class _StageTransitionLogitsProcessor(LogitsProcessor):
+        def __init__(self, stage_transitions: list[tuple[int, list[int]]], batch_size: int):
+            self.transition_map = {stop_id: list(append_ids) for stop_id, append_ids in stage_transitions}
+            self.pending_tokens = [[] for _ in range(batch_size)]
+            self.completed = [set() for _ in range(batch_size)]
+
+        def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
+            batch_size = input_ids.shape[0]
+            last_tokens = input_ids[:, -1]
+            device = scores.device
+            min_score = torch.finfo(scores.dtype).min
+
+            for i in range(batch_size):
+                last_token = last_tokens[i].item()
+
+                # Consume pending tokens if the last token matches the head.
+                if self.pending_tokens[i] and last_token == self.pending_tokens[i][0]:
+                    self.pending_tokens[i].pop(0)
+
+                # If pending tokens remain, force the next token.
+                if self.pending_tokens[i]:
+                    scores[i].fill_(min_score)
+                    scores[i, self.pending_tokens[i][0]] = 0
+                    continue
+
+                # Trigger stage transition if needed.
+                if last_token in self.transition_map and last_token not in self.completed[i]:
+                    self.completed[i].add(last_token)
+                    next_tokens = self.transition_map[last_token]
+                    if next_tokens:
+                        self.pending_tokens[i] = list(next_tokens)
+                        scores[i].fill_(min_score)
+                        scores[i, self.pending_tokens[i][0]] = 0
+
+                scores[i] = scores[i].to(device)
+
+            return scores
+
+    class _ConditionalSliceVocabLogitsProcessor(LogitsProcessor):
+        def __init__(
+            self,
+            trigger_token_ids: list[int],
+            vocab_start: int,
+            vocab_end: int,
+            other_slices: Optional[list[tuple[int, int]]] = None,
+            force_greedy: bool = False,
+        ):
+            self.trigger_token_ids = set(trigger_token_ids)
+            self.vocab_start = vocab_start
+            self.vocab_end = vocab_end
+            self.other_slices = other_slices or []
+            self.force_greedy = force_greedy
+
+        def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.Tensor:
+            last_tokens = input_ids[:, -1]
+            min_score = torch.finfo(scores.dtype).min
+            for i in range(scores.size(0)):
+                if last_tokens[i].item() not in self.trigger_token_ids:
+                    continue
+                original_scores = scores[i].clone()
+                scores[i].fill_(min_score)
+                scores[i, self.vocab_start:self.vocab_end] = original_scores[self.vocab_start:self.vocab_end]
+                for start, end in self.other_slices:
+                    scores[i, start:end] = original_scores[start:end]
+                if self.force_greedy:
+                    max_token_id = scores[i].argmax().item()
+                    scores[i].fill_(min_score)
+                    scores[i, max_token_id] = 0
+            return scores
+
+    def _get_ratio_index_from_token(self, ratio_token_id: int, tokenizer) -> int:
+        if hasattr(tokenizer, "get_all_ratio_token_ids"):
+            ratio_token_ids = tokenizer.get_all_ratio_token_ids()
+            try:
+                ratio_index = ratio_token_ids.index(ratio_token_id)
+            except ValueError as exc:
+                raise ValueError(f"Unknown ratio token id {ratio_token_id}") from exc
+        else:
+            ratio_index = ratio_token_id - tokenizer.ratio_token_id(0)
+        if ratio_index < 0 or ratio_index >= len(self.image_processor.vae_reso_group):
+            raise ValueError(f"ratio_index {ratio_index} out of range for vae_reso_group")
+        return ratio_index
+
+    @torch.no_grad()
+    def generate(
+            self,
+            inputs: Optional[torch.Tensor] = None,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
+            synced_gpus: Optional[bool] = None,
+            assistant_model: Optional["PreTrainedModel"] = None,
+            streamer: Optional["BaseStreamer"] = None,
+            negative_prompt_ids: Optional[torch.Tensor] = None,
+            negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+            use_model_defaults: Optional[bool] = None,
+            generator: Optional[list[torch.Generator]] = None,
+            decode_text: bool = False,
+            verbose: int = 0,
+            stage_transitions: Optional[list[tuple[int, list[int]]]] = None,
+            final_stop_tokens: Optional[list[int]] = None,
+            **kwargs,
+    ):
+        gen_config = default(generation_config, self.generation_config)
+        mode = kwargs.get("mode", "gen_text")
+        output = kwargs["tokenizer_output"]
+        indices = torch.where(output.tokens[0] == self._tokenizer.encode("<img>")[0])[0]
+        if indices.shape[0] > 0:
+            last_idx = indices[-1]
+            self.post_token_len = int(output.tokens[0].shape[0] - 1 - last_idx)
+        else:
+            self.post_token_len = None
+        # Log info
+        if verbose >= 1:
+            context = self._tokenizer.decode(output.tokens[0], skip_special_tokens=False)
+            # Replace <img><img>...<img> with [<img>]{number}
+            img_token = self._tokenizer.get_img_token()
+            context = re.sub(f"({img_token})+", lambda m: f"[{img_token}]{{{len(m.group(0)) // 5}}}", context)
+            info_list = [
+                ("token shape", output.tokens.shape),
+                ("context[0]", context),
+            ]
+            if mode == "gen_image":
+                if generator is not None:
+                    info_list.extend([
+                        ("seed", [g.initial_seed() for g in generator]),
+                    ])
+                info_list.extend([
+                    ("image_size",
+                     [f"{info.image_height}x{info.image_width}" for info in kwargs["batch_gen_image_info"]]),
+                    ("infer_steps", gen_config.diff_infer_steps),
+                    ("guidance_scale", gen_config.diff_guidance_scale),
+                    ("flow_shift", gen_config.flow_shift),
+                ])
+            else:
+                info_list.extend([
+                    ("do_sample", kwargs.get("do_sample", gen_config.do_sample)),
+                    ("max_new_tokens", kwargs.get("max_new_tokens", gen_config.max_new_tokens)),
+                    ("top_k", kwargs.get("top_k", gen_config.top_k)),
+                    ("top_p", kwargs.get("top_p", gen_config.top_p)),
+                    ("temperature", kwargs.get("temperature", gen_config.temperature)),
+                    ("repetition_penalty", kwargs.get("repetition_penalty", gen_config.repetition_penalty)),
+                ])
+            max_key_len = max(len(k) for k, _ in info_list)
+            info_str = "=" * 50 + \
+                       "\nModel input info:\n" + \
+                       "\n".join([f"    {k.rjust(max_key_len)}: {v}" for k, v in info_list]) + \
+                       "\n--------------------------------------------------"
+            print(info_str, flush=True)
+            start_time = time.time()
+
+        if mode == "gen_text":
+            if verbose >= 2 and streamer is None:
+                streamer = TextStreamer(self._tokenizer, skip_prompt=True, skip_special_tokens=False)   # noqa
+
+            with torch.autocast(device_type="cuda", dtype=self.dtype, enabled=self.dtype != torch.float32):
+                if stage_transitions is not None:
+                    if final_stop_tokens is None:
+                        raise ValueError("`final_stop_tokens` must be provided when `stage_transitions` is set.")
+                    if logits_processor is None:
+                        logits_processor = LogitsProcessorList()
+                    elif not isinstance(logits_processor, LogitsProcessorList):
+                        logits_processor = LogitsProcessorList(logits_processor)
+                    input_ids = kwargs.get("input_ids")
+                    if input_ids is None:
+                        raise ValueError("`input_ids` must be provided for multi-stage generation.")
+                    logits_processor.append(
+                        self._StageTransitionLogitsProcessor(stage_transitions, input_ids.shape[0])
+                    )
+                    kwargs["eos_token_id"] = final_stop_tokens
+
+                samples = super().generate(
+                    inputs=inputs,
+                    generation_config=gen_config,
+                    logits_processor=logits_processor,
+                    stopping_criteria=stopping_criteria,
+                    prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+                    synced_gpus=synced_gpus,
+                    assistant_model=assistant_model,
+                    streamer=streamer,
+                    negative_prompt_ids=negative_prompt_ids,
+                    negative_prompt_attention_mask=negative_prompt_attention_mask,
+                    use_model_defaults=use_model_defaults,
+                    **kwargs,
+                )
+                if decode_text:
+                    samples = self.decode_text(samples, input_length=kwargs["input_ids"].shape[1])
+
+        elif mode == "gen_image":
+            batch_gen_image_info: list[ImageInfo] = kwargs.get("batch_gen_image_info")
+            if batch_gen_image_info is None:
+                raise ValueError("`batch_gen_image_info` should be provided when `mode` is `gen_image`.")
+            self.num_image_tokens = (batch_gen_image_info[0].image_token_length) 
+            #                       + (1 if batch_gen_image_info[0].add_timestep_token else 0)
+            #                       + (1 if batch_gen_image_info[0].add_guidance_token else 0) )
+            self.num_special_tokens = ((1 if batch_gen_image_info[0].add_timestep_token else 0) + 
+                                       (1 if batch_gen_image_info[0].add_guidance_token else 0) +
+                                       (1 if batch_gen_image_info[0].add_timestep_r_token else 0) )
+            results = self.pipeline(
+                batch_size=len(batch_gen_image_info),
+                image_size=[batch_gen_image_info[0].image_height, batch_gen_image_info[0].image_width],
+                num_inference_steps=gen_config.diff_infer_steps,
+                guidance_scale=gen_config.diff_guidance_scale,
+                generator=generator,
+                meanflow=self.config.use_meanflow,
+                model_kwargs=kwargs,
+                cfg_distilled = self.config.cfg_distilled,
+            )
+            samples = results[0]
+
+        else:
+            raise ValueError(f"Unknown mode {mode}, only `gen_text` and `gen_image` are supported.")
+
+        if verbose >= 1:
+            end_time = time.time()
+            print(f"Generation completed in {end_time - start_time:.2f} seconds.", flush=True)  # noqa
+
+        return samples
+
+    def decode_text(self, output: torch.Tensor, input_length: int = None):
+        if output.ndim == 2:
+            assert output.size(0) == 1, "Batch decoding is not supported yet."
+            return [self.decode_text(output_i, input_length) for output_i in output]
+        elif output.ndim == 1:
+            if input_length is not None:
+                output = output[input_length:]
+            text = self._tokenizer.decode(output)
+            return text
+        else:
+            raise ValueError(f"output should be 1D or 2D tensor, but got {output.ndim}D tensor.")
+
+    @torch.no_grad()
+    def generate_image(
+            self,
+            prompt=None,
+            image=None,
+            message_list=None,
+            seed=None,
+            image_size="auto",
+            use_system_prompt=None,
+            system_prompt=None,
+            bot_task=None,
+            infer_align_image_size=False,
+            use_taylor_cache=False,
+            taylor_cache_interval=None,
+            taylor_cache_order=None,
+            taylor_cache_enable_first_enhance=None,
+            taylor_cache_first_enhance_steps=None,
+            taylor_cache_enable_tailing_enhance=None,
+            taylor_cache_tailing_enhance_steps=None,
+            taylor_cache_low_freqs_order=None,
+            taylor_cache_high_freqs_order=None,
+            **kwargs,
+    ):
+        max_new_tokens = kwargs.pop("max_new_tokens", 2048)
+        cot_text = kwargs.pop("cot_text", None)
+
+        use_system_prompt = default(use_system_prompt, self.generation_config.use_system_prompt)
+        bot_task = default(bot_task, self.generation_config.bot_task)
+        system_prompt = get_system_prompt(use_system_prompt, bot_task, system_prompt)
+        system_prompt = system_prompt.strip()
+
+        self.taylor_cache_interval = taylor_cache_interval
+        self.taylor_cache_order = taylor_cache_order
+        self.taylor_cache_enable_first_enhance = taylor_cache_enable_first_enhance
+        self.taylor_cache_first_enhance_steps = taylor_cache_first_enhance_steps
+        self.taylor_cache_enable_tailing_enhance = taylor_cache_enable_tailing_enhance
+        self.taylor_cache_tailing_enhance_steps = taylor_cache_tailing_enhance_steps
+        self.taylor_cache_low_freqs_order = taylor_cache_low_freqs_order 
+        self.taylor_cache_high_freqs_order = taylor_cache_high_freqs_order
+        self.use_taylor_cache = False
+
+        batch_cond_images_cache = None
+        tkw = self._tokenizer
+        need_ratio = image_size == "auto" or bot_task == "img_ratio"
+        if bot_task in ["think", "recaption", "think_recaption"]:
+            first_bot_task = bot_task.split("_")[0]
+            stage_transitions = []
+
+            if first_bot_task == "think" and "recaption" in bot_task:
+                stage_transitions.append(
+                    (tkw.end_of_think_token_id, [tkw.convert_tokens_to_ids(tkw.recaption_token)])
+                )
+
+            if need_ratio:
+                answer_prefix_tokens = []
+                if getattr(self.generation_config, "sequence_template", "pretrain") == "instruct":
+                    answer_prefix_tokens = [tkw.convert_tokens_to_ids(tkw.answer_token)]
+                image_base_size = self.image_processor.vae_reso_group.base_size
+                if "recaption" in bot_task:
+                    transition_id = tkw.end_of_recaption_token_id
+                else:
+                    transition_id = tkw.end_of_think_token_id
+                stage_transitions.append(
+                    (transition_id, answer_prefix_tokens + [tkw.boi_token_id, tkw.size_token_id(image_base_size)])
+                )
+                final_stop_tokens = list(range(tkw.start_ratio_token_id, tkw.end_ratio_token_id + 1))
+                for start, end in getattr(tkw, "ratio_token_other_slices", []):
+                    final_stop_tokens.extend(range(start, end))
+            else:
+                if "recaption" in bot_task:
+                    final_stop_tokens = [tkw.end_of_recaption_token_id]
+                else:
+                    final_stop_tokens = [tkw.end_of_think_token_id, tkw.end_of_recaption_token_id]
+                    
+            model_inputs = self.prepare_model_inputs(
+                prompt=prompt, image=image, message_list=message_list, system_prompt=system_prompt,
+                max_new_tokens=max_new_tokens, mode="gen_text", bot_task=first_bot_task,
+                batch_cond_images=batch_cond_images_cache, infer_align_image_size=infer_align_image_size,
+            )
+            batch_cond_images_cache = model_inputs['batch_cond_images']
+            logits_processor = None
+            if need_ratio:
+                image_base_size = self.image_processor.vae_reso_group.base_size
+                logits_processor = LogitsProcessorList([
+                    self._ConditionalSliceVocabLogitsProcessor(
+                        trigger_token_ids=[tkw.size_token_id(image_base_size)],
+                        vocab_start=tkw.start_ratio_token_id,
+                        vocab_end=tkw.end_ratio_token_id + 1,
+                        other_slices=getattr(tkw, "ratio_token_other_slices", []),
+                        force_greedy=True,
+                    )
+                ])
+
+            input_length = model_inputs["input_ids"].shape[1]
+            if stage_transitions:
+                outputs = self.generate(
+                    **model_inputs,
+                    decode_text=False,
+                    stage_transitions=stage_transitions,
+                    final_stop_tokens=final_stop_tokens,
+                    logits_processor=logits_processor,
+                    **kwargs,
+                )
+            else:
+                outputs = self.generate(**model_inputs, decode_text=False, logits_processor=logits_processor, **kwargs)
+             
+            generated_tokens = outputs[:, input_length:]
+            if "recaption" in bot_task:
+                end_token_id = tkw.end_of_recaption_token_id
+            else:
+                end_token_id = tkw.end_of_think_token_id
+            end_positions = (generated_tokens[0] == end_token_id).nonzero(as_tuple=False)
+            if end_positions.numel() > 0:
+                end_pos = end_positions[0].item()
+                cot_tokens = generated_tokens[0, :end_pos + 1]
+            else:
+                cot_tokens = generated_tokens[0]
+            cot_text_gen = self._tokenizer.decode(cot_tokens)
+
+            if first_bot_task == "think":
+                cot_text = [tkw.think_token + cot_text_gen]
+            else:
+                cot_text = [tkw.recaption_token + cot_text_gen]
+
+            if self.generation_config.drop_think and tkw.think_token in cot_text[0]:
+                if tkw.recaption_token in cot_text[0]:
+                    recaption_part = cot_text[0].split(tkw.recaption_token)[1]
+                    if tkw.end_of_recaption_token in recaption_part:
+                        recaption_part = recaption_part.split(tkw.end_of_recaption_token)[0]
+                    cot_text = [tkw.recaption_token + recaption_part + tkw.end_of_recaption_token]
+
+                    if system_prompt:
+                        system_prompt = get_system_prompt("en_recaption", bot_task)
+
+            if need_ratio:
+                ratio_token_id = outputs[0, -1].item()  # get the original ratio index from the generated tokens
+                ratio_index = self._get_ratio_index_from_token(ratio_token_id, tkw)
+                reso = self.image_processor.vae_reso_group[ratio_index]
+                image_size = reso.height, reso.width
+
+        elif need_ratio:
+            self.image_processor.build_img_ratio_slice_logits_processor(self.tokenizer)
+            model_inputs = self.prepare_model_inputs(
+                prompt=prompt, image=image, cot_text=cot_text, message_list=message_list, max_new_tokens=1,
+                system_prompt=system_prompt, seed=seed, mode="gen_text", bot_task="img_ratio",
+                batch_cond_images=batch_cond_images_cache, infer_align_image_size=infer_align_image_size,
+            )
+            batch_cond_images_cache = model_inputs['batch_cond_images']
+            outputs = self.generate(**model_inputs, do_sample=False, logits_processor=self.image_processor.img_ratio_slice_logits_processor, **kwargs)
+            ratio_index = outputs[0, -1].item()
+            reso = self.image_processor.vae_reso_group[ratio_index]
+            image_size = reso.height, reso.width
+
+        # Generate image
+        self.use_taylor_cache = use_taylor_cache
+        model_inputs = self.prepare_model_inputs(
+            prompt=prompt, image=image, cot_text=cot_text, message_list=message_list, system_prompt=system_prompt,
+            seed=seed, image_size=image_size, mode="gen_image", batch_cond_images=batch_cond_images_cache,
+            infer_align_image_size=infer_align_image_size,
+        )
+        batch_cond_images_cache = model_inputs['batch_cond_images']
+        outputs = self.generate(**model_inputs, **kwargs)
+        self.image_processor.postprocess_outputs(
+            outputs,
+            batch_cond_images=batch_cond_images_cache,
+            infer_align_image_size=infer_align_image_size,
+        )
+        return cot_text, outputs
+
+
+__all__ = [
+    "HunyuanImage3ForCausalMM",
+    "HunyuanImage3Model",
+    "HunyuanImage3PreTrainedModel",
+    "TimestepEmbedder",
+    "UNetDown",
+    "UNetUp"
+    "CachedRoPE",
+    "apply_rotary_pos_emb",
+    "build_batch_2d_rope",
+]
+