avtc commited on 11 days ago

Commit

83685eb

verified ·

1 Parent(s): 80c6cfe

Upload folder using huggingface_hub

Browse files

Files changed (50) hide show

.gitattributes +2 -0
__init__.py +26 -0
added_tokens.json +56 -0
chat_template.jinja +159 -0
config.json +147 -0
configuration_minimax_m2.py +131 -0
generation_config.json +5 -0
merges.txt +0 -0
model-00001-of-00033.safetensors +3 -0
model-00002-of-00033.safetensors +3 -0
model-00003-of-00033.safetensors +3 -0
model-00004-of-00033.safetensors +3 -0
model-00005-of-00033.safetensors +3 -0
model-00006-of-00033.safetensors +3 -0
model-00007-of-00033.safetensors +3 -0
model-00008-of-00033.safetensors +3 -0
model-00009-of-00033.safetensors +3 -0
model-00010-of-00033.safetensors +3 -0
model-00011-of-00033.safetensors +3 -0
model-00012-of-00033.safetensors +3 -0
model-00013-of-00033.safetensors +3 -0
model-00014-of-00033.safetensors +3 -0
model-00015-of-00033.safetensors +3 -0
model-00016-of-00033.safetensors +3 -0
model-00017-of-00033.safetensors +3 -0
model-00018-of-00033.safetensors +3 -0
model-00019-of-00033.safetensors +3 -0
model-00020-of-00033.safetensors +3 -0
model-00021-of-00033.safetensors +3 -0
model-00022-of-00033.safetensors +3 -0
model-00023-of-00033.safetensors +3 -0
model-00024-of-00033.safetensors +3 -0
model-00025-of-00033.safetensors +3 -0
model-00026-of-00033.safetensors +3 -0
model-00027-of-00033.safetensors +3 -0
model-00028-of-00033.safetensors +3 -0
model-00029-of-00033.safetensors +3 -0
model-00030-of-00033.safetensors +3 -0
model-00031-of-00033.safetensors +3 -0
model-00032-of-00033.safetensors +3 -0
model-00033-of-00033.safetensors +3 -0
model.safetensors.index.json +3 -0
modeling_minimax_m2.py +843 -0
quant_log.csv +0 -0
quantize_config.json +27 -0
special_tokens_map.json +76 -0
test_minimax_m2_hf.py +178 -0
tokenizer.json +3 -0
tokenizer_config.json +498 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model.safetensors.index.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 [email protected]
+# SPDX-License-Identifier: Apache-2.0
+# Contact: [email protected], x.com/qubitium
+#
+# """MiniMax M2 Hugging Face remote code support."""
+from .configuration_minimax_m2 import MiniMaxM2Config
+from .modeling_minimax_m2 import (
+    MiniMaxForCausalLM,
+    MiniMaxM2ForCausalLM,
+    MiniMaxM2Model,
+    MiniMaxM2PreTrainedModel,
+    MiniMaxModel,
+    MiniMaxPreTrainedModel,
+)
+__all__ = [
+    "MiniMaxM2Config",
+    "MiniMaxM2PreTrainedModel",
+    "MiniMaxM2Model",
+    "MiniMaxM2ForCausalLM",
+    "MiniMaxPreTrainedModel",
+    "MiniMaxModel",
+    "MiniMaxForCausalLM",
+]

added_tokens.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "</minimax:tool_call>": 200053,
+  "</think>": 200051,
+  "<add_file>": 200036,
+  "<code_context>": 200043,
+  "<code_interpreter>": 200023,
+  "<commit_after>": 200018,
+  "<commit_before>": 200016,
+  "<commit_message>": 200040,
+  "<commit_msg>": 200017,
+  "<delete_file>": 200037,
+  "<edit_file>": 200039,
+  "<empty_output>": 200015,
+  "<empty_source_file>": 200041,
+  "<file_content>": 200044,
+  "<file_sep>": 200049,
+  "<filename>": 200006,
+  "<filepath>": 200048,
+  "<fim_middle>": 200002,
+  "<fim_pad>": 200004,
+  "<fim_prefix>": 200001,
+  "<fim_suffix>": 200003,
+  "<function_call>": 200022,
+  "<gh_stars>": 200007,
+  "<issue_closed>": 200010,
+  "<issue_comment>": 200009,
+  "<issue_start>": 200008,
+  "<jupyter_code>": 200013,
+  "<jupyter_error>": 200035,
+  "<jupyter_output>": 200014,
+  "<jupyter_start>": 200011,
+  "<jupyter_text>": 200012,
+  "<minimax:tool_call>": 200052,
+  "<pr_start>": 200046,
+  "<rename_file>": 200038,
+  "<repo_struct>": 200042,
+  "<reponame>": 200005,
+  "<review_comment>": 200047,
+  "<source_files>": 200045,
+  "<think>": 200050,
+  "[e~[": 200020,
+  "]!d~[": 200021,
+  "]!p~[": 200000,
+  "]<]end of image[>[": 200030,
+  "]<]end of speech[>[": 200028,
+  "]<]end of video[>[": 200032,
+  "]<]image[>[": 200025,
+  "]<]speech[>[": 200024,
+  "]<]start of image[>[": 200029,
+  "]<]start of speech[>[": 200027,
+  "]<]start of video[>[": 200031,
+  "]<]video[>[": 200026,
+  "]<]vision pad[>[": 200033,
+  "]~!b[": 200034,
+  "]~b]": 200019
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,159 @@

+{# ----------‑‑‑ special token variables ‑‑‑---------- #}
+{%- set toolcall_begin_token   = '<minimax:tool_call>'         -%}
+{%- set toolcall_end_token     = '</minimax:tool_call>'        -%}
+{#- Tool Rendering Functions ============================================== -#}
+{%- macro render_tool_namespace(namespace_name, tool_list) -%}
+{%- for tool in tool_list -%}
+<tool>{{ tool.function | tojson(ensure_ascii=False) }}</tool>
+{% endfor -%}
+{%- endmacro -%}
+{%- macro visible_text(content) -%}
+    {%- if content is string -%}
+        {{ content }}
+    {%- elif content is iterable and content is not mapping -%}
+        {%- for item in content -%}
+            {%- if item is mapping and item.type == 'text' -%}
+                {{- item.text }}
+            {%- elif item is string -%}
+                {{- item }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{- content }}
+    {%- endif -%}
+{%- endmacro -%}
+{#- System Message Construction ============================================ -#}
+{%- macro build_system_message(system_message) -%}
+    {%- if system_message and system_message.content -%}
+        {{- visible_text(system_message.content) }}
+    {%- else -%}
+        {%- if model_identity is not defined -%}
+            {%- set model_identity = "You are a helpful assistant." -%}
+        {%- endif -%}
+        {{- model_identity }}
+    {%- endif -%}
+    {#- Handle current_date -#}
+    {%- if system_message and system_message.current_date -%}
+        {{- '\n' ~ 'Current date: ' + system_message.current_date }}
+    {%- endif -%}
+    {#- Handle current_location -#}
+    {%- if system_message and system_message.current_location -%}
+        {{- '\n' ~ 'Current location: ' + system_message.current_location }}
+    {%- endif -%}
+{%- endmacro -%}
+{#- Main Template Logic ================================================= -#}
+{#- Extract system message (only first message if it's system) -#}
+{%- set system_message = none -%}
+{%- set conversation_messages = messages -%}
+{%- if messages and messages[0].role == "system" -%}
+    {%- set system_message = messages[0] -%}
+    {%- set conversation_messages = messages[1:] -%}
+{%- endif -%}
+{#- Get the last user message turn, for interleved thinking -#}
+{%- set ns = namespace(last_user_index=-1) %}
+{% for m in conversation_messages %}
+    {%- if m.role == 'user' %}
+        {% set ns.last_user_index = loop.index0 -%}
+    {%- endif %}
+{%- endfor %}
+{#- Render system message -#}
+{{- ']~!b[' ~ ']~b]system' ~ '\n' }}
+{{- build_system_message(system_message) }}
+{#- Render tools if available -#}
+{%- if tools -%}
+    {{- '\n\n' ~ '# Tools' ~ '\n' ~ 'You may call one or more tools to assist with the user query.\nHere are the tools available in JSONSchema format:' ~ '\n' }}
+    {{- '\n' ~ '<tools>' ~ '\n' }}
+    {{- render_tool_namespace("functions", tools) }}
+    {{- '</tools>' ~ '\n\n' }}
+{{- 'When making tool calls, use XML format to invoke tools and pass parameters:' ~ '\n' }}
+{{- '\n' ~ toolcall_begin_token }}
+<invoke name="tool-name-1">
+<parameter name="param-key-1">param-value-1</parameter>
+<parameter name="param-key-2">param-value-2</parameter>
+...
+</invoke>
+{{- '\n' ~ toolcall_end_token }}
+{%- endif -%}
+{{- '[e~[\n' }}
+{#- Render messages -#}
+{%- set last_tool_call = namespace(name=none) -%}
+{%- for message in conversation_messages -%}
+    {%- if message.role == 'assistant' -%}
+        {#- Only render reasoning_content if no user message follows -#}
+        {{- ']~b]ai' ~ '\n' }}
+        {%- set reasoning_content = '' %}
+        {%- set content = visible_text(message.content) %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].strip('\n').split('<think>')[-1].strip('\n') %}
+                {%- set content = content.split('</think>')[-1].strip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if reasoning_content and loop.index0 > ns.last_user_index -%}
+            {{- '<think>' ~ '\n' ~ reasoning_content ~ '\n' ~ '</think>' ~ '\n\n' }}
+        {%- endif -%}
+        {%- if content -%}
+            {{- content }}
+        {%- endif -%}
+        {%- if message.tool_calls -%}
+            {{- '\n' ~ toolcall_begin_token ~ '\n' }}
+            {%- for tool_call in message.tool_calls -%}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<invoke name="' + tool_call.name + '">' }}
+                {% set _args = tool_call.arguments %}
+                {%- for k, v in _args.items() %}
+                {{- '<parameter name="' + k + '">' }}
+                {{- v | tojson(ensure_ascii=False) if v is not string else v }}
+                {{- '</parameter>' }}
+                {% endfor %}
+                {{- '</invoke>' ~ '\n' }}
+            {%- endfor -%}
+            {{- toolcall_end_token}}
+            {%- set last_tool_call.name = message.tool_calls[-1].name -%}
+        {%- else -%}
+            {%- set last_tool_call.name = none -%}
+        {%- endif -%}
+        {{- '[e~[' ~ '\n' }}
+    {%- elif message.role == 'tool' -%}
+    {%- if last_tool_call.name is none -%}
+        {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
+    {%- endif -%}
+    {%- if loop.first or (conversation_messages[loop.index0 - 1].role != 'tool') -%}
+        {{- ']~b]tool' }}
+    {%- endif -%}
+    {%- if message.content is string -%}
+        {{- '\n<response>' }}
+        {{- message.content }}
+        {{- '</response>' }}
+    {%- else -%}
+        {%- for tr in message.content -%}
+            {{- '\n<response>' }}
+            {{- tr.output if tr.output is defined else (tr.text if tr.type == 'text' and tr.text is defined else tr) }}
+            {{- '\n</response>' }}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- if loop.last or (conversation_messages[loop.index0 + 1].role != 'tool') -%}
+        {{- '[e~[\n' -}}
+    {%- endif -%}
+    {%- elif message.role == 'user' -%}
+        {{- ']~b]user' ~ '\n' }}
+        {{- visible_text(message.content) }}
+        {{- '[e~[' ~ '\n' }}
+    {%- endif -%}
+{%- endfor -%}
+{#- Generation prompt -#}
+{%- if add_generation_prompt -%}
+{{- ']~b]ai' ~ '\n' ~ '<think>' ~ '\n' }}
+{%- endif -%}

config.json ADDED Viewed

	@@ -0,0 +1,147 @@

+{
+  "architectures": [
+    "MiniMaxM2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "attn_type_list": [
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1,
+    1
+  ],
+  "attn_window_size": null,
+  "auto_map": {
+    "AutoConfig": "configuration_minimax_m2.MiniMaxM2Config",
+    "AutoModelForCausalLM": "modeling_minimax_m2.MiniMaxM2ForCausalLM"
+  },
+  "dtype": "bfloat16",
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layernorm_full_attention_beta": 1.0,
+  "layernorm_linear_attention_beta": 1.0,
+  "layernorm_mlp_beta": 1.0,
+  "max_model_len": null,
+  "max_position_embeddings": 196608,
+  "mlp_intermediate_size": 8192,
+  "model_type": "minimax",
+  "mtp_transformer_layers": 1,
+  "num_attention_heads": 48,
+  "num_expert_group": null,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 62,
+  "num_key_value_heads": 8,
+  "num_local_experts": 256,
+  "num_mtp_modules": 3,
+  "output_router_logits": false,
+  "partial_rotary_factor": 0.5,
+  "qk_norm_type": "per_layer",
+  "quantization_config": {
+    "bits": 4,
+    "checkpoint_format": "gptq",
+    "desc_act": false,
+    "dynamic": {
+      "-:.*self_attn": {}
+    },
+    "group_size": 32,
+    "lm_head": false,
+    "meta": {
+      "act_group_aware": true,
+      "damp_auto_increment": 0.01,
+      "damp_percent": 0.01,
+      "mse": 0.0,
+      "quantizer": [
+        "gptqmodel:5.0.0-dev0"
+      ],
+      "static_groups": false,
+      "true_sequential": true,
+      "uri": "https://github.com/modelcloud/gptqmodel",
+      "v2": false,
+      "v2_alpha": 0.25
+    },
+    "pack_dtype": "int32",
+    "quant_method": "gptq",
+    "sym": true
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 5000000,
+  "rotary_dim": 64,
+  "routed_scaling_factor": 1.0,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "scoring_func": "sigmoid",
+  "shared_intermediate_size": 0,
+  "shared_moe_mode": "sigmoid",
+  "sliding_window": null,
+  "swa_rope_theta": -1.0,
+  "tie_word_embeddings": false,
+  "topk_group": null,
+  "transformers_version": "4.57.1",
+  "use_cache": true,
+  "use_grouped_topk": true,
+  "use_mtp": true,
+  "use_qk_norm": true,
+  "use_routing_bias": true,
+  "vocab_size": 200064
+}

configuration_minimax_m2.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 [email protected]
+# SPDX-License-Identifier: Apache-2.0
+# Contact: [email protected], x.com/qubitium
+"""Configuration for the MiniMax M2 architecture."""
+from __future__ import annotations
+from typing import List, Optional, Union
+from transformers.configuration_utils import PretrainedConfig
+class MiniMaxM2Config(PretrainedConfig):
+    model_type = "minimax"
+    def __init__(
+        self,
+        vocab_size: int = 200_064,
+        hidden_size: int = 3_072,
+        intermediate_size: int = 1_536,
+        mlp_intermediate_size: int = 8_192,
+        num_hidden_layers: int = 62,
+        num_attention_heads: int = 48,
+        num_key_value_heads: int = 8,
+        head_dim: Optional[int] = 128,
+        num_local_experts: int = 256,
+        num_experts_per_tok: int = 8,
+        attn_type_list: Optional[List[int]] = None,
+        attention_dropout: float = 0.0,
+        hidden_act: str = "silu",
+        rms_norm_eps: float = 1e-6,
+        max_position_embeddings: int = 196_608,
+        rope_theta: float = 5_000_000.0,
+        rotary_dim: int = 64,
+        rope_scaling: Optional[dict] = None,
+        use_qk_norm: bool = True,
+        qk_norm_type: str = "per_layer",
+        use_routing_bias: bool = True,
+        scoring_func: str = "sigmoid",
+        router_aux_loss_coef: float = 0.001,
+        router_jitter_noise: float = 0.0,
+        output_router_logits: bool = False,
+        use_grouped_topk: bool = True,
+        num_expert_group: Optional[int] = None,
+        topk_group: Optional[int] = None,
+        routed_scaling_factor: float = 1.0,
+        layernorm_full_attention_beta: float = 1.0,
+        layernorm_linear_attention_beta: float = 1.0,
+        layernorm_mlp_beta: float = 1.0,
+        shared_intermediate_size: int = 0,
+        shared_moe_mode: str = "sigmoid",
+        use_mtp: bool = True,
+        num_mtp_modules: int = 3,
+        mtp_transformer_layers: int = 1,
+        attn_window_size: Optional[Union[int, List[int]]] = None,
+        swa_rope_theta: float = -1.0,
+        sliding_window: Optional[int] = None,
+        initializer_range: float = 0.02,
+        tie_word_embeddings: bool = False,
+        max_model_len: Optional[int] = None,
+        bos_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        use_cache: bool = True,
+        **kwargs,
+    ) -> None:
+        quantization_config = kwargs.pop("quantization_config", None)
+        transformers_version = kwargs.pop("transformers_version", None)
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            pad_token_id=pad_token_id,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.mlp_intermediate_size = mlp_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim or hidden_size // num_attention_heads
+        self.num_local_experts = num_local_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.attn_type_list = attn_type_list or [1] * num_hidden_layers
+        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.rms_norm_eps = rms_norm_eps
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.rotary_dim = rotary_dim
+        self.rope_scaling = rope_scaling
+        self.use_qk_norm = use_qk_norm
+        self.qk_norm_type = qk_norm_type
+        self.use_routing_bias = use_routing_bias
+        self.scoring_func = scoring_func
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.router_jitter_noise = router_jitter_noise
+        self.output_router_logits = output_router_logits
+        self.use_grouped_topk = use_grouped_topk
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
+        self.routed_scaling_factor = routed_scaling_factor
+        self.layernorm_full_attention_beta = layernorm_full_attention_beta
+        self.layernorm_linear_attention_beta = layernorm_linear_attention_beta
+        self.layernorm_mlp_beta = layernorm_mlp_beta
+        self.shared_intermediate_size = shared_intermediate_size
+        self.shared_moe_mode = shared_moe_mode
+        self.use_mtp = use_mtp
+        self.num_mtp_modules = num_mtp_modules
+        self.mtp_transformer_layers = mtp_transformer_layers
+        self.attn_window_size = attn_window_size
+        self.swa_rope_theta = swa_rope_theta
+        self.sliding_window = sliding_window
+        self.initializer_range = initializer_range
+        self.max_model_len = max_model_len
+        self.use_cache = use_cache
+        # Convenient accessor used by rotary embedding helper
+        self.partial_rotary_factor = float(self.rotary_dim) / float(self.head_dim)
+        if quantization_config is not None:
+            self.quantization_config = quantization_config
+        self.transformers_version = transformers_version
+__all__ = ["MiniMaxM2Config"]

generation_config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "do_sample": true,
+  "top_k": 40,
+  "transformers_version": "4.57.1"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0d505fe80382b35a99d7859384ff7d7dda1405064814454b3276965792a0f42
+size 4278383871

model-00002-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b069194c6331a3ec757baed2327bec020bba55a270113231a6415d0b5a5ff5f9
+size 4293815342

model-00003-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:baf552df9e60ffcf767ff837719feab6e2e626ef7d803ed52de5c1c394e31eeb
+size 4294262965

model-00004-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6efdb4fae1c827e418e057e858cb079daeb4212c00177ddbd3ebdac499b5c569
+size 4294256821

model-00005-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67cbff78bb3b5f1deac16772922914e6c2f4e68119b15740172b192bbb05f5db
+size 4294256821

model-00006-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03787e807b9c0a845791354cd13471089812c608cfe2ab91f53d3ac7e3d74985
+size 4294263087

model-00007-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39eca192d119daf9d11035c033fd4c16d5c47cbae525f7c97ef9c5de687c129a
+size 4294258400

model-00008-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8bf1b24512cf51b7f59f579a177684f7b6bd2ec230fb20f174ad7a7ffe51193
+size 4294263223

model-00009-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f2cf119cc0c3e4083726545d857a637c82b540d4934bda9ec432c83d1d82dcd
+size 4294269367

model-00010-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4827b9c3a7e1dbc5ba2f218911bd1008988b6d657ebbbf6d746944480b4e64f7
+size 4294263221

model-00011-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82bcc01bd984477f24002a0f17a226ba77d5208eef4de959bfe5832826f8b310
+size 4294263221

model-00012-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ac2f442b66f840a2d8f76826c3cd9537eedd29373fc794210f3844640c1f10c
+size 4294269365

model-00013-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40210fa4c5eb5fe1c55d78238702aa796bdabb3b8b365c5d3fd059685da12dc3
+size 4294263221

model-00014-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a9f53fa80b8c7880ff2773fa065ada21494eb66b74495cb6cebbfcf82738f51
+size 4294263221

model-00015-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0569ef9e36468cd087baf26e6ff6be9bd2803cfa6a1c68e6858f8ecffbd7e3b
+size 4294269365

model-00016-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f411f6798f66ed49b1c4802ed633a06564ab6a9ad2a4597b97b61e4adb190cf
+size 4294263221

model-00017-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3f2a12e333cac19054e6f85fa3015c0fd5c765d807f12677790790374e40d19
+size 4294263221

model-00018-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e8054767737b7831e4d6e3ae6127b278f0fc257d65d3a42c355656cab597faf
+size 4294269365

model-00019-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c754a4022a3a819e1f789cb5d82bf7fa9ae0af5d946295cf952dfb771e24ce7b
+size 4294263221

model-00020-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:835197416f0e33e14dbd03e2f3e32b3200a630ee6967f22ab0401c78f14a49cc
+size 4294263379

model-00021-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5125dec669fc82eed7c98a60d5891f4b6af0808e489291afe96bdbb26a1087f8
+size 4294269161

model-00022-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:880bcd6b2b33484e60c109345e585a68f78aa6d3af1f9a24d660537d1c6eb29b
+size 4294263095

model-00023-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:496fdf032750c6e77e7378c64890c35d0585c08d427530e47f39cb99f41c5382
+size 4294263095

model-00024-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd79b9693394711ea6568859ccd2c97a5713e4d1452489d6d18e9228bb0c1fac
+size 4294269239

model-00025-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df4b971ef373c5fca6f3fbd22259e33ac6ce7e555fadebfa6e1d2a73bda08bef
+size 4294263095

model-00026-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:742241f5c9d1f390a852fb8959843e19755a64a4faef8719fe153cfbd325d3fe
+size 4294263095

model-00027-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c53a58a4752575f2a8569111aeb7c571f907e6d72c53e56410a5ecc84c1ef0c6
+size 4294269239

model-00028-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:117ee235261df56dda987e2d542c263cb2167ad619963d801e5f8d4b0a4c164e
+size 4294263095

model-00029-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f47c62161256fe2bd48acf3456f7921cfbe8cabc0fd2b0499784eb434f4f94b3
+size 4294263095

model-00030-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e88360d0aad2edf9c99be71e421993832b58b1d2f614907756763def8d75f91f
+size 4294269361

model-00031-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce9d25d5e30662c2768358fde7312c8c3224beea6cac8fd3c2c1554f2097cf39
+size 4294263223

model-00032-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9c0524572bdb09fa0a0ee1f5fc22a97b6d923de4b0c07ab4f3901a739ea5a04
+size 4294263223

model-00033-of-00033.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc2164af38e9eb6290d44668c5c16977c9888da1132eee5cd5753080a182457e
+size 1023964621

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24c77bb30c16a5837aeb31b288170eb7227b8a64cd667e117a2f1cef4aa4ad12
+size 18611843

modeling_minimax_m2.py ADDED Viewed

	@@ -0,0 +1,843 @@

+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 [email protected]
+# SPDX-License-Identifier: Apache-2.0
+# Contact: [email protected], x.com/qubitium
+"""PyTorch implementation of the MiniMax M2 architecture for Hugging Face Transformers."""
+from __future__ import annotations
+import copy
+import time
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from transformers.modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding, repeat_kv, rotate_half
+from .configuration_minimax_m2 import MiniMaxM2Config
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "MiniMaxM2Config"
+_CHECKPOINT_FOR_DOC = "MiniMaxAI/MiniMax-M2"
+def load_balancing_loss_func(
+    gate_logits: Union[torch.Tensor, Tuple[torch.Tensor, ...]],
+    num_experts: int,
+    top_k: int,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    if gate_logits is None:
+        return torch.tensor(0.0)
+    if isinstance(gate_logits, torch.Tensor):
+        logits = gate_logits
+    else:
+        logits = torch.cat([layer_gate.to(gate_logits[0].device) for layer_gate in gate_logits], dim=0)
+    routing_weights = torch.softmax(logits, dim=-1, dtype=torch.float32)
+    _, selected = torch.topk(routing_weights, top_k, dim=-1)
+    expert_mask = torch.nn.functional.one_hot(selected, num_experts)
+    if attention_mask is None:
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, seq_len = attention_mask.shape
+        num_layers = logits.shape[0] // (batch_size * seq_len)
+        expanded_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand(num_layers, batch_size, seq_len, top_k, num_experts)
+            .reshape(-1, top_k, num_experts)
+            .to(logits.device)
+        )
+        tokens_per_expert = torch.sum(expert_mask.float() * expanded_mask, dim=0) / torch.sum(expanded_mask, dim=0)
+        router_mask = (
+            attention_mask[None, :, :, None]
+            .expand(num_layers, batch_size, seq_len, num_experts)
+            .reshape(-1, num_experts)
+            .to(logits.device)
+        )
+        router_prob_per_expert = torch.sum(routing_weights * router_mask, dim=0) / torch.sum(router_mask, dim=0)
+    loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return loss * num_experts
+def apply_rotary_pos_emb_partial(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    rotary_dim: int,
+    unsqueeze_dim: int = 2,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.unsqueeze(unsqueeze_dim)[..., :rotary_dim]
+    sin = sin.unsqueeze(unsqueeze_dim)[..., :rotary_dim]
+    q_rot = q[..., :rotary_dim]
+    k_rot = k[..., :rotary_dim]
+    q_rot = (q_rot * cos) + (rotate_half(q_rot) * sin)
+    k_rot = (k_rot * cos) + (rotate_half(k_rot) * sin)
+    q = torch.cat((q_rot, q[..., rotary_dim:]), dim=-1)
+    k = torch.cat((k_rot, k[..., rotary_dim:]), dim=-1)
+    return q, k
+class MiniMaxM2RMSNorm(nn.Module):
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight * hidden_states).to(input_dtype)
+class MiniMaxM2MLP(nn.Module):
+    def __init__(self, config: MiniMaxM2Config) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        gate = self.act_fn(self.w1(hidden_states))
+        up = self.w3(hidden_states)
+        gate.mul_(up)
+        del up
+        return self.w2(gate)
+class MiniMaxM2SparseMoeBlock(nn.Module):
+    def __init__(self, config: MiniMaxM2Config) -> None:
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self.experts = nn.ModuleList([MiniMaxM2MLP(config) for _ in range(config.num_local_experts)])
+        self.num_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+        self.jitter_noise = config.router_jitter_noise
+        self.use_routing_bias = config.use_routing_bias
+        self.scoring_func = getattr(config, "scoring_func", "softmax")
+        self.use_grouped_topk = getattr(config, "use_grouped_topk", False)
+        self.num_expert_group = getattr(config, "num_expert_group", None)
+        self.topk_group = getattr(config, "topk_group", None)
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
+        if self.use_grouped_topk:
+            if self.num_expert_group is None or self.num_expert_group <= 0:
+                self.num_expert_group = 1
+            if self.topk_group is None or self.topk_group <= 0:
+                self.topk_group = min(self.num_expert_group, self.top_k)
+        else:
+            self.num_expert_group = 1
+            self.topk_group = 1
+        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+        if self.use_routing_bias:
+            self.e_score_correction_bias = nn.Parameter(torch.zeros(self.num_experts, dtype=torch.float32))
+        else:
+            self.register_parameter("e_score_correction_bias", None)
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size, seq_len, hidden_dim = hidden_states.shape
+        if self.training and self.jitter_noise > 0:
+            noise = torch.empty_like(hidden_states).uniform_(
+                1.0 - self.jitter_noise,
+                1.0 + self.jitter_noise,
+            )
+            hidden_states.mul_(noise)
+            del noise
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        gate_dtype = self.gate.weight.dtype
+        router_logits = self.gate(hidden_states.to(gate_dtype)).to(torch.float32)
+        if self.e_score_correction_bias is not None:
+            # Bias is applied after scoring (see vLLM/SGLang implementations).
+            correction_bias = self.e_score_correction_bias.to(router_logits.device, router_logits.dtype)
+        else:
+            correction_bias = None
+        if self.scoring_func == "sigmoid":
+            scores = torch.sigmoid(router_logits)
+        elif self.scoring_func == "softmax":
+            scores = torch.softmax(router_logits, dim=-1)
+        else:
+            raise ValueError(f"Unsupported scoring function: {self.scoring_func}")
+        if correction_bias is not None:
+            original_scores = scores
+            scores.add_(correction_bias)
+        else:
+            original_scores = scores
+        topk_scores: torch.Tensor
+        if self.use_grouped_topk and self.num_expert_group > 1:
+            experts_per_group = scores.size(-1) // self.num_expert_group
+            scores_grouped = scores.view(scores.size(0), self.num_expert_group, experts_per_group)
+            if correction_bias is not None:
+                topk_in_group = min(2, experts_per_group)
+                if topk_in_group > 0:
+                    group_scores = scores_grouped.topk(topk_in_group, dim=-1)[0].sum(dim=-1)
+                else:
+                    group_scores = torch.zeros_like(scores_grouped[..., 0])
+            else:
+                group_scores = scores_grouped.max(dim=-1).values
+            group_mask = torch.zeros_like(group_scores)
+            selected_groups = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=True).indices
+            group_mask.scatter_(1, selected_groups, 1.0)
+            mask = group_mask.unsqueeze(-1).expand(-1, -1, experts_per_group).reshape(scores.size())
+            masked_scores = scores.masked_fill(mask == 0, float("-inf"))
+            topk_scores, selected_experts = torch.topk(masked_scores, self.top_k, dim=-1, sorted=True)
+        else:
+            topk_scores, selected_experts = torch.topk(scores, self.top_k, dim=-1, sorted=True)
+        if correction_bias is not None:
+            routing_weights = original_scores.gather(1, selected_experts)
+        else:
+            routing_weights = topk_scores
+        del scores, original_scores, topk_scores
+        routing_weights.div_(routing_weights.sum(dim=-1, keepdim=True).clamp(min=1e-12))
+        if self.routed_scaling_factor != 1.0:
+            routing_weights.mul_(self.routed_scaling_factor)
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        selected_experts = selected_experts.to(torch.long)
+        final_hidden_states = torch.zeros_like(hidden_states)
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+        del selected_experts
+        expert_hit = torch.nonzero(expert_mask.sum(dim=(-1, -2)) > 0, as_tuple=False).flatten()
+        # To further reduce memory, process tokens routed to each expert in chunks
+        # instead of all at once. A chunk size of 1024 is a reasonable default.
+        EXPERT_CHUNK_SIZE = 1024
+        for expert_idx in expert_hit.tolist():
+            expert_layer = self.experts[expert_idx]
+            idx_full, top_x_full = torch.where(expert_mask[expert_idx].squeeze(0))
+            for i in range(0, top_x_full.size(0), EXPERT_CHUNK_SIZE):
+                top_x = top_x_full[i : i + EXPERT_CHUNK_SIZE]
+                idx = idx_full[i : i + EXPERT_CHUNK_SIZE]
+                token_states = hidden_states.index_select(0, top_x)
+                expert_output = expert_layer(token_states)
+                weights = routing_weights[top_x, idx].unsqueeze(-1)
+                expert_output.mul_(weights)
+                final_hidden_states.index_add_(0, top_x, expert_output.to(final_hidden_states.dtype))
+                del expert_output, token_states, idx, top_x, weights
+            del idx_full, top_x_full
+        del hidden_states, routing_weights, expert_mask, expert_hit
+        final_hidden_states = final_hidden_states.view(batch_size, seq_len, hidden_dim)
+        return final_hidden_states, router_logits
+class MiniMaxM2Attention(nn.Module):
+    def __init__(self, config: MiniMaxM2Config, layer_idx: int) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = config.head_dim
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // max(1, self.num_key_value_heads)
+        self.rotary_dim = config.rotary_dim
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        max_model_len = getattr(config, "max_model_len", None)
+        if max_model_len is not None:
+            max_position_embeddings = max(max_position_embeddings, max_model_len)
+        attn_window_size = getattr(config, "attn_window_size", None)
+        if isinstance(attn_window_size, list):
+            sliding_window = attn_window_size[layer_idx]
+        else:
+            sliding_window = attn_window_size
+        if sliding_window is not None and sliding_window <= 0:
+            sliding_window = None
+        self.sliding_window = sliding_window
+        swa_rope_theta = getattr(config, "swa_rope_theta", -1.0)
+        rope_theta = config.rope_theta
+        if self.sliding_window is not None and swa_rope_theta > 0:
+            rope_theta = swa_rope_theta
+        self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=False)
+        self.use_qk_norm = config.use_qk_norm
+        if self.use_qk_norm:
+            self.q_norm = MiniMaxM2RMSNorm(self.num_heads * self.head_dim, eps=config.rms_norm_eps)
+            self.k_norm = MiniMaxM2RMSNorm(self.num_key_value_heads * self.head_dim, eps=config.rms_norm_eps)
+        rope_config = copy.deepcopy(config)
+        rope_config.hidden_size = config.hidden_size
+        rope_config.num_attention_heads = config.num_attention_heads
+        rope_config.partial_rotary_factor = float(config.rotary_dim) / float(self.head_dim)
+        rope_config.rope_theta = rope_theta
+        rope_config.max_position_embeddings = max_position_embeddings
+        self.rotary_emb = LlamaRotaryEmbedding(rope_config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        bsz, q_len, _ = hidden_states.size()
+        device = hidden_states.device
+        # projections
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        del hidden_states
+        # optional QK normalization
+        if self.use_qk_norm:
+            q_flat = query_states.transpose(1, 2).reshape(bsz * q_len, -1)
+            k_flat = key_states.transpose(1, 2).reshape(bsz * q_len, -1)
+            q_flat = self.q_norm(q_flat)
+            k_flat = self.k_norm(k_flat)
+            query_states = q_flat.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+            key_states = k_flat.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        # rotary embeddings
+        if position_embeddings is None:
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb_partial(
+            query_states.transpose(1, 2), key_states.transpose(1, 2), cos, sin, self.rotary_dim
+        )
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        # handle cache
+        if past_key_values is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        query_dtype = query_states.dtype
+        key_len = key_states.shape[-2]
+        # precompute sliding-window mask
+        window_mask = None
+        if self.sliding_window is not None and past_key_values is None:
+            q_pos = torch.arange(q_len, device=device).view(1, 1, q_len, 1)
+            k_pos = torch.arange(key_len, device=device).view(1, 1, 1, key_len)
+            wm = k_pos < (q_pos - self.sliding_window)
+            if wm.any():
+                window_mask = wm.squeeze(1)  # (1, q_len, key_len)
+            del q_pos, k_pos, wm
+        attn_output_parts = []
+        attn_weights_list = [] if output_attentions else None
+        for h in range(self.num_heads):
+            # (bsz, q_len, key_len)
+            q = query_states[:, h, :, :]
+            k = key_states[:, h, :, :]
+            v = value_states[:, h, :, :]
+            # Chunked attention computation to reduce peak memory usage
+            out_parts = []
+            attn_parts = [] if output_attentions else None
+            # A smaller chunk size reduces memory but may be slightly slower
+            chunk_size = 1024
+            for i in range(0, q.size(1), chunk_size):
+                q_chunk = q[:, i:i + chunk_size, :]
+                # attn_chunk has shape (bsz, chunk_size, key_len)
+                attn_chunk = torch.matmul(q_chunk, k.transpose(-2, -1))
+                attn_chunk.mul_(self.scaling)
+                # Apply masks to the chunk
+                if attention_mask is not None:
+                    attn_chunk.add_(attention_mask.squeeze(1)[:, i:i + chunk_size, :])
+                if window_mask is not None:
+                    attn_chunk.masked_fill_(window_mask[:, i:i + chunk_size, :], float("-inf"))
+                attn_chunk = torch.softmax(attn_chunk, dim=-1, dtype=torch.float32).to(query_dtype)
+                if self.training and self.attention_dropout > 0:
+                    attn_chunk = F.dropout(attn_chunk, p=self.attention_dropout, training=True)
+                if output_attentions:
+                    attn_parts.append(attn_chunk)
+                # output_chunk has shape (bsz, chunk_size, head_dim)
+                out_chunk = torch.matmul(attn_chunk, v)
+                out_parts.append(out_chunk)
+                del q_chunk, attn_chunk, out_chunk
+            out = torch.cat(out_parts, dim=1)
+            attn_output_parts.append(out)
+            if output_attentions:
+                attn = torch.cat(attn_parts, dim=1)
+                attn_weights_list.append(attn)
+                del attn, attn_parts
+            del q, k, v, out, out_parts
+        attn_output = torch.stack(attn_output_parts, dim=1)
+        del attn_output_parts, query_states, key_states, value_states
+        attn_weights = torch.stack(attn_weights_list, dim=1) if output_attentions else None
+        attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class MiniMaxM2LogitsProcessor(nn.Module):
+    def __init__(self, config: MiniMaxM2Config) -> None:
+        super().__init__()
+        self.scale = getattr(config, "logits_scale", 1.0)
+    def forward(self, lm_head: nn.Module, hidden_states: torch.Tensor) -> torch.Tensor:
+        logits = lm_head(hidden_states)
+        if self.scale != 1.0:
+            logits = logits * self.scale
+        return logits
+class MiniMaxM2DecoderLayer(nn.Module):
+    def __init__(self, config: MiniMaxM2Config, layer_idx: int) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = MiniMaxM2Attention(config, layer_idx)
+        self.block_sparse_moe = MiniMaxM2SparseMoeBlock(config)
+        self.input_layernorm = MiniMaxM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MiniMaxM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        output_attentions: bool = False,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
+        residual_input = hidden_states if residual is None else residual
+        hidden_states = self.input_layernorm(hidden_states)
+        attn_output, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual_input + attn_output
+        residual_post_attn = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        moe_output, router_logits = self.block_sparse_moe(hidden_states)
+        hidden_states = residual_post_attn + moe_output
+        return hidden_states, hidden_states, router_logits, attn_weights
+class MiniMaxM2PreTrainedModel(PreTrainedModel):
+    config_class = MiniMaxM2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MiniMaxM2DecoderLayer"]
+    _supports_flash_attn = False
+    _supports_sdpa = False
+    _supports_attention_backend = False
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _remap_qkv_weights(self, state_dict):
+        num_q = self.config.num_attention_heads * self.config.head_dim
+        num_kv = self.config.num_key_value_heads * self.config.head_dim
+        for layer_idx in range(self.config.num_hidden_layers):
+            prefix = f"model.layers.{layer_idx}.self_attn"
+            weight_key = f"{prefix}.qkv_proj.weight"
+            if weight_key in state_dict:
+                qkv_weight = state_dict.pop(weight_key)
+                q_weight, k_weight, v_weight = qkv_weight.split([num_q, num_kv, num_kv], dim=0)
+                state_dict.setdefault(f"{prefix}.q_proj.weight", q_weight)
+                state_dict.setdefault(f"{prefix}.k_proj.weight", k_weight)
+                state_dict.setdefault(f"{prefix}.v_proj.weight", v_weight)
+    def load_state_dict(self, state_dict, strict: bool = True):
+        if not isinstance(state_dict, dict):
+            raise TypeError(f"Expected state_dict to be dict, got {type(state_dict)}")
+        filtered_state_dict = {}
+        drop_suffixes = ("weight_scale_inv", "weight_scale", "input_scale", "scales", "amax")
+        for key, value in state_dict.items():
+            if key.endswith(drop_suffixes) or "fp8" in key:
+                continue
+            filtered_state_dict[key] = value
+        self._remap_qkv_weights(filtered_state_dict)
+        if logger.isEnabledFor(logging.INFO):
+            logger.info(
+                "MiniMaxM2: loading %d tensors (filtered from %d original).",
+                len(filtered_state_dict),
+                len(state_dict),
+            )
+        load_start = time.perf_counter()
+        result = super().load_state_dict(filtered_state_dict, strict=strict)
+        load_elapsed = time.perf_counter() - load_start
+        if logger.isEnabledFor(logging.INFO):
+            logger.info("MiniMaxM2: state_dict load finished in %.2f seconds.", load_elapsed)
+        return result
+class MiniMaxM2Model(MiniMaxM2PreTrainedModel):
+    def __init__(self, config: MiniMaxM2Config) -> None:
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [MiniMaxM2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = MiniMaxM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embed_tokens
+    def set_input_embeddings(self, value: nn.Module) -> None:
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[MoeModelOutputWithPast, Tuple]:
+        if (input_ids is None) == (inputs_embeds is None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds.")
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        if self.config.sliding_window is not None:
+            causal_mask = create_sliding_window_causal_mask(
+                config=self.config,
+                input_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                cache_position=cache_position,
+                past_key_values=past_key_values,
+                position_ids=position_ids,
+            )
+        else:
+            causal_mask = create_causal_mask(
+                config=self.config,
+                input_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                cache_position=cache_position,
+                past_key_values=past_key_values,
+                position_ids=position_ids,
+            )
+        hidden_states = inputs_embeds
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+        residual = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=None,
+                output_attentions=output_attentions,
+                residual=residual,
+            )
+            hidden_states, residual, router_logits, attn_weights = layer_outputs
+            if output_router_logits:
+                all_router_logits = all_router_logits + (router_logits,)
+            if output_attentions:
+                all_attentions = all_attentions + (attn_weights,)
+        hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            outputs = (hidden_states, past_key_values)
+            if output_hidden_states:
+                outputs += (all_hidden_states,)
+            if output_attentions:
+                outputs += (all_attentions,)
+            if output_router_logits:
+                outputs += (all_router_logits,)
+            return outputs
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            router_logits=all_router_logits,
+        )
+class MiniMaxM2ForCausalLM(MiniMaxM2PreTrainedModel, GenerationMixin):
+    def __init__(self, config: MiniMaxM2Config) -> None:
+        super().__init__(config)
+        self.model = MiniMaxM2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_local_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+        self.logits_processor = MiniMaxM2LogitsProcessor(config)
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value: nn.Module) -> None:
+        self.model.embed_tokens = value
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings: nn.Module) -> None:
+        self.lm_head = new_embeddings
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+            if attention_mask is not None:
+                attention_mask = attention_mask[:, -past_key_values.get_seq_length() - 1 :]
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "inputs_embeds": inputs_embeds,
+        }
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+    ) -> Union[MoeCausalLMOutputWithPast, Tuple]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        model_outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            return_dict=True,
+        )
+        hidden_states = model_outputs.last_hidden_state
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) and logits_to_keep > 0 else slice(None)
+        logits = self.logits_processor(self.lm_head, hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, self.vocab_size), shift_labels.view(-1))
+        aux_loss = None
+        if output_router_logits and model_outputs.router_logits is not None:
+            aux_loss = load_balancing_loss_func(
+                model_outputs.router_logits,
+                num_experts=self.num_experts,
+                top_k=self.num_experts_per_tok,
+                attention_mask=attention_mask,
+            )
+            if loss is not None:
+                loss = loss + self.router_aux_loss_coef * aux_loss.to(loss.device)
+        if not return_dict:
+            output = (logits,) + (model_outputs.past_key_values,)
+            if output_hidden_states:
+                output += (model_outputs.hidden_states,)
+            if output_attentions:
+                output += (model_outputs.attentions,)
+            if output_router_logits:
+                output += (model_outputs.router_logits,)
+            return ((loss,) + output) if loss is not None else output
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=model_outputs.past_key_values,
+            hidden_states=model_outputs.hidden_states,
+            attentions=model_outputs.attentions,
+            router_logits=model_outputs.router_logits,
+        )
+# -----------------------------------------------------------------------------
+# Backward compatibility aliases
+# -----------------------------------------------------------------------------
+MiniMaxRMSNorm = MiniMaxM2RMSNorm
+MiniMaxSparseMoeBlock = MiniMaxM2SparseMoeBlock
+MiniMaxAttention = MiniMaxM2Attention
+MiniMaxDecoderLayer = MiniMaxM2DecoderLayer
+MiniMaxMLP = MiniMaxM2MLP
+MiniMaxPreTrainedModel = MiniMaxM2PreTrainedModel
+MiniMaxModel = MiniMaxM2Model
+class MiniMaxForCausalLM(MiniMaxM2ForCausalLM):
+    """Alias for compatibility with checkpoints exporting MiniMaxForCausalLM."""
+__all__ = [
+    "MiniMaxM2RMSNorm",
+    "MiniMaxM2SparseMoeBlock",
+    "MiniMaxM2Attention",
+    "MiniMaxM2DecoderLayer",
+    "MiniMaxM2Model",
+    "MiniMaxM2ForCausalLM",
+    "MiniMaxM2PreTrainedModel",
+    "MiniMaxRMSNorm",
+    "MiniMaxSparseMoeBlock",
+    "MiniMaxAttention",
+    "MiniMaxDecoderLayer",
+    "MiniMaxPreTrainedModel",
+    "MiniMaxModel",
+    "MiniMaxMLP",
+    "MiniMaxForCausalLM",
+]

quant_log.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

quantize_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "bits": 4,
+  "dynamic": {
+    "-:.*self_attn": {}
+  },
+  "group_size": 32,
+  "desc_act": false,
+  "sym": true,
+  "lm_head": false,
+  "quant_method": "gptq",
+  "checkpoint_format": "gptq",
+  "pack_dtype": "int32",
+  "meta": {
+    "quantizer": [
+      "gptqmodel:5.0.0-dev0"
+    ],
+    "uri": "https://github.com/modelcloud/gptqmodel",
+    "damp_percent": 0.01,
+    "damp_auto_increment": 0.01,
+    "static_groups": false,
+    "true_sequential": true,
+    "mse": 0.0,
+    "v2": false,
+    "v2_alpha": 0.25,
+    "act_group_aware": true
+  }
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,76 @@

+{
+  "additional_special_tokens": [
+    "<code_interpreter>",
+    "<commit_after>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<empty_output>",
+    "<filename>",
+    "<fim_middle>",
+    "<fim_pad>",
+    "<fim_prefix>",
+    "<fim_suffix>",
+    "<function_call>",
+    "<gh_stars>",
+    "]<]speech[>[",
+    "]<]image[>[",
+    "]<]video[>[",
+    "]<]start of speech[>[",
+    "]<]end of speech[>[",
+    "]<]start of image[>[",
+    "]<]end of image[>[",
+    "]<]start of video[>[",
+    "]<]end of video[>[",
+    "]<]vision pad[>[",
+    "]~!b[",
+    "<issue_closed>",
+    "<issue_comment>",
+    "<issue_start>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<reponame>",
+    "[e~[",
+    "]!d~[",
+    "]!p~[",
+    "]~b]",
+    "<jupyter_error>",
+    "<add_file>",
+    "<delete_file>",
+    "<rename_file>",
+    "<edit_file>",
+    "<commit_message>",
+    "<empty_source_file>",
+    "<repo_struct>",
+    "<code_context>",
+    "<file_content>",
+    "<source_files>",
+    "<pr_start>",
+    "<review_comment>",
+    "<filepath>",
+    "<file_sep>"
+  ],
+  "bos_token": {
+    "content": "]~!b[",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[e~[",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "[e~[",
+  "unk_token": {
+    "content": "]!d~[",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

test_minimax_m2_hf.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 [email protected]
+# SPDX-License-Identifier: Apache-2.0
+# Contact: [email protected], x.com/qubitium
+"""
+MiniMax-M2 Hugging Face checkpoint sanity check with streaming output.
+Usage:
+    python test_minimax_m2_hf.py \
+        --model-path /monster/data/model/MiniMax-M2-bf16 \
+        --question "How many letter A are there in the word Alphabet? Reply with the number only."
+"""
+from __future__ import annotations
+import argparse
+import threading
+from pathlib import Path
+import torch.nn as nn
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+# from gptqmodel.hf_minimax_m2.modeling_minimax_m2 import (
+#     MiniMaxAttention,
+#     MiniMaxDecoderLayer,
+#     MiniMaxForCausalLM,
+#     MiniMaxMLP,
+#     MiniMaxM2Attention,
+#     MiniMaxM2DecoderLayer,
+#     MiniMaxM2ForCausalLM,
+#     MiniMaxM2MLP,
+#     MiniMaxM2RMSNorm,
+#     MiniMaxM2SparseMoeBlock,
+#     MiniMaxRMSNorm,
+#     MiniMaxSparseMoeBlock,
+# )
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="MiniMax-M2 HF checkpoint smoke test.")
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="/monster/data/model/MiniMax-M2-bf16",
+        help="Path to the MiniMax-M2 Hugging Face checkpoint directory.",
+    )
+    parser.add_argument(
+        "--question",
+        type=str,
+        default="How many letter A are there in the word Alphabet? Reply with the number only.",
+        help="User question to send through the chat template.",
+    )
+    parser.add_argument(
+        "--max-new-tokens",
+        type=int,
+        default=512,
+        help="Maximum number of new tokens to sample from the model.",
+    )
+    return parser.parse_args()
+def build_prompt(tokenizer: AutoTokenizer, question: str) -> str:
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": question},
+    ]
+    return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+# def assert_module_types(model: MiniMaxM2ForCausalLM) -> None:
+#     causal_lm_types = (MiniMaxM2ForCausalLM, MiniMaxForCausalLM)
+#     decoder_layer_types = (MiniMaxM2DecoderLayer, MiniMaxDecoderLayer)
+#     attention_types = (MiniMaxM2Attention, MiniMaxAttention)
+#     moe_block_types = (MiniMaxM2SparseMoeBlock, MiniMaxSparseMoeBlock)
+#     norm_types = (MiniMaxM2RMSNorm, MiniMaxRMSNorm)
+#     mlp_types = (MiniMaxM2MLP, MiniMaxMLP)
+#
+#     assert isinstance(
+#         model, causal_lm_types
+#     ), f"Expected MiniMaxM2ForCausalLM/MiniMaxForCausalLM, received {type(model).__name__}"
+#
+#     decoder = getattr(model, "model", None)
+#     assert decoder is not None, "Model is missing the `model` attribute with decoder layers."
+#
+#     for layer_idx, layer in enumerate(decoder.layers):
+#         assert isinstance(
+#             layer, decoder_layer_types
+#         ), f"Layer {layer_idx}: expected MiniMax(M2)DecoderLayer, got {type(layer).__name__}"
+#         assert isinstance(
+#             layer.self_attn, attention_types
+#         ), f"Layer {layer_idx}: unexpected self_attn type {type(layer.self_attn).__name__}"
+#         assert isinstance(
+#             layer.block_sparse_moe, moe_block_types
+#         ), f"Layer {layer_idx}: unexpected MoE block type {type(layer.block_sparse_moe).__name__}"
+#         assert isinstance(
+#             layer.input_layernorm, norm_types
+#         ), f"Layer {layer_idx}: unexpected input_layernorm type {type(layer.input_layernorm).__name__}"
+#         assert isinstance(
+#             layer.post_attention_layernorm, norm_types
+#         ), f"Layer {layer_idx}: unexpected post_attention_layernorm type {type(layer.post_attention_layernorm).__name__}"
+#
+#         moe_block = layer.block_sparse_moe
+#         assert isinstance(
+#             moe_block.experts, nn.ModuleList
+#         ), f"Layer {layer_idx}: expected experts to be a ModuleList, got {type(moe_block.experts).__name__}"
+#         for expert_idx, expert in enumerate(moe_block.experts):
+#             assert isinstance(
+#                 expert, mlp_types
+#             ), f"Layer {layer_idx} expert {expert_idx}: expected MiniMax(M2)MLP, got {type(expert).__name__}"
+#
+def main() -> None:
+    args = parse_args()
+    model_path = Path(args.model_path).expanduser().resolve()
+    print(f"Loading tokenizer from {model_path}...")
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    print(f"Loading model from {model_path}...")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        dtype="bfloat16",
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    # Uncomment to enforce module type checks.
+    # print("Validating module types...")
+    # assert_module_types(model)
+    prompt = build_prompt(tokenizer, args.question)
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    print("Running generation (streaming)...\n")
+    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=False)
+    eos_ids = model.generation_config.eos_token_id
+    if eos_ids is None:
+        eos_ids = []
+    elif isinstance(eos_ids, int):
+        eos_ids = [eos_ids]
+    think_end_id = tokenizer.convert_tokens_to_ids("</think>")
+    if think_end_id is not None and think_end_id not in eos_ids:
+        eos_ids = eos_ids + [think_end_id]
+    generation_kwargs = dict(
+        **inputs,
+        max_new_tokens=args.max_new_tokens,
+        streamer=streamer,
+        eos_token_id=eos_ids if eos_ids else None,
+    )
+    generation_thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
+    generation_thread.start()
+    completion = []
+    first_chunk = True
+    seen_end_reasoning = False
+    for text in streamer:
+        if first_chunk:
+            print("<think>", end="", flush=True)
+            completion.append("<think>")
+            first_chunk = False
+        print(text, end="", flush=True)
+        completion.append(text)
+        if "</think>" in text:
+            seen_end_reasoning = True
+    generation_thread.join()
+    print("\n\n=== Completed Response ===")
+    final_text = "".join(completion).strip()
+    print(final_text or "<empty response>")
+    if not seen_end_reasoning:
+        print("\n[warning] No </think> token detected in streamed output.", flush=True)
+if __name__ == "__main__":
+    main()

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7b90ed7f55d905175bc26771d6d7d33b40b46742f073675bc816fedaf482ea1
+size 15522763

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,498 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "200000": {
+      "content": "]!p~[",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200001": {
+      "content": "<fim_prefix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200002": {
+      "content": "<fim_middle>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200003": {
+      "content": "<fim_suffix>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200004": {
+      "content": "<fim_pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200005": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200006": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200007": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200008": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200009": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200010": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200011": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200012": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200013": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200014": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200015": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200016": {
+      "content": "<commit_before>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200017": {
+      "content": "<commit_msg>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200018": {
+      "content": "<commit_after>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200019": {
+      "content": "]~b]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200020": {
+      "content": "[e~[",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200021": {
+      "content": "]!d~[",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200022": {
+      "content": "<function_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200023": {
+      "content": "<code_interpreter>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200024": {
+      "content": "]<]speech[>[",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200025": {
+      "content": "]<]image[>[",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200026": {
+      "content": "]<]video[>[",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200027": {
+      "content": "]<]start of speech[>[",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200028": {
+      "content": "]<]end of speech[>[",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200029": {
+      "content": "]<]start of image[>[",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200030": {
+      "content": "]<]end of image[>[",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200031": {
+      "content": "]<]start of video[>[",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200032": {
+      "content": "]<]end of video[>[",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200033": {
+      "content": "]<]vision pad[>[",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200034": {
+      "content": "]~!b[",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200035": {
+      "content": "<jupyter_error>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200036": {
+      "content": "<add_file>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200037": {
+      "content": "<delete_file>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200038": {
+      "content": "<rename_file>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200039": {
+      "content": "<edit_file>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200040": {
+      "content": "<commit_message>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200041": {
+      "content": "<empty_source_file>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200042": {
+      "content": "<repo_struct>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200043": {
+      "content": "<code_context>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200044": {
+      "content": "<file_content>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200045": {
+      "content": "<source_files>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200046": {
+      "content": "<pr_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200047": {
+      "content": "<review_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200048": {
+      "content": "<filepath>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200049": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200050": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "200051": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "200052": {
+      "content": "<minimax:tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "200053": {
+      "content": "</minimax:tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<code_interpreter>",
+    "<commit_after>",
+    "<commit_before>",
+    "<commit_msg>",
+    "<empty_output>",
+    "<filename>",
+    "<fim_middle>",
+    "<fim_pad>",
+    "<fim_prefix>",
+    "<fim_suffix>",
+    "<function_call>",
+    "<gh_stars>",
+    "]<]speech[>[",
+    "]<]image[>[",
+    "]<]video[>[",
+    "]<]start of speech[>[",
+    "]<]end of speech[>[",
+    "]<]start of image[>[",
+    "]<]end of image[>[",
+    "]<]start of video[>[",
+    "]<]end of video[>[",
+    "]<]vision pad[>[",
+    "]~!b[",
+    "<issue_closed>",
+    "<issue_comment>",
+    "<issue_start>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<reponame>",
+    "[e~[",
+    "]!d~[",
+    "]!p~[",
+    "]~b]",
+    "<jupyter_error>",
+    "<add_file>",
+    "<delete_file>",
+    "<rename_file>",
+    "<edit_file>",
+    "<commit_message>",
+    "<empty_source_file>",
+    "<repo_struct>",
+    "<code_context>",
+    "<file_content>",
+    "<source_files>",
+    "<pr_start>",
+    "<review_comment>",
+    "<filepath>",
+    "<file_sep>"
+  ],
+  "bos_token": "]~!b[",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "[e~[",
+  "extra_special_tokens": {},
+  "model_max_length": 40960000,
+  "pad_token": "[e~[",
+  "tokenizer_class": "GPT2TokenizerFast",
+  "unk_token": "]!d~[",
+  "_commit_hash": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff