EVA Gen 1.11
Collection
1 item • Updated
EVA Qwen3-Next v0.0
Experimental RP/story full-parameter finetune of Qwen3-Coder-Next-Base, on refined version of EVA Gen 0.0 data. While this model may be not as smart as original Qwen3-Next (due to being a SFT-only checkpoint, trained from base model), we hope it will compensate with much better creativity, and more natural prose. Perhaps, in the future, we will build onto this checkpoint with online RL to further improve it. Model currently doesn't support reasoning.
Training data:
Older datasets had an additional cleanup and had lower quality rows regenerated.
Licensing
Both base model and this derivative are licensed under Apache 2.0 license.
Training Config (Megatron)
_target_: megatron.bridge.training.config.ConfigContainer
checkpoint:
_target_: megatron.bridge.training.config.CheckpointConfig
async_save: false
ckpt_assume_constant_structure: false
ckpt_convert_format: null
ckpt_convert_save: null
ckpt_format: torch_dist
ckpt_step: null
dist_ckpt_optim_fully_reshardable: false
dist_ckpt_strictness: assume_ok_unexpected
distrib_optim_fully_reshardable_mem_efficient: false
exit_on_missing_checkpoint: false
finetune: false
fully_parallel_load: false
fully_parallel_save: true
load: workspace/results/qwen3_coder_next_base_lora/checkpoints
load_main_params_from_ckpt: false
load_optim: true
load_rng: true
most_recent_k: -1
non_persistent_ckpt_type: null
non_persistent_global_ckpt_dir: null
non_persistent_local_ckpt_algo: fully_parallel
non_persistent_local_ckpt_dir: null
non_persistent_save_interval: null
pretrained_checkpoint: workspace/models/Qwen3-Coder-Next-Base
replication: false
replication_factor: 2
replication_jump: null
save: workspace/results/qwen3_coder_next_base_lora/checkpoints
save_interval: 200
save_optim: true
save_rng: true
save_tokenizer_assets: true
strict_fsdp_dtensor_load: false
use_checkpoint_args: false
use_persistent_ckpt_worker: true
comm_overlap: null
dataset:
_target_: megatron.bridge.training.config.FinetuningDatasetConfig
data_sharding: true
dataloader_type: single
dataset_kwargs: null
dataset_root: workspace/datasets/lilith-v0.2/bridge_jsonl
do_test: true
do_validation: true
max_train_samples: null
memmap_workers: 1
num_workers: 8
packed_sequence_specs: null
persistent_workers: false
pin_memory: true
seed: 5678
seq_length: 2048
trust_remote_code: null
ddp:
_target_: megatron.core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig
align_param_gather: false
average_in_collective: true
bucket_size: null
check_for_large_grads: false
check_for_nan_in_grad: true
data_parallel_sharding_strategy: optim_grads_params
delay_wgrad_compute: false
disable_symmetric_registration: false
fp8_param_gather: false
fsdp_db_use_persist_buf_on_alloc_fail: false
fsdp_double_buffer: false
fsdp_manual_registration: false
grad_reduce_in_fp32: true
gradient_reduce_div_fusion: true
keep_fp8_transpose_cache: false
nccl_ub: false
num_distributed_optimizer_instances: 1
outer_dp_sharding_strategy: no_shard
overlap_grad_reduce: false
overlap_param_gather: false
pad_buckets_for_high_nccl_busbw: false
preserve_fp32_weights: true
reduce_scatter_with_fp32_accumulation: false
reuse_grad_buf_for_mxfp8_param_ag: false
suggested_communication_unit_size: null
use_custom_fsdp: false
use_distributed_optimizer: true
use_megatron_fsdp: false
dist:
_target_: megatron.bridge.training.config.DistributedInitConfig
align_grad_reduce: true
disable_jit_fuser: false
distributed_backend: nccl
distributed_timeout_minutes: 10
distributed_timeout_seconds_after_init: null
enable_megatron_core_experimental: false
external_gpu_device_mapping: false
high_priority_stream_groups: null
lazy_init: false
local_rank: 0
nccl_communicator_config_path: null
sharp_enabled_group: null
use_decentralized_pg: false
use_gloo_process_groups: true
use_megatron_fsdp: false
use_sharp: false
use_torch_fsdp2: false
use_tp_pp_dp_mapping: false
ft: null
inprocess_restart: null
logger:
_target_: megatron.bridge.training.config.LoggerConfig
filter_warnings: true
log_energy: false
log_interval: 1
log_l2_norm_grad_to_tensorboard: false
log_loss_scale_to_tensorboard: true
log_memory_to_tensorboard: false
log_params_norm: false
log_progress: false
log_runtime_to_tensorboard: false
log_throughput: false
log_throughput_to_tensorboard: false
log_timers_to_tensorboard: true
log_validation_ppl_to_tensorboard: false
log_world_size_to_tensorboard: false
logging_level: 20
memory_keys: null
mlflow_experiment: null
mlflow_run_name: null
mlflow_tags: null
mlflow_tracking_uri: null
modules_to_filter: null
runtime_time_unit: hours
save_config_filepath: null
set_level_for_all_loggers: false
skip_train_metrics_log: false
tensorboard_dir: workspace/results/qwen3_coder_next_base_lora/tb_logs
tensorboard_log_interval: 1
tensorboard_queue_size: 1000
throughput_window_size: 100
timing_log_level: 0
timing_log_option: minmax
wandb_entity: nottlespike
wandb_exp_name: qwen3-coder-next-lora-lr1e3
wandb_project: qwen3-coder-next-lora
wandb_save_dir: null
mixed_precision:
_target_: megatron.bridge.training.mixed_precision.MixedPrecisionConfig
autocast_dtype: null
autocast_enabled: false
bf16: true
first_last_layers_bf16: false
fp16: false
fp32: false
fp4: null
fp4_recipe: nvfp4
fp8: null
fp8_amax_compute_algo: most_recent
fp8_amax_history_len: 1
fp8_dot_product_attention: false
fp8_margin: 0
fp8_multi_head_attention: false
fp8_param: false
fp8_param_gather: false
fp8_recipe: tensorwise
fp8_wgrad: true
grad_reduce_in_fp32: true
hysteresis: 2
initial_loss_scale: 4294967296
loss_scale: null
loss_scale_window: 1000
min_loss_scale: 1.0
num_layers_at_end_in_bf16: 0
num_layers_at_start_in_bf16: 0
params_dtype:
_call_: false
_target_: torch.bfloat16
pipeline_dtype:
_call_: false
_target_: torch.bfloat16
reuse_grad_buf_for_mxfp8_param_ag: false
model:
_target_: megatron.bridge.models.qwen.qwen_provider.Qwen3NextModelProvider
account_for_embedding_in_pipeline_split: false
account_for_loss_in_pipeline_split: false
activation_func:
_call_: false
_target_: torch.nn.functional.silu
activation_func_clamp_value: null
activation_func_fp8_input_store: false
add_bias_linear: false
add_qkv_bias: false
apply_query_key_layer_scaling: false
apply_residual_connection_post_layernorm: false
apply_rope_fusion: false
async_tensor_model_parallel_allreduce: true
attention_backend:
_args_:
- 5
_call_: true
_name_: auto
_target_: megatron.core.transformer.enums.AttnBackend
attention_dropout: 0.0
attention_output_gate: true
attention_softmax_in_fp32: false
autocast_dtype:
_call_: false
_target_: torch.bfloat16
barrier_with_L1_time: true
batch_invariant_mode: false
batch_p2p_comm: true
batch_p2p_sync: true
bf16: true
bias_activation_fusion: false
bias_dropout_fusion: false
calculate_per_token_loss: false
clone_scatter_output_in_embedding: true
config_logger_dir: ''
context_parallel_size: 1
cp_comm_type: null
cpu_offloading: false
cpu_offloading_activations: true
cpu_offloading_double_buffering: false
cpu_offloading_num_layers: 0
cpu_offloading_weights: false
cross_entropy_fusion_impl: native
cross_entropy_loss_fusion: true
cuda_graph_impl: none
cuda_graph_retain_backward_graph: false
cuda_graph_scope: []
cuda_graph_use_single_mempool: false
cuda_graph_warmup_steps: 3
deallocate_pipeline_outputs: true
defer_embedding_wgrad_compute: false
delay_wgrad_compute: false
deterministic_mode: false
disable_bf16_reduced_precision_matmul: false
disable_parameter_transpose_cache: false
distribute_saved_activations: false
dsa_indexer_head_dim: null
dsa_indexer_loss_coeff: null
dsa_indexer_n_heads: null
dsa_indexer_topk: null
dsa_indexer_use_sparse_loss: false
embedding_init_method:
_args_: []
_partial_: true
_target_: torch.nn.init.normal_
mean: 0.0
std: 0.02
embedding_init_method_std: 0.02
enable_autocast: false
enable_cuda_graph: false
ep_overlap_early_attn_memory_release: false
experimental_attention_variant: gated_delta_net
expert_model_parallel_size: 8
expert_tensor_parallel_size: 1
external_cuda_graph: false
ffn_hidden_size: 5120
finalize_model_grads_func:
_args_: []
_partial_: true
_target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads
pg_collection:
_call_: true
_target_: megatron.core.process_groups_config.ProcessGroupCollection
fine_grained_activation_offloading: false
first_last_layers_bf16: false
flash_decode: false
fp16: false
fp16_lm_cross_entropy: false
fp32_residual_connection: false
fp4: null
fp4_param: false
fp4_quantizer_factory: null
fp4_recipe: nvfp4
fp8: null
fp8_amax_compute_algo: most_recent
fp8_amax_history_len: 1
fp8_dot_product_attention: false
fp8_interval: 1
fp8_margin: 0
fp8_multi_head_attention: false
fp8_param: false
fp8_quantizer_factory: null
fp8_recipe: tensorwise
fp8_wgrad: true
fused_single_qkv_rope: false
gated_linear_unit: true
glu_linear_offset: 0.0
grad_scale_func:
_call_: false
_target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss
grad_sync_func: null
gradient_accumulation_fusion: false
hetereogenous_dist_checkpoint: true
heterogeneous_block_specs: false
hf_model_id: Qwen/Qwen3-Coder-Next-Base
hidden_dropout: 0.0
hidden_size: 2048
hierarchical_context_parallel_sizes: null
hybrid_context_parallel: false
inference_fuse_tp_communication: false
inference_rng_tracker: false
inference_sampling_seed: 42
init_method:
_args_: []
_partial_: true
_target_: torch.nn.init.normal_
mean: 0.0
std: 0.02
init_method_std: 0.02
init_model_with_meta_device: false
is_hybrid_model: false
kitchen_attention_backend: sdpa
kv_channels: 256
layernorm_epsilon: 1.0e-06
layernorm_zero_centered_gamma: true
linear_attention_freq: 4
linear_conv_kernel_dim: 4
linear_key_head_dim: 128
linear_num_key_heads: 16
linear_num_value_heads: 32
linear_value_head_dim: 128
log_max_attention_logit: false
make_vocab_size_divisible_by: 128
mamba_head_dim: 64
mamba_num_groups: 8
mamba_num_heads: null
mamba_state_dim: 128
masked_softmax_fusion: true
max_position_embeddings: 40960
max_seqlen_per_dp_cp_rank: null
memory_efficient_layer_norm: false
microbatch_group_size_per_vp_stage: 1
min_offloaded_tensor_size: 1048576
mlp_chunks_for_prefill: 1
moe_apply_probs_on_input: false
moe_aux_loss_coeff: 0.001
moe_deepep_num_sms: 20
moe_enable_deepep: false
moe_enable_routing_replay: false
moe_expert_capacity_factor: null
moe_extended_tp: false
moe_ffn_hidden_size: 512
moe_flex_dispatcher_backend: deepep
moe_grouped_gemm: true
moe_hybridep_num_sms: 16
moe_input_jitter_eps: null
moe_latent_size: null
moe_layer_freq: 1
moe_layer_recompute: false
moe_pad_expert_input_to_capacity: false
moe_pad_experts_for_cuda_graph_inference: false
moe_per_layer_logging: false
moe_permute_fusion: true
moe_router_bias_update_rate: 0.001
moe_router_dtype: fp32
moe_router_enable_expert_bias: false
moe_router_force_load_balancing: false
moe_router_fusion: false
moe_router_group_topk: null
moe_router_load_balancing_type: global_aux_loss
moe_router_num_groups: null
moe_router_padding_for_fp8: false
moe_router_padding_for_quantization: false
moe_router_pre_softmax: false
moe_router_score_function: softmax
moe_router_topk: 10
moe_router_topk_limited_devices: null
moe_router_topk_scaling_factor: null
moe_shared_expert_gate: true
moe_shared_expert_intermediate_size: 512
moe_shared_expert_overlap: false
moe_token_dispatcher_type: alltoall
moe_token_drop_policy: probs
moe_token_dropping: false
moe_use_legacy_grouped_gemm: false
moe_z_loss_coeff: null
mrope_section: null
mtp_enabled: false
mtp_hybrid_override_pattern: null
mtp_loss_scaling_factor: 0.1
mtp_num_layers: null
mtp_standalone: false
mtp_use_repeated_layer: false
multi_latent_attention: false
nccl_all_reduce_for_prefill: false
no_rope_freq: null
no_sync_func: null
normalization: RMSNorm
num_attention_heads: 16
num_layers: 48
num_layers_at_end_in_bf16: 0
num_layers_at_start_in_bf16: 0
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
num_microbatches_with_partial_activation_checkpoints: null
num_moe_experts: 512
num_query_groups: 2
offload_modules: []
output_layer_init_method:
_args_: []
_partial_: true
_target_: torch.nn.init.normal_
mean: 0.0
std: 0.0020412414523193153
overlap_moe_expert_parallel_comm: false
overlap_p2p_comm: false
overlap_p2p_comm_warmup_flush: false
parallel_output: true
param_sync_func: null
params_dtype:
_call_: false
_target_: torch.bfloat16
perform_initialization: true
persist_layer_norm: false
pipeline_dtype:
_call_: false
_target_: torch.bfloat16
pipeline_model_parallel_comm_backend: null
pipeline_model_parallel_layout: null
pipeline_model_parallel_size: 1
position_embedding_type: rope
qk_clip: false
qk_clip_alpha: 0.5
qk_clip_threshold: 100
qk_l2_norm: false
qk_layernorm: true
quant_recipe: null
recompute_granularity: selective
recompute_method: null
recompute_modules:
- layernorm
- moe
- moe_act
recompute_num_layers: null
restore_modelopt_state: false
rope_scaling: false
rope_scaling_factor: 1.0
rotary_base: 5000000
rotary_interleaved: false
rotary_percent: 0.25
rotary_scaling_factor: null
scatter_embedding_sequence_parallel: true
seq_len_interpolation_factor: null
seq_length: 2048
sequence_parallel: false
share_embeddings_and_output_weights: false
should_pad_vocab: false
softmax_scale: null
softmax_type: vanilla
symmetric_ar_type: null
tensor_model_parallel_size: 1
test_mode: false
timers:
_call_: true
_target_: megatron.core.timers.Timers
tp_comm_atomic_ag: false
tp_comm_atomic_rs: false
tp_comm_bootstrap_backend: nccl
tp_comm_bulk_dgrad: true
tp_comm_bulk_wgrad: true
tp_comm_overlap: false
tp_comm_overlap_ag: true
tp_comm_overlap_cfg: null
tp_comm_overlap_disable_fc1: false
tp_comm_overlap_disable_qkv: false
tp_comm_overlap_rs: true
tp_comm_overlap_rs_dgrad: false
tp_comm_split_ag: true
tp_comm_split_rs: true
tp_only_amax_red: false
transformer_impl: transformer_engine
transformer_layer_spec:
_call_: false
_target_: megatron.core.models.gpt.experimental_attention_variant_module_specs.get_transformer_block_with_experimental_attention_variant_spec
use_arbitrary_attention_mask: null
use_cpu_initialization: false
use_fused_weighted_squared_relu: false
use_inference_optimized_layers: false
use_kitchen: false
use_kitchen_attention: false
use_mamba_mem_eff_path: true
use_ring_exchange_p2p: false
use_te_activation_func: false
use_te_rng_tracker: false
use_transformer_engine_full_layer_spec: false
use_transformer_engine_op_fuser: false
variable_seq_lengths: false
virtual_pipeline_model_parallel_size: null
vocab_size: 151936
wgrad_deferral_limit: 0
window_attn_skip_freq: null
window_size: null
nvrx_straggler: null
optimizer:
_target_: megatron.bridge.training.config.OptimizerConfig
adam_beta1: 0.9
adam_beta2: 0.98
adam_eps: 1.0e-08
apply_wd_to_qk_layernorm: false
barrier_with_L1_time: false
bf16: true
clip_grad: 1.0
config_logger_dir: ''
decoupled_lr: null
decoupled_min_lr: null
decoupled_weight_decay: true
exp_avg_dtype:
_call_: false
_target_: torch.float32
exp_avg_sq_dtype:
_call_: false
_target_: torch.float32
fp16: false
fp8_recipe: tensorwise
hysteresis: 2
initial_loss_scale: 4294967296
log_num_zeros_in_grad: false
loss_scale: null
loss_scale_window: 1000
lr: 0.001
main_grads_dtype:
_call_: false
_target_: torch.float32
main_params_dtype:
_call_: false
_target_: torch.float32
min_loss_scale: 1.0
min_lr: 0.0001
muon_extra_scale_factor: 1.0
muon_fp32_matmul_prec: medium
muon_momentum: 0.95
muon_num_ns_steps: 5
muon_scale_mode: spectral
muon_split_qkv: true
muon_tp_mode: blockwise
muon_use_nesterov: false
optimizer: adam
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0
overlap_cpu_optimizer_d2h_h2d: false
overlap_param_gather: false
overlap_param_gather_with_optimizer_step: false
params_dtype:
_call_: false
_target_: torch.bfloat16
pin_cpu_grads: true
pin_cpu_params: true
reuse_grad_buf_for_mxfp8_param_ag: false
sgd_momentum: 0.9
store_param_remainders: true
timers:
_call_: true
_target_: megatron.core.timers.Timers
use_distributed_optimizer: true
use_precision_aware_optimizer: false
use_torch_optimizer_for_cpu_offload: false
weight_decay: 0.1
optimizer_config_override_provider:
_target_: megatron.bridge.training.config.OptimizerConfigOverrideProvider
peft:
_target_: megatron.bridge.peft.lora.LoRA
a2a_experimental: false
alpha: 64
canonical_mapping: {}
dim: 64
dropout: 0.0
dropout_position: pre
exclude_modules: []
lora_A_init_method: xavier
lora_B_init_method: zero
lora_dtype: null
params_to_save: !!set
decoder.layers.0.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.0.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.0.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.0.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.0.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.0.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.0.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.0.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.1.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.1.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.1.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.1.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.1.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.1.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.1.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.1.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.10.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.10.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.10.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.10.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.10.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.10.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.10.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.10.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.11.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.11.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.11.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.11.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.11.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.11.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.11.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.11.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.11.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.11.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.11.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.11.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.12.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.12.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.12.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.12.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.12.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.12.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.12.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.12.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.13.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.13.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.13.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.13.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.13.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.13.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.13.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.13.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.14.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.14.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.14.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.14.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.14.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.14.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.14.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.14.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.15.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.15.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.15.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.15.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.15.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.15.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.15.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.15.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.15.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.15.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.15.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.15.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.16.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.16.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.16.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.16.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.16.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.16.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.16.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.16.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.17.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.17.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.17.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.17.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.17.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.17.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.17.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.17.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.18.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.18.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.18.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.18.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.18.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.18.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.18.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.18.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.19.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.19.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.19.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.19.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.19.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.19.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.19.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.19.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.19.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.19.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.19.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.19.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.2.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.2.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.2.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.2.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.2.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.2.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.2.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.2.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.20.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.20.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.20.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.20.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.20.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.20.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.20.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.20.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.21.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.21.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.21.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.21.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.21.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.21.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.21.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.21.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.22.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.22.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.22.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.22.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.22.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.22.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.22.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.22.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.23.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.23.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.23.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.23.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.23.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.23.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.23.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.23.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.23.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.23.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.23.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.23.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.24.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.24.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.24.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.24.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.24.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.24.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.24.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.24.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.25.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.25.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.25.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.25.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.25.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.25.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.25.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.25.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.26.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.26.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.26.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.26.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.26.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.26.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.26.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.26.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.27.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.27.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.27.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.27.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.27.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.27.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.27.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.27.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.27.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.27.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.27.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.27.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.28.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.28.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.28.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.28.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.28.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.28.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.28.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.28.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.29.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.29.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.29.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.29.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.29.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.29.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.29.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.29.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.3.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.3.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.3.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.3.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.3.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.3.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.3.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.3.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.3.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.3.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.3.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.3.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.30.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.30.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.30.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.30.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.30.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.30.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.30.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.30.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.31.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.31.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.31.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.31.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.31.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.31.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.31.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.31.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.31.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.31.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.31.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.31.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.32.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.32.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.32.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.32.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.32.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.32.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.32.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.32.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.33.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.33.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.33.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.33.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.33.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.33.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.33.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.33.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.34.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.34.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.34.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.34.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.34.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.34.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.34.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.34.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.35.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.35.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.35.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.35.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.35.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.35.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.35.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.35.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.35.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.35.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.35.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.35.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.36.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.36.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.36.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.36.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.36.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.36.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.36.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.36.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.37.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.37.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.37.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.37.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.37.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.37.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.37.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.37.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.38.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.38.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.38.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.38.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.38.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.38.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.38.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.38.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.39.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.39.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.39.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.39.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.39.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.39.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.39.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.39.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.39.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.39.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.39.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.39.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.4.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.4.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.4.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.4.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.4.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.4.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.4.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.4.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.40.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.40.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.40.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.40.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.40.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.40.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.40.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.40.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.41.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.41.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.41.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.41.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.41.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.41.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.41.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.41.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.42.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.42.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.42.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.42.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.42.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.42.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.42.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.42.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.43.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.43.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.43.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.43.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.43.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.43.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.43.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.43.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.43.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.43.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.43.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.43.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.44.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.44.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.44.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.44.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.44.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.44.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.44.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.44.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.45.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.45.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.45.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.45.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.45.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.45.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.45.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.45.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.46.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.46.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.46.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.46.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.46.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.46.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.46.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.46.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.47.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.47.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.47.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.47.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.47.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.47.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.47.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.47.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.47.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.47.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.47.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.47.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.5.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.5.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.5.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.5.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.5.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.5.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.5.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.5.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.6.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.6.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.6.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.6.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.6.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.6.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.6.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.6.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.7.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.7.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.7.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.7.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.7.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.7.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.7.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.7.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.7.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.7.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.7.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.7.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.8.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.8.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.8.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.8.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.8.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.8.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.8.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.8.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.9.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.9.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.9.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.9.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.9.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.9.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.9.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.9.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
target_modules:
- linear_qkv
- linear_proj
- linear_fc1
- linear_fc2
profiling:
_target_: megatron.bridge.training.config.ProfilingConfig
memory_snapshot_path: snapshot.pickle
nvtx_ranges: false
profile_ranks: []
profile_step_end: 12
profile_step_start: 10
pytorch_profiler_collect_callstack: false
pytorch_profiler_collect_chakra: false
pytorch_profiler_collect_shapes: false
record_memory_history: false
record_shapes: false
use_nsys_profiler: false
use_pytorch_profiler: false
rerun_state_machine:
_target_: megatron.bridge.training.config.RerunStateMachineConfig
check_for_nan_in_loss: true
check_for_spiky_loss: false
error_injection_rate: 0
error_injection_type: transient_error
rerun_mode: disabled
spiky_loss_factor: 10.0
rng:
_target_: megatron.bridge.training.config.RNGConfig
data_parallel_random_init: false
inference_rng_tracker: false
seed: 5678
te_rng_tracker: false
scheduler:
_target_: megatron.bridge.training.config.SchedulerConfig
end_weight_decay: 0.033
lr_decay_iters: 2000
lr_decay_samples: null
lr_decay_steps: 64000
lr_decay_style: cosine
lr_warmup_fraction: null
lr_warmup_init: 0.0
lr_warmup_iters: 50
lr_warmup_samples: 0
lr_warmup_steps: 1600
lr_wsd_decay_iters: null
lr_wsd_decay_samples: null
lr_wsd_decay_style: exponential
no_weight_decay_cond_type: qwen3_next
override_opt_param_scheduler: true
start_weight_decay: 0.033
use_checkpoint_opt_param_scheduler: false
wd_incr_steps: 64000
weight_decay_incr_style: constant
wsd_decay_steps: null
straggler: null
tensor_inspect: null
tokenizer:
_target_: megatron.bridge.training.tokenizers.config.TokenizerConfig
chat_template: null
force_system_message: false
hf_tokenizer_kwargs: {}
image_tag_type: null
merge_file: null
metadata_path: null
sp_tokenizer_kwargs: {}
special_tokens: null
tiktoken_num_special_tokens: 1000
tiktoken_pattern: null
tiktoken_special_tokens: null
tokenizer_model: Qwen/Qwen3-Coder-Next-Base
tokenizer_prompt_format: null
tokenizer_type: HuggingFaceTokenizer
vocab_extra_ids: 0
vocab_file: null
vocab_size: null
train:
_target_: megatron.bridge.training.config.TrainingConfig
check_optimizer_step_success: true
check_weight_hash_across_dp_replicas_interval: null
decrease_batch_size_if_needed: false
empty_unused_memory_level: 0
eval_interval: null
eval_iters: null
exit_duration_in_mins: null
exit_interval: null
exit_signal:
_args_:
- 15
_call_: true
_name_: SIGTERM
_target_: signal.Signals
exit_signal_handler: false
exit_signal_handler_for_dataloader: false
global_batch_size: 32
iterations_to_skip: []
manual_gc: true
manual_gc_eval: 100
manual_gc_interval: 100
micro_batch_size: 1
rampup_batch_size: null
skip_sync_grad_norm_across_mp: false
skip_train: null
train_iters: 2000
train_samples: null
train_sync_interval: null
validation:
_target_: megatron.bridge.training.config.ValidationConfig
eval_interval: 9999
eval_iters: 32
skip_train: false