EVA Qwen3-Next v0.0
Experimental RP/story full-parameter finetune of Qwen3-Coder-Next-Base, on refined version of EVA Gen 0.0 data. While this model may be not as smart as original Qwen3-Next (due to being a SFT-only checkpoint, trained from base model), we hope it will compensate with much better creativity, and more natural prose. Perhaps, in the future, we will build onto this checkpoint with online RL to further improve it. Model currently doesn't support reasoning.
Training data:
Celeste-Filtered natural/synthethic mixed dataset.
Kalomaze's Opus_Instruct_25k dataset, filtered for refusals.
A subset (1k rows) of ChatGPT-4o-WritingPrompts by Gryphe
A subset (2k rows) of Sonnet3.5-Charcards-Roleplay by Gryphe
Synthstruct and SynthRP datasets by Epiculous
2k rows of synthetic system prompt following-focused data.
Large original multi-domain synthetic instruction-following dataset.
Older datasets had an additional cleanup and had lower quality rows regenerated.
Licensing
Both base model and this derivative are licensed under Apache 2.0 license.
Training Config (Megatron)
See Megatron config
_target_: megatron.bridge.training.config.ConfigContainer
checkpoint:
  _target_: megatron.bridge.training.config.CheckpointConfig
  async_save: false
  ckpt_assume_constant_structure: false
  ckpt_convert_format: null
  ckpt_convert_save: null
  ckpt_format: torch_dist
  ckpt_step: null
  dist_ckpt_optim_fully_reshardable: false
  dist_ckpt_strictness: assume_ok_unexpected
  distrib_optim_fully_reshardable_mem_efficient: false
  exit_on_missing_checkpoint: false
  finetune: false
  fully_parallel_load: false
  fully_parallel_save: true
  load: workspace/results/qwen3_coder_next_base_lora/checkpoints
  load_main_params_from_ckpt: false
  load_optim: true
  load_rng: true
  most_recent_k: -1
  non_persistent_ckpt_type: null
  non_persistent_global_ckpt_dir: null
  non_persistent_local_ckpt_algo: fully_parallel
  non_persistent_local_ckpt_dir: null
  non_persistent_save_interval: null
  pretrained_checkpoint: workspace/models/Qwen3-Coder-Next-Base
  replication: false
  replication_factor: 2
  replication_jump: null
  save: workspace/results/qwen3_coder_next_base_lora/checkpoints
  save_interval: 200
  save_optim: true
  save_rng: true
  save_tokenizer_assets: true
  strict_fsdp_dtensor_load: false
  use_checkpoint_args: false
  use_persistent_ckpt_worker: true
comm_overlap: null
dataset:
  _target_: megatron.bridge.training.config.FinetuningDatasetConfig
  data_sharding: true
  dataloader_type: single
  dataset_kwargs: null
  dataset_root: workspace/datasets/lilith-v0.2/bridge_jsonl
  do_test: true
  do_validation: true
  max_train_samples: null
  memmap_workers: 1
  num_workers: 8
  packed_sequence_specs: null
  persistent_workers: false
  pin_memory: true
  seed: 5678
  seq_length: 2048
  trust_remote_code: null
ddp:
  _target_: megatron.core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig
  align_param_gather: false
  average_in_collective: true
  bucket_size: null
  check_for_large_grads: false
  check_for_nan_in_grad: true
  data_parallel_sharding_strategy: optim_grads_params
  delay_wgrad_compute: false
  disable_symmetric_registration: false
  fp8_param_gather: false
  fsdp_db_use_persist_buf_on_alloc_fail: false
  fsdp_double_buffer: false
  fsdp_manual_registration: false
  grad_reduce_in_fp32: true
  gradient_reduce_div_fusion: true
  keep_fp8_transpose_cache: false
  nccl_ub: false
  num_distributed_optimizer_instances: 1
  outer_dp_sharding_strategy: no_shard
  overlap_grad_reduce: false
  overlap_param_gather: false
  pad_buckets_for_high_nccl_busbw: false
  preserve_fp32_weights: true
  reduce_scatter_with_fp32_accumulation: false
  reuse_grad_buf_for_mxfp8_param_ag: false
  suggested_communication_unit_size: null
  use_custom_fsdp: false
  use_distributed_optimizer: true
  use_megatron_fsdp: false
dist:
  _target_: megatron.bridge.training.config.DistributedInitConfig
  align_grad_reduce: true
  disable_jit_fuser: false
  distributed_backend: nccl
  distributed_timeout_minutes: 10
  distributed_timeout_seconds_after_init: null
  enable_megatron_core_experimental: false
  external_gpu_device_mapping: false
  high_priority_stream_groups: null
  lazy_init: false
  local_rank: 0
  nccl_communicator_config_path: null
  sharp_enabled_group: null
  use_decentralized_pg: false
  use_gloo_process_groups: true
  use_megatron_fsdp: false
  use_sharp: false
  use_torch_fsdp2: false
  use_tp_pp_dp_mapping: false
ft: null
inprocess_restart: null
logger:
  _target_: megatron.bridge.training.config.LoggerConfig
  filter_warnings: true
  log_energy: false
  log_interval: 1
  log_l2_norm_grad_to_tensorboard: false
  log_loss_scale_to_tensorboard: true
  log_memory_to_tensorboard: false
  log_params_norm: false
  log_progress: false
  log_runtime_to_tensorboard: false
  log_throughput: false
  log_throughput_to_tensorboard: false
  log_timers_to_tensorboard: true
  log_validation_ppl_to_tensorboard: false
  log_world_size_to_tensorboard: false
  logging_level: 20
  memory_keys: null
  mlflow_experiment: null
  mlflow_run_name: null
  mlflow_tags: null
  mlflow_tracking_uri: null
  modules_to_filter: null
  runtime_time_unit: hours
  save_config_filepath: null
  set_level_for_all_loggers: false
  skip_train_metrics_log: false
  tensorboard_dir: workspace/results/qwen3_coder_next_base_lora/tb_logs
  tensorboard_log_interval: 1
  tensorboard_queue_size: 1000
  throughput_window_size: 100
  timing_log_level: 0
  timing_log_option: minmax
  wandb_entity: nottlespike
  wandb_exp_name: qwen3-coder-next-lora-lr1e3
  wandb_project: qwen3-coder-next-lora
  wandb_save_dir: null
mixed_precision:
  _target_: megatron.bridge.training.mixed_precision.MixedPrecisionConfig
  autocast_dtype: null
  autocast_enabled: false
  bf16: true
  first_last_layers_bf16: false
  fp16: false
  fp32: false
  fp4: null
  fp4_recipe: nvfp4
  fp8: null
  fp8_amax_compute_algo: most_recent
  fp8_amax_history_len: 1
  fp8_dot_product_attention: false
  fp8_margin: 0
  fp8_multi_head_attention: false
  fp8_param: false
  fp8_param_gather: false
  fp8_recipe: tensorwise
  fp8_wgrad: true
  grad_reduce_in_fp32: true
  hysteresis: 2
  initial_loss_scale: 4294967296
  loss_scale: null
  loss_scale_window: 1000
  min_loss_scale: 1.0
  num_layers_at_end_in_bf16: 0
  num_layers_at_start_in_bf16: 0
  params_dtype:
    _call_: false
    _target_: torch.bfloat16
  pipeline_dtype:
    _call_: false
    _target_: torch.bfloat16
  reuse_grad_buf_for_mxfp8_param_ag: false
model:
  _target_: megatron.bridge.models.qwen.qwen_provider.Qwen3NextModelProvider
  account_for_embedding_in_pipeline_split: false
  account_for_loss_in_pipeline_split: false
  activation_func:
    _call_: false
    _target_: torch.nn.functional.silu
  activation_func_clamp_value: null
  activation_func_fp8_input_store: false
  add_bias_linear: false
  add_qkv_bias: false
  apply_query_key_layer_scaling: false
  apply_residual_connection_post_layernorm: false
  apply_rope_fusion: false
  async_tensor_model_parallel_allreduce: true
  attention_backend:
    _args_:
    - 5
    _call_: true
    _name_: auto
    _target_: megatron.core.transformer.enums.AttnBackend
  attention_dropout: 0.0
  attention_output_gate: true
  attention_softmax_in_fp32: false
  autocast_dtype:
    _call_: false
    _target_: torch.bfloat16
  barrier_with_L1_time: true
  batch_invariant_mode: false
  batch_p2p_comm: true
  batch_p2p_sync: true
  bf16: true
  bias_activation_fusion: false
  bias_dropout_fusion: false
  calculate_per_token_loss: false
  clone_scatter_output_in_embedding: true
  config_logger_dir: ''
  context_parallel_size: 1
  cp_comm_type: null
  cpu_offloading: false
  cpu_offloading_activations: true
  cpu_offloading_double_buffering: false
  cpu_offloading_num_layers: 0
  cpu_offloading_weights: false
  cross_entropy_fusion_impl: native
  cross_entropy_loss_fusion: true
  cuda_graph_impl: none
  cuda_graph_retain_backward_graph: false
  cuda_graph_scope: []
  cuda_graph_use_single_mempool: false
  cuda_graph_warmup_steps: 3
  deallocate_pipeline_outputs: true
  defer_embedding_wgrad_compute: false
  delay_wgrad_compute: false
  deterministic_mode: false
  disable_bf16_reduced_precision_matmul: false
  disable_parameter_transpose_cache: false
  distribute_saved_activations: false
  dsa_indexer_head_dim: null
  dsa_indexer_loss_coeff: null
  dsa_indexer_n_heads: null
  dsa_indexer_topk: null
  dsa_indexer_use_sparse_loss: false
  embedding_init_method:
    _args_: []
    _partial_: true
    _target_: torch.nn.init.normal_
    mean: 0.0
    std: 0.02
  embedding_init_method_std: 0.02
  enable_autocast: false
  enable_cuda_graph: false
  ep_overlap_early_attn_memory_release: false
  experimental_attention_variant: gated_delta_net
  expert_model_parallel_size: 8
  expert_tensor_parallel_size: 1
  external_cuda_graph: false
  ffn_hidden_size: 5120
  finalize_model_grads_func:
    _args_: []
    _partial_: true
    _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads
    pg_collection:
      _call_: true
      _target_: megatron.core.process_groups_config.ProcessGroupCollection
  fine_grained_activation_offloading: false
  first_last_layers_bf16: false
  flash_decode: false
  fp16: false
  fp16_lm_cross_entropy: false
  fp32_residual_connection: false
  fp4: null
  fp4_param: false
  fp4_quantizer_factory: null
  fp4_recipe: nvfp4
  fp8: null
  fp8_amax_compute_algo: most_recent
  fp8_amax_history_len: 1
  fp8_dot_product_attention: false
  fp8_interval: 1
  fp8_margin: 0
  fp8_multi_head_attention: false
  fp8_param: false
  fp8_quantizer_factory: null
  fp8_recipe: tensorwise
  fp8_wgrad: true
  fused_single_qkv_rope: false
  gated_linear_unit: true
  glu_linear_offset: 0.0
  grad_scale_func:
    _call_: false
    _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss
  grad_sync_func: null
  gradient_accumulation_fusion: false
  hetereogenous_dist_checkpoint: true
  heterogeneous_block_specs: false
  hf_model_id: Qwen/Qwen3-Coder-Next-Base
  hidden_dropout: 0.0
  hidden_size: 2048
  hierarchical_context_parallel_sizes: null
  hybrid_context_parallel: false
  inference_fuse_tp_communication: false
  inference_rng_tracker: false
  inference_sampling_seed: 42
  init_method:
    _args_: []
    _partial_: true
    _target_: torch.nn.init.normal_
    mean: 0.0
    std: 0.02
  init_method_std: 0.02
  init_model_with_meta_device: false
  is_hybrid_model: false
  kitchen_attention_backend: sdpa
  kv_channels: 256
  layernorm_epsilon: 1.0e-06
  layernorm_zero_centered_gamma: true
  linear_attention_freq: 4
  linear_conv_kernel_dim: 4
  linear_key_head_dim: 128
  linear_num_key_heads: 16
  linear_num_value_heads: 32
  linear_value_head_dim: 128
  log_max_attention_logit: false
  make_vocab_size_divisible_by: 128
  mamba_head_dim: 64
  mamba_num_groups: 8
  mamba_num_heads: null
  mamba_state_dim: 128
  masked_softmax_fusion: true
  max_position_embeddings: 40960
  max_seqlen_per_dp_cp_rank: null
  memory_efficient_layer_norm: false
  microbatch_group_size_per_vp_stage: 1
  min_offloaded_tensor_size: 1048576
  mlp_chunks_for_prefill: 1
  moe_apply_probs_on_input: false
  moe_aux_loss_coeff: 0.001
  moe_deepep_num_sms: 20
  moe_enable_deepep: false
  moe_enable_routing_replay: false
  moe_expert_capacity_factor: null
  moe_extended_tp: false
  moe_ffn_hidden_size: 512
  moe_flex_dispatcher_backend: deepep
  moe_grouped_gemm: true
  moe_hybridep_num_sms: 16
  moe_input_jitter_eps: null
  moe_latent_size: null
  moe_layer_freq: 1
  moe_layer_recompute: false
  moe_pad_expert_input_to_capacity: false
  moe_pad_experts_for_cuda_graph_inference: false
  moe_per_layer_logging: false
  moe_permute_fusion: true
  moe_router_bias_update_rate: 0.001
  moe_router_dtype: fp32
  moe_router_enable_expert_bias: false
  moe_router_force_load_balancing: false
  moe_router_fusion: false
  moe_router_group_topk: null
  moe_router_load_balancing_type: global_aux_loss
  moe_router_num_groups: null
  moe_router_padding_for_fp8: false
  moe_router_padding_for_quantization: false
  moe_router_pre_softmax: false
  moe_router_score_function: softmax
  moe_router_topk: 10
  moe_router_topk_limited_devices: null
  moe_router_topk_scaling_factor: null
  moe_shared_expert_gate: true
  moe_shared_expert_intermediate_size: 512
  moe_shared_expert_overlap: false
  moe_token_dispatcher_type: alltoall
  moe_token_drop_policy: probs
  moe_token_dropping: false
  moe_use_legacy_grouped_gemm: false
  moe_z_loss_coeff: null
  mrope_section: null
  mtp_enabled: false
  mtp_hybrid_override_pattern: null
  mtp_loss_scaling_factor: 0.1
  mtp_num_layers: null
  mtp_standalone: false
  mtp_use_repeated_layer: false
  multi_latent_attention: false
  nccl_all_reduce_for_prefill: false
  no_rope_freq: null
  no_sync_func: null
  normalization: RMSNorm
  num_attention_heads: 16
  num_layers: 48
  num_layers_at_end_in_bf16: 0
  num_layers_at_start_in_bf16: 0
  num_layers_in_first_pipeline_stage: null
  num_layers_in_last_pipeline_stage: null
  num_microbatches_with_partial_activation_checkpoints: null
  num_moe_experts: 512
  num_query_groups: 2
  offload_modules: []
  output_layer_init_method:
    _args_: []
    _partial_: true
    _target_: torch.nn.init.normal_
    mean: 0.0
    std: 0.0020412414523193153
  overlap_moe_expert_parallel_comm: false
  overlap_p2p_comm: false
  overlap_p2p_comm_warmup_flush: false
  parallel_output: true
  param_sync_func: null
  params_dtype:
    _call_: false
    _target_: torch.bfloat16
  perform_initialization: true
  persist_layer_norm: false
  pipeline_dtype:
    _call_: false
    _target_: torch.bfloat16
  pipeline_model_parallel_comm_backend: null
  pipeline_model_parallel_layout: null
  pipeline_model_parallel_size: 1
  position_embedding_type: rope
  qk_clip: false
  qk_clip_alpha: 0.5
  qk_clip_threshold: 100
  qk_l2_norm: false
  qk_layernorm: true
  quant_recipe: null
  recompute_granularity: selective
  recompute_method: null
  recompute_modules:
  - layernorm
  - moe
  - moe_act
  recompute_num_layers: null
  restore_modelopt_state: false
  rope_scaling: false
  rope_scaling_factor: 1.0
  rotary_base: 5000000
  rotary_interleaved: false
  rotary_percent: 0.25
  rotary_scaling_factor: null
  scatter_embedding_sequence_parallel: true
  seq_len_interpolation_factor: null
  seq_length: 2048
  sequence_parallel: false
  share_embeddings_and_output_weights: false
  should_pad_vocab: false
  softmax_scale: null
  softmax_type: vanilla
  symmetric_ar_type: null
  tensor_model_parallel_size: 1
  test_mode: false
  timers:
    _call_: true
    _target_: megatron.core.timers.Timers
  tp_comm_atomic_ag: false
  tp_comm_atomic_rs: false
  tp_comm_bootstrap_backend: nccl
  tp_comm_bulk_dgrad: true
  tp_comm_bulk_wgrad: true
  tp_comm_overlap: false
  tp_comm_overlap_ag: true
  tp_comm_overlap_cfg: null
  tp_comm_overlap_disable_fc1: false
  tp_comm_overlap_disable_qkv: false
  tp_comm_overlap_rs: true
  tp_comm_overlap_rs_dgrad: false
  tp_comm_split_ag: true
  tp_comm_split_rs: true
  tp_only_amax_red: false
  transformer_impl: transformer_engine
  transformer_layer_spec:
    _call_: false
    _target_: megatron.core.models.gpt.experimental_attention_variant_module_specs.get_transformer_block_with_experimental_attention_variant_spec
  use_arbitrary_attention_mask: null
  use_cpu_initialization: false
  use_fused_weighted_squared_relu: false
  use_inference_optimized_layers: false
  use_kitchen: false
  use_kitchen_attention: false
  use_mamba_mem_eff_path: true
  use_ring_exchange_p2p: false
  use_te_activation_func: false
  use_te_rng_tracker: false
  use_transformer_engine_full_layer_spec: false
  use_transformer_engine_op_fuser: false
  variable_seq_lengths: false
  virtual_pipeline_model_parallel_size: null
  vocab_size: 151936
  wgrad_deferral_limit: 0
  window_attn_skip_freq: null
  window_size: null
nvrx_straggler: null
optimizer:
  _target_: megatron.bridge.training.config.OptimizerConfig
  adam_beta1: 0.9
  adam_beta2: 0.98
  adam_eps: 1.0e-08
  apply_wd_to_qk_layernorm: false
  barrier_with_L1_time: false
  bf16: true
  clip_grad: 1.0
  config_logger_dir: ''
  decoupled_lr: null
  decoupled_min_lr: null
  decoupled_weight_decay: true
  exp_avg_dtype:
    _call_: false
    _target_: torch.float32
  exp_avg_sq_dtype:
    _call_: false
    _target_: torch.float32
  fp16: false
  fp8_recipe: tensorwise
  hysteresis: 2
  initial_loss_scale: 4294967296
  log_num_zeros_in_grad: false
  loss_scale: null
  loss_scale_window: 1000
  lr: 0.001
  main_grads_dtype:
    _call_: false
    _target_: torch.float32
  main_params_dtype:
    _call_: false
    _target_: torch.float32
  min_loss_scale: 1.0
  min_lr: 0.0001
  muon_extra_scale_factor: 1.0
  muon_fp32_matmul_prec: medium
  muon_momentum: 0.95
  muon_num_ns_steps: 5
  muon_scale_mode: spectral
  muon_split_qkv: true
  muon_tp_mode: blockwise
  muon_use_nesterov: false
  optimizer: adam
  optimizer_cpu_offload: false
  optimizer_offload_fraction: 0.0
  overlap_cpu_optimizer_d2h_h2d: false
  overlap_param_gather: false
  overlap_param_gather_with_optimizer_step: false
  params_dtype:
    _call_: false
    _target_: torch.bfloat16
  pin_cpu_grads: true
  pin_cpu_params: true
  reuse_grad_buf_for_mxfp8_param_ag: false
  sgd_momentum: 0.9
  store_param_remainders: true
  timers:
    _call_: true
    _target_: megatron.core.timers.Timers
  use_distributed_optimizer: true
  use_precision_aware_optimizer: false
  use_torch_optimizer_for_cpu_offload: false
  weight_decay: 0.1
optimizer_config_override_provider:
  _target_: megatron.bridge.training.config.OptimizerConfigOverrideProvider
peft:
  _target_: megatron.bridge.peft.lora.LoRA
  a2a_experimental: false
  alpha: 64
  canonical_mapping: {}
  dim: 64
  dropout: 0.0
  dropout_position: pre
  exclude_modules: []
  lora_A_init_method: xavier
  lora_B_init_method: zero
  lora_dtype: null
  params_to_save: !!set
    decoder.layers.0.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.0.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.0.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.0.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.0.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.0.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.0.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.0.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.1.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.1.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.1.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.1.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.1.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.1.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.1.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.1.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.10.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.10.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.10.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.10.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.10.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.10.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.10.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.10.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.11.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.11.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.11.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.11.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.11.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.11.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.11.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.11.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.11.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.11.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.11.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.11.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.12.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.12.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.12.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.12.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.12.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.12.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.12.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.12.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.13.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.13.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.13.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.13.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.13.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.13.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.13.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.13.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.14.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.14.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.14.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.14.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.14.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.14.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.14.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.14.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.15.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.15.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.15.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.15.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.15.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.15.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.15.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.15.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.15.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.15.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.15.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.15.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.16.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.16.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.16.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.16.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.16.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.16.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.16.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.16.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.17.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.17.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.17.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.17.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.17.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.17.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.17.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.17.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.18.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.18.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.18.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.18.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.18.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.18.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.18.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.18.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.19.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.19.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.19.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.19.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.19.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.19.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.19.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.19.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.19.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.19.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.19.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.19.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.2.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.2.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.2.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.2.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.2.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.2.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.2.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.2.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.20.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.20.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.20.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.20.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.20.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.20.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.20.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.20.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.21.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.21.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.21.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.21.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.21.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.21.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.21.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.21.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.22.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.22.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.22.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.22.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.22.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.22.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.22.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.22.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.23.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.23.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.23.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.23.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.23.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.23.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.23.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.23.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.23.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.23.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.23.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.23.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.24.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.24.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.24.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.24.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.24.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.24.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.24.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.24.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.25.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.25.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.25.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.25.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.25.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.25.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.25.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.25.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.26.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.26.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.26.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.26.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.26.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.26.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.26.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.26.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.27.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.27.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.27.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.27.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.27.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.27.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.27.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.27.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.27.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.27.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.27.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.27.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.28.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.28.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.28.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.28.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.28.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.28.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.28.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.28.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.29.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.29.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.29.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.29.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.29.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.29.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.29.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.29.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.3.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.3.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.3.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.3.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.3.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.3.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.3.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.3.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.3.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.3.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.3.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.3.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.30.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.30.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.30.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.30.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.30.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.30.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.30.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.30.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.31.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.31.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.31.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.31.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.31.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.31.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.31.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.31.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.31.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.31.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.31.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.31.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.32.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.32.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.32.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.32.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.32.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.32.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.32.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.32.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.33.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.33.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.33.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.33.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.33.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.33.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.33.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.33.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.34.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.34.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.34.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.34.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.34.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.34.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.34.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.34.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.35.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.35.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.35.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.35.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.35.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.35.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.35.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.35.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.35.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.35.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.35.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.35.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.36.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.36.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.36.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.36.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.36.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.36.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.36.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.36.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.37.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.37.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.37.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.37.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.37.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.37.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.37.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.37.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.38.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.38.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.38.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.38.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.38.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.38.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.38.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.38.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.39.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.39.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.39.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.39.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.39.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.39.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.39.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.39.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.39.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.39.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.39.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.39.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.4.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.4.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.4.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.4.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.4.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.4.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.4.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.4.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.40.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.40.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.40.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.40.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.40.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.40.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.40.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.40.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.41.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.41.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.41.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.41.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.41.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.41.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.41.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.41.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.42.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.42.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.42.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.42.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.42.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.42.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.42.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.42.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.43.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.43.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.43.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.43.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.43.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.43.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.43.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.43.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.43.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.43.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.43.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.43.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.44.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.44.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.44.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.44.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.44.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.44.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.44.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.44.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.45.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.45.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.45.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.45.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.45.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.45.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.45.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.45.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.46.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.46.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.46.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.46.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.46.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.46.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.46.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.46.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.47.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.47.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.47.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.47.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.47.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.47.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.47.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.47.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.47.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.47.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.47.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.47.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.5.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.5.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.5.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.5.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.5.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.5.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.5.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.5.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.6.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.6.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.6.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.6.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.6.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.6.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.6.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.6.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.7.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.7.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.7.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.7.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.7.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.7.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.7.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.7.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.7.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.7.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.7.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.7.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.8.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.8.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.8.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.8.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.8.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.8.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.8.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.8.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.9.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.9.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.9.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.9.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.9.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.9.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.9.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.9.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
  target_modules:
  - linear_qkv
  - linear_proj
  - linear_fc1
  - linear_fc2
profiling:
  _target_: megatron.bridge.training.config.ProfilingConfig
  memory_snapshot_path: snapshot.pickle
  nvtx_ranges: false
  profile_ranks: []
  profile_step_end: 12
  profile_step_start: 10
  pytorch_profiler_collect_callstack: false
  pytorch_profiler_collect_chakra: false
  pytorch_profiler_collect_shapes: false
  record_memory_history: false
  record_shapes: false
  use_nsys_profiler: false
  use_pytorch_profiler: false
rerun_state_machine:
  _target_: megatron.bridge.training.config.RerunStateMachineConfig
  check_for_nan_in_loss: true
  check_for_spiky_loss: false
  error_injection_rate: 0
  error_injection_type: transient_error
  rerun_mode: disabled
  spiky_loss_factor: 10.0
rng:
  _target_: megatron.bridge.training.config.RNGConfig
  data_parallel_random_init: false
  inference_rng_tracker: false
  seed: 5678
  te_rng_tracker: false
scheduler:
  _target_: megatron.bridge.training.config.SchedulerConfig
  end_weight_decay: 0.033
  lr_decay_iters: 2000
  lr_decay_samples: null
  lr_decay_steps: 64000
  lr_decay_style: cosine
  lr_warmup_fraction: null
  lr_warmup_init: 0.0
  lr_warmup_iters: 50
  lr_warmup_samples: 0
  lr_warmup_steps: 1600
  lr_wsd_decay_iters: null
  lr_wsd_decay_samples: null
  lr_wsd_decay_style: exponential
  no_weight_decay_cond_type: qwen3_next
  override_opt_param_scheduler: true
  start_weight_decay: 0.033
  use_checkpoint_opt_param_scheduler: false
  wd_incr_steps: 64000
  weight_decay_incr_style: constant
  wsd_decay_steps: null
straggler: null
tensor_inspect: null
tokenizer:
  _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig
  chat_template: null
  force_system_message: false
  hf_tokenizer_kwargs: {}
  image_tag_type: null
  merge_file: null
  metadata_path: null
  sp_tokenizer_kwargs: {}
  special_tokens: null
  tiktoken_num_special_tokens: 1000
  tiktoken_pattern: null
  tiktoken_special_tokens: null
  tokenizer_model: Qwen/Qwen3-Coder-Next-Base
  tokenizer_prompt_format: null
  tokenizer_type: HuggingFaceTokenizer
  vocab_extra_ids: 0
  vocab_file: null
  vocab_size: null
train:
  _target_: megatron.bridge.training.config.TrainingConfig
  check_optimizer_step_success: true
  check_weight_hash_across_dp_replicas_interval: null
  decrease_batch_size_if_needed: false
  empty_unused_memory_level: 0
  eval_interval: null
  eval_iters: null
  exit_duration_in_mins: null
  exit_interval: null
  exit_signal:
    _args_:
    - 15
    _call_: true
    _name_: SIGTERM
    _target_: signal.Signals
  exit_signal_handler: false
  exit_signal_handler_for_dataloader: false
  global_batch_size: 32
  iterations_to_skip: []
  manual_gc: true
  manual_gc_eval: 100
  manual_gc_interval: 100
  micro_batch_size: 1
  rampup_batch_size: null
  skip_sync_grad_norm_across_mp: false
  skip_train: null
  train_iters: 2000
  train_samples: null
  train_sync_interval: null
validation:
  _target_: megatron.bridge.training.config.ValidationConfig
  eval_interval: 9999
  eval_iters: 32
  skip_train: false