Text Generation
Transformers
Safetensors
PyTorch
nemotron_h
nvidia
conversational
custom_code

roagrawal

#33
by roagrawal - opened
Files changed (3) hide show
  1. README.md +2 -1
  2. config.json +4 -0
  3. modeling_nemotron_h.py +0 -4
README.md CHANGED
@@ -61,7 +61,8 @@ We want to hear from you! Share your ideas, vote on what matters, and help [shap
61
 
62
  ## License/Terms of Use
63
 
64
- Governing Terms: Use of this model is governed by the [NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/).
 
65
 
66
  ## Evaluation Results
67
 
 
61
 
62
  ## License/Terms of Use
63
 
64
+ GOVERNING TERMS: This trial service is governed by the [NVIDIA API Trial Terms of Service](https://assets.ngc.nvidia.com/products/api-catalog/legal/NVIDIA%20API%20Trial%20Terms%20of%20Service.pdf). Use of this model is governed by the [NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/).
65
+
66
 
67
  ## Evaluation Results
68
 
config.json CHANGED
@@ -43,6 +43,10 @@
43
  "ssm_state_size": 128,
44
  "tie_word_embeddings": false,
45
  "time_step_floor": 0.0001,
 
 
 
 
46
  "time_step_max": 0.1,
47
  "time_step_min": 0.001,
48
  "time_step_rank": 256,
 
43
  "ssm_state_size": 128,
44
  "tie_word_embeddings": false,
45
  "time_step_floor": 0.0001,
46
+ "time_step_limit": [
47
+ 0.0,
48
+ Infinity
49
+ ],
50
  "time_step_max": 0.1,
51
  "time_step_min": 0.001,
52
  "time_step_rank": 256,
modeling_nemotron_h.py CHANGED
@@ -1117,8 +1117,6 @@ class NemotronHPreTrainedModel(PreTrainedModel):
1117
  def _init_weights(self, module):
1118
  """Initialize the weights."""
1119
  if isinstance(module, NemotronHMamba2Mixer):
1120
- if getattr(module.dt_bias, "_is_hf_initialized", False):
1121
- return
1122
  module.A_log._no_weight_decay = True
1123
  module.D._no_weight_decay = True
1124
 
@@ -1150,8 +1148,6 @@ class NemotronHPreTrainedModel(PreTrainedModel):
1150
  #
1151
  # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
1152
  for name, p in module.named_parameters():
1153
- if getattr(p, "_is_hf_initialized", False):
1154
- continue
1155
  if name in ["out_proj.weight"]:
1156
  # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
1157
  # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
 
1117
  def _init_weights(self, module):
1118
  """Initialize the weights."""
1119
  if isinstance(module, NemotronHMamba2Mixer):
 
 
1120
  module.A_log._no_weight_decay = True
1121
  module.D._no_weight_decay = True
1122
 
 
1148
  #
1149
  # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
1150
  for name, p in module.named_parameters():
 
 
1151
  if name in ["out_proj.weight"]:
1152
  # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
1153
  # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)