arcinstitute
/

evo2_7b_262k

Model card Files Files and versions

gbrixi commited on Sep 12, 2025

Commit

27c4f05

·

verified ·

1 Parent(s): f733268

Add model configuration

Files changed (1) hide show

config.yml +63 -0

config.yml ADDED Viewed

	@@ -0,0 +1,63 @@

+model_name: shc-evo2-7b-8k-2T-v2
+vocab_size: 512
+hidden_size: 4096
+# Number of long convolution filters in each hyena block. Can be smaller than `hidden_size`
+num_filters: 4096
+hcl_layer_idxs: [2,6,9,13,16,20,23,27,30]
+hcm_layer_idxs: [1,5,8,12,15,19,22,26,29]
+hcs_layer_idxs: [0,4,7,11,14,18,21,25,28]
+attn_layer_idxs: [3,10,17,24,31]
+hcm_filter_length: 128
+hcl_filter_groups: 4096
+hcm_filter_groups: 256
+hcs_filter_groups: 256
+hcs_filter_length: 7
+num_layers: 32
+# Length of the short, depthwise FIR applied to input projections
+short_filter_length: 3
+num_attention_heads: 32
+short_filter_bias: false # add bias to FIR
+mlp_init_method: torch.nn.init.zeros_
+mlp_output_init_method: torch.nn.init.zeros_
+eps: 0.000001
+state_size: 16
+rotary_emb_base: 1000000000
+rotary_emb_scaling_factor: 32
+use_interpolated_rotary_pos_emb: True
+make_vocab_size_divisible_by: 8
+inner_size_multiple_of: 16  # force GLU inner_size to be a multiple of
+inner_mlp_size: 11008
+log_intermediate_values: False
+# Number of groups in GQA
+proj_groups: 1
+# Number of groups in grouped
+hyena_filter_groups: 1
+# Split strategy for channels
+column_split_hyena: False
+column_split: True
+interleave: True
+# Legacy options for MP / PP inference
+model_parallel_size: 1
+pipe_parallel_size: 1
+tie_embeddings: True
+mha_out_proj_bias: True
+hyena_out_proj_bias: True
+hyena_flip_x1x2: False
+qkv_proj_bias: False
+use_fp8_input_projections: True
+max_seqlen: 262144
+max_batch_size: 1
+final_norm: True
+use_flash_attn: True
+use_flash_rmsnorm: False
+use_flash_depthwise: False
+use_flashfft: False
+use_laughing_hyena: False
+inference_mode: True
+tokenizer_type: CharLevelTokenizer
+prefill_style: fft
+mlp_activation: gelu
+print_activations: False