gbrixi commited on
Commit
27c4f05
·
verified ·
1 Parent(s): f733268

Add model configuration

Browse files
Files changed (1) hide show
  1. config.yml +63 -0
config.yml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name: shc-evo2-7b-8k-2T-v2
2
+
3
+ vocab_size: 512
4
+ hidden_size: 4096
5
+ # Number of long convolution filters in each hyena block. Can be smaller than `hidden_size`
6
+ num_filters: 4096
7
+ hcl_layer_idxs: [2,6,9,13,16,20,23,27,30]
8
+ hcm_layer_idxs: [1,5,8,12,15,19,22,26,29]
9
+ hcs_layer_idxs: [0,4,7,11,14,18,21,25,28]
10
+ attn_layer_idxs: [3,10,17,24,31]
11
+
12
+ hcm_filter_length: 128
13
+ hcl_filter_groups: 4096
14
+ hcm_filter_groups: 256
15
+ hcs_filter_groups: 256
16
+ hcs_filter_length: 7
17
+ num_layers: 32
18
+
19
+ # Length of the short, depthwise FIR applied to input projections
20
+ short_filter_length: 3
21
+ num_attention_heads: 32
22
+ short_filter_bias: false # add bias to FIR
23
+ mlp_init_method: torch.nn.init.zeros_
24
+ mlp_output_init_method: torch.nn.init.zeros_
25
+ eps: 0.000001
26
+ state_size: 16
27
+ rotary_emb_base: 1000000000
28
+ rotary_emb_scaling_factor: 32
29
+ use_interpolated_rotary_pos_emb: True
30
+ make_vocab_size_divisible_by: 8
31
+ inner_size_multiple_of: 16 # force GLU inner_size to be a multiple of
32
+ inner_mlp_size: 11008
33
+ log_intermediate_values: False
34
+ # Number of groups in GQA
35
+ proj_groups: 1
36
+ # Number of groups in grouped
37
+ hyena_filter_groups: 1
38
+ # Split strategy for channels
39
+ column_split_hyena: False
40
+ column_split: True
41
+ interleave: True
42
+ # Legacy options for MP / PP inference
43
+ model_parallel_size: 1
44
+ pipe_parallel_size: 1
45
+ tie_embeddings: True
46
+ mha_out_proj_bias: True
47
+ hyena_out_proj_bias: True
48
+ hyena_flip_x1x2: False
49
+ qkv_proj_bias: False
50
+ use_fp8_input_projections: True
51
+ max_seqlen: 262144
52
+ max_batch_size: 1
53
+ final_norm: True
54
+ use_flash_attn: True
55
+ use_flash_rmsnorm: False
56
+ use_flash_depthwise: False
57
+ use_flashfft: False
58
+ use_laughing_hyena: False
59
+ inference_mode: True
60
+ tokenizer_type: CharLevelTokenizer
61
+ prefill_style: fft
62
+ mlp_activation: gelu
63
+ print_activations: False