RC_augmentation: false
_dataset_cfg_lookup:
  dlb_cmp_gm12878:
    cache_hbins: true
    eval_split: validation
    hf_path: andyjzhao/dlb_cmp_gm12878
    label_key: label_ut
    mask_key: mask_ut
    num_workers: 0
    path: data/dlb_cmp_gm12878
    pin_memory: true
    reference_id: hg38
    sequence_format: string
    sequence_key: sequence
    shuffle: true
    test_split: test
    train_split: train
    type: cmp_seq
    use_hbins: true
  dlb_cmp_h1hesc:
    cache_hbins: true
    eval_split: validation
    hf_path: andyjzhao/dlb_cmp_h1hesc
    label_key: label_ut
    mask_key: mask_ut
    num_workers: 0
    path: data/dlb_cmp_h1hesc
    pin_memory: true
    reference_id: hg38
    sequence_format: string
    sequence_key: sequence
    shuffle: true
    test_split: test
    train_split: train
    type: cmp_seq
    use_hbins: true
  dlb_cmp_hct116:
    cache_hbins: true
    eval_split: validation
    hf_path: andyjzhao/dlb_cmp_hct116
    label_key: label_ut
    mask_key: mask_ut
    num_workers: 0
    path: data/dlb_cmp_hct116
    pin_memory: true
    reference_id: hg38
    sequence_format: string
    sequence_key: sequence
    shuffle: true
    test_split: test
    train_split: train
    type: cmp_seq
    use_hbins: true
  dlb_cmp_hff:
    cache_hbins: true
    eval_split: validation
    hf_path: andyjzhao/dlb_cmp_hff
    label_key: label_ut
    mask_key: mask_ut
    num_workers: 0
    path: data/dlb_cmp_hff
    pin_memory: true
    reference_id: hg38
    sequence_format: string
    sequence_key: sequence
    shuffle: true
    test_split: test
    train_split: train
    type: cmp_seq
    use_hbins: true
  dlb_cmp_imr90:
    cache_hbins: true
    eval_split: validation
    hf_path: andyjzhao/dlb_cmp_imr90
    label_key: label_ut
    mask_key: mask_ut
    num_workers: 0
    path: data/dlb_cmp_imr90
    pin_memory: true
    reference_id: hg38
    sequence_format: string
    sequence_key: sequence
    shuffle: true
    test_split: test
    train_split: train
    type: cmp_seq
    use_hbins: true
  gencode_human_12.8k:
    hf_path: andyjzhao/gencode_human_12.8k
    path: data/gencode_human_12.8k
    type: refseq
  gencode_human_128k:
    hf_path: andyjzhao/gencode_human_128k
    path: data/gencode_human_128k
    type: refseq
_dlb_cmp_data:
  GM12878: dlb_cmp_gm12878
  H1hESC: dlb_cmp_h1hesc
  HCT116: dlb_cmp_hct116
  HFF: dlb_cmp_hff
  IMR90: dlb_cmp_imr90
_model_lookup:
  hnet_mamba_64m_2dc:
    k_max_list:
    - 0
    - 0
    k_min_list:
    - 8
    - 8
    model:
      arch: hnet
      name: hnet_mamba_64m_2dc
    model_cfg:
      arch_layout:
      - m2
      - - m2
        - - m15
        - m2
      - m2
      attn_cfg:
        num_heads:
        - 8
        - 8
        - 12
        rotary_emb_dim:
        - 16
        - 16
        - 24
        window_size:
        - 511
        - 511
        - -1
      d_intermediate:
      - 0
      - 0
      - 2048
      d_model:
      - 512
      - 512
      - 768
      k_max_list: ${k_max_list}
      k_min_list: ${k_min_list}
      ssm_cfg:
        chunk_size: 256
        d_conv: 4
        d_state: 64
        expand: 2
        head_dim: 64
      tie_embeddings: true
      vocab_size: 12
  hnet_mamba_650m_2dc:
    k_max_list:
    - 0
    - 0
    k_min_list:
    - 8
    - 8
    model:
      arch: hnet
      name: hnet_mamba_650m_2dc
    model_cfg:
      arch_layout:
      - m2
      - - m2
        - - m40
        - m2
      - m2
      attn_cfg:
        num_heads:
        - 8
        - 8
        - 16
        rotary_emb_dim:
        - 32
        - 32
        - 48
        window_size:
        - 511
        - 511
        - -1
      d_intermediate:
      - 0
      - 0
      - 4096
      d_model:
      - 1024
      - 1024
      - 1536
      k_max_list: ${k_max_list}
      k_min_list: ${k_min_list}
      ssm_cfg:
        chunk_size: 256
        d_conv: 4
        d_state: 64
        expand: 2
        head_dim: 64
      tie_embeddings: true
      vocab_size: 12
_unimportant_cfg:
  fields:
  - gpus
  - debug
  - wandb
  - env
  - uid
  - local_rank
  - is_distributed
  - master_port
  - device_type
  - cluster
  - world_size
  - train_dataset
  - eval_datasets
  - user_cfg
  - rank
  - device
  - hf_access_token
  - hf_private
  - hf_repo
  - hf_user
  - hf_token
  - save_every
  - eval_steps
  - save_steps
  - upload_to_hf
  - logging
  - log_every
  - use_wandb
  - project_root
  - version
  postfix:
  - _path
  - _file
  - _dir
  - _alias
  - _prefix
  prefix:
  - _
alias: GeneZip-70M-reproduce
alpha: 0.03
arch: ${model.arch}
batch_size: 32
bin_size: 2048
bp_per_token: 32
cell_type: Artery_Tibial
ckpt: andyjzhao/GeneZip-70M-12.8K
cleanup_checkpoints: true
cluster: mila
cmd: python src/scripts/pretrain_genezip.py task=pretrain data=gencode_human_12.8k
  model_name=hnet_mamba_64m_2dc max_len=12800 batch_size=32 grad_acc_steps=2 max_train_steps=5000
  eval_steps=500 num_valid_samples=512 upload_to_hf=true use_wandb=true wandb.project=DNAFM_v2
  bp_per_token=32 region_info=promoter1_cds1_utr2_exon2_intron8_nig8_dig16 alias=GeneZip-70M-reproduce
  hf_repo=jzshared/GeneZip-70M-reproduce
conjoin_test: false
data: gencode_human_12.8k
data_alias: ${data}
data_max_length: 1280000
data_root: ./data/dnalongbench
dataset: ${_dataset_cfg_lookup[${data}]}
device: cuda
device_type: GPU
dirs:
  data_cache: ${project_root}/data_cache/
  data_storage: ${project_root}/data/
  hf_cache: ${oc.env:HF_HOME,${project_root}/temp/hf_home/}
  hydra: ${project_root}/temp/hydra/
  output: ${project_root}/output/${data_alias}/${alias}/
  temp: ${project_root}/temp/working_dir/${uid}/
  wandb_cache: ${oc.env:WANDB_CACHE_DIR,${project_root}/temp/wandb_cache/}
disable_triton_autotune: true
dnalongbench_repo: andyjzhao/dnalongbench
dtype: bfloat16
early_stopping_epochs: 10
emb_cache: data_cache/dlb_cmp_${subset}/${ckpt}_${modeling}_${pooling}_${emb_position}_tok${tokenizer}_trunk${trunk_len}/
emb_position: before_lm_head
encode_batch_size: 40
epochs: 200
eval_batch_size: ${batch_size}
eval_steps: 500
freeze_encoder: true
grad_acc_steps: 2
head:
  cnn1d:
    channels: 128
    dropout: 0.1
    enabled: true
    kernel_size: 5
    layers: 2
  cnn2d:
    channels:
    - 64
    - 64
    - 64
    dropout: 0.1
    kernel_size: 3
  dim: 128
  dist_emb_dim: 16
  symmetrize: true
head_type: 2dcnn
hf_repo: jzshared/GeneZip-70M-reproduce
hf_user: ${oc.env:HF_USERNAME,null}
hnet_dtype: bfloat16
hnet_strict: false
is_distributed: true
janusdna_add_special_tokens: false
janusdna_config: null
janusdna_drop_special_tokens: false
k_max_list: ${_model_lookup.${model_name}.k_max_list}
k_min_list: ${_model_lookup.${model_name}.k_min_list}
local_rank: 0
log_every: 10
logging:
  level: info
  log_wandb_metric_to_stdout: true
lora:
  alpha: 16
  dropout: 0.05
  enabled: true
  rank: 8
  target_modules: null
lora_alpha: 64
lora_bias: none
lora_dropout: 0.05
lora_r: 32
lora_target_modules: ''
lora_task_type: SEQ_CLS
lr: 0.001
master_port: '55851'
max_data_samples: null
max_eval_samples: ${num_valid_samples}
max_grad_norm: 2.0
max_len: 12800
max_length: ${max_len}
max_test_records: null
max_train_records: null
max_train_steps: 5000
max_valid_records: null
mixed_precision: bf16
mode: Formal
model: ${_model_lookup.${model_name}.model}
model_alias: ${model.name}
model_cfg: ${_model_lookup.${model_name}.model_cfg}
model_name: hnet_mamba_64m_2dc
model_tag: null
modeling: identity
name: glm_stage1
num_bins: 448
num_test_samples: 0
num_train_samples: 0
num_valid_samples: 512
optimizer:
  lr: ${lr}
  name: adamw
  weight_decay: 0.0
orca_output_dir: null
output_prefix: ${ckpt}_${modeling}_${head_type}_trunk${trunk_len}_dlb
pooling: mean
pooling_method: mean
precompute_embeddings: true
pretrained_ckpt: null
private: false
project_root: ${hydra:runtime.cwd}
rank: 0
reference_loss: null
region_info: promoter1_cds1_utr2_exon2_intron8_nig8_dig16
region_label_map:
  cds: 1
  dig: 6
  exon: 3
  intron: 4
  nig: 5
  promoter: 0
  utr: 2
routing_step: null
save_dir: ./temp/dnalongbench_ckpt
save_every: ${save_steps}
save_steps: 3000
seed: 0
source: ${dataset.type}
strict: false
subset: HFF
task: pretrain
task_name: eqtl_prediction
test_load: auto
tokenizer: fast
tokenizer_dtype: ${dtype}
tokenizer_strict: ${strict}
train_steps: 9999999
training:
  adam_beta1: 0.9
  adam_beta2: 0.95
  bf16: true
  dataloader_drop_last: true
  dataloader_num_workers: 1
  disable_tqdm: false
  do_train: true
  eval_steps: ${eval_steps}
  eval_strategy: steps
  gradient_accumulation_steps: ${grad_acc_steps}
  gradient_checkpointing: false
  group_by_length: false
  hnet_initializer_range: 0.02
  hnet_lr_multiplier: null
  label_names:
  - input_ids
  learning_rate: ${lr}
  logging_steps: ${log_every}
  lr_scheduler_type: linear
  max_grad_norm: ${max_grad_norm}
  max_train_steps: ${max_train_steps}
  num_train_epochs: ${epochs}
  overrides: {}
  per_device_eval_batch_size: ${eval_batch_size}
  per_device_train_batch_size: ${batch_size}
  remove_unused_columns: false
  report_to: null
  save_steps: ${save_steps}
  save_strategy: steps
  use_lr_multiplier: true
  warmup_steps: 500
  weight_decay: 0.1
training_alias: ${mode}_glm_s1_${region_info}_bp${bp_per_token}_a${alpha}_lr${lr}_e${epochs}_ms${max_train_steps}_maxlen${max_len}
trunk_len: 12800
uid: jkq9sddw
upload_to_hf: true
upper_offset: 2
use_cache: true
use_lora: false
use_routing_ceiling: false
use_routing_floor: true
use_wandb: true
valid_test_downsample: null
validate_spans: false
version: release
wandb:
  dir: ${dirs.wandb_cache}
  entity: ${oc.env:WANDB_ENTITY,null}
  id: jkq9sddw
  mode: online
  name: GeneZip-70M-reproduce
  project: DNAFM_v2
  step_metric: null
  tags:
  - ${task}
  - ${mode}
  url: https://wandb.ai/jzshared/DNAFM_v2/runs/jkq9sddw
world_size: 4