RC_augmentation: false _dataset_cfg_lookup: dlb_cmp_gm12878: cache_hbins: true eval_split: validation hf_path: andyjzhao/dlb_cmp_gm12878 label_key: label_ut mask_key: mask_ut num_workers: 0 path: data/dlb_cmp_gm12878 pin_memory: true reference_id: hg38 sequence_format: string sequence_key: sequence shuffle: true test_split: test train_split: train type: cmp_seq use_hbins: true dlb_cmp_h1hesc: cache_hbins: true eval_split: validation hf_path: andyjzhao/dlb_cmp_h1hesc label_key: label_ut mask_key: mask_ut num_workers: 0 path: data/dlb_cmp_h1hesc pin_memory: true reference_id: hg38 sequence_format: string sequence_key: sequence shuffle: true test_split: test train_split: train type: cmp_seq use_hbins: true dlb_cmp_hct116: cache_hbins: true eval_split: validation hf_path: andyjzhao/dlb_cmp_hct116 label_key: label_ut mask_key: mask_ut num_workers: 0 path: data/dlb_cmp_hct116 pin_memory: true reference_id: hg38 sequence_format: string sequence_key: sequence shuffle: true test_split: test train_split: train type: cmp_seq use_hbins: true dlb_cmp_hff: cache_hbins: true eval_split: validation hf_path: andyjzhao/dlb_cmp_hff label_key: label_ut mask_key: mask_ut num_workers: 0 path: data/dlb_cmp_hff pin_memory: true reference_id: hg38 sequence_format: string sequence_key: sequence shuffle: true test_split: test train_split: train type: cmp_seq use_hbins: true dlb_cmp_imr90: cache_hbins: true eval_split: validation hf_path: andyjzhao/dlb_cmp_imr90 label_key: label_ut mask_key: mask_ut num_workers: 0 path: data/dlb_cmp_imr90 pin_memory: true reference_id: hg38 sequence_format: string sequence_key: sequence shuffle: true test_split: test train_split: train type: cmp_seq use_hbins: true gencode_human_12.8k: hf_path: andyjzhao/gencode_human_12.8k path: data/gencode_human_12.8k type: refseq gencode_human_128k: hf_path: andyjzhao/gencode_human_128k path: data/gencode_human_128k type: refseq _dlb_cmp_data: GM12878: dlb_cmp_gm12878 H1hESC: dlb_cmp_h1hesc HCT116: dlb_cmp_hct116 HFF: dlb_cmp_hff IMR90: dlb_cmp_imr90 _model_lookup: hnet_mamba_64m_2dc: k_max_list: - 0 - 0 k_min_list: - 8 - 8 model: arch: hnet name: hnet_mamba_64m_2dc model_cfg: arch_layout: - m2 - - m2 - - m15 - m2 - m2 attn_cfg: num_heads: - 8 - 8 - 12 rotary_emb_dim: - 16 - 16 - 24 window_size: - 511 - 511 - -1 d_intermediate: - 0 - 0 - 2048 d_model: - 512 - 512 - 768 k_max_list: ${k_max_list} k_min_list: ${k_min_list} ssm_cfg: chunk_size: 256 d_conv: 4 d_state: 64 expand: 2 head_dim: 64 tie_embeddings: true vocab_size: 12 hnet_mamba_650m_2dc: k_max_list: - 0 - 0 k_min_list: - 8 - 8 model: arch: hnet name: hnet_mamba_650m_2dc model_cfg: arch_layout: - m2 - - m2 - - m40 - m2 - m2 attn_cfg: num_heads: - 8 - 8 - 16 rotary_emb_dim: - 32 - 32 - 48 window_size: - 511 - 511 - -1 d_intermediate: - 0 - 0 - 4096 d_model: - 1024 - 1024 - 1536 k_max_list: ${k_max_list} k_min_list: ${k_min_list} ssm_cfg: chunk_size: 256 d_conv: 4 d_state: 64 expand: 2 head_dim: 64 tie_embeddings: true vocab_size: 12 _unimportant_cfg: fields: - gpus - debug - wandb - env - uid - local_rank - is_distributed - master_port - device_type - cluster - world_size - train_dataset - eval_datasets - user_cfg - rank - device - hf_access_token - hf_private - hf_repo - hf_user - hf_token - save_every - eval_steps - save_steps - upload_to_hf - logging - log_every - use_wandb - project_root - version postfix: - _path - _file - _dir - _alias - _prefix prefix: - _ alias: GeneZip-70M-reproduce alpha: 0.03 arch: ${model.arch} batch_size: 32 bin_size: 2048 bp_per_token: 32 cell_type: Artery_Tibial ckpt: andyjzhao/GeneZip-70M-12.8K cleanup_checkpoints: true cluster: mila cmd: python src/scripts/pretrain_genezip.py task=pretrain data=gencode_human_12.8k model_name=hnet_mamba_64m_2dc max_len=12800 batch_size=32 grad_acc_steps=2 max_train_steps=5000 eval_steps=500 num_valid_samples=512 upload_to_hf=true use_wandb=true wandb.project=DNAFM_v2 bp_per_token=32 region_info=promoter1_cds1_utr2_exon2_intron8_nig8_dig16 alias=GeneZip-70M-reproduce hf_repo=jzshared/GeneZip-70M-reproduce conjoin_test: false data: gencode_human_12.8k data_alias: ${data} data_max_length: 1280000 data_root: ./data/dnalongbench dataset: ${_dataset_cfg_lookup[${data}]} device: cuda device_type: GPU dirs: data_cache: ${project_root}/data_cache/ data_storage: ${project_root}/data/ hf_cache: ${oc.env:HF_HOME,${project_root}/temp/hf_home/} hydra: ${project_root}/temp/hydra/ output: ${project_root}/output/${data_alias}/${alias}/ temp: ${project_root}/temp/working_dir/${uid}/ wandb_cache: ${oc.env:WANDB_CACHE_DIR,${project_root}/temp/wandb_cache/} disable_triton_autotune: true dnalongbench_repo: andyjzhao/dnalongbench dtype: bfloat16 early_stopping_epochs: 10 emb_cache: data_cache/dlb_cmp_${subset}/${ckpt}_${modeling}_${pooling}_${emb_position}_tok${tokenizer}_trunk${trunk_len}/ emb_position: before_lm_head encode_batch_size: 40 epochs: 200 eval_batch_size: ${batch_size} eval_steps: 500 freeze_encoder: true grad_acc_steps: 2 head: cnn1d: channels: 128 dropout: 0.1 enabled: true kernel_size: 5 layers: 2 cnn2d: channels: - 64 - 64 - 64 dropout: 0.1 kernel_size: 3 dim: 128 dist_emb_dim: 16 symmetrize: true head_type: 2dcnn hf_repo: jzshared/GeneZip-70M-reproduce hf_user: ${oc.env:HF_USERNAME,null} hnet_dtype: bfloat16 hnet_strict: false is_distributed: true janusdna_add_special_tokens: false janusdna_config: null janusdna_drop_special_tokens: false k_max_list: ${_model_lookup.${model_name}.k_max_list} k_min_list: ${_model_lookup.${model_name}.k_min_list} local_rank: 0 log_every: 10 logging: level: info log_wandb_metric_to_stdout: true lora: alpha: 16 dropout: 0.05 enabled: true rank: 8 target_modules: null lora_alpha: 64 lora_bias: none lora_dropout: 0.05 lora_r: 32 lora_target_modules: '' lora_task_type: SEQ_CLS lr: 0.001 master_port: '55851' max_data_samples: null max_eval_samples: ${num_valid_samples} max_grad_norm: 2.0 max_len: 12800 max_length: ${max_len} max_test_records: null max_train_records: null max_train_steps: 5000 max_valid_records: null mixed_precision: bf16 mode: Formal model: ${_model_lookup.${model_name}.model} model_alias: ${model.name} model_cfg: ${_model_lookup.${model_name}.model_cfg} model_name: hnet_mamba_64m_2dc model_tag: null modeling: identity name: glm_stage1 num_bins: 448 num_test_samples: 0 num_train_samples: 0 num_valid_samples: 512 optimizer: lr: ${lr} name: adamw weight_decay: 0.0 orca_output_dir: null output_prefix: ${ckpt}_${modeling}_${head_type}_trunk${trunk_len}_dlb pooling: mean pooling_method: mean precompute_embeddings: true pretrained_ckpt: null private: false project_root: ${hydra:runtime.cwd} rank: 0 reference_loss: null region_info: promoter1_cds1_utr2_exon2_intron8_nig8_dig16 region_label_map: cds: 1 dig: 6 exon: 3 intron: 4 nig: 5 promoter: 0 utr: 2 routing_step: null save_dir: ./temp/dnalongbench_ckpt save_every: ${save_steps} save_steps: 3000 seed: 0 source: ${dataset.type} strict: false subset: HFF task: pretrain task_name: eqtl_prediction test_load: auto tokenizer: fast tokenizer_dtype: ${dtype} tokenizer_strict: ${strict} train_steps: 9999999 training: adam_beta1: 0.9 adam_beta2: 0.95 bf16: true dataloader_drop_last: true dataloader_num_workers: 1 disable_tqdm: false do_train: true eval_steps: ${eval_steps} eval_strategy: steps gradient_accumulation_steps: ${grad_acc_steps} gradient_checkpointing: false group_by_length: false hnet_initializer_range: 0.02 hnet_lr_multiplier: null label_names: - input_ids learning_rate: ${lr} logging_steps: ${log_every} lr_scheduler_type: linear max_grad_norm: ${max_grad_norm} max_train_steps: ${max_train_steps} num_train_epochs: ${epochs} overrides: {} per_device_eval_batch_size: ${eval_batch_size} per_device_train_batch_size: ${batch_size} remove_unused_columns: false report_to: null save_steps: ${save_steps} save_strategy: steps use_lr_multiplier: true warmup_steps: 500 weight_decay: 0.1 training_alias: ${mode}_glm_s1_${region_info}_bp${bp_per_token}_a${alpha}_lr${lr}_e${epochs}_ms${max_train_steps}_maxlen${max_len} trunk_len: 12800 uid: jkq9sddw upload_to_hf: true upper_offset: 2 use_cache: true use_lora: false use_routing_ceiling: false use_routing_floor: true use_wandb: true valid_test_downsample: null validate_spans: false version: release wandb: dir: ${dirs.wandb_cache} entity: ${oc.env:WANDB_ENTITY,null} id: jkq9sddw mode: online name: GeneZip-70M-reproduce project: DNAFM_v2 step_metric: null tags: - ${task} - ${mode} url: https://wandb.ai/jzshared/DNAFM_v2/runs/jkq9sddw world_size: 4