Spaces:

Tharun156
/

GestureLSM

Runtime error

App Files Files Community

Tharun156 commited on 23 days ago

Commit

f7400bf

verified ·

1 Parent(s): 859f17b

Upload 149 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +5 -0
configs/beat2_rvqvae.yaml +134 -0
configs/diffuser_rvqvae_128.yaml +96 -0
configs/model_config.yaml +71 -0
configs/sc_model_config.yaml +37 -0
configs/sc_model_holistic_config.yaml +37 -0
configs/sc_reflow_model_config.yaml +37 -0
configs/shortcut.yaml +96 -0
configs/shortcut_hf.yaml +96 -0
configs/shortcut_holistic.yaml +96 -0
configs/shortcut_reflow.yaml +96 -0
configs/shortcut_reflow_test.yaml +96 -0
configs/shortcut_rvqvae_128.yaml +96 -0
configs/shortcut_rvqvae_128_hf.yaml +96 -0
dataloaders/__pycache__/beat_sep_single.cpython-312.pyc +0 -0
dataloaders/__pycache__/build_vocab.cpython-312.pyc +0 -0
dataloaders/__pycache__/data_tools.cpython-312.pyc +0 -0
dataloaders/beat_dataset_new.py +373 -0
dataloaders/beat_sep.py +772 -0
dataloaders/beat_sep_lower.py +430 -0
dataloaders/beat_sep_single.py +693 -0
dataloaders/beat_smplx2020.py +763 -0
dataloaders/build_vocab.py +199 -0
dataloaders/data_tools.py +1756 -0
dataloaders/mix_sep.py +301 -0
dataloaders/pymo/Quaternions.py +468 -0
dataloaders/pymo/__init__.py +0 -0
dataloaders/pymo/__pycache__/Quaternions.cpython-312.pyc +0 -0
dataloaders/pymo/__pycache__/__init__.cpython-312.pyc +0 -0
dataloaders/pymo/__pycache__/data.cpython-312.pyc +0 -0
dataloaders/pymo/__pycache__/parsers.cpython-312.pyc +0 -0
dataloaders/pymo/__pycache__/preprocessing.cpython-312.pyc +0 -0
dataloaders/pymo/__pycache__/rotation_tools.cpython-312.pyc +0 -0
dataloaders/pymo/__pycache__/viz_tools.cpython-312.pyc +0 -0
dataloaders/pymo/data.py +53 -0
dataloaders/pymo/features.py +43 -0
dataloaders/pymo/parsers.py +274 -0
dataloaders/pymo/preprocessing.py +726 -0
dataloaders/pymo/rotation_tools.py +153 -0
dataloaders/pymo/rotation_tools.py! +69 -0
dataloaders/pymo/viz_tools.py +236 -0
dataloaders/pymo/writers.py +55 -0
dataloaders/utils/__pycache__/audio_features.cpython-312.pyc +0 -0
dataloaders/utils/__pycache__/other_tools.cpython-312.pyc +0 -0
dataloaders/utils/__pycache__/rotation_conversions.cpython-312.pyc +0 -0
dataloaders/utils/audio_features.py +80 -0
dataloaders/utils/data_sample.py +175 -0
dataloaders/utils/mis_features.py +64 -0
dataloaders/utils/motion_rep_transfer.py +236 -0
dataloaders/utils/other_tools.py +748 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+demo/examples/2_scott_0_1_1.wav filter=lfs diff=lfs merge=lfs -text
+demo/examples/2_scott_0_2_2.wav filter=lfs diff=lfs merge=lfs -text
+demo/examples/2_scott_0_3_3.wav filter=lfs diff=lfs merge=lfs -text
+demo/examples/2_scott_0_4_4.wav filter=lfs diff=lfs merge=lfs -text
+demo/examples/2_scott_0_5_5.wav filter=lfs diff=lfs merge=lfs -text

configs/beat2_rvqvae.yaml ADDED Viewed

	@@ -0,0 +1,134 @@

+is_train: True
+ddp: False
+stat: ts
+root_path: ./
+out_path: ./outputs/audio2pose/
+project: s2g
+data_path: ./datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/
+e_path:  weights/AESKConv_240_100.bin
+eval_model: motion_representation
+e_name: VAESKConv
+test_ckpt: ./outputs/audio2pose/custom/0112_001634_emage/last_500.bin
+data_path_1: ./datasets/hub/
+vae_test_len: 32
+vae_test_dim: 330
+vae_test_stride: 20
+vae_length: 240
+vae_codebook_size: 256
+vae_layer: 4
+vae_grow: [1,1,2,1]
+variational: False
+# data config
+training_speakers: [2] #[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] #[2]
+additional_data: False
+cache_path: datasets/beat_cache/beat_smplx_en_emage_2_rvqvae/
+dataset: mix_sep
+new_cache: True
+use_amass: False
+# motion config
+ori_joints: beat_smplx_joints
+tar_joints: beat_smplx_full
+pose_rep: smplxflame_30
+pose_norm: False
+pose_fps: 30
+rot6d: True
+pre_frames: 4
+pose_dims: 330
+pose_length: 64
+stride: 20
+test_length: 64
+motion_f: 256
+m_pre_encoder: null
+m_encoder: null
+m_fix_pre: False
+# audio config
+audio_rep: onset+amplitude
+audio_sr: 16000
+audio_fps: 16000
+audio_norm: False
+audio_f: 256
+# a_pre_encoder: tcn_camn
+# a_encoder: none
+# a_fix_pre: False
+# text config
+word_rep: textgrid
+word_index_num: 11195
+word_dims: 300
+freeze_wordembed: False
+word_f: 256
+t_pre_encoder: fasttext
+t_encoder: null
+t_fix_pre: False
+# facial config
+facial_rep: smplxflame_30
+facial_dims: 100
+facial_norm: False
+facial_f: 0
+f_pre_encoder: null
+f_encoder: null
+f_fix_pre: False
+# speaker config
+id_rep: onehot
+speaker_f: 0
+# model config
+batch_size: 80  #80
+# warmup_epochs: 1
+# warmup_lr: 1e-6
+lr_base: 4e-4
+model: motion_representation
+g_name: VQVAEConvZero
+trainer: ae_total
+hidden_size: 768
+n_layer: 1
+rec_weight: 1
+grad_norm: 0.99
+epochs: 200
+test_period: 20
+ll: 3
+lf: 3
+lu: 3
+lh: 3
+cl: 1
+cf: 0
+cu: 1
+ch: 1
+#below is vavae config, copy from QPGESTURE
+#Codebook Configs
+levels: 1
+downs_t: [3]
+strides_t : [2]
+emb_width : 512
+l_bins : 512
+l_mu : 0.99
+commit : 0.1
+hvqvae_multipliers : [1]
+width: 512
+depth: 3
+m_conv : 1.0
+dilation_growth_rate : 3
+sample_length: 80
+use_bottleneck: True
+joint_channel: 6
+# depth: 3
+# width: 128
+# m_conv: 1.0
+# dilation_growth_rate: 1
+# dilation_cycle: None
+vel: 1      # 1 -> 0
+acc: 1      # 1 -> 0
+vqvae_reverse_decoder_dilation: True
+## below is special for emage
+rec_pos_weight : 1.0

configs/diffuser_rvqvae_128.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+is_train: True
+ddp: False
+stat: ts
+root_path: ./
+out_path: ./outputs/audio2pose/
+project: s2g
+e_path:  weights/AESKConv_240_100.bin
+eval_model: motion_representation
+e_name: VAESKConv
+data_path: ./datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/
+test_ckpt: ./ckpt/new_540_diffusion.bin
+data_path_1: ./datasets/hub/
+pose_norm: True
+cfg: configs/model_config.yaml
+mean_pose_path: ./mean_std/beatx_2_330_mean.npy
+std_pose_path: ./mean_std/beatx_2_330_std.npy
+mean_trans_path: ./mean_std/beatx_2_trans_mean.npy
+std_trans_path: ./mean_std/beatx_2_trans_std.npy
+vqvae_upper_path: ./ckpt/net_300000_upper.pth
+vqvae_hands_path: ./ckpt/net_300000_hands.pth
+vqvae_lower_path: ./ckpt/net_300000_lower.pth
+vqvae_lower_trans_path: ./ckpt/net_300000_lower_trans.pth
+use_trans: True
+decay_epoch: 500
+vqvae_squeeze_scale: 4
+vqvae_latent_scale: 5
+vae_test_len: 32
+vae_test_dim: 330
+vae_test_stride: 20
+vae_length: 240
+vae_codebook_size: 256
+vae_layer: 4
+vae_grow: [1,1,2,1]
+variational: False
+# data config
+training_speakers: [2] #[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+additional_data: False
+cache_path: datasets/beat_cache/beat_smplx_en_emage_2_128/
+dataset: beat_sep_lower
+new_cache: False
+# motion config
+ori_joints: beat_smplx_joints
+tar_joints: beat_smplx_full
+pose_rep: smplxflame_30
+pose_fps: 30
+rot6d: True
+pre_frames: 4
+pose_dims: 330
+pose_length: 128
+stride: 20
+test_length: 128
+m_fix_pre: False
+audio_rep: onset+amplitude
+audio_sr: 16000
+audio_fps: 16000
+audio_norm: False
+audio_f: 256
+audio_raw: None
+word_rep: textgrid
+word_dims: 300
+t_pre_encoder: fasttext
+facial_rep: smplxflame_30
+facial_dims: 100
+facial_norm: False
+facial_f: 0
+id_rep: onehot
+speaker_f: 0
+batch_size: 128
+lr_base: 2e-4
+trainer: diffuser_rvqvae
+rec_weight: 1
+grad_norm: 0.99
+epochs: 1000
+test_period: 20

configs/model_config.yaml ADDED Viewed

	@@ -0,0 +1,71 @@

+model:
+  model_name: GestureDiffuse
+  g_name: GestureDiffusion
+  do_classifier_free_guidance: False
+  guidance_scale: 1.5
+  denoiser:
+    target: models.denoiser.GestureDenoiser
+    params:
+      input_dim: 128
+      latent_dim: 256
+      ff_size: 1024
+      num_layers: 8
+      num_heads: 4
+      dropout: 0.1
+      activation: "gelu"
+      n_seed: 8
+      flip_sin_to_cos: True
+      freq_shift: 0.0
+  modality_encoder:
+    target: models.modality_encoder.ModalityEncoder
+    params:
+      data_path: ./datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/
+      t_fix_pre: False
+      audio_dim: 256
+      audio_in: 2
+      raw_audio: False
+      latent_dim: 256
+      audio_fps: 30
+  scheduler:
+    target: diffusers.DDIMScheduler
+    num_inference_steps: 20
+    eta: 0.0
+    params:
+      num_train_timesteps: 1000
+      # if using 'linear or 'scaled_linear', beta_start and beta_end matters, if cosine, beta_start and beta_end are ignored
+      beta_start: 0.00085
+      beta_end: 0.012
+      # 'linear' or 'squaredcos_cap_v2' or 'scaled_linear'
+      beta_schedule: 'squaredcos_cap_v2'
+      prediction_type: 'sample'
+      clip_sample: false
+      # 'leading' or 'trailing' or 'linspace'
+      timestep_spacing: 'leading'
+      # below are for ddim
+      set_alpha_to_one: True
+      steps_offset: 0
+  # use ddpm scheduler
+  # scheduler:
+  #   target: diffusers.DDPMScheduler
+  #   num_inference_steps: 50
+  #   eta: 0.0
+  #   params:
+  #     num_train_timesteps: 1000
+  #     beta_start: 0.00085
+  #     beta_end: 0.012
+  #     beta_schedule: 'squaredcos_cap_v2' # 'squaredcos_cap_v2'
+  #     prediction_type: 'sample'
+  #     clip_sample: false
+  #     variance_type: 'fixed_small_log'
+  #     # below are for ddim
+  #     # set_alpha_to_one: True
+  #     # steps_offset: 1

configs/sc_model_config.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+model:
+  model_name: LSM
+  g_name: GestureLSM
+  do_classifier_free_guidance: False
+  guidance_scale: 2
+  n_steps: 20
+  use_exp: False
+  denoiser:
+    target: models.denoiser.GestureDenoiser
+    params:
+      input_dim: 128
+      latent_dim: 256
+      ff_size: 1024
+      num_layers: 8
+      num_heads: 4
+      dropout: 0.1
+      activation: "gelu"
+      n_seed: 8
+      flip_sin_to_cos: True
+      freq_shift: 0.0
+      cond_proj_dim: 256
+      use_exp: ${model.use_exp}
+  modality_encoder:
+    target: models.modality_encoder.ModalityEncoder
+    params:
+      data_path: ./datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/
+      t_fix_pre: False
+      audio_dim: 256
+      audio_in: 2
+      raw_audio: False
+      latent_dim: 256
+      audio_fps: 30
+      use_exp: ${model.use_exp}

configs/sc_model_holistic_config.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+model:
+  model_name: LSM
+  g_name: GestureLSM
+  do_classifier_free_guidance: False
+  guidance_scale: 2
+  n_steps: 25
+  use_exp: True
+  denoiser:
+    target: models.denoiser.GestureDenoiser
+    params:
+      input_dim: 128
+      latent_dim: 256
+      ff_size: 1024
+      num_layers: 8
+      num_heads: 4
+      dropout: 0.1
+      activation: "gelu"
+      n_seed: 8
+      flip_sin_to_cos: True
+      freq_shift: 0.0
+      cond_proj_dim: 256
+      use_exp: ${model.use_exp}
+  modality_encoder:
+    target: models.modality_encoder.ModalityEncoder
+    params:
+      data_path: ./datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/
+      t_fix_pre: False
+      audio_dim: 256
+      audio_in: 2
+      raw_audio: False
+      latent_dim: 256
+      audio_fps: 30
+      use_exp: ${model.use_exp}

configs/sc_reflow_model_config.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+model:
+  model_name: LSM
+  g_name: GestureLSM
+  do_classifier_free_guidance: False
+  guidance_scale: 2
+  n_steps: 2
+  use_exp: False
+  denoiser:
+    target: models.denoiser.GestureDenoiser
+    params:
+      input_dim: 128
+      latent_dim: 256
+      ff_size: 1024
+      num_layers: 8
+      num_heads: 4
+      dropout: 0.1
+      activation: "gelu"
+      n_seed: 8
+      flip_sin_to_cos: True
+      freq_shift: 0.0
+      cond_proj_dim: 256
+      use_exp: ${model.use_exp}
+  modality_encoder:
+    target: models.modality_encoder.ModalityEncoder
+    params:
+      data_path: ./datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/
+      t_fix_pre: False
+      audio_dim: 256
+      audio_in: 2
+      raw_audio: False
+      latent_dim: 256
+      audio_fps: 30
+      use_exp: ${model.use_exp}

configs/shortcut.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+is_train: True
+ddp: False
+stat: ts
+root_path: ./
+out_path: ./outputs/audio2pose/
+project: s2g
+e_path:  weights/AESKConv_240_100.bin
+eval_model: motion_representation
+e_name: VAESKConv
+data_path: ./datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/
+test_ckpt: ./ckpt/new_540_shortcut.bin
+data_path_1: ./datasets/hub/
+pose_norm: True
+cfg: configs/sc_model_config.yaml
+mean_pose_path: ./mean_std/beatx_2_330_mean.npy
+std_pose_path: ./mean_std/beatx_2_330_std.npy
+mean_trans_path: ./mean_std/beatx_2_trans_mean.npy
+std_trans_path: ./mean_std/beatx_2_trans_std.npy
+vqvae_upper_path: ./ckpt/net_300000_upper.pth
+vqvae_hands_path: ./ckpt/net_300000_hands.pth
+vqvae_lower_path: ./ckpt/net_300000_lower.pth
+vqvae_face_path: ./ckpt/net_300000_face.pth
+vqvae_lower_trans_path: ./ckpt/net_300000_lower_trans.pth
+use_trans: True
+decay_epoch: 500
+vqvae_squeeze_scale: 4
+vqvae_latent_scale: 5
+vae_test_len: 32
+vae_test_dim: 330
+vae_test_stride: 20
+vae_length: 240
+vae_codebook_size: 256
+vae_layer: 4
+vae_grow: [1,1,2,1]
+variational: False
+# data config
+training_speakers: [2] #[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+additional_data: False
+cache_path: datasets/beat_cache/beat_smplx_en_emage_2_128/
+dataset: beat_sep_lower
+new_cache: False
+# motion config
+ori_joints: beat_smplx_joints
+tar_joints: beat_smplx_full
+pose_rep: smplxflame_30
+pose_fps: 30
+rot6d: True
+pre_frames: 4
+pose_dims: 330
+pose_length: 128
+stride: 20
+test_length: 128
+m_fix_pre: False
+audio_rep: onset+amplitude
+audio_sr: 16000
+audio_fps: 16000
+audio_norm: False
+audio_f: 256
+audio_raw: None
+word_rep: textgrid
+word_dims: 300
+t_pre_encoder: fasttext
+facial_rep: smplxflame_30
+facial_dims: 100
+facial_norm: False
+facial_f: 0
+id_rep: onehot
+speaker_f: 0
+batch_size: 128
+lr_base: 2e-4
+trainer: shortcut_rvqvae
+rec_weight: 1
+grad_norm: 0.99
+epochs: 1000
+test_period: 20

configs/shortcut_hf.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+is_train: True
+ddp: False
+stat: ts
+root_path: ./
+out_path: ./outputs/audio2pose/
+project: s2g
+e_path:  weights/AESKConv_240_100.bin
+eval_model: motion_representation
+e_name: VAESKConv
+data_path: ./datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/
+test_ckpt: ./ckpt/new_540_shortcut.bin
+data_path_1: ./datasets/hub/
+pose_norm: True
+cfg: configs/sc_model_config.yaml
+mean_pose_path: ./mean_std/beatx_2_330_mean.npy
+std_pose_path: ./mean_std/beatx_2_330_std.npy
+mean_trans_path: ./mean_std/beatx_2_trans_mean.npy
+std_trans_path: ./mean_std/beatx_2_trans_std.npy
+vqvae_upper_path: ./ckpt/net_300000_upper.pth
+vqvae_hands_path: ./ckpt/net_300000_hands.pth
+vqvae_lower_path: ./ckpt/net_300000_lower.pth
+vqvae_lower_trans_path: ./ckpt/net_300000_lower_trans.pth
+use_trans: True
+decay_epoch: 500
+vqvae_squeeze_scale: 4
+vqvae_latent_scale: 5
+vae_test_len: 32
+vae_test_dim: 330
+vae_test_stride: 20
+vae_length: 240
+vae_codebook_size: 256
+vae_layer: 4
+vae_grow: [1,1,2,1]
+variational: False
+# data config
+training_speakers: [2] #[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+additional_data: False
+cache_path: datasets/beat_cache/beat_smplx_en_emage_2_128/
+dataset: beat_sep_single
+new_cache: False
+# motion config
+ori_joints: beat_smplx_joints
+tar_joints: beat_smplx_full
+pose_rep: smplxflame_30
+pose_fps: 30
+rot6d: True
+pre_frames: 4
+pose_dims: 330
+pose_length: 128
+stride: 20
+test_length: 128
+m_fix_pre: False
+audio_rep: onset+amplitude
+audio_sr: 16000
+audio_fps: 16000
+audio_norm: False
+audio_f: 256
+audio_raw: None
+word_rep: textgrid
+word_dims: 300
+t_pre_encoder: fasttext
+facial_rep: smplxflame_30
+facial_dims: 100
+facial_norm: False
+facial_f: 0
+id_rep: onehot
+speaker_f: 0
+batch_size: 128
+lr_base: 2e-4
+trainer: shortcut_rvqvae
+rec_weight: 1
+grad_norm: 0.99
+epochs: 1000
+test_period: 20

configs/shortcut_holistic.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+is_train: True
+ddp: False
+stat: ts
+root_path: ./
+out_path: ./outputs/audio2pose/
+project: s2g
+e_path:  weights/AESKConv_240_100.bin
+eval_model: motion_representation
+e_name: VAESKConv
+data_path: ./datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/
+test_ckpt: ./ckpt/new_540_shortcut_holistic.bin
+data_path_1: ./datasets/hub/
+pose_norm: True
+cfg: configs/sc_model_holistic_config.yaml
+mean_pose_path: ./mean_std/beatx_2_330_mean.npy
+std_pose_path: ./mean_std/beatx_2_330_std.npy
+mean_trans_path: ./mean_std/beatx_2_trans_mean.npy
+std_trans_path: ./mean_std/beatx_2_trans_std.npy
+vqvae_upper_path: ./ckpt/net_300000_upper.pth
+vqvae_hands_path: ./ckpt/net_300000_hands.pth
+vqvae_lower_path: ./ckpt/net_300000_lower.pth
+vqvae_face_path: ./ckpt/net_300000_face.pth
+vqvae_lower_trans_path: ./ckpt/net_300000_lower_trans.pth
+use_trans: True
+decay_epoch: 500
+vqvae_squeeze_scale: 4
+vqvae_latent_scale: 5
+vae_test_len: 32
+vae_test_dim: 330
+vae_test_stride: 20
+vae_length: 240
+vae_codebook_size: 256
+vae_layer: 4
+vae_grow: [1,1,2,1]
+variational: False
+# data config
+training_speakers: [2] #[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+additional_data: False
+cache_path: datasets/beat_cache/beat_smplx_en_emage_2_128/
+dataset: beat_sep_lower
+new_cache: False
+# motion config
+ori_joints: beat_smplx_joints
+tar_joints: beat_smplx_full
+pose_rep: smplxflame_30
+pose_fps: 30
+rot6d: True
+pre_frames: 4
+pose_dims: 330
+pose_length: 128
+stride: 20
+test_length: 128
+m_fix_pre: False
+audio_rep: onset+amplitude
+audio_sr: 16000
+audio_fps: 16000
+audio_norm: False
+audio_f: 256
+audio_raw: None
+word_rep: textgrid
+word_dims: 300
+t_pre_encoder: fasttext
+facial_rep: smplxflame_30
+facial_dims: 100
+facial_norm: False
+facial_f: 0
+id_rep: onehot
+speaker_f: 0
+batch_size: 128
+lr_base: 2e-4
+trainer: shortcut_rvqvae
+rec_weight: 1
+grad_norm: 0.99
+epochs: 1000
+test_period: 20

configs/shortcut_reflow.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+is_train: True
+ddp: False
+stat: ts
+root_path: ./
+out_path: ./outputs/audio2pose/
+project: s2g
+e_path:  weights/AESKConv_240_100.bin
+eval_model: motion_representation
+e_name: VAESKConv
+data_path: ./datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/
+test_ckpt: ./outputs/audio2pose/custom/0212_125039_shortcut_reflow/last_20.bin
+data_path_1: ./datasets/hub/
+pose_norm: True
+cfg: configs/sc_model_config.yaml
+mean_pose_path: ./mean_std/beatx_2_330_mean.npy
+std_pose_path: ./mean_std/beatx_2_330_std.npy
+mean_trans_path: ./mean_std/beatx_2_trans_mean.npy
+std_trans_path: ./mean_std/beatx_2_trans_std.npy
+vqvae_upper_path: ./ckpt/net_300000_upper.pth
+vqvae_hands_path: ./ckpt/net_300000_hands.pth
+vqvae_lower_path: ./ckpt/net_300000_lower.pth
+vqvae_face_path: ./ckpt/net_300000_face.pth
+vqvae_lower_trans_path: ./ckpt/net_300000_lower_trans.pth
+use_trans: True
+decay_epoch: 500
+vqvae_squeeze_scale: 4
+vqvae_latent_scale: 5
+vae_test_len: 32
+vae_test_dim: 330
+vae_test_stride: 20
+vae_length: 240
+vae_codebook_size: 256
+vae_layer: 4
+vae_grow: [1,1,2,1]
+variational: False
+# data config
+training_speakers: [2] #[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+additional_data: False
+cache_path: datasets/beat_cache/beat_smplx_en_emage_2_128/
+dataset: beat_sep_reflow
+new_cache: False
+# motion config
+ori_joints: beat_smplx_joints
+tar_joints: beat_smplx_full
+pose_rep: smplxflame_30
+pose_fps: 30
+rot6d: True
+pre_frames: 4
+pose_dims: 330
+pose_length: 128
+stride: 20
+test_length: 128
+m_fix_pre: False
+audio_rep: onset+amplitude
+audio_sr: 16000
+audio_fps: 16000
+audio_norm: False
+audio_f: 256
+audio_raw: None
+word_rep: textgrid
+word_dims: 300
+t_pre_encoder: fasttext
+facial_rep: smplxflame_30
+facial_dims: 100
+facial_norm: False
+facial_f: 0
+id_rep: onehot
+speaker_f: 0
+batch_size: 1
+lr_base: 2e-4
+trainer: shortcut_rvqvae
+rec_weight: 1
+grad_norm: 0.99
+epochs: 1000
+test_period: 20

configs/shortcut_reflow_test.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+is_train: True
+ddp: False
+stat: ts
+root_path: ./
+out_path: ./outputs/audio2pose/
+project: s2g
+e_path:  weights/AESKConv_240_100.bin
+eval_model: motion_representation
+e_name: VAESKConv
+data_path: ./datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/
+test_ckpt: ./ckpt/shortcut_reflow.bin
+data_path_1: ./datasets/hub/
+pose_norm: True
+cfg: configs/sc_reflow_model_config.yaml
+mean_pose_path: ./mean_std/beatx_2_330_mean.npy
+std_pose_path: ./mean_std/beatx_2_330_std.npy
+mean_trans_path: ./mean_std/beatx_2_trans_mean.npy
+std_trans_path: ./mean_std/beatx_2_trans_std.npy
+vqvae_upper_path: ./ckpt/net_300000_upper.pth
+vqvae_hands_path: ./ckpt/net_300000_hands.pth
+vqvae_lower_path: ./ckpt/net_300000_lower.pth
+vqvae_face_path: ./ckpt/net_300000_face.pth
+vqvae_lower_trans_path: ./ckpt/net_300000_lower_trans.pth
+use_trans: True
+decay_epoch: 500
+vqvae_squeeze_scale: 4
+vqvae_latent_scale: 5
+vae_test_len: 32
+vae_test_dim: 330
+vae_test_stride: 20
+vae_length: 240
+vae_codebook_size: 256
+vae_layer: 4
+vae_grow: [1,1,2,1]
+variational: False
+# data config
+training_speakers: [2] #[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+additional_data: False
+cache_path: datasets/beat_cache/beat_smplx_en_emage_2_128/
+dataset: beat_sep_lower
+new_cache: False
+# motion config
+ori_joints: beat_smplx_joints
+tar_joints: beat_smplx_full
+pose_rep: smplxflame_30
+pose_fps: 30
+rot6d: True
+pre_frames: 4
+pose_dims: 330
+pose_length: 128
+stride: 20
+test_length: 128
+m_fix_pre: False
+audio_rep: onset+amplitude
+audio_sr: 16000
+audio_fps: 16000
+audio_norm: False
+audio_f: 256
+audio_raw: None
+word_rep: textgrid
+word_dims: 300
+t_pre_encoder: fasttext
+facial_rep: smplxflame_30
+facial_dims: 100
+facial_norm: False
+facial_f: 0
+id_rep: onehot
+speaker_f: 0
+batch_size: 1
+lr_base: 2e-4
+trainer: shortcut_rvqvae
+rec_weight: 1
+grad_norm: 0.99
+epochs: 1000
+test_period: 20

configs/shortcut_rvqvae_128.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+is_train: True
+ddp: False
+stat: ts
+root_path: ./
+out_path: ./outputs/audio2pose/
+project: s2g
+e_path:  weights/AESKConv_240_100.bin
+eval_model: motion_representation
+e_name: VAESKConv
+data_path: ./datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/
+test_ckpt: ./ckpt/new_540_shortcut.bin
+data_path_1: ./datasets/hub/
+pose_norm: True
+cfg: configs/sc_model_config.yaml
+mean_pose_path: ./mean_std/beatx_2_330_mean.npy
+std_pose_path: ./mean_std/beatx_2_330_std.npy
+mean_trans_path: ./mean_std/beatx_2_trans_mean.npy
+std_trans_path: ./mean_std/beatx_2_trans_std.npy
+vqvae_upper_path: ./ckpt/net_300000_upper.pth
+vqvae_hands_path: ./ckpt/net_300000_hands.pth
+vqvae_lower_path: ./ckpt/net_300000_lower.pth
+vqvae_lower_trans_path: ./ckpt/net_300000_lower_trans.pth
+use_trans: True
+decay_epoch: 500
+vqvae_squeeze_scale: 4
+vqvae_latent_scale: 5
+vae_test_len: 32
+vae_test_dim: 330
+vae_test_stride: 20
+vae_length: 240
+vae_codebook_size: 256
+vae_layer: 4
+vae_grow: [1,1,2,1]
+variational: False
+# data config
+training_speakers: [2] #[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+additional_data: False
+cache_path: datasets/beat_cache/beat_smplx_en_emage_2_128/
+dataset: beat_sep_lower
+new_cache: False
+# motion config
+ori_joints: beat_smplx_joints
+tar_joints: beat_smplx_full
+pose_rep: smplxflame_30
+pose_fps: 30
+rot6d: True
+pre_frames: 4
+pose_dims: 330
+pose_length: 128
+stride: 20
+test_length: 128
+m_fix_pre: False
+audio_rep: onset+amplitude
+audio_sr: 16000
+audio_fps: 16000
+audio_norm: False
+audio_f: 256
+audio_raw: None
+word_rep: textgrid
+word_dims: 300
+t_pre_encoder: fasttext
+facial_rep: smplxflame_30
+facial_dims: 100
+facial_norm: False
+facial_f: 0
+id_rep: onehot
+speaker_f: 0
+batch_size: 128
+lr_base: 2e-4
+trainer: shortcut_rvqvae
+rec_weight: 1
+grad_norm: 0.99
+epochs: 1000
+test_period: 20

configs/shortcut_rvqvae_128_hf.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+is_train: True
+ddp: False
+stat: ts
+root_path: ./
+out_path: ./outputs/audio2pose/
+project: s2g
+e_path:  weights/AESKConv_240_100.bin
+eval_model: motion_representation
+e_name: VAESKConv
+data_path: ./datasets/BEAT_SMPL/beat_v2.0.0/beat_english_v2.0.0/
+test_ckpt: ./ckpt/new_540_shortcut.bin
+data_path_1: ./datasets/hub/
+pose_norm: True
+cfg: configs/sc_model_config.yaml
+mean_pose_path: ./mean_std/beatx_2_330_mean.npy
+std_pose_path: ./mean_std/beatx_2_330_std.npy
+mean_trans_path: ./mean_std/beatx_2_trans_mean.npy
+std_trans_path: ./mean_std/beatx_2_trans_std.npy
+vqvae_upper_path: ./ckpt/net_300000_upper.pth
+vqvae_hands_path: ./ckpt/net_300000_hands.pth
+vqvae_lower_path: ./ckpt/net_300000_lower.pth
+vqvae_lower_trans_path: ./ckpt/net_300000_lower_trans.pth
+use_trans: True
+decay_epoch: 500
+vqvae_squeeze_scale: 4
+vqvae_latent_scale: 5
+vae_test_len: 32
+vae_test_dim: 330
+vae_test_stride: 20
+vae_length: 240
+vae_codebook_size: 256
+vae_layer: 4
+vae_grow: [1,1,2,1]
+variational: False
+# data config
+training_speakers: [2] #[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+additional_data: False
+cache_path: datasets/beat_cache/beat_smplx_en_emage_2_128/
+dataset: beat_sep_single
+new_cache: False
+# motion config
+ori_joints: beat_smplx_joints
+tar_joints: beat_smplx_full
+pose_rep: smplxflame_30
+pose_fps: 30
+rot6d: True
+pre_frames: 4
+pose_dims: 330
+pose_length: 128
+stride: 20
+test_length: 128
+m_fix_pre: False
+audio_rep: onset+amplitude
+audio_sr: 16000
+audio_fps: 16000
+audio_norm: False
+audio_f: 256
+audio_raw: None
+word_rep: textgrid
+word_dims: 300
+t_pre_encoder: fasttext
+facial_rep: smplxflame_30
+facial_dims: 100
+facial_norm: False
+facial_f: 0
+id_rep: onehot
+speaker_f: 0
+batch_size: 128
+lr_base: 2e-4
+trainer: shortcut_rvqvae
+rec_weight: 1
+grad_norm: 0.99
+epochs: 1000
+test_period: 20

dataloaders/__pycache__/beat_sep_single.cpython-312.pyc ADDED Viewed

Binary file (42 kB). View file

dataloaders/__pycache__/build_vocab.cpython-312.pyc ADDED Viewed

Binary file (10.3 kB). View file

dataloaders/__pycache__/data_tools.cpython-312.pyc ADDED Viewed

Binary file (43.4 kB). View file

dataloaders/beat_dataset_new.py ADDED Viewed

	@@ -0,0 +1,373 @@

+import os
+import pickle
+import math
+import shutil
+import numpy as np
+import lmdb as lmdb
+import textgrid as tg
+import pandas as pd
+import torch
+import glob
+import json
+from termcolor import colored
+from loguru import logger
+from collections import defaultdict
+from torch.utils.data import Dataset
+import torch.distributed as dist
+import pickle
+import smplx
+from .utils.audio_features import AudioProcessor
+from .utils.other_tools import MultiLMDBManager
+from .utils.motion_rep_transfer import process_smplx_motion
+from .utils.mis_features import process_semantic_data, process_emotion_data
+from .utils.text_features import process_word_data
+from .utils.data_sample import sample_from_clip
+from .utils import rotation_conversions as rc
+class CustomDataset(Dataset):
+    def __init__(self, args, loader_type, build_cache=True):
+        self.args = args
+        self.loader_type = loader_type
+        self.rank = dist.get_rank()
+        self.ori_stride = self.args.stride
+        self.ori_length = self.args.pose_length
+        # Initialize basic parameters
+        self.ori_stride = self.args.stride
+        self.ori_length = self.args.pose_length
+        self.alignment = [0,0]  # for trinity
+        # Initialize SMPLX model
+        self.smplx = smplx.create(
+            self.args.data_path_1+"smplx_models/",
+            model_type='smplx',
+            gender='NEUTRAL_2020',
+            use_face_contour=False,
+            num_betas=300,
+            num_expression_coeffs=100,
+            ext='npz',
+            use_pca=False,
+        ).cuda().eval()
+        self.avg_vel = np.load(args.data_path+f"weights/mean_vel_{args.pose_rep}.npy")
+        # Load and process split rules
+        self._process_split_rules()
+        # Initialize data directories and lengths
+        self._init_data_paths()
+        # Build or load cache
+        self._init_cache(build_cache)
+    def _process_split_rules(self):
+        """Process dataset split rules."""
+        split_rule = pd.read_csv(self.args.data_path+"train_test_split.csv")
+        self.selected_file = split_rule.loc[
+            (split_rule['type'] == self.loader_type) &
+            (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))
+        ]
+        if self.args.additional_data and self.loader_type == 'train':
+            split_b = split_rule.loc[
+                (split_rule['type'] == 'additional') &
+                (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))
+            ]
+            self.selected_file = pd.concat([self.selected_file, split_b])
+        if self.selected_file.empty:
+            logger.warning(f"{self.loader_type} is empty for speaker {self.args.training_speakers}, use train set 0-8 instead")
+            self.selected_file = split_rule.loc[
+                (split_rule['type'] == 'train') &
+                (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))
+            ]
+            self.selected_file = self.selected_file.iloc[0:8]
+    def _init_data_paths(self):
+        """Initialize data directories and lengths."""
+        self.data_dir = self.args.data_path
+        if self.loader_type == "test":
+            self.args.multi_length_training = [1.0]
+        self.max_length = int(self.args.pose_length * self.args.multi_length_training[-1])
+        self.max_audio_pre_len = math.floor(self.args.pose_length / self.args.pose_fps * self.args.audio_sr)
+        if self.max_audio_pre_len > self.args.test_length * self.args.audio_sr:
+            self.max_audio_pre_len = self.args.test_length * self.args.audio_sr
+        if self.args.test_clip and self.loader_type == "test":
+            self.preloaded_dir = self.args.root_path + self.args.cache_path + self.loader_type + "_clip" + f"/{self.args.pose_rep}_cache"
+        else:
+            self.preloaded_dir = self.args.root_path + self.args.cache_path + self.loader_type + f"/{self.args.pose_rep}_cache"
+    def _init_cache(self, build_cache):
+        """Initialize or build cache."""
+        self.lmdb_envs = {}
+        self.mapping_data = None
+        if build_cache and self.rank == 0:
+            self.build_cache(self.preloaded_dir)
+        self.load_db_mapping()
+    def build_cache(self, preloaded_dir):
+        """Build the dataset cache."""
+        logger.info(f"Audio bit rate: {self.args.audio_fps}")
+        logger.info("Reading data '{}'...".format(self.data_dir))
+        logger.info("Creating the dataset cache...")
+        if self.args.new_cache and os.path.exists(preloaded_dir):
+            shutil.rmtree(preloaded_dir)
+        if os.path.exists(preloaded_dir):
+            # if the dir is empty, that means we still need to build the cache
+            if not os.listdir(preloaded_dir):
+                self.cache_generation(
+                    preloaded_dir,
+                    self.args.disable_filtering,
+                    self.args.clean_first_seconds,
+                    self.args.clean_final_seconds,
+                    is_test=False
+                )
+            else:
+                logger.info("Found the cache {}".format(preloaded_dir))
+        elif self.loader_type == "test":
+            self.cache_generation(preloaded_dir, True, 0, 0, is_test=True)
+        else:
+            self.cache_generation(
+                preloaded_dir,
+                self.args.disable_filtering,
+                self.args.clean_first_seconds,
+                self.args.clean_final_seconds,
+                is_test=False
+            )
+    def cache_generation(self, out_lmdb_dir, disable_filtering, clean_first_seconds, clean_final_seconds, is_test=False):
+        """Generate cache for the dataset."""
+        if not os.path.exists(out_lmdb_dir):
+            os.makedirs(out_lmdb_dir)
+        self.audio_processor = AudioProcessor(layer=self.args.n_layer, use_distill=self.args.use_distill)
+        # Initialize the multi-LMDB manager
+        lmdb_manager = MultiLMDBManager(out_lmdb_dir, max_db_size=10*1024*1024*1024)
+        self.n_out_samples = 0
+        n_filtered_out = defaultdict(int)
+        for index, file_name in self.selected_file.iterrows():
+            f_name = file_name["id"]
+            ext = ".npz" if "smplx" in self.args.pose_rep else ".bvh"
+            pose_file = os.path.join(self.data_dir, self.args.pose_rep, f_name + ext)
+            # Process data
+            data = self._process_file_data(f_name, pose_file, ext)
+            if data is None:
+                continue
+            # Sample from clip
+            filtered_result, self.n_out_samples = sample_from_clip(
+                lmdb_manager=lmdb_manager,
+                audio_file=pose_file.replace(self.args.pose_rep, 'wave16k').replace(ext, ".wav"),
+                audio_each_file=data['audio_tensor'],
+                high_each_file=data['high_level'],
+                low_each_file=data['low_level'],
+                pose_each_file=data['pose'],
+                rep15d_each_file=data['rep15d'],
+                trans_each_file=data['trans'],
+                trans_v_each_file=data['trans_v'],
+                shape_each_file=data['shape'],
+                facial_each_file=data['facial'],
+                aligned_text_each_file=data['aligned_text'],
+                word_each_file=data['word'] if self.args.word_rep is not None else None,
+                vid_each_file=data['vid'],
+                emo_each_file=data['emo'],
+                sem_each_file=data['sem'],
+                intention_each_file=data['intention'] if data['intention'] is not None else None,
+                audio_onset_each_file=data['audio_onset'] if self.args.onset_rep else None,
+                args=self.args,
+                ori_stride=self.ori_stride,
+                ori_length=self.ori_length,
+                disable_filtering=disable_filtering,
+                clean_first_seconds=clean_first_seconds,
+                clean_final_seconds=clean_final_seconds,
+                is_test=is_test,
+                n_out_samples=self.n_out_samples
+            )
+            for type_key in filtered_result:
+                n_filtered_out[type_key] += filtered_result[type_key]
+        lmdb_manager.close()
+    def _process_file_data(self, f_name, pose_file, ext):
+        """Process all data for a single file."""
+        data = {
+            'pose': None, 'trans': None, 'trans_v': None, 'shape': None,
+            'audio': None, 'facial': None, 'word': None, 'emo': None,
+            'sem': None, 'vid': None
+        }
+        # Process motion data
+        logger.info(colored(f"# ---- Building cache for Pose {f_name} ---- #", "blue"))
+        if "smplx" in self.args.pose_rep:
+            motion_data = process_smplx_motion(pose_file, self.smplx, self.args.pose_fps, self.args.facial_rep)
+        else:
+            raise ValueError(f"Unknown pose representation '{self.args.pose_rep}'.")
+        if motion_data is None:
+            return None
+        data.update(motion_data)
+        # Process speaker ID
+        if self.args.id_rep is not None:
+            speaker_id = int(f_name.split("_")[0]) - 1
+            data['vid'] = np.repeat(np.array(speaker_id).reshape(1, 1), data['pose'].shape[0], axis=0)
+        else:
+            data['vid'] = np.array([-1])
+        # Process audio if needed
+        if self.args.audio_rep is not None:
+            audio_file = pose_file.replace(self.args.pose_rep, 'wave16k').replace(ext, ".wav")
+            audio_data = self.audio_processor.get_wav2vec_from_16k_wav(audio_file, aligned_text=True)
+            if audio_data is None:
+                return None
+            data.update(audio_data)
+        if getattr(self.args, "onset_rep", False):
+            audio_file = pose_file.replace(self.args.pose_rep, 'wave16k').replace(ext, ".wav")
+            onset_data = self.audio_processor.calculate_onset_amplitude(audio_file, data)
+            if onset_data is None:
+                return None
+            data.update(onset_data)
+        # Process emotion if needed
+        if self.args.emo_rep is not None:
+            data = process_emotion_data(f_name, data, self.args)
+            if data is None:
+                return None
+        # Process word data if needed
+        if self.args.word_rep is not None:
+            word_file = f"{self.data_dir}{self.args.word_rep}/{f_name}.TextGrid"
+            data = process_word_data(self.data_dir, word_file, self.args, data, f_name, self.selected_file)
+            if data is None:
+                return None
+        # Process semantic data if needed
+        if self.args.sem_rep is not None:
+            sem_file = f"{self.data_dir}{self.args.sem_rep}/{f_name}.txt"
+            data = process_semantic_data(sem_file, self.args, data, f_name)
+            if data is None:
+                return None
+        return data
+    def load_db_mapping(self):
+        """Load database mapping from file."""
+        mapping_path = os.path.join(self.preloaded_dir, "sample_db_mapping.pkl")
+        with open(mapping_path, 'rb') as f:
+            self.mapping_data = pickle.load(f)
+        # Update paths from test to test_clip if needed
+        if self.loader_type == "test" and self.args.test_clip:
+            updated_paths = []
+            for path in self.mapping_data['db_paths']:
+                updated_path = path.replace("test/", "test_clip/")
+                updated_paths.append(updated_path)
+            self.mapping_data['db_paths'] = updated_paths
+            # Re-save the updated mapping_data to the same pickle file
+            with open(mapping_path, 'wb') as f:
+                pickle.dump(self.mapping_data, f)
+        self.n_samples = len(self.mapping_data['mapping'])
+    def get_lmdb_env(self, db_idx):
+        """Get LMDB environment for given database index."""
+        if db_idx not in self.lmdb_envs:
+            db_path = self.mapping_data['db_paths'][db_idx]
+            self.lmdb_envs[db_idx] = lmdb.open(db_path, readonly=True, lock=False)
+        return self.lmdb_envs[db_idx]
+    def __len__(self):
+        """Return the total number of samples in the dataset."""
+        return self.n_samples
+    def __getitem__(self, idx):
+        """Get a single sample from the dataset."""
+        db_idx = self.mapping_data['mapping'][idx]
+        lmdb_env = self.get_lmdb_env(db_idx)
+        with lmdb_env.begin(write=False) as txn:
+            key = "{:008d}".format(idx).encode("ascii")
+            sample = txn.get(key)
+            sample = pickle.loads(sample)
+            tar_pose, in_audio, in_audio_high, in_audio_low, tar_rep15d, in_facial, in_shape, in_aligned_text, in_word, emo, sem, vid, trans, trans_v, intention, audio_name, audio_onset = sample
+            # Convert data to tensors with appropriate types
+            processed_data = self._convert_to_tensors(
+                tar_pose, tar_rep15d, in_audio, in_audio_high, in_audio_low, in_facial, in_shape, in_aligned_text, in_word,
+                emo, sem, vid, trans, trans_v, intention, audio_onset
+            )
+            processed_data['audio_name'] = audio_name
+            return processed_data
+    def _convert_to_tensors(self, tar_pose, tar_rep15d, in_audio, in_audio_high, in_audio_low, in_facial, in_shape, in_aligned_text, in_word,
+                           emo, sem, vid, trans, trans_v, intention=None, audio_onset=None):
+        """Convert numpy arrays to tensors with appropriate types."""
+        data = {
+            'emo': torch.from_numpy(emo).int(),
+            'sem': torch.from_numpy(sem).float(),
+            'audio_tensor': torch.from_numpy(in_audio).float(),
+            'bert_time_aligned': torch.from_numpy(in_aligned_text).float()
+        }
+        tar_pose = torch.from_numpy(tar_pose).float()
+        if self.loader_type == "test":
+            data.update({
+                'pose': tar_pose,
+                'rep15d': torch.from_numpy(tar_rep15d).float(),
+                'trans': torch.from_numpy(trans).float(),
+                'trans_v': torch.from_numpy(trans_v).float(),
+                'facial': torch.from_numpy(in_facial).float(),
+                'id': torch.from_numpy(vid).float(),
+                'beta': torch.from_numpy(in_shape).float()
+            })
+        else:
+            data.update({
+                'pose': tar_pose,
+                'rep15d': torch.from_numpy(tar_rep15d).reshape((tar_rep15d.shape[0], -1)).float(),
+                'trans': torch.from_numpy(trans).reshape((trans.shape[0], -1)).float(),
+                'trans_v': torch.from_numpy(trans_v).reshape((trans_v.shape[0], -1)).float(),
+                'facial': torch.from_numpy(in_facial).reshape((in_facial.shape[0], -1)).float(),
+                'id': torch.from_numpy(vid).reshape((vid.shape[0], -1)).float(),
+                'beta': torch.from_numpy(in_shape).reshape((in_shape.shape[0], -1)).float()
+            })
+        # Handle audio onset
+        if audio_onset is not None:
+            data['audio_onset'] = torch.from_numpy(audio_onset).float()
+        else:
+            data['audio_onset'] = torch.tensor([-1])
+        if in_word is not None:
+            data['word'] = torch.from_numpy(in_word).int()
+        else:
+            data['word'] = torch.tensor([-1])
+        return data

dataloaders/beat_sep.py ADDED Viewed

	@@ -0,0 +1,772 @@

+import os
+import pickle
+import math
+import shutil
+import numpy as np
+import lmdb as lmdb
+import textgrid as tg
+import pandas as pd
+import torch
+import glob
+import json
+from termcolor import colored
+from loguru import logger
+from collections import defaultdict
+from torch.utils.data import Dataset
+import torch.distributed as dist
+#import pyarrow
+import pickle
+import librosa
+import smplx
+from .build_vocab import Vocab
+from .utils.audio_features import Wav2Vec2Model
+from .data_tools import joints_list
+from .utils import rotation_conversions as rc
+from .utils import other_tools
+class CustomDataset(Dataset):
+    def __init__(self, args, loader_type, augmentation=None, kwargs=None, build_cache=True):
+        self.args = args
+        self.loader_type = loader_type
+        self.rank = dist.get_rank()
+        self.ori_stride = self.args.stride
+        self.ori_length = self.args.pose_length
+        self.alignment = [0,0] # for trinity
+        self.ori_joint_list = joints_list[self.args.ori_joints]
+        self.tar_joint_list = joints_list[self.args.tar_joints]
+        if 'smplx' in self.args.pose_rep:
+            self.joint_mask = np.zeros(len(list(self.ori_joint_list.keys()))*3)
+            self.joints = len(list(self.tar_joint_list.keys()))
+            for joint_name in self.tar_joint_list:
+                self.joint_mask[self.ori_joint_list[joint_name][1] - self.ori_joint_list[joint_name][0]:self.ori_joint_list[joint_name][1]] = 1
+        else:
+            self.joints = len(list(self.ori_joint_list.keys()))+1
+            self.joint_mask = np.zeros(self.joints*3)
+            for joint_name in self.tar_joint_list:
+                if joint_name == "Hips":
+                    self.joint_mask[3:6] = 1
+                else:
+                    self.joint_mask[self.ori_joint_list[joint_name][1] - self.ori_joint_list[joint_name][0]:self.ori_joint_list[joint_name][1]] = 1
+        # select trainable joints
+        split_rule = pd.read_csv(args.data_path+"train_test_split.csv")
+        self.selected_file = split_rule.loc[(split_rule['type'] == loader_type) & (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))]
+        if args.additional_data and loader_type == 'train':
+            split_b = split_rule.loc[(split_rule['type'] == 'additional') & (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))]
+            #self.selected_file = split_rule.loc[(split_rule['type'] == 'additional') & (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))]
+            self.selected_file = pd.concat([self.selected_file, split_b])
+        if self.selected_file.empty:
+            logger.warning(f"{loader_type} is empty for speaker {self.args.training_speakers}, use train set 0-8 instead")
+            self.selected_file = split_rule.loc[(split_rule['type'] == 'train') & (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))]
+            self.selected_file = self.selected_file.iloc[0:8]
+        self.data_dir = args.data_path
+        if loader_type == "test":
+            self.args.multi_length_training = [1.0]
+        self.max_length = int(args.pose_length * self.args.multi_length_training[-1])
+        self.max_audio_pre_len = math.floor(args.pose_length / args.pose_fps * self.args.audio_sr)
+        if self.max_audio_pre_len > self.args.test_length*self.args.audio_sr:
+            self.max_audio_pre_len = self.args.test_length*self.args.audio_sr
+        if args.word_rep is not None:
+            with open(f"{args.data_path}weights/vocab.pkl", 'rb') as f:
+                self.lang_model = pickle.load(f)
+        preloaded_dir = self.args.root_path + self.args.cache_path + loader_type + f"/{args.pose_rep}_cache"
+        # if args.pose_norm:
+        #     # careful for rotation vectors
+        #     if not os.path.exists(args.data_path+args.mean_pose_path+f"{args.pose_rep.split('_')[0]}/bvh_mean.npy"):
+        #         self.calculate_mean_pose()
+        #     self.mean_pose = np.load(args.data_path+args.mean_pose_path+f"{args.pose_rep.split('_')[0]}/bvh_mean.npy")
+        #     self.std_pose = np.load(args.data_path+args.mean_pose_path+f"{args.pose_rep.split('_')[0]}/bvh_std.npy")
+        # if args.audio_norm:
+        #     if not os.path.exists(args.data_path+args.mean_pose_path+f"{args.audio_rep.split('_')[0]}/bvh_mean.npy"):
+        #         self.calculate_mean_audio()
+        #     self.mean_audio = np.load(args.data_path+args.mean_pose_path+f"{args.audio_rep.split('_')[0]}/npy_mean.npy")
+        #     self.std_audio = np.load(args.data_path+args.mean_pose_path+f"{args.audio_rep.split('_')[0]}/npy_std.npy")
+        # if args.facial_norm:
+        #     if not os.path.exists(args.data_path+args.mean_pose_path+f"{args.pose_rep.split('_')[0]}/bvh_mean.npy"):
+        #         self.calculate_mean_face()
+        #     self.mean_facial = np.load(args.data_path+args.mean_pose_path+f"{args.facial_rep}/json_mean.npy")
+        #     self.std_facial = np.load(args.data_path+args.mean_pose_path+f"{args.facial_rep}/json_std.npy")
+        if self.args.beat_align:
+            if not os.path.exists(args.data_path+f"weights/mean_vel_{args.pose_rep}.npy"):
+                self.calculate_mean_velocity(args.data_path+f"weights/mean_vel_{args.pose_rep}.npy")
+            self.avg_vel = np.load(args.data_path+f"weights/mean_vel_{args.pose_rep}.npy")
+        if build_cache and self.rank == 0:
+            self.build_cache(preloaded_dir)
+        self.lmdb_env = lmdb.open(preloaded_dir, readonly=True, lock=False)
+        with self.lmdb_env.begin() as txn:
+            self.n_samples = txn.stat()["entries"]
+    def calculate_mean_velocity(self, save_path):
+        self.smplx = smplx.create(
+            self.args.data_path_1+"smplx_models/",
+            model_type='smplx',
+            gender='NEUTRAL_2020',
+            use_face_contour=False,
+            num_betas=300,
+            num_expression_coeffs=100,
+            ext='npz',
+            use_pca=False,
+        ).cuda().eval()
+        dir_p = self.data_dir + self.args.pose_rep + "/"
+        all_list = []
+        from tqdm import tqdm
+        for tar in tqdm(os.listdir(dir_p)):
+            if tar.endswith(".npz"):
+                m_data = np.load(dir_p+tar, allow_pickle=True)
+                betas, poses, trans, exps = m_data["betas"], m_data["poses"], m_data["trans"], m_data["expressions"]
+                n, c = poses.shape[0], poses.shape[1]
+                betas = betas.reshape(1, 300)
+                betas = np.tile(betas, (n, 1))
+                betas = torch.from_numpy(betas).cuda().float()
+                poses = torch.from_numpy(poses.reshape(n, c)).cuda().float()
+                exps = torch.from_numpy(exps.reshape(n, 100)).cuda().float()
+                trans = torch.from_numpy(trans.reshape(n, 3)).cuda().float()
+                max_length = 128
+                s, r = n//max_length, n%max_length
+                #print(n, s, r)
+                all_tensor = []
+                for i in range(s):
+                    with torch.no_grad():
+                        joints = self.smplx(
+                            betas=betas[i*max_length:(i+1)*max_length],
+                            transl=trans[i*max_length:(i+1)*max_length],
+                            expression=exps[i*max_length:(i+1)*max_length],
+                            jaw_pose=poses[i*max_length:(i+1)*max_length, 66:69],
+                            global_orient=poses[i*max_length:(i+1)*max_length,:3],
+                            body_pose=poses[i*max_length:(i+1)*max_length,3:21*3+3],
+                            left_hand_pose=poses[i*max_length:(i+1)*max_length,25*3:40*3],
+                            right_hand_pose=poses[i*max_length:(i+1)*max_length,40*3:55*3],
+                            return_verts=True,
+                            return_joints=True,
+                            leye_pose=poses[i*max_length:(i+1)*max_length, 69:72],
+                            reye_pose=poses[i*max_length:(i+1)*max_length, 72:75],
+                        )['joints'][:, :55, :].reshape(max_length, 55*3)
+                    all_tensor.append(joints)
+                if r != 0:
+                    with torch.no_grad():
+                        joints = self.smplx(
+                            betas=betas[s*max_length:s*max_length+r],
+                            transl=trans[s*max_length:s*max_length+r],
+                            expression=exps[s*max_length:s*max_length+r],
+                            jaw_pose=poses[s*max_length:s*max_length+r, 66:69],
+                            global_orient=poses[s*max_length:s*max_length+r,:3],
+                            body_pose=poses[s*max_length:s*max_length+r,3:21*3+3],
+                            left_hand_pose=poses[s*max_length:s*max_length+r,25*3:40*3],
+                            right_hand_pose=poses[s*max_length:s*max_length+r,40*3:55*3],
+                            return_verts=True,
+                            return_joints=True,
+                            leye_pose=poses[s*max_length:s*max_length+r, 69:72],
+                            reye_pose=poses[s*max_length:s*max_length+r, 72:75],
+                        )['joints'][:, :55, :].reshape(r, 55*3)
+                    all_tensor.append(joints)
+                joints = torch.cat(all_tensor, axis=0)
+                joints = joints.permute(1, 0)
+                dt = 1/30
+            # first steps is forward diff (t+1 - t) / dt
+                init_vel = (joints[:, 1:2] - joints[:, :1]) / dt
+                # middle steps are second order (t+1 - t-1) / 2dt
+                middle_vel = (joints[:, 2:] - joints[:, 0:-2]) / (2 * dt)
+                # last step is backward diff (t - t-1) / dt
+                final_vel = (joints[:, -1:] - joints[:, -2:-1]) / dt
+                #print(joints.shape, init_vel.shape, middle_vel.shape, final_vel.shape)
+                vel_seq = torch.cat([init_vel, middle_vel, final_vel], dim=1).permute(1, 0).reshape(n, 55, 3)
+                #print(vel_seq.shape)
+                #.permute(1, 0).reshape(n, 55, 3)
+                vel_seq_np = vel_seq.cpu().numpy()
+                vel_joints_np = np.linalg.norm(vel_seq_np, axis=2) # n * 55
+                all_list.append(vel_joints_np)
+        avg_vel = np.mean(np.concatenate(all_list, axis=0),axis=0) # 55
+        np.save(save_path, avg_vel)
+    def build_cache(self, preloaded_dir):
+        logger.info(f"Audio bit rate: {self.args.audio_fps}")
+        logger.info("Reading data '{}'...".format(self.data_dir))
+        logger.info("Creating the dataset cache...")
+        if self.args.new_cache:
+            if os.path.exists(preloaded_dir):
+                shutil.rmtree(preloaded_dir)
+        if os.path.exists(preloaded_dir):
+            logger.info("Found the cache {}".format(preloaded_dir))
+        elif self.loader_type == "test":
+            self.cache_generation(
+                preloaded_dir, True,
+                0, 0,
+                is_test=True)
+        else:
+            self.cache_generation(
+                preloaded_dir, self.args.disable_filtering,
+                self.args.clean_first_seconds, self.args.clean_final_seconds,
+                is_test=False)
+    def __len__(self):
+        return self.n_samples
+    def cache_generation(self, out_lmdb_dir, disable_filtering, clean_first_seconds,  clean_final_seconds, is_test=False):
+        # if "wav2vec2" in self.args.audio_rep:
+        #     self.wav2vec_model = Wav2Vec2Model.from_pretrained(f"{self.args.data_path_1}/hub/transformer/wav2vec2-base-960h")
+        #     self.wav2vec_model.feature_extractor._freeze_parameters()
+        #     self.wav2vec_model = self.wav2vec_model.cuda()
+        #     self.wav2vec_model.eval()
+        self.n_out_samples = 0
+        # create db for samples
+        if not os.path.exists(out_lmdb_dir): os.makedirs(out_lmdb_dir)
+        dst_lmdb_env = lmdb.open(out_lmdb_dir, map_size= int(1024 ** 3 * 50))# 50G
+        n_filtered_out = defaultdict(int)
+        for index, file_name in self.selected_file.iterrows():
+            f_name = file_name["id"]
+            ext = ".npz" if "smplx" in self.args.pose_rep else ".bvh"
+            pose_file = self.data_dir + self.args.pose_rep + "/" + f_name + ext
+            pose_each_file = []
+            trans_each_file = []
+            shape_each_file = []
+            audio_each_file = []
+            facial_each_file = []
+            word_each_file = []
+            emo_each_file = []
+            sem_each_file = []
+            vid_each_file = []
+            id_pose = f_name #1_wayne_0_1_1
+            logger.info(colored(f"# ---- Building cache for Pose   {id_pose} ---- #", "blue"))
+            if "smplx" in self.args.pose_rep:
+                pose_data = np.load(pose_file, allow_pickle=True)
+                assert 30%self.args.pose_fps == 0, 'pose_fps should be an aliquot part of 30'
+                stride = int(30/self.args.pose_fps)
+                pose_each_file = pose_data["poses"][::stride] * self.joint_mask
+                pose_each_file = pose_each_file[:, self.joint_mask.astype(bool)]
+                # print(pose_each_file.shape)
+                trans_each_file = pose_data["trans"][::stride]
+                shape_each_file = np.repeat(pose_data["betas"].reshape(1, 300), pose_each_file.shape[0], axis=0)
+                if self.args.facial_rep is not None:
+                    logger.info(f"# ---- Building cache for Facial {id_pose} and Pose {id_pose} ---- #")
+                    facial_each_file = pose_data["expressions"][::stride]
+                    if self.args.facial_norm:
+                        facial_each_file = (facial_each_file - self.mean_facial) / self.std_facial
+            else:
+                assert 120%self.args.pose_fps == 0, 'pose_fps should be an aliquot part of 120'
+                stride = int(120/self.args.pose_fps)
+                with open(pose_file, "r") as pose_data:
+                    for j, line in enumerate(pose_data.readlines()):
+                        if j < 431: continue
+                        if j%stride != 0:continue
+                        data = np.fromstring(line, dtype=float, sep=" ")
+                        rot_data = rc.euler_angles_to_matrix(torch.from_numpy(np.deg2rad(data)).reshape(-1, self.joints,3), "XYZ")
+                        rot_data = rc.matrix_to_axis_angle(rot_data).reshape(-1, self.joints*3)
+                        rot_data = rot_data.numpy() * self.joint_mask
+                        pose_each_file.append(rot_data)
+                        trans_each_file.append(data[:3])
+                pose_each_file = np.array(pose_each_file)
+                # print(pose_each_file.shape)
+                trans_each_file = np.array(trans_each_file)
+                shape_each_file = np.repeat(np.array(-1).reshape(1, 1), pose_each_file.shape[0], axis=0)
+                if self.args.facial_rep is not None:
+                    logger.info(f"# ---- Building cache for Facial {id_pose} and Pose {id_pose} ---- #")
+                    facial_file = pose_file.replace(self.args.pose_rep, self.args.facial_rep).replace("bvh", "json")
+                    assert 60%self.args.pose_fps == 0, 'pose_fps should be an aliquot part of 120'
+                    stride = int(60/self.args.pose_fps)
+                    if not os.path.exists(facial_file):
+                        logger.warning(f"# ---- file not found for Facial {id_pose}, skip all files with the same id ---- #")
+                        self.selected_file = self.selected_file.drop(self.selected_file[self.selected_file['id'] == id_pose].index)
+                        continue
+                    with open(facial_file, 'r') as facial_data_file:
+                        facial_data = json.load(facial_data_file)
+                        for j, frame_data in enumerate(facial_data['frames']):
+                            if j%stride != 0:continue
+                            facial_each_file.append(frame_data['weights'])
+                    facial_each_file = np.array(facial_each_file)
+                    if self.args.facial_norm:
+                        facial_each_file = (facial_each_file - self.mean_facial) / self.std_facial
+            if self.args.id_rep is not None:
+                vid_each_file = np.repeat(np.array(int(f_name.split("_")[0])-1).reshape(1, 1), pose_each_file.shape[0], axis=0)
+            if self.args.audio_rep is not None:
+                logger.info(f"# ---- Building cache for Audio  {id_pose} and Pose {id_pose} ---- #")
+                audio_file = pose_file.replace(self.args.pose_rep, 'wave16k').replace(ext, ".wav")
+                if not os.path.exists(audio_file):
+                    logger.warning(f"# ---- file not found for Audio  {id_pose}, skip all files with the same id ---- #")
+                    self.selected_file = self.selected_file.drop(self.selected_file[self.selected_file['id'] == id_pose].index)
+                    continue
+                audio_each_file, sr = librosa.load(audio_file)
+                audio_each_file = librosa.resample(audio_each_file, orig_sr=sr, target_sr=self.args.audio_sr)
+                if self.args.audio_rep == "onset+amplitude":
+                    from numpy.lib import stride_tricks
+                    frame_length = 1024
+                    # hop_length = 512
+                    shape = (audio_each_file.shape[-1] - frame_length + 1, frame_length)
+                    strides = (audio_each_file.strides[-1], audio_each_file.strides[-1])
+                    rolling_view = stride_tricks.as_strided(audio_each_file, shape=shape, strides=strides)
+                    amplitude_envelope = np.max(np.abs(rolling_view), axis=1)
+                    # pad the last frame_length-1 samples
+                    amplitude_envelope = np.pad(amplitude_envelope, (0, frame_length-1), mode='constant', constant_values=amplitude_envelope[-1])
+                    audio_onset_f = librosa.onset.onset_detect(y=audio_each_file, sr=self.args.audio_sr, units='frames')
+                    onset_array = np.zeros(len(audio_each_file), dtype=float)
+                    onset_array[audio_onset_f] = 1.0
+                    # print(amplitude_envelope.shape, audio_each_file.shape, onset_array.shape)
+                    audio_each_file = np.concatenate([amplitude_envelope.reshape(-1, 1), onset_array.reshape(-1, 1)], axis=1)
+                elif self.args.audio_rep == "mfcc":
+                    audio_each_file = librosa.feature.melspectrogram(y=audio_each_file, sr=self.args.audio_sr, n_mels=128, hop_length=int(self.args.audio_sr/self.args.audio_fps))
+                    audio_each_file = audio_each_file.transpose(1, 0)
+                    # print(audio_each_file.shape, pose_each_file.shape)
+                if self.args.audio_norm and self.args.audio_rep == "wave16k":
+                    audio_each_file = (audio_each_file - self.mean_audio) / self.std_audio
+                # print(audio_each_file.shape)
+            time_offset = 0
+            if self.args.word_rep is not None:
+                logger.info(f"# ---- Building cache for Word   {id_pose} and Pose {id_pose} ---- #")
+                word_file = f"{self.data_dir}{self.args.word_rep}/{id_pose}.TextGrid"
+                if not os.path.exists(word_file):
+                    logger.warning(f"# ---- file not found for Word   {id_pose}, skip all files with the same id ---- #")
+                    self.selected_file = self.selected_file.drop(self.selected_file[self.selected_file['id'] == id_pose].index)
+                    continue
+                tgrid = tg.TextGrid.fromFile(word_file)
+                if self.args.t_pre_encoder == "bert":
+                    from transformers import AutoTokenizer, BertModel
+                    tokenizer = AutoTokenizer.from_pretrained(self.args.data_path_1 + "hub/bert-base-uncased", local_files_only=True)
+                    model = BertModel.from_pretrained(self.args.data_path_1 + "hub/bert-base-uncased", local_files_only=True).eval()
+                    list_word = []
+                    all_hidden = []
+                    max_len = 400
+                    last = 0
+                    word_token_mapping = []
+                    first = True
+                    for i, word in enumerate(tgrid[0]):
+                        last = i
+                        if (i%max_len != 0) or (i==0):
+                            if word.mark == "":
+                                list_word.append(".")
+                            else:
+                                list_word.append(word.mark)
+                        else:
+                            max_counter = max_len
+                            str_word = ' '.join(map(str, list_word))
+                            if first:
+                                global_len = 0
+                            end = -1
+                            offset_word = []
+                            for k, wordvalue in enumerate(list_word):
+                                start = end+1
+                                end = start+len(wordvalue)
+                                offset_word.append((start, end))
+                            #print(offset_word)
+                            token_scan = tokenizer.encode_plus(str_word, return_offsets_mapping=True)['offset_mapping']
+                            #print(token_scan)
+                            for start, end in offset_word:
+                                sub_mapping = []
+                                for i, (start_t, end_t) in enumerate(token_scan[1:-1]):
+                                    if int(start) <= int(start_t) and int(end_t) <= int(end):
+                                        #print(i+global_len)
+                                        sub_mapping.append(i+global_len)
+                                word_token_mapping.append(sub_mapping)
+                            #print(len(word_token_mapping))
+                            global_len = word_token_mapping[-1][-1] + 1
+                            list_word = []
+                            if word.mark == "":
+                                list_word.append(".")
+                            else:
+                                list_word.append(word.mark)
+                            with torch.no_grad():
+                                inputs = tokenizer(str_word, return_tensors="pt")
+                                outputs = model(**inputs)
+                                last_hidden_states = outputs.last_hidden_state.reshape(-1, 768).cpu().numpy()[1:-1, :]
+                            all_hidden.append(last_hidden_states)
+                    #list_word = list_word[:10]
+                    if list_word == []:
+                        pass
+                    else:
+                        if first:
+                            global_len = 0
+                        str_word = ' '.join(map(str, list_word))
+                        end = -1
+                        offset_word = []
+                        for k, wordvalue in enumerate(list_word):
+                            start = end+1
+                            end = start+len(wordvalue)
+                            offset_word.append((start, end))
+                        #print(offset_word)
+                        token_scan = tokenizer.encode_plus(str_word, return_offsets_mapping=True)['offset_mapping']
+                        #print(token_scan)
+                        for start, end in offset_word:
+                            sub_mapping = []
+                            for i, (start_t, end_t) in enumerate(token_scan[1:-1]):
+                                if int(start) <= int(start_t) and int(end_t) <= int(end):
+                                    sub_mapping.append(i+global_len)
+                                    #print(sub_mapping)
+                            word_token_mapping.append(sub_mapping)
+                        #print(len(word_token_mapping))
+                        with torch.no_grad():
+                            inputs = tokenizer(str_word, return_tensors="pt")
+                            outputs = model(**inputs)
+                            last_hidden_states = outputs.last_hidden_state.reshape(-1, 768).cpu().numpy()[1:-1, :]
+                        all_hidden.append(last_hidden_states)
+                    last_hidden_states = np.concatenate(all_hidden, axis=0)
+                for i in range(pose_each_file.shape[0]):
+                    found_flag = False
+                    current_time = i/self.args.pose_fps + time_offset
+                    j_last = 0
+                    for j, word in enumerate(tgrid[0]):
+                        word_n, word_s, word_e = word.mark, word.minTime, word.maxTime
+                        if word_s<=current_time and current_time<=word_e:
+                            if self.args.word_cache and self.args.t_pre_encoder == 'bert':
+                                mapping_index = word_token_mapping[j]
+                                #print(mapping_index, word_s, word_e)
+                                s_t = np.linspace(word_s, word_e, len(mapping_index)+1)
+                                #print(s_t)
+                                for tt, t_sep in enumerate(s_t[1:]):
+                                    if current_time <= t_sep:
+                                        #if len(mapping_index) > 1: print(mapping_index[tt])
+                                        word_each_file.append(last_hidden_states[mapping_index[tt]])
+                                        break
+                            else:
+                                if word_n == " ":
+                                    word_each_file.append(self.lang_model.PAD_token)
+                                else:
+                                    word_each_file.append(self.lang_model.get_word_index(word_n))
+                            found_flag = True
+                            j_last = j
+                            break
+                        else: continue
+                    if not found_flag:
+                        if self.args.word_cache and self.args.t_pre_encoder == 'bert':
+                            word_each_file.append(last_hidden_states[j_last])
+                        else:
+                            word_each_file.append(self.lang_model.UNK_token)
+                word_each_file = np.array(word_each_file)
+                #print(word_each_file.shape)
+            if self.args.emo_rep is not None:
+                logger.info(f"# ---- Building cache for Emo    {id_pose} and Pose {id_pose} ---- #")
+                rtype, start = int(id_pose.split('_')[3]), int(id_pose.split('_')[3])
+                if rtype == 0 or rtype == 2 or rtype == 4 or rtype == 6:
+                    if start >= 1 and start <= 64:
+                        score = 0
+                    elif start >= 65 and start <= 72:
+                        score = 1
+                    elif start >= 73 and start <= 80:
+                        score = 2
+                    elif start >= 81 and start <= 86:
+                        score = 3
+                    elif start >= 87 and start <= 94:
+                        score = 4
+                    elif start >= 95 and start <= 102:
+                        score = 5
+                    elif start >= 103 and start <= 110:
+                        score = 6
+                    elif start >= 111 and start <= 118:
+                        score = 7
+                    else: pass
+                else:
+                    # you may denote as unknown in the future
+                    score = 0
+                emo_each_file = np.repeat(np.array(score).reshape(1, 1), pose_each_file.shape[0], axis=0)
+                #print(emo_each_file)
+            if self.args.sem_rep is not None:
+                logger.info(f"# ---- Building cache for Sem    {id_pose} and Pose {id_pose} ---- #")
+                sem_file = f"{self.data_dir}{self.args.sem_rep}/{id_pose}.txt"
+                sem_all = pd.read_csv(sem_file,
+                    sep='\t',
+                    names=["name", "start_time", "end_time", "duration", "score", "keywords"])
+                # we adopt motion-level semantic score here.
+                for i in range(pose_each_file.shape[0]):
+                    found_flag = False
+                    for j, (start, end, score) in enumerate(zip(sem_all['start_time'],sem_all['end_time'], sem_all['score'])):
+                        current_time = i/self.args.pose_fps + time_offset
+                        if start<=current_time and current_time<=end:
+                            sem_each_file.append(score)
+                            found_flag=True
+                            break
+                        else: continue
+                    if not found_flag: sem_each_file.append(0.)
+                sem_each_file = np.array(sem_each_file)
+                #print(sem_each_file)
+            filtered_result = self._sample_from_clip(
+                dst_lmdb_env,
+                audio_each_file, pose_each_file, trans_each_file, shape_each_file, facial_each_file, word_each_file,
+                vid_each_file, emo_each_file, sem_each_file,
+                disable_filtering, clean_first_seconds, clean_final_seconds, is_test,
+                )
+            for type in filtered_result.keys():
+                n_filtered_out[type] += filtered_result[type]
+        with dst_lmdb_env.begin() as txn:
+            logger.info(colored(f"no. of samples: {txn.stat()['entries']}", "cyan"))
+            n_total_filtered = 0
+            for type, n_filtered in n_filtered_out.items():
+                logger.info("{}: {}".format(type, n_filtered))
+                n_total_filtered += n_filtered
+            logger.info(colored("no. of excluded samples: {} ({:.1f}%)".format(
+                n_total_filtered, 100 * n_total_filtered / (txn.stat()["entries"] + n_total_filtered)), "cyan"))
+        dst_lmdb_env.sync()
+        dst_lmdb_env.close()
+    def _sample_from_clip(
+        self, dst_lmdb_env, audio_each_file, pose_each_file, trans_each_file, shape_each_file, facial_each_file, word_each_file,
+        vid_each_file, emo_each_file, sem_each_file,
+        disable_filtering, clean_first_seconds, clean_final_seconds, is_test,
+        ):
+        """
+        for data cleaning, we ignore the data for first and final n s
+        for test, we return all data
+        """
+        # audio_start = int(self.alignment[0] * self.args.audio_fps)
+        # pose_start = int(self.alignment[1] * self.args.pose_fps)
+        #logger.info(f"before: {audio_each_file.shape} {pose_each_file.shape}")
+        # audio_each_file = audio_each_file[audio_start:]
+        # pose_each_file = pose_each_file[pose_start:]
+        # trans_each_file =
+        #logger.info(f"after alignment: {audio_each_file.shape} {pose_each_file.shape}")
+        #print(pose_each_file.shape)
+        round_seconds_skeleton = pose_each_file.shape[0] // self.args.pose_fps  # assume 1500 frames / 15 fps = 100 s
+        #print(round_seconds_skeleton)
+        if audio_each_file != []:
+            if self.args.audio_rep != "wave16k":
+                round_seconds_audio = len(audio_each_file) // self.args.audio_fps # assume 16,000,00 / 16,000 = 100 s
+            elif self.args.audio_rep == "mfcc":
+                round_seconds_audio = audio_each_file.shape[0] // self.args.audio_fps
+            else:
+                round_seconds_audio = audio_each_file.shape[0] // self.args.audio_sr
+            if facial_each_file != []:
+                round_seconds_facial = facial_each_file.shape[0] // self.args.pose_fps
+                logger.info(f"audio: {round_seconds_audio}s, pose: {round_seconds_skeleton}s, facial: {round_seconds_facial}s")
+                round_seconds_skeleton = min(round_seconds_audio, round_seconds_skeleton, round_seconds_facial)
+                max_round = max(round_seconds_audio, round_seconds_skeleton, round_seconds_facial)
+                if round_seconds_skeleton != max_round:
+                    logger.warning(f"reduce to {round_seconds_skeleton}s, ignore {max_round-round_seconds_skeleton}s")
+            else:
+                logger.info(f"pose: {round_seconds_skeleton}s, audio: {round_seconds_audio}s")
+                round_seconds_skeleton = min(round_seconds_audio, round_seconds_skeleton)
+                max_round = max(round_seconds_audio, round_seconds_skeleton)
+                if round_seconds_skeleton != max_round:
+                    logger.warning(f"reduce to {round_seconds_skeleton}s, ignore {max_round-round_seconds_skeleton}s")
+        clip_s_t, clip_e_t = clean_first_seconds, round_seconds_skeleton - clean_final_seconds # assume [10, 90]s
+        clip_s_f_audio, clip_e_f_audio = self.args.audio_fps * clip_s_t, clip_e_t * self.args.audio_fps # [160,000,90*160,000]
+        clip_s_f_pose, clip_e_f_pose = clip_s_t * self.args.pose_fps, clip_e_t * self.args.pose_fps # [150,90*15]
+        for ratio in self.args.multi_length_training:
+            if is_test:# stride = length for test
+                cut_length = clip_e_f_pose - clip_s_f_pose
+                self.args.stride = cut_length
+                self.max_length = cut_length
+            else:
+                self.args.stride = int(ratio*self.ori_stride)
+                cut_length = int(self.ori_length*ratio)
+            num_subdivision = math.floor((clip_e_f_pose - clip_s_f_pose - cut_length) / self.args.stride) + 1
+            logger.info(f"pose from frame {clip_s_f_pose} to {clip_e_f_pose}, length {cut_length}")
+            logger.info(f"{num_subdivision} clips is expected with stride {self.args.stride}")
+            if audio_each_file != []:
+                audio_short_length = math.floor(cut_length / self.args.pose_fps * self.args.audio_fps)
+                """
+                for audio sr = 16000, fps = 15, pose_length = 34,
+                audio short length = 36266.7 -> 36266
+                this error is fine.
+                """
+                logger.info(f"audio from frame {clip_s_f_audio} to {clip_e_f_audio}, length {audio_short_length}")
+            n_filtered_out = defaultdict(int)
+            sample_pose_list = []
+            sample_audio_list = []
+            sample_facial_list = []
+            sample_shape_list = []
+            sample_word_list = []
+            sample_emo_list = []
+            sample_sem_list = []
+            sample_vid_list = []
+            sample_trans_list = []
+            for i in range(num_subdivision): # cut into around 2s chip, (self npose)
+                start_idx = clip_s_f_pose + i * self.args.stride
+                fin_idx = start_idx + cut_length
+                sample_pose = pose_each_file[start_idx:fin_idx]
+                sample_trans = trans_each_file[start_idx:fin_idx]
+                sample_shape = shape_each_file[start_idx:fin_idx]
+                # print(sample_pose.shape)
+                if self.args.audio_rep is not None:
+                    audio_start = clip_s_f_audio + math.floor(i * self.args.stride * self.args.audio_fps / self.args.pose_fps)
+                    audio_end = audio_start + audio_short_length
+                    sample_audio = audio_each_file[audio_start:audio_end]
+                else:
+                    sample_audio = np.array([-1])
+                sample_facial = facial_each_file[start_idx:fin_idx] if self.args.facial_rep is not None else np.array([-1])
+                sample_word = word_each_file[start_idx:fin_idx] if self.args.word_rep is not None else np.array([-1])
+                sample_emo = emo_each_file[start_idx:fin_idx] if self.args.emo_rep is not None else np.array([-1])
+                sample_sem = sem_each_file[start_idx:fin_idx] if self.args.sem_rep is not None else np.array([-1])
+                sample_vid = vid_each_file[start_idx:fin_idx] if self.args.id_rep is not None else np.array([-1])
+                if sample_pose.any() != None:
+                    # filtering motion skeleton data
+                    sample_pose, filtering_message = MotionPreprocessor(sample_pose).get()
+                    is_correct_motion = (sample_pose != [])
+                    if is_correct_motion or disable_filtering:
+                        sample_pose_list.append(sample_pose)
+                        sample_audio_list.append(sample_audio)
+                        sample_facial_list.append(sample_facial)
+                        sample_shape_list.append(sample_shape)
+                        sample_word_list.append(sample_word)
+                        sample_vid_list.append(sample_vid)
+                        sample_emo_list.append(sample_emo)
+                        sample_sem_list.append(sample_sem)
+                        sample_trans_list.append(sample_trans)
+                    else:
+                        n_filtered_out[filtering_message] += 1
+            if len(sample_pose_list) > 0:
+                with dst_lmdb_env.begin(write=True) as txn:
+                    for pose, audio, facial, shape, word, vid, emo, sem, trans in zip(
+                        sample_pose_list,
+                        sample_audio_list,
+                        sample_facial_list,
+                        sample_shape_list,
+                        sample_word_list,
+                        sample_vid_list,
+                        sample_emo_list,
+                        sample_sem_list,
+                        sample_trans_list,):
+                        k = "{:005}".format(self.n_out_samples).encode("ascii")
+                        v = [pose, audio, facial, shape, word, emo, sem, vid, trans]
+                        v = pickle.dumps(v,5)
+                        txn.put(k, v)
+                        self.n_out_samples += 1
+        return n_filtered_out
+    def __getitem__(self, idx):
+        with self.lmdb_env.begin(write=False) as txn:
+            key = "{:005}".format(idx).encode("ascii")
+            sample = txn.get(key)
+            sample = pickle.loads(sample)
+            tar_pose, in_audio, in_facial, in_shape, in_word, emo, sem, vid, trans = sample
+            #print(in_shape)
+            #vid = torch.from_numpy(vid).int()
+            emo = torch.from_numpy(emo).int()
+            sem = torch.from_numpy(sem).float()
+            in_audio = torch.from_numpy(in_audio).float()
+            in_word = torch.from_numpy(in_word).float() if self.args.word_cache else torch.from_numpy(in_word).int()
+            if self.loader_type == "test":
+                tar_pose = torch.from_numpy(tar_pose).float()
+                trans = torch.from_numpy(trans).float()
+                in_facial = torch.from_numpy(in_facial).float()
+                vid = torch.from_numpy(vid).float()
+                in_shape = torch.from_numpy(in_shape).float()
+            else:
+                in_shape = torch.from_numpy(in_shape).reshape((in_shape.shape[0], -1)).float()
+                trans = torch.from_numpy(trans).reshape((trans.shape[0], -1)).float()
+                vid = torch.from_numpy(vid).reshape((vid.shape[0], -1)).float()
+                tar_pose = torch.from_numpy(tar_pose).reshape((tar_pose.shape[0], -1)).float()
+                in_facial = torch.from_numpy(in_facial).reshape((in_facial.shape[0], -1)).float()
+            return {"pose":tar_pose, "audio":in_audio, "facial":in_facial, "beta": in_shape, "word":in_word, "id":vid, "emo":emo, "sem":sem, "trans":trans}
+class MotionPreprocessor:
+    def __init__(self, skeletons):
+        self.skeletons = skeletons
+        #self.mean_pose = mean_pose
+        self.filtering_message = "PASS"
+    def get(self):
+        assert (self.skeletons is not None)
+        # filtering
+        if self.skeletons != []:
+            if self.check_pose_diff():
+                self.skeletons = []
+                self.filtering_message = "pose"
+            # elif self.check_spine_angle():
+            #     self.skeletons = []
+            #     self.filtering_message = "spine angle"
+            # elif self.check_static_motion():
+            #     self.skeletons = []
+            #     self.filtering_message = "motion"
+        # if self.skeletons != []:
+        #     self.skeletons = self.skeletons.tolist()
+        #     for i, frame in enumerate(self.skeletons):
+        #         assert not np.isnan(self.skeletons[i]).any()  # missing joints
+        return self.skeletons, self.filtering_message
+    def check_static_motion(self, verbose=True):
+        def get_variance(skeleton, joint_idx):
+            wrist_pos = skeleton[:, joint_idx]
+            variance = np.sum(np.var(wrist_pos, axis=0))
+            return variance
+        left_arm_var = get_variance(self.skeletons, 6)
+        right_arm_var = get_variance(self.skeletons, 9)
+        th = 0.0014  # exclude 13110
+        # th = 0.002  # exclude 16905
+        if left_arm_var < th and right_arm_var < th:
+            if verbose:
+                print("skip - check_static_motion left var {}, right var {}".format(left_arm_var, right_arm_var))
+            return True
+        else:
+            if verbose:
+                print("pass - check_static_motion left var {}, right var {}".format(left_arm_var, right_arm_var))
+            return False
+    def check_pose_diff(self, verbose=False):
+#         diff = np.abs(self.skeletons - self.mean_pose) # 186*1
+#         diff = np.mean(diff)
+#         # th = 0.017
+#         th = 0.02 #0.02  # exclude 3594
+#         if diff < th:
+#             if verbose:
+#                 print("skip - check_pose_diff {:.5f}".format(diff))
+#             return True
+# #         th = 3.5 #0.02  # exclude 3594
+# #         if 3.5 < diff < 5:
+# #             if verbose:
+# #                 print("skip - check_pose_diff {:.5f}".format(diff))
+# #             return True
+#         else:
+#             if verbose:
+#                 print("pass - check_pose_diff {:.5f}".format(diff))
+        return False
+    def check_spine_angle(self, verbose=True):
+        def angle_between(v1, v2):
+            v1_u = v1 / np.linalg.norm(v1)
+            v2_u = v2 / np.linalg.norm(v2)
+            return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0))
+        angles = []
+        for i in range(self.skeletons.shape[0]):
+            spine_vec = self.skeletons[i, 1] - self.skeletons[i, 0]
+            angle = angle_between(spine_vec, [0, -1, 0])
+            angles.append(angle)
+        if np.rad2deg(max(angles)) > 30 or np.rad2deg(np.mean(angles)) > 20:  # exclude 4495
+        # if np.rad2deg(max(angles)) > 20:  # exclude 8270
+            if verbose:
+                print("skip - check_spine_angle {:.5f}, {:.5f}".format(max(angles), np.mean(angles)))
+            return True
+        else:
+            if verbose:
+                print("pass - check_spine_angle {:.5f}".format(max(angles)))
+            return False

dataloaders/beat_sep_lower.py ADDED Viewed

	@@ -0,0 +1,430 @@

+import os
+import pickle
+import math
+import shutil
+import numpy as np
+import lmdb as lmdb
+import pandas as pd
+import torch
+import glob
+import json
+from dataloaders.build_vocab import Vocab
+from termcolor import colored
+from loguru import logger
+from collections import defaultdict
+from torch.utils.data import Dataset
+import torch.distributed as dist
+import pickle
+import smplx
+from .utils.audio_features import process_audio_data
+from .data_tools import joints_list
+from .utils.other_tools import MultiLMDBManager
+from .utils.motion_rep_transfer import process_smplx_motion
+from .utils.mis_features import process_semantic_data, process_emotion_data
+from .utils.text_features import process_word_data
+from .utils.data_sample import sample_from_clip
+import time
+class CustomDataset(Dataset):
+    def __init__(self, args, loader_type, augmentation=None, kwargs=None, build_cache=True):
+        self.args = args
+        self.loader_type = loader_type
+        # Set rank safely - handle cases where distributed training is not yet initialized
+        try:
+            if torch.distributed.is_initialized():
+                self.rank = torch.distributed.get_rank()
+            else:
+                self.rank = 0
+        except:
+            self.rank = 0
+        self.ori_stride = self.args.stride
+        self.ori_length = self.args.pose_length
+        # Initialize basic parameters
+        self.ori_stride = self.args.stride
+        self.ori_length = self.args.pose_length
+        self.alignment = [0,0]  # for trinity
+        """Initialize SMPLX model."""
+        self.smplx = smplx.create(
+            self.args.data_path_1+"smplx_models/",
+            model_type='smplx',
+            gender='NEUTRAL_2020',
+            use_face_contour=False,
+            num_betas=300,
+            num_expression_coeffs=100,
+            ext='npz',
+            use_pca=False,
+        ).cuda().eval()
+        if self.args.word_rep is not None:
+            with open(f"{self.args.data_path}weights/vocab.pkl", 'rb') as f:
+                self.lang_model = pickle.load(f)
+        # Load and process split rules
+        self._process_split_rules()
+        # Initialize data directories and lengths
+        self._init_data_paths()
+        if self.args.beat_align:
+            if not os.path.exists(args.data_path+f"weights/mean_vel_{args.pose_rep}.npy"):
+                self.calculate_mean_velocity(args.data_path+f"weights/mean_vel_{args.pose_rep}.npy")
+            self.avg_vel = np.load(args.data_path+f"weights/mean_vel_{args.pose_rep}.npy")
+        # Build or load cache
+        self._init_cache(build_cache)
+    def _process_split_rules(self):
+        """Process dataset split rules."""
+        split_rule = pd.read_csv(self.args.data_path+"train_test_split.csv")
+        self.selected_file = split_rule.loc[
+            (split_rule['type'] == self.loader_type) &
+            (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))
+        ]
+        if self.args.additional_data and self.loader_type == 'train':
+            split_b = split_rule.loc[
+                (split_rule['type'] == 'additional') &
+                (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))
+            ]
+            self.selected_file = pd.concat([self.selected_file, split_b])
+        if self.selected_file.empty:
+            logger.warning(f"{self.loader_type} is empty for speaker {self.args.training_speakers}, use train set 0-8 instead")
+            self.selected_file = split_rule.loc[
+                (split_rule['type'] == 'train') &
+                (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))
+            ]
+            self.selected_file = self.selected_file.iloc[0:8]
+    def _init_data_paths(self):
+        """Initialize data directories and lengths."""
+        self.data_dir = self.args.data_path
+        if self.loader_type == "test":
+            self.args.multi_length_training = [1.0]
+        self.max_length = int(self.args.pose_length * self.args.multi_length_training[-1])
+        self.max_audio_pre_len = math.floor(self.args.pose_length / self.args.pose_fps * self.args.audio_sr)
+        if self.max_audio_pre_len > self.args.test_length * self.args.audio_sr:
+            self.max_audio_pre_len = self.args.test_length * self.args.audio_sr
+        if self.args.test_clip and self.loader_type == "test":
+            self.preloaded_dir = self.args.root_path + self.args.cache_path + self.loader_type + "_clip" + f"/{self.args.pose_rep}_cache"
+        else:
+            self.preloaded_dir = self.args.root_path + self.args.cache_path + self.loader_type + f"/{self.args.pose_rep}_cache"
+    def _init_cache(self, build_cache):
+        """Initialize or build cache."""
+        self.lmdb_envs = {}
+        self.mapping_data = None
+        if build_cache and self.rank == 0:
+            self.build_cache(self.preloaded_dir)
+        # In DDP mode, ensure all processes wait for cache building to complete
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+        # Try to regenerate cache if corrupted (only on rank 0 to avoid race conditions)
+        if self.rank == 0:
+            self.regenerate_cache_if_corrupted()
+        # Wait for cache regeneration to complete
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+        self.load_db_mapping()
+    def build_cache(self, preloaded_dir):
+        """Build the dataset cache."""
+        logger.info(f"Audio bit rate: {self.args.audio_fps}")
+        logger.info("Reading data '{}'...".format(self.data_dir))
+        logger.info("Creating the dataset cache...")
+        if self.args.new_cache and os.path.exists(preloaded_dir):
+            shutil.rmtree(preloaded_dir)
+        if os.path.exists(preloaded_dir):
+            # if the dir is empty, that means we still need to build the cache
+            if not os.listdir(preloaded_dir):
+                self.cache_generation(
+                    preloaded_dir,
+                    self.args.disable_filtering,
+                    self.args.clean_first_seconds,
+                    self.args.clean_final_seconds,
+                    is_test=False
+                )
+            else:
+                logger.info("Found the cache {}".format(preloaded_dir))
+        elif self.loader_type == "test":
+            self.cache_generation(preloaded_dir, True, 0, 0, is_test=True)
+        else:
+            self.cache_generation(
+                preloaded_dir,
+                self.args.disable_filtering,
+                self.args.clean_first_seconds,
+                self.args.clean_final_seconds,
+                is_test=False
+            )
+    def cache_generation(self, out_lmdb_dir, disable_filtering, clean_first_seconds, clean_final_seconds, is_test=False):
+        """Generate cache for the dataset."""
+        if not os.path.exists(out_lmdb_dir):
+            os.makedirs(out_lmdb_dir)
+        # Initialize the multi-LMDB manager
+        lmdb_manager = MultiLMDBManager(out_lmdb_dir, max_db_size=10*1024*1024*1024)
+        self.n_out_samples = 0
+        n_filtered_out = defaultdict(int)
+        for index, file_name in self.selected_file.iterrows():
+            f_name = file_name["id"]
+            ext = ".npz" if "smplx" in self.args.pose_rep else ".bvh"
+            pose_file = os.path.join(self.data_dir, self.args.pose_rep, f_name + ext)
+            # Process data
+            data = self._process_file_data(f_name, pose_file, ext)
+            if data is None:
+                continue
+            # Sample from clip
+            filtered_result, self.n_out_samples = sample_from_clip(
+                lmdb_manager=lmdb_manager,
+                audio_file=pose_file.replace(self.args.pose_rep, 'wave16k').replace(ext, ".wav"),
+                audio_each_file=data['audio'],
+                pose_each_file=data['pose'],
+                trans_each_file=data['trans'],
+                trans_v_each_file=data['trans_v'],
+                shape_each_file=data['shape'],
+                facial_each_file=data['facial'],
+                word_each_file=data['word'],
+                vid_each_file=data['vid'],
+                emo_each_file=data['emo'],
+                sem_each_file=data['sem'],
+                args=self.args,
+                ori_stride=self.ori_stride,
+                ori_length=self.ori_length,
+                disable_filtering=disable_filtering,
+                clean_first_seconds=clean_first_seconds,
+                clean_final_seconds=clean_final_seconds,
+                is_test=is_test,
+                n_out_samples=self.n_out_samples
+            )
+            for type_key in filtered_result:
+                n_filtered_out[type_key] += filtered_result[type_key]
+        lmdb_manager.close()
+    def _process_file_data(self, f_name, pose_file, ext):
+        """Process all data for a single file."""
+        data = {
+            'pose': None, 'trans': None, 'trans_v': None, 'shape': None,
+            'audio': None, 'facial': None, 'word': None, 'emo': None,
+            'sem': None, 'vid': None
+        }
+        # Process motion data
+        logger.info(colored(f"# ---- Building cache for Pose {f_name} ---- #", "blue"))
+        if "smplx" in self.args.pose_rep:
+            motion_data = process_smplx_motion(pose_file, self.smplx, self.args.pose_fps, self.args.facial_rep)
+        else:
+            raise ValueError(f"Unknown pose representation '{self.args.pose_rep}'.")
+        if motion_data is None:
+            return None
+        data.update(motion_data)
+        # Process speaker ID
+        if self.args.id_rep is not None:
+            speaker_id = int(f_name.split("_")[0]) - 1
+            data['vid'] = np.repeat(np.array(speaker_id).reshape(1, 1), data['pose'].shape[0], axis=0)
+        else:
+            data['vid'] = np.array([-1])
+        # Process audio if needed
+        if self.args.audio_rep is not None:
+            audio_file = pose_file.replace(self.args.pose_rep, 'wave16k').replace(ext, ".wav")
+            data = process_audio_data(audio_file, self.args, data, f_name, self.selected_file)
+            if data is None:
+                return None
+        # Process emotion if needed
+        if self.args.emo_rep is not None:
+            data = process_emotion_data(f_name, data, self.args)
+            if data is None:
+                return None
+        # Process word data if needed
+        if self.args.word_rep is not None:
+            word_file = f"{self.data_dir}{self.args.word_rep}/{f_name}.TextGrid"
+            data = process_word_data(self.data_dir, word_file, self.args, data, f_name, self.selected_file, self.lang_model)
+            if data is None:
+                return None
+        # Process semantic data if needed
+        if self.args.sem_rep is not None:
+            sem_file = f"{self.data_dir}{self.args.sem_rep}/{f_name}.txt"
+            data = process_semantic_data(sem_file, self.args, data, f_name)
+            if data is None:
+                return None
+        return data
+    def load_db_mapping(self):
+        """Load database mapping from file."""
+        mapping_path = os.path.join(self.preloaded_dir, "sample_db_mapping.pkl")
+        backup_path = os.path.join(self.preloaded_dir, "sample_db_mapping_backup.pkl")
+        # Check if file exists and is readable
+        if not os.path.exists(mapping_path):
+            raise FileNotFoundError(f"Mapping file not found: {mapping_path}")
+        # Check file size to ensure it's not empty
+        file_size = os.path.getsize(mapping_path)
+        if file_size == 0:
+            raise ValueError(f"Mapping file is empty: {mapping_path}")
+        print(f"Loading mapping file: {mapping_path} (size: {file_size} bytes)")
+        # Add error handling and retry logic for pickle loading
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                with open(mapping_path, 'rb') as f:
+                    self.mapping_data = pickle.load(f)
+                print(f"Successfully loaded mapping data with {len(self.mapping_data.get('mapping', []))} samples")
+                break
+            except (EOFError, pickle.UnpicklingError) as e:
+                if attempt < max_retries - 1:
+                    print(f"Warning: Failed to load pickle file (attempt {attempt + 1}/{max_retries}): {e}")
+                    print(f"File path: {mapping_path}")
+                    # Try backup file if main file is corrupted
+                    if os.path.exists(backup_path) and os.path.getsize(backup_path) > 0:
+                        print("Trying backup file...")
+                        try:
+                            with open(backup_path, 'rb') as f:
+                                self.mapping_data = pickle.load(f)
+                            print(f"Successfully loaded mapping data from backup with {len(self.mapping_data.get('mapping', []))} samples")
+                            break
+                        except Exception as backup_e:
+                            print(f"Backup file also failed: {backup_e}")
+                    print("Retrying...")
+                    time.sleep(1)  # Wait a bit before retrying
+                else:
+                    print(f"Error: Failed to load pickle file after {max_retries} attempts: {e}")
+                    print(f"File path: {mapping_path}")
+                    print("Please check if the file is corrupted or incomplete.")
+                    print("You may need to regenerate the cache files.")
+                    raise
+        # Update paths from test to test_clip if needed
+        if self.loader_type == "test" and self.args.test_clip:
+            updated_paths = []
+            for path in self.mapping_data['db_paths']:
+                updated_path = path.replace("test/", "test_clip/")
+                updated_paths.append(updated_path)
+            self.mapping_data['db_paths'] = updated_paths
+            # In DDP mode, avoid modifying shared files to prevent race conditions
+            # Instead, just update the in-memory data
+            print(f"Updated test paths for test_clip mode (avoiding file modification in DDP)")
+        self.n_samples = len(self.mapping_data['mapping'])
+    def get_lmdb_env(self, db_idx):
+        """Get LMDB environment for given database index."""
+        if db_idx not in self.lmdb_envs:
+            db_path = self.mapping_data['db_paths'][db_idx]
+            self.lmdb_envs[db_idx] = lmdb.open(db_path, readonly=True, lock=False)
+        return self.lmdb_envs[db_idx]
+    def __len__(self):
+        """Return the total number of samples in the dataset."""
+        return self.n_samples
+    def __getitem__(self, idx):
+        """Get a single sample from the dataset."""
+        db_idx = self.mapping_data['mapping'][idx]
+        lmdb_env = self.get_lmdb_env(db_idx)
+        with lmdb_env.begin(write=False) as txn:
+            key = "{:008d}".format(idx).encode("ascii")
+            sample = txn.get(key)
+            sample = pickle.loads(sample)
+            tar_pose, in_audio, in_facial, in_shape, in_word, emo, sem, vid, trans, trans_v, audio_name = sample
+            # Convert data to tensors with appropriate types
+            processed_data = self._convert_to_tensors(
+                tar_pose, in_audio, in_facial, in_shape, in_word,
+                emo, sem, vid, trans, trans_v
+            )
+            processed_data['audio_name'] = audio_name
+            return processed_data
+    def _convert_to_tensors(self, tar_pose, in_audio, in_facial, in_shape, in_word,
+                           emo, sem, vid, trans, trans_v):
+        """Convert numpy arrays to tensors with appropriate types."""
+        data = {
+            'emo': torch.from_numpy(emo).int(),
+            'sem': torch.from_numpy(sem).float(),
+            'audio_onset': torch.from_numpy(in_audio).float(),
+            'word': torch.from_numpy(in_word).int()
+        }
+        if self.loader_type == "test":
+            data.update({
+                'pose': torch.from_numpy(tar_pose).float(),
+                'trans': torch.from_numpy(trans).float(),
+                'trans_v': torch.from_numpy(trans_v).float(),
+                'facial': torch.from_numpy(in_facial).float(),
+                'id': torch.from_numpy(vid).float(),
+                'beta': torch.from_numpy(in_shape).float()
+            })
+        else:
+            data.update({
+                'pose': torch.from_numpy(tar_pose).reshape((tar_pose.shape[0], -1)).float(),
+                'trans': torch.from_numpy(trans).reshape((trans.shape[0], -1)).float(),
+                'trans_v': torch.from_numpy(trans_v).reshape((trans_v.shape[0], -1)).float(),
+                'facial': torch.from_numpy(in_facial).reshape((in_facial.shape[0], -1)).float(),
+                'id': torch.from_numpy(vid).reshape((vid.shape[0], -1)).float(),
+                'beta': torch.from_numpy(in_shape).reshape((in_shape.shape[0], -1)).float()
+            })
+        return data
+    def regenerate_cache_if_corrupted(self):
+        """Regenerate cache if the pickle file is corrupted."""
+        mapping_path = os.path.join(self.preloaded_dir, "sample_db_mapping.pkl")
+        if os.path.exists(mapping_path):
+            try:
+                # Try to load the file to check if it's corrupted
+                with open(mapping_path, 'rb') as f:
+                    test_data = pickle.load(f)
+                return False  # File is not corrupted
+            except (EOFError, pickle.UnpicklingError):
+                print(f"Detected corrupted pickle file: {mapping_path}")
+                print("Regenerating cache...")
+                # Remove corrupted file
+                os.remove(mapping_path)
+                # Regenerate cache
+                self.build_cache(self.preloaded_dir)
+                return True
+        return False

dataloaders/beat_sep_single.py ADDED Viewed

	@@ -0,0 +1,693 @@

+import os
+import pickle
+import math
+import shutil
+import numpy as np
+import lmdb as lmdb
+import textgrid as tg
+import pandas as pd
+import torch
+import glob
+import json
+from termcolor import colored
+from loguru import logger
+from collections import defaultdict
+from torch.utils.data import Dataset
+import torch.distributed as dist
+#import pyarrow
+import pickle
+import librosa
+import smplx
+from .build_vocab import Vocab
+from models.utils.wav2vec import Wav2Vec2Model
+from .data_tools import joints_list
+from .utils import rotation_conversions as rc
+from .utils import other_tools
+import torch.nn.functional as F
+class _FallbackLangModel:
+    """Minimal vocabulary that grows on demand for demo/test mode."""
+    def __init__(self) -> None:
+        self.PAD_token = 0
+        self.UNK_token = 1
+        self._word_to_idx = {"<pad>": self.PAD_token, "<unk>": self.UNK_token}
+        self.word_embedding_weights = np.zeros((2, 300), dtype=np.float32)
+    def get_word_index(self, word: str) -> int:
+        if word is None:
+            return self.UNK_token
+        cleaned = word.strip().lower()
+        if not cleaned:
+            return self.PAD_token
+        return self._word_to_idx["<unk>"]
+class CustomDataset(Dataset):
+    def __init__(self, args, loader_type, augmentation=None, kwargs=None, build_cache=True):
+        self.audio_file_path = args.audio_file_path
+        self.textgrid_file_path = args.textgrid_file_path
+        self.default_pose_file = "./demo/examples/2_scott_0_1_1.npz"
+        self.args = args
+        self.loader_type = loader_type
+        self.rank = 0
+        self.ori_stride = self.args.stride
+        self.ori_length = self.args.pose_length
+        self.alignment = [0,0] # for trinity
+        self.ori_joint_list = joints_list[self.args.ori_joints]
+        self.tar_joint_list = joints_list[self.args.tar_joints]
+        if 'smplx' in self.args.pose_rep:
+            self.joint_mask = np.zeros(len(list(self.ori_joint_list.keys()))*3)
+            self.joints = len(list(self.tar_joint_list.keys()))
+            for joint_name in self.tar_joint_list:
+                self.joint_mask[self.ori_joint_list[joint_name][1] - self.ori_joint_list[joint_name][0]:self.ori_joint_list[joint_name][1]] = 1
+        else:
+            self.joints = len(list(self.ori_joint_list.keys()))+1
+            self.joint_mask = np.zeros(self.joints*3)
+            for joint_name in self.tar_joint_list:
+                if joint_name == "Hips":
+                    self.joint_mask[3:6] = 1
+                else:
+                    self.joint_mask[self.ori_joint_list[joint_name][1] - self.ori_joint_list[joint_name][0]:self.ori_joint_list[joint_name][1]] = 1
+        # select trainable joints
+        self.smplx = smplx.create(
+            self.args.data_path_1+"smplx_models/",
+            model_type='smplx',
+            gender='NEUTRAL_2020',
+            use_face_contour=False,
+            num_betas=300,
+            num_expression_coeffs=100,
+            ext='npz',
+            use_pca=False,
+            ).eval()
+        if loader_type == 'test':
+            # In demo/test mode, skip dataset CSV and use provided paths
+            self.selected_file = pd.DataFrame([{
+                'id': 'demo_0',
+                'audio_path': self.args.audio_file_path or './demo/examples/2_scott_0_1_1.wav',
+                'textgrid_path': self.args.textgrid_file_path or None,
+                'pose_path': self.default_pose_file,
+            }])
+        else:
+            split_rule = pd.read_csv(args.data_path+"train_test_split.csv")
+            self.selected_file = split_rule.loc[(split_rule['type'] == loader_type) & (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))]
+            if args.additional_data and loader_type == 'train':
+                split_b = split_rule.loc[(split_rule['type'] == 'additional') & (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))]
+                self.selected_file = pd.concat([self.selected_file, split_b])
+            if self.selected_file.empty:
+                logger.warning(f"{loader_type} is empty for speaker {self.args.training_speakers}, use train set 0-8 instead")
+                self.selected_file = split_rule.loc[(split_rule['type'] == 'train') & (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))]
+                self.selected_file = self.selected_file.iloc[0:8]
+        self.data_dir = args.data_path
+        if loader_type == "test":
+            self.args.multi_length_training = [1.0]
+        self.max_length = int(args.pose_length * self.args.multi_length_training[-1])
+        self.max_audio_pre_len = math.floor(args.pose_length / args.pose_fps * self.args.audio_sr)
+        if self.max_audio_pre_len > self.args.test_length*self.args.audio_sr:
+            self.max_audio_pre_len = self.args.test_length*self.args.audio_sr
+        if args.word_rep is not None:
+            vocab_path = f"{args.data_path}weights/vocab.pkl"
+            if loader_type == 'test':
+                logger.info("Instantiating fallback vocabulary for test loader")
+                self.lang_model = _FallbackLangModel()
+            elif os.path.exists(vocab_path):
+                with open(vocab_path, 'rb') as f:
+                    self.lang_model = pickle.load(f)
+            else:
+                logger.warning(f"vocab.pkl not found at {vocab_path}, using fallback vocabulary")
+                self.lang_model = _FallbackLangModel()
+        else:
+            self.lang_model = None
+        preloaded_dir = self.args.tmp_dir+'/' + loader_type + f"/{args.pose_rep}_cache"
+        if self.args.beat_align and loader_type != 'test':
+            if not os.path.exists(args.data_path+f"weights/mean_vel_{args.pose_rep}.npy"):
+                self.calculate_mean_velocity(args.data_path+f"weights/mean_vel_{args.pose_rep}.npy")
+            self.avg_vel = np.load(args.data_path+f"weights/mean_vel_{args.pose_rep}.npy")
+        else:
+            self.avg_vel = None
+        if build_cache and self.rank == 0:
+            self.build_cache(preloaded_dir)
+        self.lmdb_env = lmdb.open(preloaded_dir, readonly=True, lock=False)
+        with self.lmdb_env.begin() as txn:
+            self.n_samples = txn.stat()["entries"]
+    def calculate_mean_velocity(self, save_path):
+        # Stub for demo mode: write zero velocity to avoid heavy computation
+        avg_vel = np.zeros(55)
+        np.save(save_path, avg_vel)
+    def build_cache(self, preloaded_dir):
+        logger.info(f"Audio bit rate: {self.args.audio_fps}")
+        logger.info("Reading data '{}'...".format(self.data_dir))
+        logger.info("Creating the dataset cache...")
+        if self.args.new_cache:
+            if os.path.exists(preloaded_dir):
+                shutil.rmtree(preloaded_dir)
+        if os.path.exists(preloaded_dir):
+            logger.info("Found the cache {}".format(preloaded_dir))
+        elif self.loader_type == "test":
+            self.cache_generation(
+                preloaded_dir, True,
+                0, 0,
+                is_test=True)
+        else:
+            self.cache_generation(
+                preloaded_dir, self.args.disable_filtering,
+                self.args.clean_first_seconds, self.args.clean_final_seconds,
+                is_test=False)
+    def __len__(self):
+        return self.n_samples
+    def cache_generation(self, out_lmdb_dir, disable_filtering, clean_first_seconds,  clean_final_seconds, is_test=False):
+        # if "wav2vec2" in self.args.audio_rep:
+        #     self.wav2vec_model = Wav2Vec2Model.from_pretrained(f"{self.args.data_path_1}/hub/transformer/wav2vec2-base-960h")
+        #     self.wav2vec_model.feature_extractor._freeze_parameters()
+        #     self.wav2vec_model = self.wav2vec_model.cuda()
+        #     self.wav2vec_model.eval()
+        self.n_out_samples = 0
+        # create db for samples
+        if not os.path.exists(out_lmdb_dir): os.makedirs(out_lmdb_dir)
+        dst_lmdb_env = lmdb.open(out_lmdb_dir, map_size= int(1024 ** 3 * 500))# 500G
+        n_filtered_out = defaultdict(int)
+        #f_name = file_name["id"]
+        ext = ".npz" if "smplx" in self.args.pose_rep else ".bvh"
+        pose_file = self.default_pose_file
+        pose_each_file = []
+        trans_each_file = []
+        trans_v_each_file = []
+        shape_each_file = []
+        audio_each_file = []
+        facial_each_file = []
+        word_each_file = []
+        emo_each_file = []
+        sem_each_file = []
+        vid_each_file = []
+        id_pose = "tmp" #1_wayne_0_1_1
+        logger.info(colored(f"# ---- Building cache for Pose   {id_pose} ---- #", "blue"))
+        if "smplx" in self.args.pose_rep:
+            pose_data = np.load(pose_file, allow_pickle=True)
+            assert 30%self.args.pose_fps == 0, 'pose_fps should be an aliquot part of 30'
+            stride = int(30/self.args.pose_fps)
+            pose_each_file = pose_data["poses"][::stride]
+            trans_each_file = pose_data["trans"][::stride]
+            trans_each_file[:,0] = trans_each_file[:,0] - trans_each_file[0,0]
+            trans_each_file[:,2] = trans_each_file[:,2] - trans_each_file[0,2]
+            trans_v_each_file = np.zeros_like(trans_each_file)
+            trans_v_each_file[1:,0] = trans_each_file[1:,0] - trans_each_file[:-1,0]
+            trans_v_each_file[0,0] = trans_v_each_file[1,0]
+            trans_v_each_file[1:,2] = trans_each_file[1:,2] - trans_each_file[:-1,2]
+            trans_v_each_file[0,2] = trans_v_each_file[1,2]
+            trans_v_each_file[:,1] = trans_each_file[:,1]
+            shape_each_file = np.repeat(pose_data["betas"].reshape(1, 300), pose_each_file.shape[0], axis=0)
+            assert self.args.pose_fps == 30, "should 30"
+            m_data = np.load(pose_file, allow_pickle=True)
+            betas, poses, trans, exps = m_data["betas"], m_data["poses"], m_data["trans"], m_data["expressions"]
+            n, c = poses.shape[0], poses.shape[1]
+            betas = betas.reshape(1, 300)
+            betas = np.tile(betas, (n, 1))
+            betas = torch.from_numpy(betas).float()
+            poses = torch.from_numpy(poses.reshape(n, c)).float()
+            exps = torch.from_numpy(exps.reshape(n, 100)).float()
+            trans = torch.from_numpy(trans.reshape(n, 3)).float()
+            max_length = 128    # 为什么这里需要一个max_length
+            s, r = n//max_length, n%max_length
+            #print(n, s, r)
+            all_tensor = []
+            for i in range(s):
+                with torch.no_grad():
+                    joints = self.smplx(
+                        betas=betas[i*max_length:(i+1)*max_length],
+                        transl=trans[i*max_length:(i+1)*max_length],
+                        expression=exps[i*max_length:(i+1)*max_length],
+                        jaw_pose=poses[i*max_length:(i+1)*max_length, 66:69],
+                        global_orient=poses[i*max_length:(i+1)*max_length,:3],
+                        body_pose=poses[i*max_length:(i+1)*max_length,3:21*3+3],
+                        left_hand_pose=poses[i*max_length:(i+1)*max_length,25*3:40*3],
+                        right_hand_pose=poses[i*max_length:(i+1)*max_length,40*3:55*3],
+                        return_verts=True,
+                        return_joints=True,
+                        leye_pose=poses[i*max_length:(i+1)*max_length, 69:72],
+                        reye_pose=poses[i*max_length:(i+1)*max_length, 72:75],
+                    )['joints'][:, (7,8,10,11), :].reshape(max_length, 4, 3).cpu()
+                all_tensor.append(joints)
+            if r != 0:
+                with torch.no_grad():
+                    joints = self.smplx(
+                        betas=betas[s*max_length:s*max_length+r],
+                        transl=trans[s*max_length:s*max_length+r],
+                        expression=exps[s*max_length:s*max_length+r],
+                        jaw_pose=poses[s*max_length:s*max_length+r, 66:69],
+                        global_orient=poses[s*max_length:s*max_length+r,:3],
+                        body_pose=poses[s*max_length:s*max_length+r,3:21*3+3],
+                        left_hand_pose=poses[s*max_length:s*max_length+r,25*3:40*3],
+                        right_hand_pose=poses[s*max_length:s*max_length+r,40*3:55*3],
+                        return_verts=True,
+                        return_joints=True,
+                        leye_pose=poses[s*max_length:s*max_length+r, 69:72],
+                        reye_pose=poses[s*max_length:s*max_length+r, 72:75],
+                    )['joints'][:, (7,8,10,11), :].reshape(r, 4, 3).cpu()
+                all_tensor.append(joints)
+            joints = torch.cat(all_tensor, axis=0) # all, 4, 3
+            # print(joints.shape)
+            feetv = torch.zeros(joints.shape[1], joints.shape[0])
+            joints = joints.permute(1, 0, 2)
+            #print(joints.shape, feetv.shape)
+            feetv[:, :-1] = (joints[:, 1:] - joints[:, :-1]).norm(dim=-1)
+            #print(feetv.shape)
+            contacts = (feetv < 0.01).numpy().astype(float)
+            # print(contacts.shape, contacts)
+            contacts = contacts.transpose(1, 0)
+            pose_each_file = pose_each_file * self.joint_mask
+            pose_each_file = pose_each_file[:, self.joint_mask.astype(bool)]
+            pose_each_file = np.concatenate([pose_each_file, contacts], axis=1)
+            # print(pose_each_file.shape)
+            if self.args.facial_rep is not None:
+                logger.info(f"# ---- Building cache for Facial {id_pose} and Pose {id_pose} ---- #")
+                facial_each_file = pose_data["expressions"][::stride]
+                if self.args.facial_norm:
+                    facial_each_file = (facial_each_file - self.mean_facial) / self.std_facial
+        if self.args.id_rep is not None:
+            vid_each_file = np.repeat(np.array(int(999)-1).reshape(1, 1), pose_each_file.shape[0], axis=0)
+        if self.args.audio_rep is not None:
+            logger.info(f"# ---- Building cache for Audio  {id_pose} and Pose {id_pose} ---- #")
+            audio_file = self.audio_file_path
+            if not os.path.exists(audio_file):
+                logger.warning(f"# ---- file not found for Audio  {id_pose}, skip all files with the same id ---- #")
+                self.selected_file = self.selected_file.drop(self.selected_file[self.selected_file['id'] == id_pose].index)
+            audio_save_path = audio_file.replace("wave16k", "onset_amplitude").replace(".wav", ".npy")
+            if self.args.audio_rep == "onset+amplitude":
+                audio_each_file, sr = librosa.load(audio_file)
+                audio_each_file = librosa.resample(audio_each_file, orig_sr=sr, target_sr=self.args.audio_sr)
+                from numpy.lib import stride_tricks
+                frame_length = 1024
+                # hop_length = 512
+                shape = (audio_each_file.shape[-1] - frame_length + 1, frame_length)
+                strides = (audio_each_file.strides[-1], audio_each_file.strides[-1])
+                rolling_view = stride_tricks.as_strided(audio_each_file, shape=shape, strides=strides)
+                amplitude_envelope = np.max(np.abs(rolling_view), axis=1)
+                # pad the last frame_length-1 samples
+                amplitude_envelope = np.pad(amplitude_envelope, (0, frame_length-1), mode='constant', constant_values=amplitude_envelope[-1])
+                audio_onset_f = librosa.onset.onset_detect(y=audio_each_file, sr=self.args.audio_sr, units='frames')
+                onset_array = np.zeros(len(audio_each_file), dtype=float)
+                onset_array[audio_onset_f] = 1.0
+                # print(amplitude_envelope.shape, audio_each_file.shape, onset_array.shape)
+                audio_each_file = np.concatenate([amplitude_envelope.reshape(-1, 1), onset_array.reshape(-1, 1)], axis=1)
+            elif self.args.audio_rep == "mfcc":
+                audio_each_file = librosa.feature.melspectrogram(y=audio_each_file, sr=self.args.audio_sr, n_mels=128, hop_length=int(self.args.audio_sr/self.args.audio_fps))
+                audio_each_file = audio_each_file.transpose(1, 0)
+                # print(audio_each_file.shape, pose_each_file.shape)
+            if self.args.audio_norm and self.args.audio_rep == "wave16k":
+                audio_each_file = (audio_each_file - self.mean_audio) / self.std_audio
+        time_offset = 0
+        if self.args.word_rep is not None:
+            logger.info(f"# ---- Building cache for Word   {id_pose} and Pose {id_pose} ---- #")
+            word_file = self.textgrid_file_path
+            if not os.path.exists(word_file):
+                logger.warning(f"# ---- file not found for Word   {id_pose}, skip all files with the same id ---- #")
+                self.selected_file = self.selected_file.drop(self.selected_file[self.selected_file['id'] == id_pose].index)
+            word_save_path = f"{self.data_dir}{self.args.t_pre_encoder}/{id_pose}.npy"
+            tgrid = tg.TextGrid.fromFile(word_file)
+            for i in range(pose_each_file.shape[0]):
+                found_flag = False
+                current_time = i/self.args.pose_fps + time_offset
+                j_last = 0
+                for j, word in enumerate(tgrid[0]):
+                    word_n, word_s, word_e = word.mark, word.minTime, word.maxTime
+                    if word_s<=current_time and current_time<=word_e:
+                        if word_n == " ":
+                            word_each_file.append(self.lang_model.PAD_token)
+                        else:
+                            word_each_file.append(self.lang_model.get_word_index(word_n))
+                        found_flag = True
+                        j_last = j
+                        break
+                    else: continue
+                if not found_flag:
+                    word_each_file.append(self.lang_model.UNK_token)
+            word_each_file = np.array(word_each_file)
+        if self.args.emo_rep is not None:
+            logger.info(f"# ---- Building cache for Emo    {id_pose} and Pose {id_pose} ---- #")
+            rtype, start = int(id_pose.split('_')[3]), int(id_pose.split('_')[3])
+            if rtype == 0 or rtype == 2 or rtype == 4 or rtype == 6:
+                if start >= 1 and start <= 64:
+                    score = 0
+                elif start >= 65 and start <= 72:
+                    score = 1
+                elif start >= 73 and start <= 80:
+                    score = 2
+                elif start >= 81 and start <= 86:
+                    score = 3
+                elif start >= 87 and start <= 94:
+                    score = 4
+                elif start >= 95 and start <= 102:
+                    score = 5
+                elif start >= 103 and start <= 110:
+                    score = 6
+                elif start >= 111 and start <= 118:
+                    score = 7
+                else: pass
+            else:
+                # you may denote as unknown in the future
+                score = 0
+            emo_each_file = np.repeat(np.array(score).reshape(1, 1), pose_each_file.shape[0], axis=0)
+            #print(emo_each_file)
+        if self.args.sem_rep is not None:
+            logger.info(f"# ---- Building cache for Sem    {id_pose} and Pose {id_pose} ---- #")
+            sem_file = f"{self.data_dir}{self.args.sem_rep}/{id_pose}.txt"
+            sem_all = pd.read_csv(sem_file,
+                sep='\t',
+                names=["name", "start_time", "end_time", "duration", "score", "keywords"])
+            # we adopt motion-level semantic score here.
+            for i in range(pose_each_file.shape[0]):
+                found_flag = False
+                for j, (start, end, score) in enumerate(zip(sem_all['start_time'],sem_all['end_time'], sem_all['score'])):
+                    current_time = i/self.args.pose_fps + time_offset
+                    if start<=current_time and current_time<=end:
+                        sem_each_file.append(score)
+                        found_flag=True
+                        break
+                    else: continue
+                if not found_flag: sem_each_file.append(0.)
+            sem_each_file = np.array(sem_each_file)
+            #print(sem_each_file)
+        filtered_result = self._sample_from_clip(
+            dst_lmdb_env,
+            audio_each_file, pose_each_file, trans_each_file, trans_v_each_file,shape_each_file, facial_each_file, word_each_file,
+            vid_each_file, emo_each_file, sem_each_file,
+            disable_filtering, clean_first_seconds, clean_final_seconds, is_test,
+            )
+        for type in filtered_result.keys():
+            n_filtered_out[type] += filtered_result[type]
+#### ---------for_end------------ ####
+        with dst_lmdb_env.begin() as txn:
+            logger.info(colored(f"no. of samples: {txn.stat()['entries']}", "cyan"))
+            n_total_filtered = 0
+            for type, n_filtered in n_filtered_out.items():
+                logger.info("{}: {}".format(type, n_filtered))
+                n_total_filtered += n_filtered
+            logger.info(colored("no. of excluded samples: {} ({:.1f}%)".format(
+                n_total_filtered, 100 * n_total_filtered / (txn.stat()["entries"] + n_total_filtered)), "cyan"))
+        dst_lmdb_env.sync()
+        dst_lmdb_env.close()
+    def _sample_from_clip(
+        self, dst_lmdb_env, audio_each_file, pose_each_file, trans_each_file, trans_v_each_file,shape_each_file, facial_each_file, word_each_file,
+        vid_each_file, emo_each_file, sem_each_file,
+        disable_filtering, clean_first_seconds, clean_final_seconds, is_test,
+        ):
+        """
+        for data cleaning, we ignore the data for first and final n s
+        for test, we return all data
+        """
+        # audio_start = int(self.alignment[0] * self.args.audio_fps)
+        # pose_start = int(self.alignment[1] * self.args.pose_fps)
+        #logger.info(f"before: {audio_each_file.shape} {pose_each_file.shape}")
+        # audio_each_file = audio_each_file[audio_start:]
+        # pose_each_file = pose_each_file[pose_start:]
+        # trans_each_file =
+        #logger.info(f"after alignment: {audio_each_file.shape} {pose_each_file.shape}")
+        #print(pose_each_file.shape)
+        round_seconds_skeleton = pose_each_file.shape[0] // self.args.pose_fps  # assume 1500 frames / 15 fps = 100 s
+        #print(round_seconds_skeleton)
+        if audio_each_file is not None:
+            if self.args.audio_rep != "wave16k":
+                round_seconds_audio = len(audio_each_file) // self.args.audio_fps # assume 16,000,00 / 16,000 = 100 s
+            elif self.args.audio_rep == "mfcc":
+                round_seconds_audio = audio_each_file.shape[0] // self.args.audio_fps
+            else:
+                round_seconds_audio = audio_each_file.shape[0] // self.args.audio_sr
+            if facial_each_file is not None:
+                round_seconds_facial = facial_each_file.shape[0] // self.args.pose_fps
+                logger.info(f"audio: {round_seconds_audio}s, pose: {round_seconds_skeleton}s, facial: {round_seconds_facial}s")
+                round_seconds_skeleton = min(round_seconds_audio, round_seconds_skeleton, round_seconds_facial)
+                max_round = max(round_seconds_audio, round_seconds_skeleton, round_seconds_facial)
+                if round_seconds_skeleton != max_round:
+                    logger.warning(f"reduce to {round_seconds_skeleton}s, ignore {max_round-round_seconds_skeleton}s")
+            else:
+                logger.info(f"pose: {round_seconds_skeleton}s, audio: {round_seconds_audio}s")
+                round_seconds_skeleton = min(round_seconds_audio, round_seconds_skeleton)
+                max_round = max(round_seconds_audio, round_seconds_skeleton)
+                if round_seconds_skeleton != max_round:
+                    logger.warning(f"reduce to {round_seconds_skeleton}s, ignore {max_round-round_seconds_skeleton}s")
+        clip_s_t, clip_e_t = clean_first_seconds, round_seconds_skeleton - clean_final_seconds # assume [10, 90]s
+        clip_s_f_audio, clip_e_f_audio = self.args.audio_fps * clip_s_t, clip_e_t * self.args.audio_fps # [160,000,90*160,000]
+        clip_s_f_pose, clip_e_f_pose = clip_s_t * self.args.pose_fps, clip_e_t * self.args.pose_fps # [150,90*15]
+        for ratio in self.args.multi_length_training:
+            if is_test:# stride = length for test
+                cut_length = clip_e_f_pose - clip_s_f_pose
+                self.args.stride = cut_length
+                self.max_length = cut_length
+            else:
+                self.args.stride = int(ratio*self.ori_stride)
+                cut_length = int(self.ori_length*ratio)
+            num_subdivision = math.floor((clip_e_f_pose - clip_s_f_pose - cut_length) / self.args.stride) + 1
+            logger.info(f"pose from frame {clip_s_f_pose} to {clip_e_f_pose}, length {cut_length}")
+            logger.info(f"{num_subdivision} clips is expected with stride {self.args.stride}")
+            if audio_each_file is not None:
+                audio_short_length = math.floor(cut_length / self.args.pose_fps * self.args.audio_fps)
+                """
+                for audio sr = 16000, fps = 15, pose_length = 34,
+                audio short length = 36266.7 -> 36266
+                this error is fine.
+                """
+                logger.info(f"audio from frame {clip_s_f_audio} to {clip_e_f_audio}, length {audio_short_length}")
+            n_filtered_out = defaultdict(int)
+            sample_pose_list = []
+            sample_audio_list = []
+            sample_facial_list = []
+            sample_shape_list = []
+            sample_word_list = []
+            sample_emo_list = []
+            sample_sem_list = []
+            sample_vid_list = []
+            sample_trans_list = []
+            sample_trans_v_list = []
+            for i in range(num_subdivision): # cut into around 2s chip, (self npose)
+                start_idx = clip_s_f_pose + i * self.args.stride
+                fin_idx = start_idx + cut_length
+                sample_pose = pose_each_file[start_idx:fin_idx]
+                sample_trans = trans_each_file[start_idx:fin_idx]
+                sample_trans_v = trans_v_each_file[start_idx:fin_idx]
+                sample_shape = shape_each_file[start_idx:fin_idx]
+                # print(sample_pose.shape)
+                if self.args.audio_rep is not None:
+                    audio_start = clip_s_f_audio + math.floor(i * self.args.stride * self.args.audio_fps / self.args.pose_fps)
+                    audio_end = audio_start + audio_short_length
+                    sample_audio = audio_each_file[audio_start:audio_end]
+                else:
+                    sample_audio = np.array([-1])
+                sample_facial = facial_each_file[start_idx:fin_idx] if self.args.facial_rep is not None else np.array([-1])
+                sample_word = word_each_file[start_idx:fin_idx] if self.args.word_rep is not None else np.array([-1])
+                sample_emo = emo_each_file[start_idx:fin_idx] if self.args.emo_rep is not None else np.array([-1])
+                sample_sem = sem_each_file[start_idx:fin_idx] if self.args.sem_rep is not None else np.array([-1])
+                sample_vid = vid_each_file[start_idx:fin_idx] if self.args.id_rep is not None else np.array([-1])
+                if sample_pose.any() != None:
+                    # filtering motion skeleton data
+                    sample_pose, filtering_message = MotionPreprocessor(sample_pose).get()
+                    is_correct_motion = (sample_pose is not None)
+                    if is_correct_motion or disable_filtering:
+                        sample_pose_list.append(sample_pose)
+                        sample_audio_list.append(sample_audio)
+                        sample_facial_list.append(sample_facial)
+                        sample_shape_list.append(sample_shape)
+                        sample_word_list.append(sample_word)
+                        sample_vid_list.append(sample_vid)
+                        sample_emo_list.append(sample_emo)
+                        sample_sem_list.append(sample_sem)
+                        sample_trans_list.append(sample_trans)
+                        sample_trans_v_list.append(sample_trans_v)
+                    else:
+                        n_filtered_out[filtering_message] += 1
+            if len(sample_pose_list) > 0:
+                with dst_lmdb_env.begin(write=True) as txn:
+                    for pose, audio, facial, shape, word, vid, emo, sem, trans,trans_v in zip(
+                        sample_pose_list,
+                        sample_audio_list,
+                        sample_facial_list,
+                        sample_shape_list,
+                        sample_word_list,
+                        sample_vid_list,
+                        sample_emo_list,
+                        sample_sem_list,
+                        sample_trans_list,
+                        sample_trans_v_list,):
+                        k = "{:005}".format(self.n_out_samples).encode("ascii")
+                        v = [pose, audio, facial, shape, word, emo, sem, vid, trans,trans_v]
+                        v = pickle.dumps(v,5)
+                        txn.put(k, v)
+                        self.n_out_samples += 1
+        return n_filtered_out
+    def __getitem__(self, idx):
+        with self.lmdb_env.begin(write=False) as txn:
+            key = "{:005}".format(idx).encode("ascii")
+            sample = txn.get(key)
+            sample = pickle.loads(sample)
+            tar_pose, in_audio, in_facial, in_shape, in_word, emo, sem, vid, trans,trans_v = sample
+            #print(in_shape)
+            #vid = torch.from_numpy(vid).int()
+            emo = torch.from_numpy(emo).int()
+            sem = torch.from_numpy(sem).float()
+            in_audio = torch.from_numpy(in_audio).float()
+            in_word = torch.from_numpy(in_word).float() if self.args.word_cache else torch.from_numpy(in_word).int()
+            if self.loader_type == "test":
+                tar_pose = torch.from_numpy(tar_pose).float()
+                trans = torch.from_numpy(trans).float()
+                trans_v = torch.from_numpy(trans_v).float()
+                in_facial = torch.from_numpy(in_facial).float()
+                vid = torch.from_numpy(vid).float()
+                in_shape = torch.from_numpy(in_shape).float()
+            else:
+                in_shape = torch.from_numpy(in_shape).reshape((in_shape.shape[0], -1)).float()
+                trans = torch.from_numpy(trans).reshape((trans.shape[0], -1)).float()
+                trans_v = torch.from_numpy(trans_v).reshape((trans_v.shape[0], -1)).float()
+                vid = torch.from_numpy(vid).reshape((vid.shape[0], -1)).float()
+                tar_pose = torch.from_numpy(tar_pose).reshape((tar_pose.shape[0], -1)).float()
+                in_facial = torch.from_numpy(in_facial).reshape((in_facial.shape[0], -1)).float()
+            return {"pose":tar_pose, "audio":in_audio, "facial":in_facial, "beta": in_shape, "word":in_word, "id":vid, "emo":emo, "sem":sem, "trans":trans,"trans_v":trans_v}
+class MotionPreprocessor:
+    def __init__(self, skeletons):
+        self.skeletons = skeletons
+        #self.mean_pose = mean_pose
+        self.filtering_message = "PASS"
+    def get(self):
+        assert (self.skeletons is not None)
+        # filtering
+        if self.skeletons is not None:
+            if self.check_pose_diff():
+                self.skeletons = []
+                self.filtering_message = "pose"
+            # elif self.check_spine_angle():
+            #     self.skeletons = []
+            #     self.filtering_message = "spine angle"
+            # elif self.check_static_motion():
+            #     self.skeletons = []
+            #     self.filtering_message = "motion"
+        # if self.skeletons is not None:
+        #     self.skeletons = self.skeletons.tolist()
+        #     for i, frame in enumerate(self.skeletons):
+        #         assert not np.isnan(self.skeletons[i]).any()  # missing joints
+        return self.skeletons, self.filtering_message
+    def check_static_motion(self, verbose=True):
+        def get_variance(skeleton, joint_idx):
+            wrist_pos = skeleton[:, joint_idx]
+            variance = np.sum(np.var(wrist_pos, axis=0))
+            return variance
+        left_arm_var = get_variance(self.skeletons, 6)
+        right_arm_var = get_variance(self.skeletons, 9)
+        th = 0.0014  # exclude 13110
+        # th = 0.002  # exclude 16905
+        if left_arm_var < th and right_arm_var < th:
+            if verbose:
+                print("skip - check_static_motion left var {}, right var {}".format(left_arm_var, right_arm_var))
+            return True
+        else:
+            if verbose:
+                print("pass - check_static_motion left var {}, right var {}".format(left_arm_var, right_arm_var))
+            return False
+    def check_pose_diff(self, verbose=False):
+#         diff = np.abs(self.skeletons - self.mean_pose) # 186*1
+#         diff = np.mean(diff)
+#         # th = 0.017
+#         th = 0.02 #0.02  # exclude 3594
+#         if diff < th:
+#             if verbose:
+#                 print("skip - check_pose_diff {:.5f}".format(diff))
+#             return True
+# #         th = 3.5 #0.02  # exclude 3594
+# #         if 3.5 < diff < 5:
+# #             if verbose:
+# #                 print("skip - check_pose_diff {:.5f}".format(diff))
+# #             return True
+#         else:
+#             if verbose:
+#                 print("pass - check_pose_diff {:.5f}".format(diff))
+        return False
+    def check_spine_angle(self, verbose=True):
+        def angle_between(v1, v2):
+            v1_u = v1 / np.linalg.norm(v1)
+            v2_u = v2 / np.linalg.norm(v2)
+            return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0))
+        angles = []
+        for i in range(self.skeletons.shape[0]):
+            spine_vec = self.skeletons[i, 1] - self.skeletons[i, 0]
+            angle = angle_between(spine_vec, [0, -1, 0])
+            angles.append(angle)
+        if np.rad2deg(max(angles)) > 30 or np.rad2deg(np.mean(angles)) > 20:  # exclude 4495
+        # if np.rad2deg(max(angles)) > 20:  # exclude 8270
+            if verbose:
+                print("skip - check_spine_angle {:.5f}, {:.5f}".format(max(angles), np.mean(angles)))
+            return True
+        else:
+            if verbose:
+                print("pass - check_spine_angle {:.5f}".format(max(angles)))
+            return False

dataloaders/beat_smplx2020.py ADDED Viewed

	@@ -0,0 +1,763 @@

+import os
+import pickle
+import math
+import shutil
+import numpy as np
+import lmdb as lmdb
+import textgrid as tg
+import pandas as pd
+import torch
+import glob
+import json
+from termcolor import colored
+from loguru import logger
+from collections import defaultdict
+from torch.utils.data import Dataset
+import torch.distributed as dist
+import pyarrow
+import librosa
+import smplx
+from .build_vocab import Vocab
+from .utils.audio_features import Wav2Vec2Model
+from .data_tools import joints_list
+from .utils import rotation_conversions as rc
+from .utils import other_tools
+class CustomDataset(Dataset):
+    def __init__(self, args, loader_type, augmentation=None, kwargs=None, build_cache=True):
+        self.args = args
+        self.loader_type = loader_type
+        self.rank = dist.get_rank()
+        self.ori_stride = self.args.stride
+        self.ori_length = self.args.pose_length
+        self.alignment = [0,0] # for trinity
+        self.ori_joint_list = joints_list[self.args.ori_joints]
+        self.tar_joint_list = joints_list[self.args.tar_joints]
+        if 'smplx' in self.args.pose_rep:
+            self.joint_mask = np.zeros(len(list(self.ori_joint_list.keys()))*3)
+            self.joints = len(list(self.ori_joint_list.keys()))
+            for joint_name in self.tar_joint_list:
+                self.joint_mask[self.ori_joint_list[joint_name][1] - self.ori_joint_list[joint_name][0]:self.ori_joint_list[joint_name][1]] = 1
+        else:
+            self.joints = len(list(self.ori_joint_list.keys()))+1
+            self.joint_mask = np.zeros(self.joints*3)
+            for joint_name in self.tar_joint_list:
+                if joint_name == "Hips":
+                    self.joint_mask[3:6] = 1
+                else:
+                    self.joint_mask[self.ori_joint_list[joint_name][1] - self.ori_joint_list[joint_name][0]:self.ori_joint_list[joint_name][1]] = 1
+        # select trainable joints
+        split_rule = pd.read_csv(args.data_path+"train_test_split.csv")
+        self.selected_file = split_rule.loc[(split_rule['type'] == loader_type) & (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))]
+        if args.additional_data and loader_type == 'train':
+            split_b = split_rule.loc[(split_rule['type'] == 'additional') & (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))]
+            #self.selected_file = split_rule.loc[(split_rule['type'] == 'additional') & (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))]
+            self.selected_file = pd.concat([self.selected_file, split_b])
+        if self.selected_file.empty:
+            logger.warning(f"{loader_type} is empty for speaker {self.args.training_speakers}, use train set 0-8 instead")
+            self.selected_file = split_rule.loc[(split_rule['type'] == 'train') & (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))]
+            self.selected_file = self.selected_file.iloc[0:8]
+        self.data_dir = args.data_path
+        if loader_type == "test":
+            self.args.multi_length_training = [1.0]
+        self.max_length = int(args.pose_length * self.args.multi_length_training[-1])
+        self.max_audio_pre_len = math.floor(args.pose_length / args.pose_fps * self.args.audio_sr)
+        if self.max_audio_pre_len > self.args.test_length*self.args.audio_sr:
+            self.max_audio_pre_len = self.args.test_length*self.args.audio_sr
+        if args.word_rep is not None:
+            with open(f"{args.data_path}weights/vocab.pkl", 'rb') as f:
+                self.lang_model = pickle.load(f)
+        preloaded_dir = self.args.root_path + self.args.cache_path + loader_type + f"/{args.pose_rep}_cache"
+        # if args.pose_norm:
+        #     # careful for rotation vectors
+        #     if not os.path.exists(args.data_path+args.mean_pose_path+f"{args.pose_rep.split('_')[0]}/bvh_mean.npy"):
+        #         self.calculate_mean_pose()
+        #     self.mean_pose = np.load(args.data_path+args.mean_pose_path+f"{args.pose_rep.split('_')[0]}/bvh_mean.npy")
+        #     self.std_pose = np.load(args.data_path+args.mean_pose_path+f"{args.pose_rep.split('_')[0]}/bvh_std.npy")
+        # if args.audio_norm:
+        #     if not os.path.exists(args.data_path+args.mean_pose_path+f"{args.audio_rep.split('_')[0]}/bvh_mean.npy"):
+        #         self.calculate_mean_audio()
+        #     self.mean_audio = np.load(args.data_path+args.mean_pose_path+f"{args.audio_rep.split('_')[0]}/npy_mean.npy")
+        #     self.std_audio = np.load(args.data_path+args.mean_pose_path+f"{args.audio_rep.split('_')[0]}/npy_std.npy")
+        # if args.facial_norm:
+        #     if not os.path.exists(args.data_path+args.mean_pose_path+f"{args.pose_rep.split('_')[0]}/bvh_mean.npy"):
+        #         self.calculate_mean_face()
+        #     self.mean_facial = np.load(args.data_path+args.mean_pose_path+f"{args.facial_rep}/json_mean.npy")
+        #     self.std_facial = np.load(args.data_path+args.mean_pose_path+f"{args.facial_rep}/json_std.npy")
+        if self.args.beat_align:
+            if not os.path.exists(args.data_path+f"weights/mean_vel_{args.pose_rep}.npy"):
+                self.calculate_mean_velocity(args.data_path+f"weights/mean_vel_{args.pose_rep}.npy")
+            self.avg_vel = np.load(args.data_path+f"weights/mean_vel_{args.pose_rep}.npy")
+        if build_cache and self.rank == 0:
+            self.build_cache(preloaded_dir)
+        self.lmdb_env = lmdb.open(preloaded_dir, readonly=True, lock=False)
+        with self.lmdb_env.begin() as txn:
+            self.n_samples = txn.stat()["entries"]
+    def calculate_mean_velocity(self, save_path):
+        self.smplx = smplx.create(
+            self.args.data_path_1+"smplx_models/",
+            model_type='smplx',
+            gender='NEUTRAL_2020',
+            use_face_contour=False,
+            num_betas=300,
+            num_expression_coeffs=100,
+            ext='npz',
+            use_pca=False,
+        ).cuda().eval()
+        dir_p = self.data_dir + self.args.pose_rep + "/"
+        all_list = []
+        from tqdm import tqdm
+        for tar in tqdm(os.listdir(dir_p)):
+            if tar.endswith(".npz"):
+                m_data = np.load(dir_p+tar, allow_pickle=True)
+                betas, poses, trans, exps = m_data["betas"], m_data["poses"], m_data["trans"], m_data["expressions"]
+                n, c = poses.shape[0], poses.shape[1]
+                betas = betas.reshape(1, 300)
+                betas = np.tile(betas, (n, 1))
+                betas = torch.from_numpy(betas).cuda().float()
+                poses = torch.from_numpy(poses.reshape(n, c)).cuda().float()
+                exps = torch.from_numpy(exps.reshape(n, 100)).cuda().float()
+                trans = torch.from_numpy(trans.reshape(n, 3)).cuda().float()
+                max_length = 128
+                s, r = n//max_length, n%max_length
+                #print(n, s, r)
+                all_tensor = []
+                for i in range(s):
+                    with torch.no_grad():
+                        joints = self.smplx(
+                            betas=betas[i*max_length:(i+1)*max_length],
+                            transl=trans[i*max_length:(i+1)*max_length],
+                            expression=exps[i*max_length:(i+1)*max_length],
+                            jaw_pose=poses[i*max_length:(i+1)*max_length, 66:69],
+                            global_orient=poses[i*max_length:(i+1)*max_length,:3],
+                            body_pose=poses[i*max_length:(i+1)*max_length,3:21*3+3],
+                            left_hand_pose=poses[i*max_length:(i+1)*max_length,25*3:40*3],
+                            right_hand_pose=poses[i*max_length:(i+1)*max_length,40*3:55*3],
+                            return_verts=True,
+                            return_joints=True,
+                            leye_pose=poses[i*max_length:(i+1)*max_length, 69:72],
+                            reye_pose=poses[i*max_length:(i+1)*max_length, 72:75],
+                        )['joints'][:, :55, :].reshape(max_length, 55*3)
+                    all_tensor.append(joints)
+                if r != 0:
+                    with torch.no_grad():
+                        joints = self.smplx(
+                            betas=betas[s*max_length:s*max_length+r],
+                            transl=trans[s*max_length:s*max_length+r],
+                            expression=exps[s*max_length:s*max_length+r],
+                            jaw_pose=poses[s*max_length:s*max_length+r, 66:69],
+                            global_orient=poses[s*max_length:s*max_length+r,:3],
+                            body_pose=poses[s*max_length:s*max_length+r,3:21*3+3],
+                            left_hand_pose=poses[s*max_length:s*max_length+r,25*3:40*3],
+                            right_hand_pose=poses[s*max_length:s*max_length+r,40*3:55*3],
+                            return_verts=True,
+                            return_joints=True,
+                            leye_pose=poses[s*max_length:s*max_length+r, 69:72],
+                            reye_pose=poses[s*max_length:s*max_length+r, 72:75],
+                        )['joints'][:, :55, :].reshape(r, 55*3)
+                    all_tensor.append(joints)
+                joints = torch.cat(all_tensor, axis=0)
+                joints = joints.permute(1, 0)
+                dt = 1/30
+            # first steps is forward diff (t+1 - t) / dt
+                init_vel = (joints[:, 1:2] - joints[:, :1]) / dt
+                # middle steps are second order (t+1 - t-1) / 2dt
+                middle_vel = (joints[:, 2:] - joints[:, 0:-2]) / (2 * dt)
+                # last step is backward diff (t - t-1) / dt
+                final_vel = (joints[:, -1:] - joints[:, -2:-1]) / dt
+                #print(joints.shape, init_vel.shape, middle_vel.shape, final_vel.shape)
+                vel_seq = torch.cat([init_vel, middle_vel, final_vel], dim=1).permute(1, 0).reshape(n, 55, 3)
+                #print(vel_seq.shape)
+                #.permute(1, 0).reshape(n, 55, 3)
+                vel_seq_np = vel_seq.cpu().numpy()
+                vel_joints_np = np.linalg.norm(vel_seq_np, axis=2) # n * 55
+                all_list.append(vel_joints_np)
+        avg_vel = np.mean(np.concatenate(all_list, axis=0),axis=0) # 55
+        np.save(save_path, avg_vel)
+    def build_cache(self, preloaded_dir):
+        logger.info(f"Audio bit rate: {self.args.audio_fps}")
+        logger.info("Reading data '{}'...".format(self.data_dir))
+        logger.info("Creating the dataset cache...")
+        if self.args.new_cache:
+            if os.path.exists(preloaded_dir):
+                shutil.rmtree(preloaded_dir)
+        if os.path.exists(preloaded_dir):
+            logger.info("Found the cache {}".format(preloaded_dir))
+        elif self.loader_type == "test":
+            self.cache_generation(
+                preloaded_dir, True,
+                0, 0,
+                is_test=True)
+        else:
+            self.cache_generation(
+                preloaded_dir, self.args.disable_filtering,
+                self.args.clean_first_seconds, self.args.clean_final_seconds,
+                is_test=False)
+    def __len__(self):
+        return self.n_samples
+    def cache_generation(self, out_lmdb_dir, disable_filtering, clean_first_seconds,  clean_final_seconds, is_test=False):
+        # if "wav2vec2" in self.args.audio_rep:
+        #     self.wav2vec_model = Wav2Vec2Model.from_pretrained(f"{self.args.data_path_1}/hub/transformer/wav2vec2-base-960h")
+        #     self.wav2vec_model.feature_extractor._freeze_parameters()
+        #     self.wav2vec_model = self.wav2vec_model.cuda()
+        #     self.wav2vec_model.eval()
+        self.n_out_samples = 0
+        # create db for samples
+        if not os.path.exists(out_lmdb_dir): os.makedirs(out_lmdb_dir)
+        dst_lmdb_env = lmdb.open(out_lmdb_dir, map_size= int(1024 ** 3 * 50))# 50G
+        n_filtered_out = defaultdict(int)
+        for index, file_name in self.selected_file.iterrows():
+            f_name = file_name["id"]
+            ext = ".npz" if "smplx" in self.args.pose_rep else ".bvh"
+            pose_file = self.data_dir + self.args.pose_rep + "/" + f_name + ext
+            pose_each_file = []
+            trans_each_file = []
+            shape_each_file = []
+            audio_each_file = []
+            facial_each_file = []
+            word_each_file = []
+            emo_each_file = []
+            sem_each_file = []
+            vid_each_file = []
+            id_pose = f_name #1_wayne_0_1_1
+            logger.info(colored(f"# ---- Building cache for Pose   {id_pose} ---- #", "blue"))
+            if "smplx" in self.args.pose_rep:
+                pose_data = np.load(pose_file, allow_pickle=True)
+                assert 30%self.args.pose_fps == 0, 'pose_fps should be an aliquot part of 30'
+                stride = int(30/self.args.pose_fps)
+                pose_each_file = pose_data["poses"][::stride] * self.joint_mask
+                trans_each_file = pose_data["trans"][::stride]
+                shape_each_file = np.repeat(pose_data["betas"].reshape(1, 300), pose_each_file.shape[0], axis=0)
+                if self.args.facial_rep is not None:
+                    logger.info(f"# ---- Building cache for Facial {id_pose} and Pose {id_pose} ---- #")
+                    facial_each_file = pose_data["expressions"][::stride]
+                    if self.args.facial_norm:
+                        facial_each_file = (facial_each_file - self.mean_facial) / self.std_facial
+            else:
+                assert 120%self.args.pose_fps == 0, 'pose_fps should be an aliquot part of 120'
+                stride = int(120/self.args.pose_fps)
+                with open(pose_file, "r") as pose_data:
+                    for j, line in enumerate(pose_data.readlines()):
+                        if j < 431: continue
+                        if j%stride != 0:continue
+                        data = np.fromstring(line, dtype=float, sep=" ")
+                        rot_data = rc.euler_angles_to_matrix(torch.from_numpy(np.deg2rad(data)).reshape(-1, self.joints,3), "XYZ")
+                        rot_data = rc.matrix_to_axis_angle(rot_data).reshape(-1, self.joints*3)
+                        rot_data = rot_data.numpy() * self.joint_mask
+                        pose_each_file.append(rot_data)
+                        trans_each_file.append(data[:3])
+                pose_each_file = np.array(pose_each_file)
+                # print(pose_each_file.shape)
+                trans_each_file = np.array(trans_each_file)
+                shape_each_file = np.repeat(np.array(-1).reshape(1, 1), pose_each_file.shape[0], axis=0)
+                if self.args.facial_rep is not None:
+                    logger.info(f"# ---- Building cache for Facial {id_pose} and Pose {id_pose} ---- #")
+                    facial_file = pose_file.replace(self.args.pose_rep, self.args.facial_rep).replace("bvh", "json")
+                    assert 60%self.args.pose_fps == 0, 'pose_fps should be an aliquot part of 120'
+                    stride = int(60/self.args.pose_fps)
+                    if not os.path.exists(facial_file):
+                        logger.warning(f"# ---- file not found for Facial {id_pose}, skip all files with the same id ---- #")
+                        self.selected_file = self.selected_file.drop(self.selected_file[self.selected_file['id'] == id_pose].index)
+                        continue
+                    with open(facial_file, 'r') as facial_data_file:
+                        facial_data = json.load(facial_data_file)
+                        for j, frame_data in enumerate(facial_data['frames']):
+                            if j%stride != 0:continue
+                            facial_each_file.append(frame_data['weights'])
+                    facial_each_file = np.array(facial_each_file)
+                    if self.args.facial_norm:
+                        facial_each_file = (facial_each_file - self.mean_facial) / self.std_facial
+            if self.args.id_rep is not None:
+                vid_each_file = np.repeat(np.array(int(f_name.split("_")[0])-1).reshape(1, 1), pose_each_file.shape[0], axis=0)
+            if self.args.audio_rep is not None:
+                logger.info(f"# ---- Building cache for Audio  {id_pose} and Pose {id_pose} ---- #")
+                audio_file = pose_file.replace(self.args.pose_rep, 'wave16k').replace(ext, ".wav")
+                if not os.path.exists(audio_file):
+                    logger.warning(f"# ---- file not found for Audio  {id_pose}, skip all files with the same id ---- #")
+                    self.selected_file = self.selected_file.drop(self.selected_file[self.selected_file['id'] == id_pose].index)
+                    continue
+                audio_each_file, sr = librosa.load(audio_file)
+                audio_each_file = librosa.resample(audio_each_file, orig_sr=sr, target_sr=self.args.audio_sr)
+                if self.args.audio_rep == "onset+amplitude":
+                    from numpy.lib import stride_tricks
+                    frame_length = 1024
+                    # hop_length = 512
+                    shape = (audio_each_file.shape[-1] - frame_length + 1, frame_length)
+                    strides = (audio_each_file.strides[-1], audio_each_file.strides[-1])
+                    rolling_view = stride_tricks.as_strided(audio_each_file, shape=shape, strides=strides)
+                    amplitude_envelope = np.max(np.abs(rolling_view), axis=1)
+                    # pad the last frame_length-1 samples
+                    amplitude_envelope = np.pad(amplitude_envelope, (0, frame_length-1), mode='constant', constant_values=amplitude_envelope[-1])
+                    audio_onset_f = librosa.onset.onset_detect(y=audio_each_file, sr=self.args.audio_sr, units='frames')
+                    onset_array = np.zeros(len(audio_each_file), dtype=float)
+                    onset_array[audio_onset_f] = 1.0
+                    # print(amplitude_envelope.shape, audio_each_file.shape, onset_array.shape)
+                    audio_each_file = np.concatenate([amplitude_envelope.reshape(-1, 1), onset_array.reshape(-1, 1)], axis=1)
+                elif self.args.audio_rep == "mfcc":
+                    audio_each_file = librosa.feature.mfcc(audio_each_file, sr=self.args.audio_sr, n_mfcc=13, hop_length=int(self.args.audio_sr/self.args.audio_fps))
+                if self.args.audio_norm and self.args.audio_rep == "wave16k":
+                    audio_each_file = (audio_each_file - self.mean_audio) / self.std_audio
+            time_offset = 0
+            if self.args.word_rep is not None:
+                logger.info(f"# ---- Building cache for Word   {id_pose} and Pose {id_pose} ---- #")
+                word_file = f"{self.data_dir}{self.args.word_rep}/{id_pose}.TextGrid"
+                if not os.path.exists(word_file):
+                    logger.warning(f"# ---- file not found for Word   {id_pose}, skip all files with the same id ---- #")
+                    self.selected_file = self.selected_file.drop(self.selected_file[self.selected_file['id'] == id_pose].index)
+                    continue
+                tgrid = tg.TextGrid.fromFile(word_file)
+                if self.args.t_pre_encoder == "bert":
+                    from transformers import AutoTokenizer, BertModel
+                    tokenizer = AutoTokenizer.from_pretrained(self.args.data_path_1 + "hub/bert-base-uncased", local_files_only=True)
+                    model = BertModel.from_pretrained(self.args.data_path_1 + "hub/bert-base-uncased", local_files_only=True).eval()
+                    list_word = []
+                    all_hidden = []
+                    max_len = 400
+                    last = 0
+                    word_token_mapping = []
+                    first = True
+                    for i, word in enumerate(tgrid[0]):
+                        last = i
+                        if (i%max_len != 0) or (i==0):
+                            if word.mark == "":
+                                list_word.append(".")
+                            else:
+                                list_word.append(word.mark)
+                        else:
+                            max_counter = max_len
+                            str_word = ' '.join(map(str, list_word))
+                            if first:
+                                global_len = 0
+                            end = -1
+                            offset_word = []
+                            for k, wordvalue in enumerate(list_word):
+                                start = end+1
+                                end = start+len(wordvalue)
+                                offset_word.append((start, end))
+                            #print(offset_word)
+                            token_scan = tokenizer.encode_plus(str_word, return_offsets_mapping=True)['offset_mapping']
+                            #print(token_scan)
+                            for start, end in offset_word:
+                                sub_mapping = []
+                                for i, (start_t, end_t) in enumerate(token_scan[1:-1]):
+                                    if int(start) <= int(start_t) and int(end_t) <= int(end):
+                                        #print(i+global_len)
+                                        sub_mapping.append(i+global_len)
+                                word_token_mapping.append(sub_mapping)
+                            #print(len(word_token_mapping))
+                            global_len = word_token_mapping[-1][-1] + 1
+                            list_word = []
+                            if word.mark == "":
+                                list_word.append(".")
+                            else:
+                                list_word.append(word.mark)
+                            with torch.no_grad():
+                                inputs = tokenizer(str_word, return_tensors="pt")
+                                outputs = model(**inputs)
+                                last_hidden_states = outputs.last_hidden_state.reshape(-1, 768).cpu().numpy()[1:-1, :]
+                            all_hidden.append(last_hidden_states)
+                    #list_word = list_word[:10]
+                    if list_word == []:
+                        pass
+                    else:
+                        if first:
+                            global_len = 0
+                        str_word = ' '.join(map(str, list_word))
+                        end = -1
+                        offset_word = []
+                        for k, wordvalue in enumerate(list_word):
+                            start = end+1
+                            end = start+len(wordvalue)
+                            offset_word.append((start, end))
+                        #print(offset_word)
+                        token_scan = tokenizer.encode_plus(str_word, return_offsets_mapping=True)['offset_mapping']
+                        #print(token_scan)
+                        for start, end in offset_word:
+                            sub_mapping = []
+                            for i, (start_t, end_t) in enumerate(token_scan[1:-1]):
+                                if int(start) <= int(start_t) and int(end_t) <= int(end):
+                                    sub_mapping.append(i+global_len)
+                                    #print(sub_mapping)
+                            word_token_mapping.append(sub_mapping)
+                        #print(len(word_token_mapping))
+                        with torch.no_grad():
+                            inputs = tokenizer(str_word, return_tensors="pt")
+                            outputs = model(**inputs)
+                            last_hidden_states = outputs.last_hidden_state.reshape(-1, 768).cpu().numpy()[1:-1, :]
+                        all_hidden.append(last_hidden_states)
+                    last_hidden_states = np.concatenate(all_hidden, axis=0)
+                for i in range(pose_each_file.shape[0]):
+                    found_flag = False
+                    current_time = i/self.args.pose_fps + time_offset
+                    j_last = 0
+                    for j, word in enumerate(tgrid[0]):
+                        word_n, word_s, word_e = word.mark, word.minTime, word.maxTime
+                        if word_s<=current_time and current_time<=word_e:
+                            if self.args.word_cache and self.args.t_pre_encoder == 'bert':
+                                mapping_index = word_token_mapping[j]
+                                #print(mapping_index, word_s, word_e)
+                                s_t = np.linspace(word_s, word_e, len(mapping_index)+1)
+                                #print(s_t)
+                                for tt, t_sep in enumerate(s_t[1:]):
+                                    if current_time <= t_sep:
+                                        #if len(mapping_index) > 1: print(mapping_index[tt])
+                                        word_each_file.append(last_hidden_states[mapping_index[tt]])
+                                        break
+                            else:
+                                if word_n == " ":
+                                    word_each_file.append(self.lang_model.PAD_token)
+                                else:
+                                    word_each_file.append(self.lang_model.get_word_index(word_n))
+                            found_flag = True
+                            j_last = j
+                            break
+                        else: continue
+                    if not found_flag:
+                        if self.args.word_cache and self.args.t_pre_encoder == 'bert':
+                            word_each_file.append(last_hidden_states[j_last])
+                        else:
+                            word_each_file.append(self.lang_model.UNK_token)
+                word_each_file = np.array(word_each_file)
+                #print(word_each_file.shape)
+            if self.args.emo_rep is not None:
+                logger.info(f"# ---- Building cache for Emo    {id_pose} and Pose {id_pose} ---- #")
+                rtype, start = int(id_pose.split('_')[3]), int(id_pose.split('_')[3])
+                if rtype == 0 or rtype == 2 or rtype == 4 or rtype == 6:
+                    if start >= 1 and start <= 64:
+                        score = 0
+                    elif start >= 65 and start <= 72:
+                        score = 1
+                    elif start >= 73 and start <= 80:
+                        score = 2
+                    elif start >= 81 and start <= 86:
+                        score = 3
+                    elif start >= 87 and start <= 94:
+                        score = 4
+                    elif start >= 95 and start <= 102:
+                        score = 5
+                    elif start >= 103 and start <= 110:
+                        score = 6
+                    elif start >= 111 and start <= 118:
+                        score = 7
+                    else: pass
+                else:
+                    # you may denote as unknown in the future
+                    score = 0
+                emo_each_file = np.repeat(np.array(score).reshape(1, 1), pose_each_file.shape[0], axis=0)
+                #print(emo_each_file)
+            if self.args.sem_rep is not None:
+                logger.info(f"# ---- Building cache for Sem    {id_pose} and Pose {id_pose} ---- #")
+                sem_file = f"{self.data_dir}{self.args.sem_rep}/{id_pose}.txt"
+                sem_all = pd.read_csv(sem_file,
+                    sep='\t',
+                    names=["name", "start_time", "end_time", "duration", "score", "keywords"])
+                # we adopt motion-level semantic score here.
+                for i in range(pose_each_file.shape[0]):
+                    found_flag = False
+                    for j, (start, end, score) in enumerate(zip(sem_all['start_time'],sem_all['end_time'], sem_all['score'])):
+                        current_time = i/self.args.pose_fps + time_offset
+                        if start<=current_time and current_time<=end:
+                            sem_each_file.append(score)
+                            found_flag=True
+                            break
+                        else: continue
+                    if not found_flag: sem_each_file.append(0.)
+                sem_each_file = np.array(sem_each_file)
+                #print(sem_each_file)
+            filtered_result = self._sample_from_clip(
+                dst_lmdb_env,
+                audio_each_file, pose_each_file, trans_each_file, shape_each_file, facial_each_file, word_each_file,
+                vid_each_file, emo_each_file, sem_each_file,
+                disable_filtering, clean_first_seconds, clean_final_seconds, is_test,
+                )
+            for type in filtered_result.keys():
+                n_filtered_out[type] += filtered_result[type]
+        with dst_lmdb_env.begin() as txn:
+            logger.info(colored(f"no. of samples: {txn.stat()['entries']}", "cyan"))
+            n_total_filtered = 0
+            for type, n_filtered in n_filtered_out.items():
+                logger.info("{}: {}".format(type, n_filtered))
+                n_total_filtered += n_filtered
+            logger.info(colored("no. of excluded samples: {} ({:.1f}%)".format(
+                n_total_filtered, 100 * n_total_filtered / (txn.stat()["entries"] + n_total_filtered)), "cyan"))
+        dst_lmdb_env.sync()
+        dst_lmdb_env.close()
+    def _sample_from_clip(
+        self, dst_lmdb_env, audio_each_file, pose_each_file, trans_each_file, shape_each_file, facial_each_file, word_each_file,
+        vid_each_file, emo_each_file, sem_each_file,
+        disable_filtering, clean_first_seconds, clean_final_seconds, is_test,
+        ):
+        """
+        for data cleaning, we ignore the data for first and final n s
+        for test, we return all data
+        """
+        # audio_start = int(self.alignment[0] * self.args.audio_fps)
+        # pose_start = int(self.alignment[1] * self.args.pose_fps)
+        #logger.info(f"before: {audio_each_file.shape} {pose_each_file.shape}")
+        # audio_each_file = audio_each_file[audio_start:]
+        # pose_each_file = pose_each_file[pose_start:]
+        # trans_each_file =
+        #logger.info(f"after alignment: {audio_each_file.shape} {pose_each_file.shape}")
+        #print(pose_each_file.shape)
+        round_seconds_skeleton = pose_each_file.shape[0] // self.args.pose_fps  # assume 1500 frames / 15 fps = 100 s
+        #print(round_seconds_skeleton)
+        if audio_each_file != []:
+            round_seconds_audio = len(audio_each_file) // self.args.audio_fps # assume 16,000,00 / 16,000 = 100 s
+            if facial_each_file != []:
+                round_seconds_facial = facial_each_file.shape[0] // self.args.pose_fps
+                logger.info(f"audio: {round_seconds_audio}s, pose: {round_seconds_skeleton}s, facial: {round_seconds_facial}s")
+                round_seconds_skeleton = min(round_seconds_audio, round_seconds_skeleton, round_seconds_facial)
+                max_round = max(round_seconds_audio, round_seconds_skeleton, round_seconds_facial)
+                if round_seconds_skeleton != max_round:
+                    logger.warning(f"reduce to {round_seconds_skeleton}s, ignore {max_round-round_seconds_skeleton}s")
+            else:
+                logger.info(f"pose: {round_seconds_skeleton}s, audio: {round_seconds_audio}s")
+                round_seconds_skeleton = min(round_seconds_audio, round_seconds_skeleton)
+                max_round = max(round_seconds_audio, round_seconds_skeleton)
+                if round_seconds_skeleton != max_round:
+                    logger.warning(f"reduce to {round_seconds_skeleton}s, ignore {max_round-round_seconds_skeleton}s")
+        clip_s_t, clip_e_t = clean_first_seconds, round_seconds_skeleton - clean_final_seconds # assume [10, 90]s
+        clip_s_f_audio, clip_e_f_audio = self.args.audio_fps * clip_s_t, clip_e_t * self.args.audio_fps # [160,000,90*160,000]
+        clip_s_f_pose, clip_e_f_pose = clip_s_t * self.args.pose_fps, clip_e_t * self.args.pose_fps # [150,90*15]
+        for ratio in self.args.multi_length_training:
+            if is_test:# stride = length for test
+                cut_length = clip_e_f_pose - clip_s_f_pose
+                self.args.stride = cut_length
+                self.max_length = cut_length
+            else:
+                self.args.stride = int(ratio*self.ori_stride)
+                cut_length = int(self.ori_length*ratio)
+            num_subdivision = math.floor((clip_e_f_pose - clip_s_f_pose - cut_length) / self.args.stride) + 1
+            logger.info(f"pose from frame {clip_s_f_pose} to {clip_e_f_pose}, length {cut_length}")
+            logger.info(f"{num_subdivision} clips is expected with stride {self.args.stride}")
+            if audio_each_file != []:
+                audio_short_length = math.floor(cut_length / self.args.pose_fps * self.args.audio_fps)
+                """
+                for audio sr = 16000, fps = 15, pose_length = 34,
+                audio short length = 36266.7 -> 36266
+                this error is fine.
+                """
+                logger.info(f"audio from frame {clip_s_f_audio} to {clip_e_f_audio}, length {audio_short_length}")
+            n_filtered_out = defaultdict(int)
+            sample_pose_list = []
+            sample_audio_list = []
+            sample_facial_list = []
+            sample_shape_list = []
+            sample_word_list = []
+            sample_emo_list = []
+            sample_sem_list = []
+            sample_vid_list = []
+            sample_trans_list = []
+            for i in range(num_subdivision): # cut into around 2s chip, (self npose)
+                start_idx = clip_s_f_pose + i * self.args.stride
+                fin_idx = start_idx + cut_length
+                sample_pose = pose_each_file[start_idx:fin_idx]
+                sample_trans = trans_each_file[start_idx:fin_idx]
+                sample_shape = shape_each_file[start_idx:fin_idx]
+                # print(sample_pose.shape)
+                if self.args.audio_rep is not None:
+                    audio_start = clip_s_f_audio + math.floor(i * self.args.stride * self.args.audio_fps / self.args.pose_fps)
+                    audio_end = audio_start + audio_short_length
+                    sample_audio = audio_each_file[audio_start:audio_end]
+                else:
+                    sample_audio = np.array([-1])
+                sample_facial = facial_each_file[start_idx:fin_idx] if self.args.facial_rep is not None else np.array([-1])
+                sample_word = word_each_file[start_idx:fin_idx] if self.args.word_rep is not None else np.array([-1])
+                sample_emo = emo_each_file[start_idx:fin_idx] if self.args.emo_rep is not None else np.array([-1])
+                sample_sem = sem_each_file[start_idx:fin_idx] if self.args.sem_rep is not None else np.array([-1])
+                sample_vid = vid_each_file[start_idx:fin_idx] if self.args.id_rep is not None else np.array([-1])
+                if sample_pose.any() != None:
+                    # filtering motion skeleton data
+                    sample_pose, filtering_message = MotionPreprocessor(sample_pose).get()
+                    is_correct_motion = (sample_pose != [])
+                    if is_correct_motion or disable_filtering:
+                        sample_pose_list.append(sample_pose)
+                        sample_audio_list.append(sample_audio)
+                        sample_facial_list.append(sample_facial)
+                        sample_shape_list.append(sample_shape)
+                        sample_word_list.append(sample_word)
+                        sample_vid_list.append(sample_vid)
+                        sample_emo_list.append(sample_emo)
+                        sample_sem_list.append(sample_sem)
+                        sample_trans_list.append(sample_trans)
+                    else:
+                        n_filtered_out[filtering_message] += 1
+            if len(sample_pose_list) > 0:
+                with dst_lmdb_env.begin(write=True) as txn:
+                    for pose, audio, facial, shape, word, vid, emo, sem, trans in zip(
+                        sample_pose_list,
+                        sample_audio_list,
+                        sample_facial_list,
+                        sample_shape_list,
+                        sample_word_list,
+                        sample_vid_list,
+                        sample_emo_list,
+                        sample_sem_list,
+                        sample_trans_list,):
+                        k = "{:005}".format(self.n_out_samples).encode("ascii")
+                        v = [pose, audio, facial, shape, word, emo, sem, vid, trans]
+                        v = pyarrow.serialize(v).to_buffer()
+                        txn.put(k, v)
+                        self.n_out_samples += 1
+        return n_filtered_out
+    def __getitem__(self, idx):
+        with self.lmdb_env.begin(write=False) as txn:
+            key = "{:005}".format(idx).encode("ascii")
+            sample = txn.get(key)
+            sample = pyarrow.deserialize(sample)
+            tar_pose, in_audio, in_facial, in_shape, in_word, emo, sem, vid, trans = sample
+            #print(in_shape)
+            #vid = torch.from_numpy(vid).int()
+            emo = torch.from_numpy(emo).int()
+            sem = torch.from_numpy(sem).float()
+            in_audio = torch.from_numpy(in_audio).float()
+            in_word = torch.from_numpy(in_word).float() if self.args.word_cache else torch.from_numpy(in_word).int()
+            if self.loader_type == "test":
+                tar_pose = torch.from_numpy(tar_pose).float()
+                trans = torch.from_numpy(trans).float()
+                in_facial = torch.from_numpy(in_facial).float()
+                vid = torch.from_numpy(vid).float()
+                in_shape = torch.from_numpy(in_shape).float()
+            else:
+                in_shape = torch.from_numpy(in_shape).reshape((in_shape.shape[0], -1)).float()
+                trans = torch.from_numpy(trans).reshape((trans.shape[0], -1)).float()
+                vid = torch.from_numpy(vid).reshape((vid.shape[0], -1)).float()
+                tar_pose = torch.from_numpy(tar_pose).reshape((tar_pose.shape[0], -1)).float()
+                in_facial = torch.from_numpy(in_facial).reshape((in_facial.shape[0], -1)).float()
+            return {"pose":tar_pose, "audio":in_audio, "facial":in_facial, "beta": in_shape, "word":in_word, "id":vid, "emo":emo, "sem":sem, "trans":trans}
+class MotionPreprocessor:
+    def __init__(self, skeletons):
+        self.skeletons = skeletons
+        #self.mean_pose = mean_pose
+        self.filtering_message = "PASS"
+    def get(self):
+        assert (self.skeletons is not None)
+        # filtering
+        if self.skeletons != []:
+            if self.check_pose_diff():
+                self.skeletons = []
+                self.filtering_message = "pose"
+            # elif self.check_spine_angle():
+            #     self.skeletons = []
+            #     self.filtering_message = "spine angle"
+            # elif self.check_static_motion():
+            #     self.skeletons = []
+            #     self.filtering_message = "motion"
+        # if self.skeletons != []:
+        #     self.skeletons = self.skeletons.tolist()
+        #     for i, frame in enumerate(self.skeletons):
+        #         assert not np.isnan(self.skeletons[i]).any()  # missing joints
+        return self.skeletons, self.filtering_message
+    def check_static_motion(self, verbose=True):
+        def get_variance(skeleton, joint_idx):
+            wrist_pos = skeleton[:, joint_idx]
+            variance = np.sum(np.var(wrist_pos, axis=0))
+            return variance
+        left_arm_var = get_variance(self.skeletons, 6)
+        right_arm_var = get_variance(self.skeletons, 9)
+        th = 0.0014  # exclude 13110
+        # th = 0.002  # exclude 16905
+        if left_arm_var < th and right_arm_var < th:
+            if verbose:
+                print("skip - check_static_motion left var {}, right var {}".format(left_arm_var, right_arm_var))
+            return True
+        else:
+            if verbose:
+                print("pass - check_static_motion left var {}, right var {}".format(left_arm_var, right_arm_var))
+            return False
+    def check_pose_diff(self, verbose=False):
+#         diff = np.abs(self.skeletons - self.mean_pose) # 186*1
+#         diff = np.mean(diff)
+#         # th = 0.017
+#         th = 0.02 #0.02  # exclude 3594
+#         if diff < th:
+#             if verbose:
+#                 print("skip - check_pose_diff {:.5f}".format(diff))
+#             return True
+# #         th = 3.5 #0.02  # exclude 3594
+# #         if 3.5 < diff < 5:
+# #             if verbose:
+# #                 print("skip - check_pose_diff {:.5f}".format(diff))
+# #             return True
+#         else:
+#             if verbose:
+#                 print("pass - check_pose_diff {:.5f}".format(diff))
+        return False
+    def check_spine_angle(self, verbose=True):
+        def angle_between(v1, v2):
+            v1_u = v1 / np.linalg.norm(v1)
+            v2_u = v2 / np.linalg.norm(v2)
+            return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0))
+        angles = []
+        for i in range(self.skeletons.shape[0]):
+            spine_vec = self.skeletons[i, 1] - self.skeletons[i, 0]
+            angle = angle_between(spine_vec, [0, -1, 0])
+            angles.append(angle)
+        if np.rad2deg(max(angles)) > 30 or np.rad2deg(np.mean(angles)) > 20:  # exclude 4495
+        # if np.rad2deg(max(angles)) > 20:  # exclude 8270
+            if verbose:
+                print("skip - check_spine_angle {:.5f}, {:.5f}".format(max(angles), np.mean(angles)))
+            return True
+        else:
+            if verbose:
+                print("pass - check_spine_angle {:.5f}".format(max(angles)))
+            return False

dataloaders/build_vocab.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import numpy as np
+import glob
+import os
+import pickle
+import lmdb
+#import pyarrow
+import fasttext
+from loguru import logger
+from scipy import linalg
+class Vocab:
+    PAD_token = 0
+    SOS_token = 1
+    EOS_token = 2
+    UNK_token = 3
+    def __init__(self, name, insert_default_tokens=True):
+        self.name = name
+        self.trimmed = False
+        self.word_embedding_weights = None
+        self.reset_dictionary(insert_default_tokens)
+    def reset_dictionary(self, insert_default_tokens=True):
+        self.word2index = {}
+        self.word2count = {}
+        if insert_default_tokens:
+            self.index2word = {self.PAD_token: "<PAD>", self.SOS_token: "<SOS>",
+                               self.EOS_token: "<EOS>", self.UNK_token: "<UNK>"}
+        else:
+            self.index2word = {self.UNK_token: "<UNK>"}
+        self.n_words = len(self.index2word)  # count default tokens
+    def index_word(self, word):
+        if word not in self.word2index:
+            self.word2index[word] = self.n_words
+            self.word2count[word] = 1
+            self.index2word[self.n_words] = word
+            self.n_words += 1
+        else:
+            self.word2count[word] += 1
+    def add_vocab(self, other_vocab):
+        for word, _ in other_vocab.word2count.items():
+            self.index_word(word)
+    # remove words below a certain count threshold
+    def trim(self, min_count):
+        if self.trimmed:
+            return
+        self.trimmed = True
+        keep_words = []
+        for k, v in self.word2count.items():
+            if v >= min_count:
+                keep_words.append(k)
+        print('    word trimming, kept %s / %s = %.4f' % (
+            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
+        ))
+        # reinitialize dictionary
+        self.reset_dictionary()
+        for word in keep_words:
+            self.index_word(word)
+    def get_word_index(self, word):
+        if word in self.word2index:
+            return self.word2index[word]
+        else:
+            return self.UNK_token
+    def load_word_vectors(self, pretrained_path, embedding_dim=300):
+        print("  loading word vectors from '{}'...".format(pretrained_path))
+        # initialize embeddings to random values for special words
+        init_sd = 1 / np.sqrt(embedding_dim)
+        weights = np.random.normal(0, scale=init_sd, size=[self.n_words, embedding_dim])
+        weights = weights.astype(np.float32)
+        # read word vectors
+        word_model = fasttext.load_model(pretrained_path)
+        for word, id in self.word2index.items():
+            vec = word_model.get_word_vector(word)
+            weights[id] = vec
+        self.word_embedding_weights = weights
+    def __get_embedding_weight(self, pretrained_path, embedding_dim=300):
+        """ function modified from http://ronny.rest/blog/post_2017_08_04_glove/ """
+        print("Loading word embedding '{}'...".format(pretrained_path))
+        cache_path = pretrained_path
+        weights = None
+        # use cached file if it exists
+        if os.path.exists(cache_path):  #
+            with open(cache_path, 'rb') as f:
+                print('  using cached result from {}'.format(cache_path))
+                weights = pickle.load(f)
+                if weights.shape != (self.n_words, embedding_dim):
+                    logging.warning('  failed to load word embedding weights. reinitializing...')
+                    weights = None
+        if weights is None:
+            # initialize embeddings to random values for special and OOV words
+            init_sd = 1 / np.sqrt(embedding_dim)
+            weights = np.random.normal(0, scale=init_sd, size=[self.n_words, embedding_dim])
+            weights = weights.astype(np.float32)
+            with open(pretrained_path, encoding="utf-8", mode="r") as textFile:
+                num_embedded_words = 0
+                for line_raw in textFile:
+                    # extract the word, and embeddings vector
+                    line = line_raw.split()
+                    try:
+                        word, vector = (line[0], np.array(line[1:], dtype=np.float32))
+                        # if word == 'love':  # debugging
+                        #     print(word, vector)
+                        # if it is in our vocab, then update the corresponding weights
+                        id = self.word2index.get(word, None)
+                        if id is not None:
+                            weights[id] = vector
+                            num_embedded_words += 1
+                    except ValueError:
+                        print('  parsing error at {}...'.format(line_raw[:50]))
+                        continue
+                print('  {} / {} word vectors are found in the embedding'.format(num_embedded_words, len(self.word2index)))
+                with open(cache_path, 'wb') as f:
+                    pickle.dump(weights, f)
+        return weights
+def build_vocab(name, data_path, cache_path, word_vec_path=None, feat_dim=None):
+    print('  building a language model...')
+    #if not os.path.exists(cache_path):
+    lang_model = Vocab(name)
+    print('    indexing words from {}'.format(data_path))
+    index_words_from_textgrid(lang_model, data_path)
+    if word_vec_path is not None:
+        lang_model.load_word_vectors(word_vec_path, feat_dim)
+    else:
+        print('    loaded from {}'.format(cache_path))
+        with open(cache_path, 'rb') as f:
+            lang_model = pickle.load(f)
+        if word_vec_path is None:
+            lang_model.word_embedding_weights = None
+        elif lang_model.word_embedding_weights.shape[0] != lang_model.n_words:
+            logging.warning('    failed to load word embedding weights. check this')
+            assert False
+    with open(cache_path, 'wb') as f:
+        pickle.dump(lang_model, f)
+    return lang_model
+def index_words(lang_model, data_path):
+    #index words form text
+    with open(data_path, "r") as f:
+        for line in f.readlines():
+            line = line.replace(",", " ")
+            line = line.replace(".", " ")
+            line = line.replace("?", " ")
+            line = line.replace("!", " ")
+            for word in line.split():
+                lang_model.index_word(word)
+    print('    indexed %d words' % lang_model.n_words)
+def index_words_from_textgrid(lang_model, data_path):
+    import textgrid as tg
+    from tqdm import tqdm
+    #trainvaltest=os.listdir(data_path)
+    # for loadtype in trainvaltest:
+    #     if "." in loadtype: continue #ignore .ipynb_checkpoints
+    texts = os.listdir(data_path+"/textgrid/")
+    #print(texts)
+    for textfile in tqdm(texts):
+        tgrid = tg.TextGrid.fromFile(data_path+"/textgrid/"+textfile)
+        for word in tgrid[0]:
+            word_n, word_s, word_e = word.mark, word.minTime, word.maxTime
+            word_n = word_n.replace(",", " ")
+            word_n = word_n.replace(".", " ")
+            word_n = word_n.replace("?", " ")
+            word_n = word_n.replace("!", " ")
+            #print(word_n)
+            lang_model.index_word(word_n)
+    print('    indexed %d words' % lang_model.n_words)
+    print(lang_model.word2index, lang_model.word2count)
+if __name__ == "__main__":
+    # 11195 for all, 5793 for 4 speakers
+    # build_vocab("beat_english_15_141", "/home/ma-user/work/datasets/beat_cache/beat_english_15_141/", "/home/ma-user/work/datasets/beat_cache/beat_english_15_141/vocab.pkl", "/home/ma-user/work/datasets/cc.en.300.bin", 300)
+    build_vocab("beat_chinese_v1.0.0", "/data/datasets/beat_chinese_v1.0.0/", "/data/datasets/beat_chinese_v1.0.0/weights/vocab.pkl", "/home/ma-user/work/cc.zh.300.bin", 300)

dataloaders/data_tools.py ADDED Viewed

	@@ -0,0 +1,1756 @@

+import numpy as np
+import glob
+import os
+import pickle
+import lmdb
+#import pyarrow
+import fasttext
+from loguru import logger
+from scipy import linalg
+from .pymo.parsers import BVHParser
+from .pymo.viz_tools import *
+from .pymo.preprocessing import *
+# pose version fpsxx_trinity/japanese_joints(_xxx)
+joints_list = {
+    "trinity_joints":{
+        'Hips':         [6,6],
+        'Spine':        [3,9],
+        'Spine1':       [3,12],
+        'Spine2':       [3,15],
+        'Spine3':       [3,18],
+        'Neck':         [3,21],
+        'Neck1':        [3,24],
+        'Head':         [3,27],
+        'RShoulder':    [3,30],
+        'RArm':         [3,33],
+        'RArm1':        [3,36],
+        'RHand':        [3,39],
+        'RHandT1':      [3,42],
+        'RHandT2':      [3,45],
+        'RHandT3':      [3,48],
+        'RHandI1':      [3,51],
+        'RHandI2':      [3,54],
+        'RHandI3':      [3,57],
+        'RHandM1':      [3,60],
+        'RHandM2':      [3,63],
+        'RHandM3':      [3,66],
+        'RHandR1':      [3,69],
+        'RHandR2':      [3,72],
+        'RHandR3':      [3,75],
+        'RHandP1':      [3,78],
+        'RHandP2':      [3,81],
+        'RHandP3':      [3,84],
+        'LShoulder':    [3,87],
+        'LArm':         [3,90],
+        'LArm1':        [3,93],
+        'LHand':        [3,96],
+        'LHandT1':      [3,99],
+        'LHandT2':      [3,102],
+        'LHandT3':      [3,105],
+        'LHandI1':      [3,108],
+        'LHandI2':      [3,111],
+        'LHandI3':      [3,114],
+        'LHandM1':      [3,117],
+        'LHandM2':      [3,120],
+        'LHandM3':      [3,123],
+        'LHandR1':      [3,126],
+        'LHandR2':      [3,129],
+        'LHandR3':      [3,132],
+        'LHandP1':      [3,135],
+        'LHandP2':      [3,138],
+        'LHandP3':      [3,141],
+        'RUpLeg':       [3,144],
+        'RLeg':         [3,147],
+        'RFoot':        [3,150],
+        'RFootF':       [3,153],
+        'RToeBase':     [3,156],
+        'LUpLeg':       [3,159],
+        'LLeg':         [3,162],
+        'LFoot':        [3,165],
+        'LFootF':       [3,168],
+        'LToeBase':     [3,171],},
+    "trinity_joints_123":{
+        'Spine':       3 ,
+        'Neck':        3 ,
+        'Neck1':       3 ,
+        'RShoulder':   3 ,
+        'RArm':        3 ,
+        'RArm1':       3 ,
+        'RHand':       3 ,
+        'RHandT1':     3 ,
+        'RHandT2':     3 ,
+        'RHandT3':     3 ,
+        'RHandI1':     3 ,
+        'RHandI2':     3 ,
+        'RHandI3':     3 ,
+        'RHandM1':     3 ,
+        'RHandM2':     3 ,
+        'RHandM3':     3 ,
+        'RHandR1':     3 ,
+        'RHandR2':     3 ,
+        'RHandR3':     3 ,
+        'RHandP1':     3 ,
+        'RHandP2':     3 ,
+        'RHandP3':     3 ,
+        'LShoulder':   3 ,
+        'LArm':        3 ,
+        'LArm1':       3 ,
+        'LHand':       3 ,
+        'LHandT1':     3 ,
+        'LHandT2':     3 ,
+        'LHandT3':     3 ,
+        'LHandI1':     3 ,
+        'LHandI2':     3 ,
+        'LHandI3':     3 ,
+        'LHandM1':     3 ,
+        'LHandM2':     3 ,
+        'LHandM3':     3 ,
+        'LHandR1':     3 ,
+        'LHandR2':     3 ,
+        'LHandR3':     3 ,
+        'LHandP1':     3 ,
+        'LHandP2':     3 ,
+        'LHandP3':     3 ,},
+    "trinity_joints_168":{
+        'Hips':        3 ,
+        'Spine':       3 ,
+        'Spine1':      3 ,
+        'Spine2':      3 ,
+        'Spine3':      3 ,
+        'Neck':        3 ,
+        'Neck1':       3 ,
+        'Head':        3 ,
+        'RShoulder':   3 ,
+        'RArm':        3 ,
+        'RArm1':       3 ,
+        'RHand':       3 ,
+        'RHandT1':     3 ,
+        'RHandT2':     3 ,
+        'RHandT3':     3 ,
+        'RHandI1':     3 ,
+        'RHandI2':     3 ,
+        'RHandI3':     3 ,
+        'RHandM1':     3 ,
+        'RHandM2':     3 ,
+        'RHandM3':     3 ,
+        'RHandR1':     3 ,
+        'RHandR2':     3 ,
+        'RHandR3':     3 ,
+        'RHandP1':     3 ,
+        'RHandP2':     3 ,
+        'RHandP3':     3 ,
+        'LShoulder':   3 ,
+        'LArm':        3 ,
+        'LArm1':       3 ,
+        'LHand':       3 ,
+        'LHandT1':     3 ,
+        'LHandT2':     3 ,
+        'LHandT3':     3 ,
+        'LHandI1':     3 ,
+        'LHandI2':     3 ,
+        'LHandI3':     3 ,
+        'LHandM1':     3 ,
+        'LHandM2':     3 ,
+        'LHandM3':     3 ,
+        'LHandR1':     3 ,
+        'LHandR2':     3 ,
+        'LHandR3':     3 ,
+        'LHandP1':     3 ,
+        'LHandP2':     3 ,
+        'LHandP3':     3 ,
+        'RUpLeg':      3 ,
+        'RLeg':        3 ,
+        'RFoot':       3 ,
+        'RFootF':      3 ,
+        'RToeBase':    3 ,
+        'LUpLeg':      3 ,
+        'LLeg':        3 ,
+        'LFoot':       3 ,
+        'LFootF':      3 ,
+        'LToeBase':    3 ,},
+    "trinity_joints_138":{
+        "Hips":        3 ,
+        'Spine':       3 ,
+        'Spine1':      3 ,
+        'Spine2':      3 ,
+        'Spine3':      3 ,
+        'Neck':        3 ,
+        'Neck1':       3 ,
+        'Head':        3 ,
+        'RShoulder':   3 ,
+        'RArm':        3 ,
+        'RArm1':       3 ,
+        'RHand':       3 ,
+        'RHandT1':     3 ,
+        'RHandT2':     3 ,
+        'RHandT3':     3 ,
+        'RHandI1':     3 ,
+        'RHandI2':     3 ,
+        'RHandI3':     3 ,
+        'RHandM1':     3 ,
+        'RHandM2':     3 ,
+        'RHandM3':     3 ,
+        'RHandR1':     3 ,
+        'RHandR2':     3 ,
+        'RHandR3':     3 ,
+        'RHandP1':     3 ,
+        'RHandP2':     3 ,
+        'RHandP3':     3 ,
+        'LShoulder':   3 ,
+        'LArm':        3 ,
+        'LArm1':       3 ,
+        'LHand':       3 ,
+        'LHandT1':     3 ,
+        'LHandT2':     3 ,
+        'LHandT3':     3 ,
+        'LHandI1':     3 ,
+        'LHandI2':     3 ,
+        'LHandI3':     3 ,
+        'LHandM1':     3 ,
+        'LHandM2':     3 ,
+        'LHandM3':     3 ,
+        'LHandR1':     3 ,
+        'LHandR2':     3 ,
+        'LHandR3':     3 ,
+        'LHandP1':     3 ,
+        'LHandP2':     3 ,
+        'LHandP3':     3 ,},
+    "beat_smplx_joints": {
+        'pelvis':         [3,3],
+        'left_hip':        [3,6],
+        'right_hip':       [3,9],
+        'spine1':       [3,12],
+        'left_knee':       [3,15],
+        'right_knee':         [3,18],
+        'spine2':        [3,21],
+        'left_ankle':         [3,24],
+        'right_ankle':      [3,27],
+        'spine3':    [3,30],
+        'left_foot':         [3,33],
+        'right_foot':        [3,36],
+        'neck':        [3,39],
+        'left_collar':      [3,42],
+        'right_collar':      [3,45],
+        'head':      [3,48],
+        'left_shoulder':      [3,51],
+        'right_shoulder':       [3,54],
+        'left_elbow':      [3,57],
+        'right_elbow':      [3,60],
+        'left_wrist':      [3,63],
+        'right_wrist':      [3,66],
+        'jaw':       [3,69],
+        'left_eye_smplhf':      [3,72],
+        'right_eye_smplhf':      [3,75],
+        'left_index1':      [3,78],
+        'left_index2':      [3,81],
+        'left_index3':       [3,84],
+        'left_middle1':      [3,87],
+        'left_middle2':      [3,90],
+        'left_middle3':      [3,93],
+        'left_pinky1':      [3,96],
+        'left_pinky2':      [3,99],
+        'left_pinky3':      [3,102],
+        'left_ring1':      [3,105],
+        'left_ring2':      [3,108],
+        'left_ring3':    [3,111],
+        'left_thumb1':         [3,114],
+        'left_thumb2':        [3,117],
+        'left_thumb3':        [3,120],
+        'right_index1':      [3,123],
+        'right_index2':      [3,126],
+        'right_index3':      [3,129],
+        'right_middle1':      [3,132],
+        'right_middle2':       [3,135],
+        'right_middle3':      [3,138],
+        'right_pinky1':      [3,141],
+        'right_pinky2':      [3,144],
+        'right_pinky3':      [3,147],
+        'right_ring1':       [3,150],
+        'right_ring2':      [3,153],
+        'right_ring3':      [3,156],
+        'right_thumb1':      [3,159],
+        'right_thumb2':      [3,162],
+        'right_thumb3':       [3,165],
+#         'nose':      [3,168],
+#         'right_eye':      [3,171],
+#         'left_eye':      [3,174],
+#         'right_ear':      [3,177],
+#         'left_ear':      [3,180],
+#         'left_big_toe':      [3,183],
+#         'left_small_toe':      [3,186],
+#         'left_heel':      [3,189],
+#         'right_big_toe':       [3,192],
+#         'right_small_toe':         [3,195],
+#         'right_heel':        [3,198],
+#         'left_thumb':       [3,201],
+#         'left_index':     [3,204],
+#         'left_middle':  [3,207],
+#         'left_ring':       [3,210],
+#         'left_pinky':         [3,213],
+#         'right_thumb':        [3,216],
+#         'right_index':       [3,219],
+#         'right_middle':     [3,222],
+#         'right_ring':  [3,225],
+#         'right_pinky':      [3,228],
+#         'right_eye_brow1':      [3,231],
+#         'right_eye_brow2':      [3,234],
+#         'right_eye_brow3':      [3,237],
+#         'right_eye_brow4':      [3,240],
+#         'right_eye_brow5':      [3,243],
+#         'left_eye_brow5':      [3,246],
+#         'left_eye_brow4':      [3,249],
+#         'left_eye_brow3':       [3,252],
+#         'left_eye_brow2':         [3,255],
+#         'left_eye_brow1':        [3,258],
+#         'nose1':       [3,261],
+#         'nose2':     [3,264],
+#         'nose3':  [3,267],
+#         'nose4':       [3,270],
+#         'right_nose_2':         [3,273],
+#         'right_nose_1':        [3,276],
+#         'nose_middle':       [3,279],
+#         'left_nose_1':     [3,282],
+#         'left_nose_2':  [3,285],
+#         'right_eye1':      [3,288],
+#         'right_eye2':      [3,291],
+#         'right_eye3':      [3,294],
+#         'right_eye4':      [3,297],
+#         'right_eye5':      [3,300],
+#         'right_eye6':      [3,303],
+#         'left_eye4':      [3,306],
+#         'left_eye3':      [3,309],
+#         'left_eye2':       [3,312],
+#         'left_eye1':         [3,315],
+#         'left_eye6':        [3,318],
+#         'left_eye5':       [3,321],
+#         'right_mouth_1':     [3,324],
+#         'right_mouth_2':  [3,327],
+#         'right_mouth_3':       [3,330],
+#         'mouth_top':         [3,333],
+#         'left_mouth_3':        [3,336],
+#         'left_mouth_2':       [3,339],
+#         'left_mouth_1':     [3,342],
+#         'left_mouth_5':  [3,345],
+#         'left_mouth_4':        [3,348],
+#         'mouth_bottom':       [3,351],
+#         'right_mouth_4':     [3,354],
+#         'right_mouth_5':  [3,357],
+#         'right_lip_1':        [3,360],
+#         'right_lip_2':       [3,363],
+#         'lip_top':     [3,366],
+#         'left_lip_2':  [3,369],
+#         'left_lip_1':       [3,372],
+#         'left_lip_3':         [3,375],
+#         'lip_bottom':        [3,378],
+#         'right_lip_3':       [3,381],
+#         'right_contour_1':     [3,384],
+#         'right_contour_2':  [3,387],
+#         'right_contour_3':       [3,390],
+#         'right_contour_4':         [3,393],
+#         'right_contour_5':        [3,396],
+#         'right_contour_6':       [3,399],
+#         'right_contour_7':     [3,402],
+#         'right_contour_8':  [3,405],
+#         'contour_middle':        [3,408],
+#         'left_contour_8':       [3,411],
+#         'left_contour_7':     [3,414],
+#         'left_contour_6':  [3,417],
+#         'left_contour_5':        [3,420],
+#         'left_contour_4':       [3,423],
+#         'left_contour_3':     [3,426],
+#         'left_contour_2':  [3,429],
+#         'left_contour_1':  [3,432],
+    },
+    "beat_smplx_no_eyes": {
+        "pelvis":3,
+        "left_hip":3,
+        "right_hip":3,
+        "spine1":3,
+        "left_knee":3,
+        "right_knee":3,
+        "spine2":3,
+        "left_ankle":3,
+        "right_ankle":3,
+        "spine3":3,
+        "left_foot":3,
+        "right_foot":3,
+        "neck":3,
+        "left_collar":3,
+        "right_collar":3,
+        "head":3,
+        "left_shoulder":3,
+        "right_shoulder":3,
+        "left_elbow":3,
+        "right_elbow":3,
+        "left_wrist":3,
+        "right_wrist":3,
+        "jaw":3,
+        # "left_eye_smplhf":3,
+        # "right_eye_smplhf":3,
+        "left_index1":3,
+        "left_index2":3,
+        "left_index3":3,
+        "left_middle1":3,
+        "left_middle2":3,
+        "left_middle3":3,
+        "left_pinky1":3,
+        "left_pinky2":3,
+        "left_pinky3":3,
+        "left_ring1":3,
+        "left_ring2":3,
+        "left_ring3":3,
+        "left_thumb1":3,
+        "left_thumb2":3,
+        "left_thumb3":3,
+        "right_index1":3,
+        "right_index2":3,
+        "right_index3":3,
+        "right_middle1":3,
+        "right_middle2":3,
+        "right_middle3":3,
+        "right_pinky1":3,
+        "right_pinky2":3,
+        "right_pinky3":3,
+        "right_ring1":3,
+        "right_ring2":3,
+        "right_ring3":3,
+        "right_thumb1":3,
+        "right_thumb2":3,
+        "right_thumb3":3,
+    },
+    "beat_smplx_full": {
+        "pelvis":3,
+        "left_hip":3,
+        "right_hip":3,
+        "spine1":3,
+        "left_knee":3,
+        "right_knee":3,
+        "spine2":3,
+        "left_ankle":3,
+        "right_ankle":3,
+        "spine3":3,
+        "left_foot":3,
+        "right_foot":3,
+        "neck":3,
+        "left_collar":3,
+        "right_collar":3,
+        "head":3,
+        "left_shoulder":3,
+        "right_shoulder":3,
+        "left_elbow":3,
+        "right_elbow":3,
+        "left_wrist":3,
+        "right_wrist":3,
+        "jaw":3,
+        "left_eye_smplhf":3,
+        "right_eye_smplhf":3,
+        "left_index1":3,
+        "left_index2":3,
+        "left_index3":3,
+        "left_middle1":3,
+        "left_middle2":3,
+        "left_middle3":3,
+        "left_pinky1":3,
+        "left_pinky2":3,
+        "left_pinky3":3,
+        "left_ring1":3,
+        "left_ring2":3,
+        "left_ring3":3,
+        "left_thumb1":3,
+        "left_thumb2":3,
+        "left_thumb3":3,
+        "right_index1":3,
+        "right_index2":3,
+        "right_index3":3,
+        "right_middle1":3,
+        "right_middle2":3,
+        "right_middle3":3,
+        "right_pinky1":3,
+        "right_pinky2":3,
+        "right_pinky3":3,
+        "right_ring1":3,
+        "right_ring2":3,
+        "right_ring3":3,
+        "right_thumb1":3,
+        "right_thumb2":3,
+        "right_thumb3":3,
+    },
+    "beat_smplx_upall": {
+        # "pelvis":3,
+        # "left_hip":3,
+        # "right_hip":3,
+        "spine1":3,
+        # "left_knee":3,
+        # "right_knee":3,
+        "spine2":3,
+        # "left_ankle":3,
+        # "right_ankle":3,
+        "spine3":3,
+        # "left_foot":3,
+        # "right_foot":3,
+        "neck":3,
+        "left_collar":3,
+        "right_collar":3,
+        "head":3,
+        "left_shoulder":3,
+        "right_shoulder":3,
+        "left_elbow":3,
+        "right_elbow":3,
+        "left_wrist":3,
+        "right_wrist":3,
+        # "jaw":3,
+        # "left_eye_smplhf":3,
+        # "right_eye_smplhf":3,
+        "left_index1":3,
+        "left_index2":3,
+        "left_index3":3,
+        "left_middle1":3,
+        "left_middle2":3,
+        "left_middle3":3,
+        "left_pinky1":3,
+        "left_pinky2":3,
+        "left_pinky3":3,
+        "left_ring1":3,
+        "left_ring2":3,
+        "left_ring3":3,
+        "left_thumb1":3,
+        "left_thumb2":3,
+        "left_thumb3":3,
+        "right_index1":3,
+        "right_index2":3,
+        "right_index3":3,
+        "right_middle1":3,
+        "right_middle2":3,
+        "right_middle3":3,
+        "right_pinky1":3,
+        "right_pinky2":3,
+        "right_pinky3":3,
+        "right_ring1":3,
+        "right_ring2":3,
+        "right_ring3":3,
+        "right_thumb1":3,
+        "right_thumb2":3,
+        "right_thumb3":3,
+    },
+    "beat_smplx_upper": {
+        #"pelvis":3,
+        # "left_hip":3,
+        # "right_hip":3,
+        "spine1":3,
+        # "left_knee":3,
+        # "right_knee":3,
+        "spine2":3,
+        # "left_ankle":3,
+        # "right_ankle":3,
+        "spine3":3,
+        # "left_foot":3,
+        # "right_foot":3,
+        "neck":3,
+        "left_collar":3,
+        "right_collar":3,
+        "head":3,
+        "left_shoulder":3,
+        "right_shoulder":3,
+        "left_elbow":3,
+        "right_elbow":3,
+        "left_wrist":3,
+        "right_wrist":3,
+        # "jaw":3,
+        # "left_eye_smplhf":3,
+        # "right_eye_smplhf":3,
+        # "left_index1":3,
+        # "left_index2":3,
+        # "left_index3":3,
+        # "left_middle1":3,
+        # "left_middle2":3,
+        # "left_middle3":3,
+        # "left_pinky1":3,
+        # "left_pinky2":3,
+        # "left_pinky3":3,
+        # "left_ring1":3,
+        # "left_ring2":3,
+        # "left_ring3":3,
+        # "left_thumb1":3,
+        # "left_thumb2":3,
+        # "left_thumb3":3,
+        # "right_index1":3,
+        # "right_index2":3,
+        # "right_index3":3,
+        # "right_middle1":3,
+        # "right_middle2":3,
+        # "right_middle3":3,
+        # "right_pinky1":3,
+        # "right_pinky2":3,
+        # "right_pinky3":3,
+        # "right_ring1":3,
+        # "right_ring2":3,
+        # "right_ring3":3,
+        # "right_thumb1":3,
+        # "right_thumb2":3,
+        # "right_thumb3":3,
+    },
+        "beat_smplx_hands": {
+        #"pelvis":3,
+        # "left_hip":3,
+        # "right_hip":3,
+        # "spine1":3,
+        # "left_knee":3,
+        # "right_knee":3,
+        # "spine2":3,
+        # "left_ankle":3,
+        # "right_ankle":3,
+        # "spine3":3,
+        # "left_foot":3,
+        # "right_foot":3,
+        # "neck":3,
+        # "left_collar":3,
+        # "right_collar":3,
+        # "head":3,
+        # "left_shoulder":3,
+        # "right_shoulder":3,
+        # "left_elbow":3,
+        # "right_elbow":3,
+        # "left_wrist":3,
+        # "right_wrist":3,
+        # "jaw":3,
+        # "left_eye_smplhf":3,
+        # "right_eye_smplhf":3,
+        "left_index1":3,
+        "left_index2":3,
+        "left_index3":3,
+        "left_middle1":3,
+        "left_middle2":3,
+        "left_middle3":3,
+        "left_pinky1":3,
+        "left_pinky2":3,
+        "left_pinky3":3,
+        "left_ring1":3,
+        "left_ring2":3,
+        "left_ring3":3,
+        "left_thumb1":3,
+        "left_thumb2":3,
+        "left_thumb3":3,
+        "right_index1":3,
+        "right_index2":3,
+        "right_index3":3,
+        "right_middle1":3,
+        "right_middle2":3,
+        "right_middle3":3,
+        "right_pinky1":3,
+        "right_pinky2":3,
+        "right_pinky3":3,
+        "right_ring1":3,
+        "right_ring2":3,
+        "right_ring3":3,
+        "right_thumb1":3,
+        "right_thumb2":3,
+        "right_thumb3":3,
+    },
+    "beat_smplx_lower": {
+        "pelvis":3,
+        "left_hip":3,
+        "right_hip":3,
+        # "spine1":3,
+        "left_knee":3,
+        "right_knee":3,
+        # "spine2":3,
+        "left_ankle":3,
+        "right_ankle":3,
+        # "spine3":3,
+        "left_foot":3,
+        "right_foot":3,
+        # "neck":3,
+        # "left_collar":3,
+        # "right_collar":3,
+        # "head":3,
+        # "left_shoulder":3,
+        # "right_shoulder":3,
+        # "left_elbow":3,
+        # "right_elbow":3,
+        # "left_wrist":3,
+        # "right_wrist":3,
+        # "jaw":3,
+        # "left_eye_smplhf":3,
+        # "right_eye_smplhf":3,
+        # "left_index1":3,
+        # "left_index2":3,
+        # "left_index3":3,
+        # "left_middle1":3,
+        # "left_middle2":3,
+        # "left_middle3":3,
+        # "left_pinky1":3,
+        # "left_pinky2":3,
+        # "left_pinky3":3,
+        # "left_ring1":3,
+        # "left_ring2":3,
+        # "left_ring3":3,
+        # "left_thumb1":3,
+        # "left_thumb2":3,
+        # "left_thumb3":3,
+        # "right_index1":3,
+        # "right_index2":3,
+        # "right_index3":3,
+        # "right_middle1":3,
+        # "right_middle2":3,
+        # "right_middle3":3,
+        # "right_pinky1":3,
+        # "right_pinky2":3,
+        # "right_pinky3":3,
+        # "right_ring1":3,
+        # "right_ring2":3,
+        # "right_ring3":3,
+        # "right_thumb1":3,
+        # "right_thumb2":3,
+        # "right_thumb3":3,
+    },
+    "beat_smplx_face": {
+        # "pelvis":3,
+        # "left_hip":3,
+        # "right_hip":3,
+        # # "spine1":3,
+        # "left_knee":3,
+        # "right_knee":3,
+        # # "spine2":3,
+        # "left_ankle":3,
+        # "right_ankle":3,
+        # # "spine3":3,
+        # "left_foot":3,
+        # "right_foot":3,
+        # "neck":3,
+        # "left_collar":3,
+        # "right_collar":3,
+        # "head":3,
+        # "left_shoulder":3,
+        # "right_shoulder":3,
+        # "left_elbow":3,
+        # "right_elbow":3,
+        # "left_wrist":3,
+        # "right_wrist":3,
+        "jaw":3,
+        # "left_eye_smplhf":3,
+        # "right_eye_smplhf":3,
+        # "left_index1":3,
+        # "left_index2":3,
+        # "left_index3":3,
+        # "left_middle1":3,
+        # "left_middle2":3,
+        # "left_middle3":3,
+        # "left_pinky1":3,
+        # "left_pinky2":3,
+        # "left_pinky3":3,
+        # "left_ring1":3,
+        # "left_ring2":3,
+        # "left_ring3":3,
+        # "left_thumb1":3,
+        # "left_thumb2":3,
+        # "left_thumb3":3,
+        # "right_index1":3,
+        # "right_index2":3,
+        # "right_index3":3,
+        # "right_middle1":3,
+        # "right_middle2":3,
+        # "right_middle3":3,
+        # "right_pinky1":3,
+        # "right_pinky2":3,
+        # "right_pinky3":3,
+        # "right_ring1":3,
+        # "right_ring2":3,
+        # "right_ring3":3,
+        # "right_thumb1":3,
+        # "right_thumb2":3,
+        # "right_thumb3":3,
+    },
+        "beat_joints": {
+        'Hips':         [6,6],
+        'Spine':        [3,9],
+        'Spine1':       [3,12],
+        'Spine2':       [3,15],
+        'Spine3':       [3,18],
+        'Neck':         [3,21],
+        'Neck1':        [3,24],
+        'Head':         [3,27],
+        'HeadEnd':      [3,30],
+        'RShoulder':    [3,33],
+        'RArm':         [3,36],
+        'RArm1':        [3,39],
+        'RHand':        [3,42],
+        'RHandM1':      [3,45],
+        'RHandM2':      [3,48],
+        'RHandM3':      [3,51],
+        'RHandM4':      [3,54],
+        'RHandR':       [3,57],
+        'RHandR1':      [3,60],
+        'RHandR2':      [3,63],
+        'RHandR3':      [3,66],
+        'RHandR4':      [3,69],
+        'RHandP':       [3,72],
+        'RHandP1':      [3,75],
+        'RHandP2':      [3,78],
+        'RHandP3':      [3,81],
+        'RHandP4':      [3,84],
+        'RHandI':       [3,87],
+        'RHandI1':      [3,90],
+        'RHandI2':      [3,93],
+        'RHandI3':      [3,96],
+        'RHandI4':      [3,99],
+        'RHandT1':      [3,102],
+        'RHandT2':      [3,105],
+        'RHandT3':      [3,108],
+        'RHandT4':      [3,111],
+        'LShoulder':    [3,114],
+        'LArm':         [3,117],
+        'LArm1':        [3,120],
+        'LHand':        [3,123],
+        'LHandM1':      [3,126],
+        'LHandM2':      [3,129],
+        'LHandM3':      [3,132],
+        'LHandM4':      [3,135],
+        'LHandR':       [3,138],
+        'LHandR1':      [3,141],
+        'LHandR2':      [3,144],
+        'LHandR3':      [3,147],
+        'LHandR4':      [3,150],
+        'LHandP':       [3,153],
+        'LHandP1':      [3,156],
+        'LHandP2':      [3,159],
+        'LHandP3':      [3,162],
+        'LHandP4':      [3,165],
+        'LHandI':       [3,168],
+        'LHandI1':      [3,171],
+        'LHandI2':      [3,174],
+        'LHandI3':      [3,177],
+        'LHandI4':      [3,180],
+        'LHandT1':      [3,183],
+        'LHandT2':      [3,186],
+        'LHandT3':      [3,189],
+        'LHandT4':      [3,192],
+        'RUpLeg':       [3,195],
+        'RLeg':         [3,198],
+        'RFoot':        [3,201],
+        'RFootF':       [3,204],
+        'RToeBase':     [3,207],
+        'RToeBaseEnd':  [3,210],
+        'LUpLeg':       [3,213],
+        'LLeg':         [3,216],
+        'LFoot':        [3,219],
+        'LFootF':       [3,222],
+        'LToeBase':     [3,225],
+        'LToeBaseEnd':  [3,228],},
+    "beat_full":{
+        'Hips': 3,
+        'Spine':       3 ,
+        'Spine1':       3 ,
+        'Spine2':       3 ,
+        'Spine3':       3 ,
+        'Neck':        3 ,
+        'Neck1':       3 ,
+        'Head' :       3,
+        'HeadEnd' :       3,
+        'RShoulder':   3 ,
+        'RArm':        3 ,
+        'RArm1':       3 ,
+        'RHand':       3 ,
+        'RHandM1':     3 ,
+        'RHandM2':     3 ,
+        'RHandM3':     3 ,
+        'RHandM4':     3 ,
+        'RHandR':      3 ,
+        'RHandR1':     3 ,
+        'RHandR2':     3 ,
+        'RHandR3':     3 ,
+        'RHandR4':     3 ,
+        'RHandP':      3 ,
+        'RHandP1':     3 ,
+        'RHandP2':     3 ,
+        'RHandP3':     3 ,
+        'RHandP4':     3 ,
+        'RHandI':      3 ,
+        'RHandI1':     3 ,
+        'RHandI2':     3 ,
+        'RHandI3':     3 ,
+        'RHandI4':     3 ,
+        'RHandT1':     3 ,
+        'RHandT2':     3 ,
+        'RHandT3':     3 ,
+        'RHandT4':     3 ,
+        'LShoulder':   3 ,
+        'LArm':        3 ,
+        'LArm1':       3 ,
+        'LHand':       3 ,
+        'LHandM1':     3 ,
+        'LHandM2':     3 ,
+        'LHandM3':     3 ,
+        'LHandM4':     3 ,
+        'LHandR':      3 ,
+        'LHandR1':     3 ,
+        'LHandR2':     3 ,
+        'LHandR3':     3 ,
+        'LHandR4':     3 ,
+        'LHandP':      3 ,
+        'LHandP1':     3 ,
+        'LHandP2':     3 ,
+        'LHandP3':     3 ,
+        'LHandP4':     3 ,
+        'LHandI':      3 ,
+        'LHandI1':     3 ,
+        'LHandI2':     3 ,
+        'LHandI3':     3 ,
+        'LHandI4':     3 ,
+        'LHandT1':     3 ,
+        'LHandT2':     3 ,
+        'LHandT3':     3 ,
+        'LHandT4':     3 ,
+        'RUpLeg':      3,
+        'RLeg':        3,
+        'RFoot':       3,
+        'RFootF':       3,
+        'RToeBase':     3,
+        'RToeBaseEnd':  3,
+        'LUpLeg':       3,
+        'LLeg':         3,
+        'LFoot':        3,
+        'LFootF':       3,
+        'LToeBase':     3,
+        'LToeBaseEnd':  3,
+    },
+    "japanese_joints":{
+        'Hips':         [6,6],
+        'Spine':        [6,12],
+        'Spine1':       [6,18],
+        'Spine2':       [6,24],
+        'Spine3':       [6,30],
+        'Neck':         [6,36],
+        'Neck1':        [6,42],
+        'Head':         [6,48],
+        'RShoulder':    [6,54],
+        'RArm':         [6,60],
+        'RArm1':        [6,66],
+        'RHand':        [6,72],
+        'RHandM1':      [6,78],
+        'RHandM2':      [6,84],
+        'RHandM3':      [6,90],
+        'RHandR':       [6,96],
+        'RHandR1':      [6,102],
+        'RHandR2':      [6,108],
+        'RHandR3':      [6,114],
+        'RHandP':       [6,120],
+        'RHandP1':      [6,126],
+        'RHandP2':      [6,132],
+        'RHandP3':      [6,138],
+        'RHandI':       [6,144],
+        'RHandI1':      [6,150],
+        'RHandI2':      [6,156],
+        'RHandI3':      [6,162],
+        'RHandT1':      [6,168],
+        'RHandT2':      [6,174],
+        'RHandT3':      [6,180],
+        'LShoulder':    [6,186],
+        'LArm':         [6,192],
+        'LArm1':        [6,198],
+        'LHand':        [6,204],
+        'LHandM1':      [6,210],
+        'LHandM2':      [6,216],
+        'LHandM3':      [6,222],
+        'LHandR':       [6,228],
+        'LHandR1':      [6,234],
+        'LHandR2':      [6,240],
+        'LHandR3':      [6,246],
+        'LHandP':       [6,252],
+        'LHandP1':      [6,258],
+        'LHandP2':      [6,264],
+        'LHandP3':      [6,270],
+        'LHandI':       [6,276],
+        'LHandI1':      [6,282],
+        'LHandI2':      [6,288],
+        'LHandI3':      [6,294],
+        'LHandT1':      [6,300],
+        'LHandT2':      [6,306],
+        'LHandT3':      [6,312],
+        'RUpLeg':       [6,318],
+        'RLeg':         [6,324],
+        'RFoot':        [6,330],
+        'RFootF':       [6,336],
+        'RToeBase':     [6,342],
+        'LUpLeg':       [6,348],
+        'LLeg':         [6,354],
+        'LFoot':        [6,360],
+        'LFootF':       [6,366],
+        'LToeBase':     [6,372],},
+    "yostar":{
+    'Hips':         [6,6],
+    'Spine':        [3,9],
+    'Spine1':       [3,12],
+    'Bone040':       [3,15],
+    'Bone041':       [3,18],
+    'Bone034':         [3,21],
+    'Bone035':        [3,24],
+    'Bone036':         [3,27],
+    'Bone037':        [3,30],
+    'Bone038':         [3,33],
+    'Bone039':        [3,36],
+    'RibbonL1':         [3,39],
+    'RibbonL1_end':      [3,42],
+    'Chest':         [3,45],
+    'L_eri':      [3,48],
+    'R_eri':      [3,51],
+    'Neck':      [3,54],
+    'Head':      [3,57],
+    'Head_end':      [3,60],
+    'RBackHair_1':  [3,63],
+    'RBackHair_2':  [3,66],
+    'RBackHair_3':  [3,69],
+    'RBackHair_4':  [3,72],
+    'RBackHair_end':  [3,75],
+    'RFrontHair':  [3,78],
+    'CFrontHair_1':  [3,81],
+    'CFrontHair_2':  [3,84],
+    'CFrontHair_3':  [3,87],
+    'CFrontHair_emd':  [3,90],
+    'LFrontHair_1':  [3,93],
+    'LFrontHair_2':  [3,96],
+    'LFrontHair_3':  [3,99],
+    'LBackHair_1':  [3,102],
+    'LBackHair_2':  [3,105],
+    'LBackHair_3':  [3,108],
+    'LBackHair_4':  [3,111],
+    'LBackHair_end':  [3,114],
+    'LSideHair_1':  [3,117],
+    'LSideHair_2':  [3,120],
+    'LSideHair_3':  [3,123],
+    'LSideHair_4':  [3,126],
+    'LSideHair_5':  [3,129],
+    'LSideHair_6':  [3,132],
+    'LSideHair_7':  [3,135],
+    'LSideHair_end':  [3,138],
+    'CBackHair_1':  [3,141],
+    'CBackHair_2':  [3,144],
+    'CBackHair_3':  [3,147],
+    'CBackHair_4':  [3,150],
+    'CBackHair_end':  [3,153],
+    'RSideHair_1':  [3,156],
+    'RSideHair_2':  [3,159],
+    'RSideHair_3':  [3,162],
+    'RSideHair_4':  [3,165],
+    'RibbonR_1':  [3,168],
+    'RibbonR_2':  [3,171],
+    'RibbonR_3':  [3,174],
+    'RibbonL_1':  [3,177],
+    'RibbonL_2':  [3,180],
+    'RibbonL_3':  [3,183],
+    'LeftEye':  [3,186],
+    'LeftEye_end':  [3,189],
+    'RightEye':  [3,192],
+    'RightEye_end':  [3,195],
+    'LeftShoulder':    [3,198],
+    'LeftArm':         [3,201],
+    'LeftForearm':        [3,204],
+    'LeftHand':        [3,207],
+    'LeftHandThumb1':      [3,210],
+    'LeftHandThumb2':      [3,213],
+    'LeftHandThumb3':      [3,216],
+    'LeftHandThumb_end':      [3,219],
+    'LeftHandIndex1':       [3,222],
+    'LeftHandIndex2':      [3,225],
+    'LeftHandIndex3':      [3,228],
+    'LeftHandIndex_end':      [3,231],
+    'LeftHandMiddle1':      [3,234],
+    'LeftHandMiddle2':      [3,237],
+    'LeftHandMiddle3':      [3,240],
+    'LeftHandMiddle_end':      [3,243],
+    'LeftHandRing1':       [3,246],
+    'LeftHandRing2':      [3,249],
+    'LeftHandRing3':      [3,252],
+    'LeftHandRing_end':      [3,255],
+    'LeftHandPinky1':       [3,258],
+    'LeftHandPinky2':      [3,261],
+    'LeftHandPinky3':      [3,264],
+    'LeftHandPinky_end':      [3,267],
+    'RightShoulder':    [3,270],
+    'RightArm':         [3,273],
+    'RightForearm':        [3,276],
+    'RightHand':        [3,279],
+    'RightHandThumb1':      [3,282],
+    'RightHandThumb2':      [3,285],
+    'RightHandThumb3':      [3,288],
+    'RightHandThumb_end':      [3,291],
+    'RightHandIndex1':       [3,294],
+    'RightHandIndex2':      [3,297],
+    'RightHandIndex3':      [3,300],
+    'RightHandIndex_end':      [3,303],
+    'RightHandMiddle1':      [3,306],
+    'RightHandMiddle2':      [3,309],
+    'RightHandMiddle3':      [3,312],
+    'RightHandMiddle_end':      [3,315],
+    'RightHandRing1':       [3,318],
+    'RightHandRing2':      [3,321],
+    'RightHandRing3':      [3,324],
+    'RightHandRing_end':      [3,327],
+    'RightHandPinky1':       [3,330],
+    'RightHandPinky2':      [3,333],
+    'RightHandPinky3':      [3,336],
+    'RightHandPinky_end':      [3,339],
+    'RibbonR1':  [3,342],
+    'RibbonR1_end':  [3,345],
+    'RibbonR2':  [3,348],
+    'RibbonR2_end':  [3,351],
+    'RibbonL2':  [3,354],
+    'RibbonL2_end':  [3,357],
+    'LeftUpLeg':       [3,360],
+    'LeftLeg':         [3,363],
+    'LeftFoot':        [3,366],
+    'LeftToe':       [3,369],
+    'LeftToe_end':     [3,372],
+    'RightUpLeg':       [3,375],
+    'RightLEg':         [3,378],
+    'RightFoot':        [3,381],
+    'RightToe':       [3,384],
+    'RightToe_end':     [3,387],
+    'bone_skirtF00': [3, 390],
+    'bone_skirtF01': [3, 393],
+    'bone_skirtF02': [3, 396],
+    'bone_skirtF03': [3, 399],
+    'Bone020': [3, 402],
+    'Bone026': [3, 405],
+    'bone_skirtF_R_00': [3, 408],
+    'bone_skirtF_R_01': [3, 411],
+    'bone_skirtF_R_02': [3, 414],
+    'bone_skirtF_R_03': [3, 417],
+    'Bone019': [3, 420],
+    'Bone028': [3, 423],
+    'bone_skirtR00': [3, 426],
+    'bone_skirtR01': [3, 429],
+    'bone_skirtR02': [3, 432],
+    'bone_skirtR03': [3, 435],
+    'Bone018': [3, 438],
+    'Bone029': [3, 441],
+    'bone_skirtF_L_00': [3, 444],
+    'bone_skirtF_L_01': [3, 447],
+    'bone_skirtF_L_02': [3, 450],
+    'bone_skirtF_L_03': [3, 453],
+    'Bone021': [3, 456],
+    'Bone027': [3, 459],
+    'bone_skirtL00': [3, 462],
+    'bone_skirtL01': [3, 465],
+    'bone_skirtL02': [3, 468],
+    'bone_skirtL03': [3, 471],
+    'Bone022': [3, 474],
+    'Bone033': [3, 477],
+    'bone_skirtB_L_00': [3, 480],
+    'bone_skirtB_L_01': [3, 483],
+    'bone_skirtB_L_02': [3, 486],
+    'bone_skirtB_L_03': [3, 489],
+    'Bone023': [3, 492],
+    'Bone032': [3, 495],
+    'bone_skirtB00': [3, 498],
+    'bone_skirtB01': [3, 501],
+    'bone_skirtB02': [3, 504],
+    'bone_skirtB03': [3, 507],
+    'Bone024': [3, 510],
+    'Bone031': [3, 513],
+    'bone_skirtB_R_00': [3, 516],
+    'bone_skirtB_R_01': [3, 519],
+    'bone_skirtB_R_02': [3, 521],
+    'bone_skirtB_R_03': [3, 524],
+    'Bone025': [3, 527],
+    'Bone030': [3, 530],
+        },
+    "yostar_fullbody_213":{
+    'Hips':       3 ,
+    'Spine':       3 ,
+    'Spine1':        3 ,
+    'Chest':       3 ,
+    'L_eri':       3 ,
+    'R_eri':       3 ,
+    'Neck':   3 ,
+    'Head':        3 ,
+    'Head_end':       3 ,
+    'LeftEye':  3,
+    'LeftEye_end':  3,
+    'RightEye':  3,
+    'RightEye_end':  3,
+    'LeftShoulder':    3,
+    'LeftArm':       3,
+    'LeftForearm':     3,
+    'LeftHand':      3,
+    'LeftHandThumb1':     3,
+    'LeftHandThumb2':    3,
+    'LeftHandThumb3':     3,
+    'LeftHandThumb_end':    3,
+    'LeftHandIndex1':    3,
+    'LeftHandIndex2':    3,
+    'LeftHandIndex3':  3,
+    'LeftHandIndex_end':    3,
+    'LeftHandMiddle1':    3,
+    'LeftHandMiddle2':   3,
+    'LeftHandMiddle3':    3,
+    'LeftHandMiddle_end':    3,
+    'LeftHandRing1':  3,
+    'LeftHandRing2':     3,
+    'LeftHandRing3':     3,
+    'LeftHandRing_end':    3,
+    'LeftHandPinky1':      3,
+    'LeftHandPinky2':     3,
+    'LeftHandPinky3':     3,
+    'LeftHandPinky_end':3,
+    'RightShoulder':   3,
+    'RightArm':        3,
+    'RightForearm':     3,
+    'RightHand':      3,
+    'RightHandThumb1':    3,
+    'RightHandThumb2':     3,
+    'RightHandThumb3':     3,
+    'RightHandThumb_end':     3,
+    'RightHandIndex1':      3,
+    'RightHandIndex2':    3,
+    'RightHandIndex3':     3,
+    'RightHandIndex_end':    3,
+    'RightHandMiddle1':    3,
+    'RightHandMiddle2':   3,
+    'RightHandMiddle3':      3,
+    'RightHandMiddle_end':    3,
+    'RightHandRing1':     3,
+    'RightHandRing2':    3,
+    'RightHandRing3':      3,
+    'RightHandRing_end':    3,
+    'RightHandPinky1':     3,
+    'RightHandPinky2':   3,
+    'RightHandPinky3':     3,
+    'RightHandPinky_end':    3,
+    'LeftUpLeg':       3,
+    'LeftLeg':         3,
+    'LeftFoot':       3,
+    'LeftToe':     3,
+    'LeftToe_end':    3,
+    'RightUpLeg':     3,
+    'RightLEg':      3,
+    'RightFoot':       3,
+    'RightToe':      3,
+    'RightToe_end':    3,
+        },
+    "yostar_mainbody_48": {
+    #'Hips':       3 ,
+    'Spine':       3 ,
+    'Spine1':        3 ,
+    'Chest':       3 ,
+    'L_eri':       3 ,
+    'R_eri':       3 ,
+    'Neck':   3 ,
+    'Head':        3 ,
+    'Head_end':       3 ,
+    'LeftShoulder':    3,
+    'LeftArm':       3,
+    'LeftForearm':     3,
+    'LeftHand':      3,
+    'RightShoulder':   3,
+    'RightArm':        3,
+    'RightForearm':     3,
+    'RightHand':      3,
+    },
+    "yostar_mainbody_69": {
+    'Hips':       3 ,
+    'Spine':       3 ,
+    'Spine1':        3 ,
+    'Chest':       3 ,
+    'L_eri':       3 ,
+    'R_eri':       3 ,
+    'Neck':   3 ,
+    'Head':        3 ,
+    'Head_end':       3 ,
+    'LeftShoulder':    3,
+    'LeftArm':       3,
+    'LeftForearm':     3,
+    'LeftHand':      3,
+    'RightShoulder':   3,
+    'RightArm':        3,
+    'RightForearm':     3,
+    'RightHand':      3,
+    'LeftUpLeg':       3,
+    'LeftLeg':         3,
+    'LeftFoot':       3,
+    'RightUpLeg':     3,
+    'RightLEg':      3,
+    'RightFoot':       3,
+    },
+    "yostar_upbody_168": {
+    #'Hips':       3 ,
+    'Spine':       3 ,
+    'Spine1':        3 ,
+    'Chest':       3 ,
+    'L_eri':       3 ,
+    'R_eri':       3 ,
+    'Neck':   3 ,
+    'Head':        3 ,
+    'Head_end':       3 ,
+    'LeftShoulder':    3,
+    'LeftArm':       3,
+    'LeftForearm':     3,
+    'LeftHand':      3,
+    'LeftHandThumb1':     3,
+    'LeftHandThumb2':    3,
+    'LeftHandThumb3':     3,
+    'LeftHandThumb_end':    3,
+    'LeftHandIndex1':    3,
+    'LeftHandIndex2':    3,
+    'LeftHandIndex3':  3,
+    'LeftHandIndex_end':    3,
+    'LeftHandMiddle1':    3,
+    'LeftHandMiddle2':   3,
+    'LeftHandMiddle3':    3,
+    'LeftHandMiddle_end':    3,
+    'LeftHandRing1':  3,
+    'LeftHandRing2':     3,
+    'LeftHandRing3':     3,
+    'LeftHandRing_end':    3,
+    'LeftHandPinky1':      3,
+    'LeftHandPinky2':     3,
+    'LeftHandPinky3':     3,
+    'LeftHandPinky_end':3,
+    'RightShoulder':   3,
+    'RightArm':        3,
+    'RightForearm':     3,
+    'RightHand':      3,
+    'RightHandThumb1':    3,
+    'RightHandThumb2':     3,
+    'RightHandThumb3':     3,
+    'RightHandThumb_end':     3,
+    'RightHandIndex1':      3,
+    'RightHandIndex2':    3,
+    'RightHandIndex3':     3,
+    'RightHandIndex_end':    3,
+    'RightHandMiddle1':    3,
+    'RightHandMiddle2':   3,
+    'RightHandMiddle3':      3,
+    'RightHandMiddle_end':    3,
+    'RightHandRing1':     3,
+    'RightHandRing2':    3,
+    'RightHandRing3':      3,
+    'RightHandRing_end':    3,
+    'RightHandPinky1':     3,
+    'RightHandPinky2':   3,
+    'RightHandPinky3':     3,
+    'RightHandPinky_end':    3,
+    },
+    "spine_neck_141":{
+        'Spine':       3 ,
+        'Neck':        3 ,
+        'Neck1':       3 ,
+        'RShoulder':   3 ,
+        'RArm':        3 ,
+        'RArm1':       3 ,
+        'RHand':       3 ,
+        'RHandM1':     3 ,
+        'RHandM2':     3 ,
+        'RHandM3':     3 ,
+        'RHandR':      3 ,
+        'RHandR1':     3 ,
+        'RHandR2':     3 ,
+        'RHandR3':     3 ,
+        'RHandP':      3 ,
+        'RHandP1':     3 ,
+        'RHandP2':     3 ,
+        'RHandP3':     3 ,
+        'RHandI':      3 ,
+        'RHandI1':     3 ,
+        'RHandI2':     3 ,
+        'RHandI3':     3 ,
+        'RHandT1':     3 ,
+        'RHandT2':     3 ,
+        'RHandT3':     3 ,
+        'LShoulder':   3 ,
+        'LArm':        3 ,
+        'LArm1':       3 ,
+        'LHand':       3 ,
+        'LHandM1':     3 ,
+        'LHandM2':     3 ,
+        'LHandM3':     3 ,
+        'LHandR':      3 ,
+        'LHandR1':     3 ,
+        'LHandR2':     3 ,
+        'LHandR3':     3 ,
+        'LHandP':      3 ,
+        'LHandP1':     3 ,
+        'LHandP2':     3 ,
+        'LHandP3':     3 ,
+        'LHandI':      3 ,
+        'LHandI1':     3 ,
+        'LHandI2':     3 ,
+        'LHandI3':     3 ,
+        'LHandT1':     3 ,
+        'LHandT2':     3 ,
+        'LHandT3':     3 ,},
+}
+class FIDCalculator(object):
+    '''
+    todo
+    '''
+    def __init__(self):
+        self.gt_rot = None # pandas dataframe for n frames * joints * 6
+        self.gt_pos = None # n frames * (joints + 13) * 3
+        self.op_rot = None # pandas dataframe for n frames * joints * 6
+        self.op_pos = None # n frames * (joints + 13) * 3
+    def load(self, path, load_type, save_pos=False):
+        '''
+        select gt or op for load_type
+        '''
+        parser = BVHParser()
+        parsed_data = parser.parse(path)
+        if load_type == 'gt':
+            self.gt_rot = parsed_data.values
+        elif load_type == 'op':
+            self.op_rot = parsed_data.values
+        else: print('error, select gt or op for load_type')
+        if save_pos:
+            mp = MocapParameterizer('position')
+            positions = mp.fit_transform([parsed_data])
+            if load_type == 'gt':
+                self.gt_pos = positions[0].values
+            elif load_type == 'op':
+                self.op_pos = positions[0].values
+            else: print('error, select gt or op for load_type')
+    def _joint_selector(self, selected_joints, ori_data):
+        selected_data = pd.DataFrame(columns=[])
+        for joint_name in selected_joints:
+            selected_data[joint_name] = ori_data[joint_name]
+        return selected_data.to_numpy()
+    def cal_vol(self, dtype):
+        if dtype == 'pos':
+            gt = self.gt_pos
+            op = self.op_pos
+        else:
+            gt = self.gt_rot
+            op = self.op_rot
+        gt_v = gt.to_numpy()[1:, :] - gt.to_numpy()[0:-1, :]
+        op_v = op.to_numpy()[1:, :] - op.to_numpy()[0:-1, :]
+        if dtype == 'pos':
+            self.gt_vol_pos = pd.DataFrame(gt_v, columns = gt.columns.tolist())
+            self.op_vol_pos = pd.DataFrame(op_v, columns = gt.columns.tolist())
+        else:
+            self.gt_vol_rot = pd.DataFrame(gt_v, columns = gt.columns.tolist())
+            self.op_vol_rot = pd.DataFrame(op_v, columns = gt.columns.tolist())
+    @staticmethod
+    def frechet_distance(samples_A, samples_B):
+        A_mu = np.mean(samples_A, axis=0)
+        A_sigma = np.cov(samples_A, rowvar=False)
+        B_mu = np.mean(samples_B, axis=0)
+        B_sigma = np.cov(samples_B, rowvar=False)
+        try:
+            frechet_dist = FIDCalculator.calculate_frechet_distance(A_mu, A_sigma, B_mu, B_sigma)
+        except ValueError:
+            frechet_dist = 1e+10
+        return frechet_dist
+    @staticmethod
+    def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+        """ from https://github.com/mseitzer/pytorch-fid/blob/master/fid_score.py """
+        """Numpy implementation of the Frechet Distance.
+        The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
+        and X_2 ~ N(mu_2, C_2) is
+                d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+        Stable version by Dougal J. Sutherland.
+        Params:
+        -- mu1   : Numpy array containing the activations of a layer of the
+                    inception net (like returned by the function 'get_predictions')
+                    for generated samples.
+        -- mu2   : The sample mean over activations, precalculated on an
+                    representative data set.
+        -- sigma1: The covariance matrix over activations for generated samples.
+        -- sigma2: The covariance matrix over activations, precalculated on an
+                    representative data set.
+        Returns:
+        --   : The Frechet Distance.
+        """
+        mu1 = np.atleast_1d(mu1)
+        mu2 = np.atleast_1d(mu2)
+        #print(mu1[0], mu2[0])
+        sigma1 = np.atleast_2d(sigma1)
+        sigma2 = np.atleast_2d(sigma2)
+        #print(sigma1[0], sigma2[0])
+        assert mu1.shape == mu2.shape, \
+            'Training and test mean vectors have different lengths'
+        assert sigma1.shape == sigma2.shape, \
+            'Training and test covariances have different dimensions'
+        diff = mu1 - mu2
+        # Product might be almost singular
+        covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+        #print(diff, covmean[0])
+        if not np.isfinite(covmean).all():
+            msg = ('fid calculation produces singular product; '
+                    'adding %s to diagonal of cov estimates') % eps
+            print(msg)
+            offset = np.eye(sigma1.shape[0]) * eps
+            covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+        # Numerical error might give slight imaginary component
+        if np.iscomplexobj(covmean):
+            if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+                m = np.max(np.abs(covmean.imag))
+                raise ValueError('Imaginary component {}'.format(m))
+            covmean = covmean.real
+        tr_covmean = np.trace(covmean)
+        return (diff.dot(diff) + np.trace(sigma1) +
+                np.trace(sigma2) - 2 * tr_covmean)
+    def calculate_fid(self, cal_type, joint_type, high_level_opt):
+        if cal_type == 'pos':
+            if self.gt_pos.shape != self.op_pos.shape:
+                min_val = min(self.gt_pos.shape[0],self.op_pos.shape[0])
+                gt = self.gt_pos[:min_val]
+                op = self.op_pos[:min_val]
+            else:
+                gt = self.gt_pos
+                op = self.op_pos
+            full_body = gt.columns.tolist()
+        elif cal_type == 'rot':
+            if self.gt_rot.shape != self.op_rot.shape:
+                min_val = min(self.gt_rot.shape[0],self.op_rot.shape[0])
+                gt = self.gt_rot[:min_val]
+                op = self.op_rot[:min_val]
+            else:
+                gt = self.gt_rot
+                op = self.op_rot
+            full_body_with_offset = gt.columns.tolist()
+            full_body = [o for o in full_body_with_offset if ('position' not in o)]
+        elif cal_type == 'pos_vol':
+            assert self.gt_vol_pos.shape == self.op_vol_pos.shape
+            gt = self.gt_vol_pos
+            op = self.op_vol_pos
+            full_body_with_offset = gt.columns.tolist()
+            full_body = gt.columns.tolist()
+        elif cal_type == 'rot_vol':
+            assert self.gt_vol_rot.shape == self.op_vol_rot.shape
+            gt = self.gt_vol_rot
+            op = self.op_vol_rot
+            full_body_with_offset = gt.columns.tolist()
+            full_body = [o for o in full_body_with_offset if ('position' not in o)]
+        #print(f'full_body contains {len(full_body)//3} joints')
+        if joint_type == 'full_upper_body':
+            selected_body = [o for o in full_body if ('Leg' not in o) and ('Foot' not in o) and ('Toe' not in o)]
+        elif joint_type == 'upper_body':
+            selected_body = [o for o in full_body if ('Hand' not in o) and ('Leg' not in o) and ('Foot' not in o) and ('Toe' not in o)]
+        elif joint_type == 'fingers':
+            selected_body = [o for o in full_body if ('Hand' in o)]
+        elif joint_type == 'indivdual':
+            pass
+        else: print('error, plz select correct joint type')
+        #print(f'calculate fid for {len(selected_body)//3} joints')
+        gt = self._joint_selector(selected_body, gt)
+        op = self._joint_selector(selected_body, op)
+        if high_level_opt == 'fid':
+            fid = FIDCalculator.frechet_distance(gt, op)
+            return fid
+        elif high_level_opt == 'var':
+            var_gt = gt.var()
+            var_op = op.var()
+            return var_gt, var_op
+        elif high_level_opt == 'mean':
+            mean_gt = gt.mean()
+            mean_op = op.mean()
+            return mean_gt, mean_op
+        else: return 0
+def result2target_vis(pose_version, res_bvhlist, save_path, demo_name, verbose=True):
+    if "trinity" in pose_version:
+        ori_list = joints_list[pose_version[6:-4]]
+        target_list = joints_list[pose_version[6:]]
+        file_content_length = 336
+    elif "beat" in pose_version or "spine_neck_141" in pose_version:
+        ori_list = joints_list["beat_joints"]
+        target_list = joints_list["spine_neck_141"]
+        file_content_length = 431
+    elif "yostar" in pose_version:
+        ori_list = joints_list["yostar"]
+        target_list = joints_list[pose_version]
+        file_content_length = 1056
+    else:
+        ori_list = joints_list["japanese_joints"]
+        target_list = joints_list[pose_version]
+        file_content_length = 366
+    bvh_files_dirs = sorted(glob.glob(f'{res_bvhlist}*.bvh'), key=str)
+    #test_seq_list = os.list_dir(demo_name).sort()
+    counter = 0
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    for i, bvh_file_dir in enumerate(bvh_files_dirs):
+        short_name = bvh_file_dir.split("/")[-1][11:]
+        #print(short_name)
+        wirte_file =  open(os.path.join(save_path, f'res_{short_name}'),'w+')
+        with open(f"{demo_name}{short_name}",'r') as pose_data_pre:
+            pose_data_pre_file = pose_data_pre.readlines()
+            for j, line in enumerate(pose_data_pre_file[0:file_content_length]):
+                    wirte_file.write(line)
+            offset_data = pose_data_pre_file[file_content_length]
+            offset_data = np.fromstring(offset_data, dtype=float, sep=' ')
+        wirte_file.close()
+        wirte_file = open(os.path.join(save_path, f'res_{short_name}'),'r')
+        ori_lines = wirte_file.readlines()
+        with open(bvh_file_dir, 'r') as pose_data:
+            pose_data_file = pose_data.readlines()
+        ori_lines[file_content_length-2] = 'Frames: ' + str(len(pose_data_file)-1) + '\n'
+        wirte_file.close()
+        wirte_file = open(os.path.join(save_path, f'res_{short_name}'),'w+')
+        wirte_file.writelines(i for i in ori_lines[:file_content_length])
+        wirte_file.close()
+        with open(os.path.join(save_path, f'res_{short_name}'),'a+') as wirte_file:
+            with open(bvh_file_dir, 'r') as pose_data:
+                data_each_file = []
+                pose_data_file = pose_data.readlines()
+                for j, line in enumerate(pose_data_file):
+                    if not j:
+                        pass
+                    else:
+                        data = np.fromstring(line, dtype=float, sep=' ')
+                        data_rotation = offset_data.copy()
+                        for iii, (k, v) in enumerate(target_list.items()): # here is 147 rotations by 3
+                            #print(data_rotation[ori_list[k][1]-v:ori_list[k][1]], data[iii*3:iii*3+3])
+                            data_rotation[ori_list[k][1]-v:ori_list[k][1]] = data[iii*3:iii*3+3]
+                        data_each_file.append(data_rotation)
+            for line_data in data_each_file:
+                line_data = np.array2string(line_data, max_line_width=np.inf, precision=6, suppress_small=False, separator=' ')
+                wirte_file.write(line_data[1:-2]+'\n')
+        counter += 1
+        if verbose:
+            logger.info('data_shape:', data_rotation.shape, 'process:', counter, '/', len(bvh_files_dirs))

dataloaders/mix_sep.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import os
+import pickle
+import math
+import shutil
+import numpy as np
+import lmdb as lmdb
+import textgrid as tg
+import pandas as pd
+import torch
+import glob
+import json
+from termcolor import colored
+from loguru import logger
+from collections import defaultdict
+from torch.utils.data import Dataset
+import torch.distributed as dist
+#import pyarrow
+import pickle
+import librosa
+import smplx
+import glob
+from .build_vocab import Vocab
+from .utils.audio_features import Wav2Vec2Model
+from .data_tools import joints_list
+from .utils import rotation_conversions as rc
+from .utils import other_tools
+class CustomDataset(Dataset):
+    def __init__(self, args, loader_type, augmentation=None, kwargs=None, build_cache=True):
+        self.args = args
+        self.loader_type = loader_type
+        self.rank = 0
+        self.ori_stride = self.args.stride
+        self.ori_length = self.args.pose_length
+        self.ori_joint_list = joints_list[self.args.ori_joints]
+        self.tar_joint_list = joints_list[self.args.tar_joints]
+        if 'smplx' in self.args.pose_rep:
+            self.joint_mask = np.zeros(len(list(self.ori_joint_list.keys()))*3)
+            self.joints = len(list(self.tar_joint_list.keys()))
+            for joint_name in self.tar_joint_list:
+                self.joint_mask[self.ori_joint_list[joint_name][1] - self.ori_joint_list[joint_name][0]:self.ori_joint_list[joint_name][1]] = 1
+        else:
+            self.joints = len(list(self.ori_joint_list.keys()))+1
+            self.joint_mask = np.zeros(self.joints*3)
+            for joint_name in self.tar_joint_list:
+                if joint_name == "Hips":
+                    self.joint_mask[3:6] = 1
+                else:
+                    self.joint_mask[self.ori_joint_list[joint_name][1] - self.ori_joint_list[joint_name][0]:self.ori_joint_list[joint_name][1]] = 1
+        # select trainable joints
+        split_rule = pd.read_csv(args.data_path+"train_test_split.csv")
+        self.selected_file = split_rule.loc[(split_rule['type'] == loader_type) & (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))]
+        if args.additional_data and loader_type == 'train':
+            split_b = split_rule.loc[(split_rule['type'] == 'additional') & (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))]
+            #self.selected_file = split_rule.loc[(split_rule['type'] == 'additional') & (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))]
+            self.selected_file = pd.concat([self.selected_file, split_b])
+        if self.selected_file.empty:
+            logger.warning(f"{loader_type} is empty for speaker {self.args.training_speakers}, use train set 0-8 instead")
+            self.selected_file = split_rule.loc[(split_rule['type'] == 'train') & (split_rule['id'].str.split("_").str[0].astype(int).isin(self.args.training_speakers))]
+            self.selected_file = self.selected_file.iloc[0:8]
+        self.data_dir = args.data_path
+        self.beatx_during_time = 0
+        if loader_type == "test":
+            self.args.multi_length_training = [1.0]
+        self.max_length = int(args.pose_length * self.args.multi_length_training[-1])
+        self.max_audio_pre_len = math.floor(args.pose_length / args.pose_fps * self.args.audio_sr)
+        if self.max_audio_pre_len > self.args.test_length*self.args.audio_sr:
+            self.max_audio_pre_len = self.args.test_length*self.args.audio_sr
+        preloaded_dir = self.args.root_path + self.args.cache_path + loader_type + f"/{args.pose_rep}_cache"
+        if build_cache and self.rank == 0:
+            self.build_cache(preloaded_dir)
+        self.lmdb_env = lmdb.open(preloaded_dir, readonly=True, lock=False)
+        with self.lmdb_env.begin() as txn:
+            self.n_samples = txn.stat()["entries"]
+        self.norm = True
+        self.mean = np.load('./mean_std/beatx_2_330_mean.npy')
+        self.std = np.load('./mean_std/beatx_2_330_std.npy')
+        self.trans_mean = np.load('./mean_std/beatx_2_trans_mean.npy')
+        self.trans_std = np.load('./mean_std/beatx_2_trans_std.npy')
+    def build_cache(self, preloaded_dir):
+        logger.info(f"Audio bit rate: {self.args.audio_fps}")
+        logger.info("Reading data '{}'...".format(self.data_dir))
+        logger.info("Creating the dataset cache...")
+        if self.args.new_cache:
+            if os.path.exists(preloaded_dir):
+                shutil.rmtree(preloaded_dir)
+        if os.path.exists(preloaded_dir):
+            logger.info("Found the cache {}".format(preloaded_dir))
+        elif self.loader_type == "test":
+            self.cache_generation(
+                preloaded_dir, True,
+                0, 0,
+                is_test=True)
+        else:
+            self.cache_generation(
+                preloaded_dir, self.args.disable_filtering,
+                self.args.clean_first_seconds, self.args.clean_final_seconds,
+                is_test=False)
+        logger.info(f"BEATX during time is {self.beatx_during_time}s !")
+    def __len__(self):
+        return self.n_samples
+    def cache_generation(self, out_lmdb_dir, disable_filtering, clean_first_seconds,  clean_final_seconds, is_test=False):
+        self.n_out_samples = 0
+        # create db for samples
+        if not os.path.exists(out_lmdb_dir): os.makedirs(out_lmdb_dir)
+        dst_lmdb_env = lmdb.open(out_lmdb_dir, map_size= int(1024 ** 3 * 50))# 50G
+        n_filtered_out = defaultdict(int)
+        for index, file_name in self.selected_file.iterrows():
+            f_name = file_name["id"]
+            ext = ".npz" if "smplx" in self.args.pose_rep else ".bvh"
+            pose_file = self.data_dir + self.args.pose_rep + "/" + f_name + ext
+            pose_each_file = []
+            trans_each_file = []
+            trans_v_each_file = []
+            shape_each_file = []
+            audio_each_file = []
+            facial_each_file = []
+            word_each_file = []
+            emo_each_file = []
+            sem_each_file = []
+            vid_each_file = []
+            id_pose = f_name #1_wayne_0_1_1
+            logger.info(colored(f"# ---- Building cache for Pose   {id_pose} ---- #", "blue"))
+            if "smplx" in self.args.pose_rep:
+                pose_data = np.load(pose_file, allow_pickle=True)
+                assert 30%self.args.pose_fps == 0, 'pose_fps should be an aliquot part of 30'
+                stride = int(30/self.args.pose_fps)
+                pose_each_file = pose_data["poses"][::stride] * self.joint_mask
+                pose_each_file = pose_each_file[:, self.joint_mask.astype(bool)]
+                self.beatx_during_time += pose_each_file.shape[0]/30
+                trans_each_file = pose_data["trans"][::stride]
+                trans_each_file[:,0] = trans_each_file[:,0] - trans_each_file[0,0]
+                trans_each_file[:,2] = trans_each_file[:,2] - trans_each_file[0,2]
+                trans_v_each_file = np.zeros_like(trans_each_file)
+                trans_v_each_file[1:,0] = trans_each_file[1:,0] - trans_each_file[:-1,0]
+                trans_v_each_file[0,0] = trans_v_each_file[1,0]
+                trans_v_each_file[1:,2] = trans_each_file[1:,2] - trans_each_file[:-1,2]
+                trans_v_each_file[0,2] = trans_v_each_file[1,2]
+                trans_v_each_file[:,1] = trans_each_file[:,1]
+                shape_each_file = np.repeat(pose_data["betas"].reshape(1, 300), pose_each_file.shape[0], axis=0)
+                if self.args.facial_rep is not None:
+                    logger.info(f"# ---- Building cache for Facial {id_pose} and Pose {id_pose} ---- #")
+                    facial_each_file = pose_data["expressions"][::stride]
+                    if self.args.facial_norm:
+                        facial_each_file = (facial_each_file - self.mean_facial) / self.std_facial
+            if self.args.id_rep is not None:
+                vid_each_file = np.repeat(np.array(int(f_name.split("_")[0])-1).reshape(1, 1), pose_each_file.shape[0], axis=0)
+            filtered_result = self._sample_from_clip(
+                dst_lmdb_env,
+                pose_each_file, trans_each_file,trans_v_each_file, shape_each_file, facial_each_file,
+                vid_each_file,
+                disable_filtering, clean_first_seconds, clean_final_seconds, is_test,
+                )
+            for type in filtered_result.keys():
+                n_filtered_out[type] += filtered_result[type]
+        with dst_lmdb_env.begin() as txn:
+            logger.info(colored(f"no. of samples: {txn.stat()['entries']}", "cyan"))
+            n_total_filtered = 0
+            for type, n_filtered in n_filtered_out.items():
+                logger.info("{}: {}".format(type, n_filtered))
+                n_total_filtered += n_filtered
+            logger.info(colored("no. of excluded samples: {} ({:.1f}%)".format(
+                n_total_filtered, 100 * n_total_filtered / (txn.stat()["entries"] + n_total_filtered)), "cyan"))
+        dst_lmdb_env.sync()
+        dst_lmdb_env.close()
+    def _sample_from_clip(
+        self, dst_lmdb_env, pose_each_file, trans_each_file, trans_v_each_file, shape_each_file, facial_each_file,
+        vid_each_file,
+        disable_filtering, clean_first_seconds, clean_final_seconds, is_test,
+        ):
+        """
+        for data cleaning, we ignore the data for first and final n s
+        for test, we return all data
+        """
+        round_seconds_skeleton = pose_each_file.shape[0] // self.args.pose_fps  # assume 1500 frames / 15 fps = 100 s
+        #print(round_seconds_skeleton)
+        clip_s_t, clip_e_t = clean_first_seconds, round_seconds_skeleton - clean_final_seconds # assume [10, 90]s
+        clip_s_f_audio, clip_e_f_audio = self.args.audio_fps * clip_s_t, clip_e_t * self.args.audio_fps # [160,000,90*160,000]
+        clip_s_f_pose, clip_e_f_pose = clip_s_t * self.args.pose_fps, clip_e_t * self.args.pose_fps # [150,90*15]
+        for ratio in self.args.multi_length_training:
+            if is_test:# stride = length for test
+                cut_length = clip_e_f_pose - clip_s_f_pose
+                self.args.stride = cut_length
+                self.max_length = cut_length
+            else:
+                self.args.stride = int(ratio*self.ori_stride)
+                cut_length = int(self.ori_length*ratio)
+            num_subdivision = math.floor((clip_e_f_pose - clip_s_f_pose - cut_length) / self.args.stride) + 1
+            logger.info(f"pose from frame {clip_s_f_pose} to {clip_e_f_pose}, length {cut_length}")
+            logger.info(f"{num_subdivision} clips is expected with stride {self.args.stride}")
+            n_filtered_out = defaultdict(int)
+            sample_pose_list = []
+            sample_face_list = []
+            sample_shape_list = []
+            sample_vid_list = []
+            sample_trans_list = []
+            sample_trans_v_list = []
+            for i in range(num_subdivision): # cut into around 2s chip, (self npose)
+                start_idx = clip_s_f_pose + i * self.args.stride
+                fin_idx = start_idx + cut_length
+                sample_pose = pose_each_file[start_idx:fin_idx]
+                sample_trans = trans_each_file[start_idx:fin_idx]
+                sample_trans_v = trans_v_each_file[start_idx:fin_idx]
+                sample_shape = shape_each_file[start_idx:fin_idx]
+                sample_face = facial_each_file[start_idx:fin_idx]
+                # print(sample_pose.shape)
+                sample_vid = vid_each_file[start_idx:fin_idx] if self.args.id_rep is not None else np.array([-1])
+                if sample_pose.any() != None:
+                    sample_pose_list.append(sample_pose)
+                    sample_shape_list.append(sample_shape)
+                    sample_vid_list.append(sample_vid)
+                    sample_face_list.append(sample_face)
+                    sample_trans_list.append(sample_trans)
+                    sample_trans_v_list.append(sample_trans_v)
+            if len(sample_pose_list) > 0:
+                with dst_lmdb_env.begin(write=True) as txn:
+                    for pose, shape, face, vid, trans,trans_v in zip(
+                        sample_pose_list,
+                        sample_shape_list,
+                        sample_face_list,
+                        sample_vid_list,
+                        sample_trans_list,
+                        sample_trans_v_list,
+                        ):
+                        k = "{:005}".format(self.n_out_samples).encode("ascii")
+                        v = [pose , shape, face, vid, trans,trans_v]
+                        v = pickle.dumps(v,5)
+                        txn.put(k, v)
+                        self.n_out_samples += 1
+        return n_filtered_out
+    def __getitem__(self, idx):
+        with self.lmdb_env.begin(write=False) as txn:
+            key = "{:005}".format(idx).encode("ascii")
+            sample = txn.get(key)
+            sample = pickle.loads(sample)
+            tar_pose,  in_shape, tar_face, vid, trans,trans_v = sample
+            tar_pose = torch.from_numpy(tar_pose).float()
+            tar_face = torch.from_numpy(tar_face).float()
+            tar_pose = rc.axis_angle_to_matrix(tar_pose.reshape(-1, 55, 3))
+            tar_pose = rc.matrix_to_rotation_6d(tar_pose).reshape(-1, 55*6)
+            if self.norm:
+                tar_pose = (tar_pose - self.mean) / self.std
+                trans_v = (trans_v-self.trans_mean)/self.trans_std
+            if self.loader_type == "test":
+                tar_pose = tar_pose.float()
+                trans = torch.from_numpy(trans).float()
+                trans_v = torch.from_numpy(trans_v).float()
+                vid = torch.from_numpy(vid).float()
+                in_shape = torch.from_numpy(in_shape).float()
+                tar_pose = torch.cat([tar_pose, trans_v], dim=1)
+                tar_pose = torch.cat([tar_pose, tar_face], dim=1)
+            else:
+                in_shape = torch.from_numpy(in_shape).reshape((in_shape.shape[0], -1)).float()
+                trans = torch.from_numpy(trans).reshape((trans.shape[0], -1)).float()
+                trans_v = torch.from_numpy(trans_v).reshape((trans_v.shape[0], -1)).float()
+                vid = torch.from_numpy(vid).reshape((vid.shape[0], -1)).float()
+                tar_pose = tar_pose.reshape((tar_pose.shape[0], -1)).float()
+                tar_pose = torch.cat([tar_pose, trans_v], dim=1)
+                tar_pose = torch.cat([tar_pose, tar_face], dim=1)
+            return tar_pose

dataloaders/pymo/Quaternions.py ADDED Viewed

	@@ -0,0 +1,468 @@

+import numpy as np
+class Quaternions:
+    """
+    Quaternions is a wrapper around a numpy ndarray
+    that allows it to act as if it were an narray of
+    a quaternion data type.
+    Therefore addition, subtraction, multiplication,
+    division, negation, absolute, are all defined
+    in terms of quaternion operations such as quaternion
+    multiplication.
+    This allows for much neater code and many routines
+    which conceptually do the same thing to be written
+    in the same way for point data and for rotation data.
+    The Quaternions class has been desgined such that it
+    should support broadcasting and slicing in all of the
+    usual ways.
+    """
+    def __init__(self, qs):
+        if isinstance(qs, np.ndarray):
+            if len(qs.shape) == 1: qs = np.array([qs])
+            self.qs = qs
+            return
+        if isinstance(qs, Quaternions):
+            self.qs = qs.qs
+            return
+        raise TypeError('Quaternions must be constructed from iterable, numpy array, or Quaternions, not %s' % type(qs))
+    def __str__(self): return "Quaternions("+ str(self.qs) + ")"
+    def __repr__(self): return "Quaternions("+ repr(self.qs) + ")"
+    """ Helper Methods for Broadcasting and Data extraction """
+    @classmethod
+    def _broadcast(cls, sqs, oqs, scalar=False):
+        if isinstance(oqs, float): return sqs, oqs * np.ones(sqs.shape[:-1])
+        ss = np.array(sqs.shape) if not scalar else np.array(sqs.shape[:-1])
+        os = np.array(oqs.shape)
+        if len(ss) != len(os):
+            raise TypeError('Quaternions cannot broadcast together shapes %s and %s' % (sqs.shape, oqs.shape))
+        if np.all(ss == os): return sqs, oqs
+        if not np.all((ss == os) | (os == np.ones(len(os))) | (ss == np.ones(len(ss)))):
+            raise TypeError('Quaternions cannot broadcast together shapes %s and %s' % (sqs.shape, oqs.shape))
+        sqsn, oqsn = sqs.copy(), oqs.copy()
+        for a in np.where(ss == 1)[0]: sqsn = sqsn.repeat(os[a], axis=a)
+        for a in np.where(os == 1)[0]: oqsn = oqsn.repeat(ss[a], axis=a)
+        return sqsn, oqsn
+    """ Adding Quaterions is just Defined as Multiplication """
+    def __add__(self, other): return self * other
+    def __sub__(self, other): return self / other
+    """ Quaterion Multiplication """
+    def __mul__(self, other):
+        """
+        Quaternion multiplication has three main methods.
+        When multiplying a Quaternions array by Quaternions
+        normal quaternion multiplication is performed.
+        When multiplying a Quaternions array by a vector
+        array of the same shape, where the last axis is 3,
+        it is assumed to be a Quaternion by 3D-Vector
+        multiplication and the 3D-Vectors are rotated
+        in space by the Quaternions.
+        When multipplying a Quaternions array by a scalar
+        or vector of different shape it is assumed to be
+        a Quaternions by Scalars multiplication and the
+        Quaternions are scaled using Slerp and the identity
+        quaternions.
+        """
+        """ If Quaternions type do Quaternions * Quaternions """
+        if isinstance(other, Quaternions):
+            sqs, oqs = Quaternions._broadcast(self.qs, other.qs)
+            q0 = sqs[...,0]; q1 = sqs[...,1];
+            q2 = sqs[...,2]; q3 = sqs[...,3];
+            r0 = oqs[...,0]; r1 = oqs[...,1];
+            r2 = oqs[...,2]; r3 = oqs[...,3];
+            qs = np.empty(sqs.shape)
+            qs[...,0] = r0 * q0 - r1 * q1 - r2 * q2 - r3 * q3
+            qs[...,1] = r0 * q1 + r1 * q0 - r2 * q3 + r3 * q2
+            qs[...,2] = r0 * q2 + r1 * q3 + r2 * q0 - r3 * q1
+            qs[...,3] = r0 * q3 - r1 * q2 + r2 * q1 + r3 * q0
+            return Quaternions(qs)
+        """ If array type do Quaternions * Vectors """
+        if isinstance(other, np.ndarray) and other.shape[-1] == 3:
+            vs = Quaternions(np.concatenate([np.zeros(other.shape[:-1] + (1,)), other], axis=-1))
+            return (self * (vs * -self)).imaginaries
+        """ If float do Quaternions * Scalars """
+        if isinstance(other, np.ndarray) or isinstance(other, float):
+            return Quaternions.slerp(Quaternions.id_like(self), self, other)
+        raise TypeError('Cannot multiply/add Quaternions with type %s' % str(type(other)))
+    def __div__(self, other):
+        """
+        When a Quaternion type is supplied, division is defined
+        as multiplication by the inverse of that Quaternion.
+        When a scalar or vector is supplied it is defined
+        as multiplicaion of one over the supplied value.
+        Essentially a scaling.
+        """
+        if isinstance(other, Quaternions): return self * (-other)
+        if isinstance(other, np.ndarray): return self * (1.0 / other)
+        if isinstance(other, float): return self * (1.0 / other)
+        raise TypeError('Cannot divide/subtract Quaternions with type %s' + str(type(other)))
+    def __eq__(self, other): return self.qs == other.qs
+    def __ne__(self, other): return self.qs != other.qs
+    def __neg__(self):
+        """ Invert Quaternions """
+        return Quaternions(self.qs * np.array([[1, -1, -1, -1]]))
+    def __abs__(self):
+        """ Unify Quaternions To Single Pole """
+        qabs = self.normalized().copy()
+        top = np.sum(( qabs.qs) * np.array([1,0,0,0]), axis=-1)
+        bot = np.sum((-qabs.qs) * np.array([1,0,0,0]), axis=-1)
+        qabs.qs[top < bot] = -qabs.qs[top <  bot]
+        return qabs
+    def __iter__(self): return iter(self.qs)
+    def __len__(self): return len(self.qs)
+    def __getitem__(self, k):    return Quaternions(self.qs[k])
+    def __setitem__(self, k, v): self.qs[k] = v.qs
+    @property
+    def lengths(self):
+        return np.sum(self.qs**2.0, axis=-1)**0.5
+    @property
+    def reals(self):
+        return self.qs[...,0]
+    @property
+    def imaginaries(self):
+        return self.qs[...,1:4]
+    @property
+    def shape(self): return self.qs.shape[:-1]
+    def repeat(self, n, **kwargs):
+        return Quaternions(self.qs.repeat(n, **kwargs))
+    def normalized(self):
+        return Quaternions(self.qs / self.lengths[...,np.newaxis])
+    def log(self):
+        norm = abs(self.normalized())
+        imgs = norm.imaginaries
+        lens = np.sqrt(np.sum(imgs**2, axis=-1))
+        lens = np.arctan2(lens, norm.reals) / (lens + 1e-10)
+        return imgs * lens[...,np.newaxis]
+    def constrained(self, axis):
+        rl = self.reals
+        im = np.sum(axis * self.imaginaries, axis=-1)
+        t1 = -2 * np.arctan2(rl, im) + np.pi
+        t2 = -2 * np.arctan2(rl, im) - np.pi
+        top = Quaternions.exp(axis[np.newaxis] * (t1[:,np.newaxis] / 2.0))
+        bot = Quaternions.exp(axis[np.newaxis] * (t2[:,np.newaxis] / 2.0))
+        img = self.dot(top) > self.dot(bot)
+        ret = top.copy()
+        ret[ img] = top[ img]
+        ret[~img] = bot[~img]
+        return ret
+    def constrained_x(self): return self.constrained(np.array([1,0,0]))
+    def constrained_y(self): return self.constrained(np.array([0,1,0]))
+    def constrained_z(self): return self.constrained(np.array([0,0,1]))
+    def dot(self, q): return np.sum(self.qs * q.qs, axis=-1)
+    def copy(self): return Quaternions(np.copy(self.qs))
+    def reshape(self, s):
+        self.qs.reshape(s)
+        return self
+    def interpolate(self, ws):
+        return Quaternions.exp(np.average(abs(self).log, axis=0, weights=ws))
+    def euler(self, order='xyz'):
+        q = self.normalized().qs
+        q0 = q[...,0]
+        q1 = q[...,1]
+        q2 = q[...,2]
+        q3 = q[...,3]
+        es = np.zeros(self.shape + (3,))
+        if   order == 'xyz':
+            es[...,0] = np.arctan2(2 * (q0 * q1 + q2 * q3), 1 - 2 * (q1 * q1 + q2 * q2))
+            es[...,1] = np.arcsin((2 * (q0 * q2 - q3 * q1)).clip(-1,1))
+            es[...,2] = np.arctan2(2 * (q0 * q3 + q1 * q2), 1 - 2 * (q2 * q2 + q3 * q3))
+        elif order == 'yzx':
+            es[...,0] = np.arctan2(2 * (q1 * q0 - q2 * q3), -q1 * q1 + q2 * q2 - q3 * q3 + q0 * q0)
+            es[...,1] = np.arctan2(2 * (q2 * q0 - q1 * q3),  q1 * q1 - q2 * q2 - q3 * q3 + q0 * q0)
+            es[...,2] = np.arcsin((2 * (q1 * q2 + q3 * q0)).clip(-1,1))
+        else:
+            raise NotImplementedError('Cannot convert from ordering %s' % order)
+        """
+        # These conversion don't appear to work correctly for Maya.
+        # http://bediyap.com/programming/convert-quaternion-to-euler-rotations/
+        if   order == 'xyz':
+            es[...,0] = np.arctan2(2 * (q0 * q3 - q1 * q2), q0 * q0 + q1 * q1 - q2 * q2 - q3 * q3)
+            es[...,1] = np.arcsin((2 * (q1 * q3 + q0 * q2)).clip(-1,1))
+            es[...,2] = np.arctan2(2 * (q0 * q1 - q2 * q3), q0 * q0 - q1 * q1 - q2 * q2 + q3 * q3)
+        elif order == 'yzx':
+            es[...,0] = np.arctan2(2 * (q0 * q1 - q2 * q3), q0 * q0 - q1 * q1 + q2 * q2 - q3 * q3)
+            es[...,1] = np.arcsin((2 * (q1 * q2 + q0 * q3)).clip(-1,1))
+            es[...,2] = np.arctan2(2 * (q0 * q2 - q1 * q3), q0 * q0 + q1 * q1 - q2 * q2 - q3 * q3)
+        elif order == 'zxy':
+            es[...,0] = np.arctan2(2 * (q0 * q2 - q1 * q3), q0 * q0 - q1 * q1 - q2 * q2 + q3 * q3)
+            es[...,1] = np.arcsin((2 * (q0 * q1 + q2 * q3)).clip(-1,1))
+            es[...,2] = np.arctan2(2 * (q0 * q3 - q1 * q2), q0 * q0 - q1 * q1 + q2 * q2 - q3 * q3)
+        elif order == 'xzy':
+            es[...,0] = np.arctan2(2 * (q0 * q2 + q1 * q3), q0 * q0 + q1 * q1 - q2 * q2 - q3 * q3)
+            es[...,1] = np.arcsin((2 * (q0 * q3 - q1 * q2)).clip(-1,1))
+            es[...,2] = np.arctan2(2 * (q0 * q1 + q2 * q3), q0 * q0 - q1 * q1 + q2 * q2 - q3 * q3)
+        elif order == 'yxz':
+            es[...,0] = np.arctan2(2 * (q1 * q2 + q0 * q3), q0 * q0 - q1 * q1 + q2 * q2 - q3 * q3)
+            es[...,1] = np.arcsin((2 * (q0 * q1 - q2 * q3)).clip(-1,1))
+            es[...,2] = np.arctan2(2 * (q1 * q3 + q0 * q2), q0 * q0 - q1 * q1 - q2 * q2 + q3 * q3)
+        elif order == 'zyx':
+            es[...,0] = np.arctan2(2 * (q0 * q1 + q2 * q3), q0 * q0 - q1 * q1 - q2 * q2 + q3 * q3)
+            es[...,1] = np.arcsin((2 * (q0 * q2 - q1 * q3)).clip(-1,1))
+            es[...,2] = np.arctan2(2 * (q0 * q3 + q1 * q2), q0 * q0 + q1 * q1 - q2 * q2 - q3 * q3)
+        else:
+            raise KeyError('Unknown ordering %s' % order)
+        """
+        # https://github.com/ehsan/ogre/blob/master/OgreMain/src/OgreMatrix3.cpp
+        # Use this class and convert from matrix
+        return es
+    def average(self):
+        if len(self.shape) == 1:
+            import numpy.core.umath_tests as ut
+            system = ut.matrix_multiply(self.qs[:,:,np.newaxis], self.qs[:,np.newaxis,:]).sum(axis=0)
+            w, v = np.linalg.eigh(system)
+            qiT_dot_qref = (self.qs[:,:,np.newaxis] * v[np.newaxis,:,:]).sum(axis=1)
+            return Quaternions(v[:,np.argmin((1.-qiT_dot_qref**2).sum(axis=0))])
+        else:
+            raise NotImplementedError('Cannot average multi-dimensionsal Quaternions')
+    def angle_axis(self):
+        norm = self.normalized()
+        s = np.sqrt(1 - (norm.reals**2.0))
+        s[s == 0] = 0.001
+        angles = 2.0 * np.arccos(norm.reals)
+        axis = norm.imaginaries / s[...,np.newaxis]
+        return angles, axis
+    def transforms(self):
+        qw = self.qs[...,0]
+        qx = self.qs[...,1]
+        qy = self.qs[...,2]
+        qz = self.qs[...,3]
+        x2 = qx + qx; y2 = qy + qy; z2 = qz + qz;
+        xx = qx * x2; yy = qy * y2; wx = qw * x2;
+        xy = qx * y2; yz = qy * z2; wy = qw * y2;
+        xz = qx * z2; zz = qz * z2; wz = qw * z2;
+        m = np.empty(self.shape + (3,3))
+        m[...,0,0] = 1.0 - (yy + zz)
+        m[...,0,1] = xy - wz
+        m[...,0,2] = xz + wy
+        m[...,1,0] = xy + wz
+        m[...,1,1] = 1.0 - (xx + zz)
+        m[...,1,2] = yz - wx
+        m[...,2,0] = xz - wy
+        m[...,2,1] = yz + wx
+        m[...,2,2] = 1.0 - (xx + yy)
+        return m
+    def ravel(self):
+        return self.qs.ravel()
+    @classmethod
+    def id(cls, n):
+        if isinstance(n, tuple):
+            qs = np.zeros(n + (4,))
+            qs[...,0] = 1.0
+            return Quaternions(qs)
+        if isinstance(n, int) or isinstance(n, long):
+            qs = np.zeros((n,4))
+            qs[:,0] = 1.0
+            return Quaternions(qs)
+        raise TypeError('Cannot Construct Quaternion from %s type' % str(type(n)))
+    @classmethod
+    def id_like(cls, a):
+        qs = np.zeros(a.shape + (4,))
+        qs[...,0] = 1.0
+        return Quaternions(qs)
+    @classmethod
+    def exp(cls, ws):
+        ts = np.sum(ws**2.0, axis=-1)**0.5
+        ts[ts == 0] = 0.001
+        ls = np.sin(ts) / ts
+        qs = np.empty(ws.shape[:-1] + (4,))
+        qs[...,0] = np.cos(ts)
+        qs[...,1] = ws[...,0] * ls
+        qs[...,2] = ws[...,1] * ls
+        qs[...,3] = ws[...,2] * ls
+        return Quaternions(qs).normalized()
+    @classmethod
+    def slerp(cls, q0s, q1s, a):
+        fst, snd = cls._broadcast(q0s.qs, q1s.qs)
+        fst, a = cls._broadcast(fst, a, scalar=True)
+        snd, a = cls._broadcast(snd, a, scalar=True)
+        len = np.sum(fst * snd, axis=-1)
+        neg = len < 0.0
+        len[neg] = -len[neg]
+        snd[neg] = -snd[neg]
+        amount0 = np.zeros(a.shape)
+        amount1 = np.zeros(a.shape)
+        linear = (1.0 - len) < 0.01
+        omegas = np.arccos(len[~linear])
+        sinoms = np.sin(omegas)
+        amount0[ linear] = 1.0 - a[linear]
+        amount1[ linear] =       a[linear]
+        amount0[~linear] = np.sin((1.0 - a[~linear]) * omegas) / sinoms
+        amount1[~linear] = np.sin(       a[~linear]  * omegas) / sinoms
+        return Quaternions(
+            amount0[...,np.newaxis] * fst +
+            amount1[...,np.newaxis] * snd)
+    @classmethod
+    def between(cls, v0s, v1s):
+        a = np.cross(v0s, v1s)
+        w = np.sqrt((v0s**2).sum(axis=-1) * (v1s**2).sum(axis=-1)) + (v0s * v1s).sum(axis=-1)
+        return Quaternions(np.concatenate([w[...,np.newaxis], a], axis=-1)).normalized()
+    @classmethod
+    def from_angle_axis(cls, angles, axis):
+        axis    = axis / (np.sqrt(np.sum(axis**2, axis=-1)) + 1e-10)[...,np.newaxis]
+        sines   = np.sin(angles / 2.0)[...,np.newaxis]
+        cosines = np.cos(angles / 2.0)[...,np.newaxis]
+        return Quaternions(np.concatenate([cosines, axis * sines], axis=-1))
+    @classmethod
+    def from_euler(cls, es, order='xyz', world=False):
+        axis = {
+            'x' : np.array([1,0,0]),
+            'y' : np.array([0,1,0]),
+            'z' : np.array([0,0,1]),
+        }
+        q0s = Quaternions.from_angle_axis(es[...,0], axis[order[0]])
+        q1s = Quaternions.from_angle_axis(es[...,1], axis[order[1]])
+        q2s = Quaternions.from_angle_axis(es[...,2], axis[order[2]])
+        return (q2s * (q1s * q0s)) if world else (q0s * (q1s * q2s))
+    @classmethod
+    def from_transforms(cls, ts):
+        d0, d1, d2 = ts[...,0,0], ts[...,1,1], ts[...,2,2]
+        q0 = ( d0 + d1 + d2 + 1.0) / 4.0
+        q1 = ( d0 - d1 - d2 + 1.0) / 4.0
+        q2 = (-d0 + d1 - d2 + 1.0) / 4.0
+        q3 = (-d0 - d1 + d2 + 1.0) / 4.0
+        q0 = np.sqrt(q0.clip(0,None))
+        q1 = np.sqrt(q1.clip(0,None))
+        q2 = np.sqrt(q2.clip(0,None))
+        q3 = np.sqrt(q3.clip(0,None))
+        c0 = (q0 >= q1) & (q0 >= q2) & (q0 >= q3)
+        c1 = (q1 >= q0) & (q1 >= q2) & (q1 >= q3)
+        c2 = (q2 >= q0) & (q2 >= q1) & (q2 >= q3)
+        c3 = (q3 >= q0) & (q3 >= q1) & (q3 >= q2)
+        q1[c0] *= np.sign(ts[c0,2,1] - ts[c0,1,2])
+        q2[c0] *= np.sign(ts[c0,0,2] - ts[c0,2,0])
+        q3[c0] *= np.sign(ts[c0,1,0] - ts[c0,0,1])
+        q0[c1] *= np.sign(ts[c1,2,1] - ts[c1,1,2])
+        q2[c1] *= np.sign(ts[c1,1,0] + ts[c1,0,1])
+        q3[c1] *= np.sign(ts[c1,0,2] + ts[c1,2,0])
+        q0[c2] *= np.sign(ts[c2,0,2] - ts[c2,2,0])
+        q1[c2] *= np.sign(ts[c2,1,0] + ts[c2,0,1])
+        q3[c2] *= np.sign(ts[c2,2,1] + ts[c2,1,2])
+        q0[c3] *= np.sign(ts[c3,1,0] - ts[c3,0,1])
+        q1[c3] *= np.sign(ts[c3,2,0] + ts[c3,0,2])
+        q2[c3] *= np.sign(ts[c3,2,1] + ts[c3,1,2])
+        qs = np.empty(ts.shape[:-2] + (4,))
+        qs[...,0] = q0
+        qs[...,1] = q1
+        qs[...,2] = q2
+        qs[...,3] = q3
+        return cls(qs)

dataloaders/pymo/__init__.py ADDED Viewed

File without changes

dataloaders/pymo/__pycache__/Quaternions.cpython-312.pyc ADDED Viewed

Binary file (28.3 kB). View file

dataloaders/pymo/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (173 Bytes). View file

dataloaders/pymo/__pycache__/data.cpython-312.pyc ADDED Viewed

Binary file (3.34 kB). View file

dataloaders/pymo/__pycache__/parsers.cpython-312.pyc ADDED Viewed

Binary file (13.1 kB). View file

dataloaders/pymo/__pycache__/preprocessing.cpython-312.pyc ADDED Viewed

Binary file (33.8 kB). View file

dataloaders/pymo/__pycache__/rotation_tools.cpython-312.pyc ADDED Viewed

Binary file (8.27 kB). View file

dataloaders/pymo/__pycache__/viz_tools.cpython-312.pyc ADDED Viewed

Binary file (10.9 kB). View file

dataloaders/pymo/data.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import numpy as np
+class Joint():
+    def __init__(self, name, parent=None, children=None):
+        self.name = name
+        self.parent = parent
+        self.children = children
+class MocapData():
+    def __init__(self):
+        self.skeleton = {}
+        self.values = None
+        self.channel_names = []
+        self.framerate = 0.0
+        self.root_name = ''
+    def traverse(self, j=None):
+        stack = [self.root_name]
+        while stack:
+            joint = stack.pop()
+            yield joint
+            for c in self.skeleton[joint]['children']:
+                stack.append(c)
+    def clone(self):
+        import copy
+        new_data = MocapData()
+        new_data.skeleton = copy.copy(self.skeleton)
+        new_data.values = copy.copy(self.values)
+        new_data.channel_names = copy.copy(self.channel_names)
+        new_data.root_name = copy.copy(self.root_name)
+        new_data.framerate = copy.copy(self.framerate)
+        return new_data
+    def get_all_channels(self):
+        '''Returns all of the channels parsed from the file as a 2D numpy array'''
+        frames = [f[1] for f in self.values]
+        return np.asarray([[channel[2] for channel in frame] for frame in frames])
+    def get_skeleton_tree(self):
+        tree = []
+        root_key =  [j for j in self.skeleton if self.skeleton[j]['parent']==None][0]
+        root_joint = Joint(root_key)
+    def get_empty_channels(self):
+        #TODO
+        pass
+    def get_constant_channels(self):
+        #TODO
+        pass

dataloaders/pymo/features.py ADDED Viewed

	@@ -0,0 +1,43 @@

+'''
+A set of mocap feature extraction functions
+Created by Omid Alemi | Nov 17 2017
+'''
+import numpy as np
+import pandas as pd
+import peakutils
+import matplotlib.pyplot as plt
+def get_foot_contact_idxs(signal, t=0.02, min_dist=120):
+    up_idxs = peakutils.indexes(signal, thres=t/max(signal), min_dist=min_dist)
+    down_idxs = peakutils.indexes(-signal, thres=t/min(signal), min_dist=min_dist)
+    return [up_idxs, down_idxs]
+def create_foot_contact_signal(mocap_track, col_name, start=1, t=0.02, min_dist=120):
+    signal = mocap_track.values[col_name].values
+    idxs = get_foot_contact_idxs(signal, t, min_dist)
+    step_signal = []
+    c = start
+    for f in range(len(signal)):
+        if f in idxs[1]:
+            c = 0
+        elif f in idxs[0]:
+            c = 1
+        step_signal.append(c)
+    return step_signal
+def plot_foot_up_down(mocap_track, col_name, t=0.02, min_dist=120):
+    signal = mocap_track.values[col_name].values
+    idxs = get_foot_contact_idxs(signal, t, min_dist)
+    plt.plot(mocap_track.values.index, signal)
+    plt.plot(mocap_track.values.index[idxs[0]], signal[idxs[0]], 'ro')
+    plt.plot(mocap_track.values.index[idxs[1]], signal[idxs[1]], 'go')

dataloaders/pymo/parsers.py ADDED Viewed

	@@ -0,0 +1,274 @@

+'''
+BVH Parser Class
+By Omid Alemi
+Created: June 12, 2017
+Based on: https://gist.github.com/johnfredcee/2007503
+'''
+import re
+from unicodedata import name
+import numpy as np
+from .data import Joint, MocapData
+class BVHScanner():
+    '''
+    A wrapper class for re.Scanner
+    '''
+    def __init__(self):
+        def identifier(scanner, token):
+            return 'IDENT', token
+        def operator(scanner, token):
+            return 'OPERATOR', token
+        def digit(scanner, token):
+            return 'DIGIT', token
+        def open_brace(scanner, token):
+            return 'OPEN_BRACE', token
+        def close_brace(scanner, token):
+            return 'CLOSE_BRACE', token
+        self.scanner = re.Scanner([
+            (r'[a-zA-Z_]\w*', identifier),
+            #(r'-*[0-9]+(\.[0-9]+)?', digit), # won't work for .34
+            #(r'[-+]?[0-9]*\.?[0-9]+', digit), # won't work for 4.56e-2
+            #(r'[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?', digit),
+            (r'-*[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?', digit),
+            (r'}', close_brace),
+            (r'}', close_brace),
+            (r'{', open_brace),
+            (r':', None),
+            (r'\s+', None)
+        ])
+    def scan(self, stuff):
+        return self.scanner.scan(stuff)
+class BVHParser():
+    '''
+    A class to parse a BVH file.
+    Extracts the skeleton and channel values
+    '''
+    def __init__(self, filename=None):
+        self.reset()
+    def reset(self):
+        self._skeleton = {}
+        self.bone_context = []
+        self._motion_channels = []
+        self._motions = []
+        self.current_token = 0
+        self.framerate = 0.0
+        self.root_name = ''
+        self.scanner = BVHScanner()
+        self.data = MocapData()
+    def parse(self, filename, start=0, stop=-1):
+        self.reset()
+        self.correct_row_num = 0
+        with open(filename, 'r') as f:
+            for line in f.readlines():
+                self.correct_row_num += 1
+        with open(filename, 'r') as bvh_file:
+            raw_contents = bvh_file.read()
+        tokens, remainder = self.scanner.scan(raw_contents)
+        self._parse_hierarchy(tokens)
+        self.current_token = self.current_token + 1
+        self._parse_motion(tokens, start, stop)
+        self.data.skeleton = self._skeleton
+        self.data.channel_names = self._motion_channels
+        self.data.values = self._to_DataFrame()
+        self.data.root_name = self.root_name
+        self.data.framerate = self.framerate
+        return self.data
+    def _to_DataFrame(self):
+        '''Returns all of the channels parsed from the file as a pandas DataFrame'''
+        import pandas as pd
+        time_index = pd.to_timedelta([f[0] for f in self._motions], unit='s')
+        frames = [f[1] for f in self._motions]
+        channels = np.asarray([[channel[2] for channel in frame] for frame in frames])
+        column_names = ['%s_%s'%(c[0], c[1]) for c in self._motion_channels]
+        return pd.DataFrame(data=channels, index=time_index, columns=column_names)
+    def _new_bone(self, parent, name):
+        bone = {'parent': parent, 'channels': [], 'offsets': [], 'order': '','children': []}
+        return bone
+    def _push_bone_context(self,name):
+        self.bone_context.append(name)
+    def _get_bone_context(self):
+        return self.bone_context[len(self.bone_context)-1]
+    def _pop_bone_context(self):
+        self.bone_context = self.bone_context[:-1]
+        return self.bone_context[len(self.bone_context)-1]
+    def _read_offset(self, bvh, token_index):
+        if bvh[token_index] != ('IDENT', 'OFFSET'):
+            return None, None
+        token_index = token_index + 1
+        offsets = [0.0] * 3
+        for i in range(3):
+            offsets[i] = float(bvh[token_index][1])
+            token_index = token_index + 1
+        return offsets, token_index
+    def _read_channels(self, bvh, token_index):
+        if bvh[token_index] != ('IDENT', 'CHANNELS'):
+            return None, None
+        token_index = token_index + 1
+        channel_count = int(bvh[token_index][1])
+        token_index = token_index + 1
+        channels = [""] * channel_count
+        order = ""
+        for i in range(channel_count):
+            channels[i] = bvh[token_index][1]
+            token_index = token_index + 1
+            if(channels[i] == "Xrotation" or channels[i]== "Yrotation" or channels[i]== "Zrotation"):
+                order += channels[i][0]
+            else :
+                order = ""
+        return channels, token_index, order
+    def _parse_joint(self, bvh, token_index):
+        end_site = False
+        joint_id = bvh[token_index][1]
+        token_index = token_index + 1
+        joint_name = bvh[token_index][1]
+        token_index = token_index + 1
+        parent_name = self._get_bone_context()
+        if (joint_id == "End"):
+            joint_name = parent_name+ '_Nub'
+            end_site = True
+        joint = self._new_bone(parent_name, joint_name)
+        if bvh[token_index][0] != 'OPEN_BRACE':
+            print('Was expecting brance, got ', bvh[token_index])
+            return None
+        token_index = token_index + 1
+        offsets, token_index = self._read_offset(bvh, token_index)
+        joint['offsets'] = offsets
+        if not end_site:
+            channels, token_index, order = self._read_channels(bvh, token_index)
+            joint['channels'] = channels
+            joint['order'] = order
+            for channel in channels:
+                self._motion_channels.append((joint_name, channel))
+        self._skeleton[joint_name] = joint
+        self._skeleton[parent_name]['children'].append(joint_name)
+        while (bvh[token_index][0] == 'IDENT' and bvh[token_index][1] == 'JOINT') or  (bvh[token_index][0] == 'IDENT' and bvh[token_index][1] == 'End'):
+            self._push_bone_context(joint_name)
+            token_index = self._parse_joint(bvh, token_index)
+            self._pop_bone_context()
+        if bvh[token_index][0] == 'CLOSE_BRACE':
+            return token_index + 1
+        print('Unexpected token ', bvh[token_index])
+    def _parse_hierarchy(self, bvh):
+        self.current_token = 0
+        if bvh[self.current_token] != ('IDENT', 'HIERARCHY'):
+            return None
+        self.current_token = self.current_token + 1
+        if bvh[self.current_token] != ('IDENT', 'ROOT'):
+            return None
+        self.current_token = self.current_token + 1
+        if bvh[self.current_token][0] != 'IDENT':
+            return None
+        root_name = bvh[self.current_token][1]
+        root_bone = self._new_bone(None, root_name)
+        self.current_token = self.current_token + 2 #skipping open brace
+        offsets, self.current_token = self._read_offset(bvh, self.current_token)
+        channels, self.current_token, order = self._read_channels(bvh, self.current_token)
+        root_bone['offsets'] = offsets
+        root_bone['channels'] = channels
+        root_bone['order'] = order
+        self._skeleton[root_name] = root_bone
+        self._push_bone_context(root_name)
+        for channel in channels:
+            self._motion_channels.append((root_name, channel))
+        while bvh[self.current_token][1] == 'JOINT':
+            self.current_token = self._parse_joint(bvh, self.current_token)
+        self.root_name = root_name
+    def _parse_motion(self, bvh, start, stop):
+        if bvh[self.current_token][0] != 'IDENT':
+            print('Unexpected text')
+            return None
+        if bvh[self.current_token][1] != 'MOTION':
+            print('No motion section')
+            return None
+        self.current_token = self.current_token + 1
+        if bvh[self.current_token][1] != 'Frames':
+            return None
+        self.current_token = self.current_token + 1
+        frame_count = int(bvh[self.current_token][1])
+        if stop<0 or stop>frame_count:
+            stop = min(frame_count, self.correct_row_num-431)
+        assert(start>=0)
+        assert(start<stop)
+        self.current_token = self.current_token + 1
+        if bvh[self.current_token][1] != 'Frame':
+            return None
+        self.current_token = self.current_token + 1
+        if bvh[self.current_token][1] != 'Time':
+            return None
+        self.current_token = self.current_token + 1
+        frame_rate = float(bvh[self.current_token][1])
+        self.framerate = frame_rate
+        self.current_token = self.current_token + 1
+        frame_time = 0.0
+        self._motions = [()] * (stop-start)
+        idx=0
+        for i in range(stop):
+            #print(i)
+            channel_values = []
+            for channel in self._motion_channels:
+                #print(channel)
+                channel_values.append((channel[0], channel[1], float(bvh[self.current_token][1])))
+                self.current_token = self.current_token + 1
+            if i>=start:
+                self._motions[idx] = (frame_time, channel_values)
+                frame_time = frame_time + frame_rate
+                idx+=1
+if __name__ == "__main__":
+    p = BVHParser()
+    data = [p.parse("../../../datasets/beat_full/2/2_scott_0_1_1.bvh")]

dataloaders/pymo/preprocessing.py ADDED Viewed

	@@ -0,0 +1,726 @@

+'''
+Preprocessing Tranformers Based on sci-kit's API
+By Omid Alemi
+Created on June 12, 2017
+'''
+import copy
+import pandas as pd
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from .Quaternions import Quaternions
+from .rotation_tools import Rotation
+class MocapParameterizer(BaseEstimator, TransformerMixin):
+    def __init__(self, param_type = 'euler'):
+        '''
+        param_type = {'euler', 'quat', 'expmap', 'position'}
+        '''
+        self.param_type = param_type
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X, y=None):
+        if self.param_type == 'euler':
+            return X
+        elif self.param_type == 'expmap':
+            return self._to_expmap(X)
+        elif self.param_type == 'quat':
+            return X
+        elif self.param_type == 'position':
+            return self._to_pos(X)
+        else:
+            raise UnsupportedParamError('Unsupported param: %s. Valid param types are: euler, quat, expmap, position' % self.param_type)
+#        return X
+    def inverse_transform(self, X, copy=None):
+        if self.param_type == 'euler':
+            return X
+        elif self.param_type == 'expmap':
+            return self._expmap_to_euler(X)
+        elif self.param_type == 'quat':
+            raise UnsupportedParamError('quat2euler is not supported')
+        elif self.param_type == 'position':
+            print('positions 2 eulers is not supported')
+            return X
+        else:
+            raise UnsupportedParamError('Unsupported param: %s. Valid param types are: euler, quat, expmap, position' % self.param_type)
+    def _to_pos(self, X):
+        '''Converts joints rotations in Euler angles to joint positions'''
+        Q = []
+        for track in X:
+            channels = []
+            titles = []
+            euler_df = track.values
+            # Create a new DataFrame to store the exponential map rep
+            pos_df = pd.DataFrame(index=euler_df.index)
+            # Copy the root rotations into the new DataFrame
+            # rxp = '%s_Xrotation'%track.root_name
+            # ryp = '%s_Yrotation'%track.root_name
+            # rzp = '%s_Zrotation'%track.root_name
+            # pos_df[rxp] = pd.Series(data=euler_df[rxp], index=pos_df.index)
+            # pos_df[ryp] = pd.Series(data=euler_df[ryp], index=pos_df.index)
+            # pos_df[rzp] = pd.Series(data=euler_df[rzp], index=pos_df.index)
+            # List the columns that contain rotation channels
+            rot_cols = [c for c in euler_df.columns if ('rotation' in c)]
+            # List the columns that contain position channels
+            pos_cols = [c for c in euler_df.columns if ('position' in c)]
+            # List the joints that are not end sites, i.e., have channels
+            joints = (joint for joint in track.skeleton)
+            tree_data = {}
+            for joint in track.traverse():
+                parent = track.skeleton[joint]['parent']
+                rot_order = track.skeleton[joint]['order']
+                #print("rot_order:" + joint + " :" + rot_order)
+                # Get the rotation columns that belong to this joint
+                rc = euler_df[[c for c in rot_cols if joint in c]]
+                # Get the position columns that belong to this joint
+                pc = euler_df[[c for c in pos_cols if joint in c]]
+                # Make sure the columns are organized in xyz order
+                if rc.shape[1] < 3:
+                    euler_values = np.zeros((euler_df.shape[0], 3))
+                    rot_order = "XYZ"
+                else:
+                    euler_values = np.pi/180.0*np.transpose(np.array([track.values['%s_%srotation'%(joint, rot_order[0])], track.values['%s_%srotation'%(joint, rot_order[1])], track.values['%s_%srotation'%(joint, rot_order[2])]]))
+                if pc.shape[1] < 3:
+                    pos_values = np.asarray([[0,0,0] for f in pc.iterrows()])
+                else:
+                    pos_values =np.asarray([[f[1]['%s_Xposition'%joint],
+                                  f[1]['%s_Yposition'%joint],
+                                  f[1]['%s_Zposition'%joint]] for f in pc.iterrows()])
+                quats = Quaternions.from_euler(np.asarray(euler_values), order=rot_order.lower(), world=False)
+                tree_data[joint]=[
+                                    [], # to store the rotation matrix
+                                    []  # to store the calculated position
+                                 ]
+                if track.root_name == joint:
+                    tree_data[joint][0] = quats#rotmats
+                    # tree_data[joint][1] = np.add(pos_values, track.skeleton[joint]['offsets'])
+                    tree_data[joint][1] = pos_values
+                else:
+                    # for every frame i, multiply this joint's rotmat to the rotmat of its parent
+                    tree_data[joint][0] = tree_data[parent][0]*quats# np.matmul(rotmats, tree_data[parent][0])
+                    # add the position channel to the offset and store it in k, for every frame i
+                    k = pos_values + np.asarray(track.skeleton[joint]['offsets'])
+                    # multiply k to the rotmat of the parent for every frame i
+                    q = tree_data[parent][0]*k #np.matmul(k.reshape(k.shape[0],1,3), tree_data[parent][0])
+                    # add q to the position of the parent, for every frame i
+                    tree_data[joint][1] = tree_data[parent][1] + q #q.reshape(k.shape[0],3) + tree_data[parent][1]
+                # Create the corresponding columns in the new DataFrame
+                pos_df['%s_Xposition'%joint] = pd.Series(data=[e[0] for e in tree_data[joint][1]], index=pos_df.index)
+                pos_df['%s_Yposition'%joint] = pd.Series(data=[e[1] for e in tree_data[joint][1]], index=pos_df.index)
+                pos_df['%s_Zposition'%joint] = pd.Series(data=[e[2] for e in tree_data[joint][1]], index=pos_df.index)
+            new_track = track.clone()
+            new_track.values = pos_df
+            Q.append(new_track)
+        return Q
+    def _to_expmap(self, X):
+        '''Converts Euler angles to Exponential Maps'''
+        Q = []
+        for track in X:
+            channels = []
+            titles = []
+            euler_df = track.values
+            # Create a new DataFrame to store the exponential map rep
+            exp_df = pd.DataFrame(index=euler_df.index)
+            # Copy the root positions into the new DataFrame
+            rxp = '%s_Xposition'%track.root_name
+            ryp = '%s_Yposition'%track.root_name
+            rzp = '%s_Zposition'%track.root_name
+            exp_df[rxp] = pd.Series(data=euler_df[rxp], index=exp_df.index)
+            exp_df[ryp] = pd.Series(data=euler_df[ryp], index=exp_df.index)
+            exp_df[rzp] = pd.Series(data=euler_df[rzp], index=exp_df.index)
+            # List the columns that contain rotation channels
+            rots = [c for c in euler_df.columns if ('rotation' in c and 'Nub' not in c)]
+            # List the joints that are not end sites, i.e., have channels
+            joints = (joint for joint in track.skeleton if 'Nub' not in joint)
+            for joint in joints:
+                r = euler_df[[c for c in rots if joint in c]] # Get the columns that belong to this joint
+                euler = [[f[1]['%s_Xrotation'%joint], f[1]['%s_Yrotation'%joint], f[1]['%s_Zrotation'%joint]] for f in r.iterrows()] # Make sure the columsn are organized in xyz order
+                exps = [Rotation(f, 'euler', from_deg=True).to_expmap() for f in euler] # Convert the eulers to exp maps
+                # Create the corresponding columns in the new DataFrame
+                exp_df['%s_alpha'%joint] = pd.Series(data=[e[0] for e in exps], index=exp_df.index)
+                exp_df['%s_beta'%joint] = pd.Series(data=[e[1] for e in exps], index=exp_df.index)
+                exp_df['%s_gamma'%joint] = pd.Series(data=[e[2] for e in exps], index=exp_df.index)
+            new_track = track.clone()
+            new_track.values = exp_df
+            Q.append(new_track)
+        return Q
+    def _expmap_to_euler(self, X):
+        Q = []
+        for track in X:
+            channels = []
+            titles = []
+            exp_df = track.values
+            # Create a new DataFrame to store the exponential map rep
+            euler_df = pd.DataFrame(index=exp_df.index)
+            # Copy the root positions into the new DataFrame
+            rxp = '%s_Xposition'%track.root_name
+            ryp = '%s_Yposition'%track.root_name
+            rzp = '%s_Zposition'%track.root_name
+            euler_df[rxp] = pd.Series(data=exp_df[rxp], index=euler_df.index)
+            euler_df[ryp] = pd.Series(data=exp_df[ryp], index=euler_df.index)
+            euler_df[rzp] = pd.Series(data=exp_df[rzp], index=euler_df.index)
+            # List the columns that contain rotation channels
+            exp_params = [c for c in exp_df.columns if ( any(p in c for p in ['alpha', 'beta','gamma']) and 'Nub' not in c)]
+            # List the joints that are not end sites, i.e., have channels
+            joints = (joint for joint in track.skeleton if 'Nub' not in joint)
+            for joint in joints:
+                r = exp_df[[c for c in exp_params if joint in c]] # Get the columns that belong to this joint
+                expmap = [[f[1]['%s_alpha'%joint], f[1]['%s_beta'%joint], f[1]['%s_gamma'%joint]] for f in r.iterrows()] # Make sure the columsn are organized in xyz order
+                euler_rots = [Rotation(f, 'expmap').to_euler(True)[0] for f in expmap] # Convert the eulers to exp maps
+                # Create the corresponding columns in the new DataFrame
+                euler_df['%s_Xrotation'%joint] = pd.Series(data=[e[0] for e in euler_rots], index=euler_df.index)
+                euler_df['%s_Yrotation'%joint] = pd.Series(data=[e[1] for e in euler_rots], index=euler_df.index)
+                euler_df['%s_Zrotation'%joint] = pd.Series(data=[e[2] for e in euler_rots], index=euler_df.index)
+            new_track = track.clone()
+            new_track.values = euler_df
+            Q.append(new_track)
+        return Q
+class JointSelector(BaseEstimator, TransformerMixin):
+    '''
+    Allows for filtering the mocap data to include only the selected joints
+    '''
+    def __init__(self, joints, include_root=False):
+        self.joints = joints
+        self.include_root = include_root
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X, y=None):
+        selected_joints = []
+        selected_channels = []
+        if self.include_root:
+            selected_joints.append(X[0].root_name)
+        selected_joints.extend(self.joints)
+        for joint_name in selected_joints:
+            selected_channels.extend([o for o in X[0].values.columns if joint_name in o])
+        Q = []
+        for track in X:
+            t2 = track.clone()
+            for key in track.skeleton.keys():
+                if key not in selected_joints:
+                    t2.skeleton.pop(key)
+            t2.values = track.values[selected_channels]
+            Q.append(t2)
+        return Q
+class Numpyfier(BaseEstimator, TransformerMixin):
+    '''
+    Just converts the values in a MocapData object into a numpy array
+    Useful for the final stage of a pipeline before training
+    '''
+    def __init__(self):
+        pass
+    def fit(self, X, y=None):
+        self.org_mocap_ = X[0].clone()
+        self.org_mocap_.values.drop(self.org_mocap_.values.index, inplace=True)
+        return self
+    def transform(self, X, y=None):
+        Q = []
+        for track in X:
+            Q.append(track.values.values)
+        return np.array(Q)
+    def inverse_transform(self, X, copy=None):
+        Q = []
+        for track in X:
+            new_mocap = self.org_mocap_.clone()
+            time_index = pd.to_timedelta([f for f in range(track.shape[0])], unit='s')
+            new_df =  pd.DataFrame(data=track, index=time_index, columns=self.org_mocap_.values.columns)
+            new_mocap.values = new_df
+            Q.append(new_mocap)
+        return Q
+class RootTransformer(BaseEstimator, TransformerMixin):
+    def __init__(self, method):
+        """
+        Accepted methods:
+            abdolute_translation_deltas
+            pos_rot_deltas
+        """
+        self.method = method
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X, y=None):
+        Q = []
+        for track in X:
+            if self.method == 'abdolute_translation_deltas':
+                new_df = track.values.copy()
+                xpcol = '%s_Xposition'%track.root_name
+                ypcol = '%s_Yposition'%track.root_name
+                zpcol = '%s_Zposition'%track.root_name
+                dxpcol = '%s_dXposition'%track.root_name
+                dzpcol = '%s_dZposition'%track.root_name
+                dx = track.values[xpcol].diff()
+                dz = track.values[zpcol].diff()
+                dx[0] = 0
+                dz[0] = 0
+                new_df.drop([xpcol, zpcol], axis=1, inplace=True)
+                new_df[dxpcol] = dx
+                new_df[dzpcol] = dz
+                new_track = track.clone()
+                new_track.values = new_df
+            # end of abdolute_translation_deltas
+            elif self.method == 'pos_rot_deltas':
+                new_track = track.clone()
+                # Absolute columns
+                xp_col = '%s_Xposition'%track.root_name
+                yp_col = '%s_Yposition'%track.root_name
+                zp_col = '%s_Zposition'%track.root_name
+                xr_col = '%s_Xrotation'%track.root_name
+                yr_col = '%s_Yrotation'%track.root_name
+                zr_col = '%s_Zrotation'%track.root_name
+                # Delta columns
+                dxp_col = '%s_dXposition'%track.root_name
+                dzp_col = '%s_dZposition'%track.root_name
+                dxr_col = '%s_dXrotation'%track.root_name
+                dyr_col = '%s_dYrotation'%track.root_name
+                dzr_col = '%s_dZrotation'%track.root_name
+                new_df = track.values.copy()
+                root_pos_x_diff = pd.Series(data=track.values[xp_col].diff(), index=new_df.index)
+                root_pos_z_diff = pd.Series(data=track.values[zp_col].diff(), index=new_df.index)
+                root_rot_y_diff = pd.Series(data=track.values[yr_col].diff(), index=new_df.index)
+                root_rot_x_diff = pd.Series(data=track.values[xr_col].diff(), index=new_df.index)
+                root_rot_z_diff = pd.Series(data=track.values[zr_col].diff(), index=new_df.index)
+                root_pos_x_diff[0] = 0
+                root_pos_z_diff[0] = 0
+                root_rot_y_diff[0] = 0
+                root_rot_x_diff[0] = 0
+                root_rot_z_diff[0] = 0
+                new_df.drop([xr_col, yr_col, zr_col, xp_col, zp_col], axis=1, inplace=True)
+                new_df[dxp_col] = root_pos_x_diff
+                new_df[dzp_col] = root_pos_z_diff
+                new_df[dxr_col] = root_rot_x_diff
+                new_df[dyr_col] = root_rot_y_diff
+                new_df[dzr_col] = root_rot_z_diff
+                new_track.values = new_df
+            Q.append(new_track)
+        return Q
+    def inverse_transform(self, X, copy=None, start_pos=None):
+        Q = []
+        #TODO: simplify this implementation
+        startx = 0
+        startz = 0
+        if start_pos is not None:
+            startx, startz = start_pos
+        for track in X:
+            new_track = track.clone()
+            if self.method == 'abdolute_translation_deltas':
+                new_df = new_track.values
+                xpcol = '%s_Xposition'%track.root_name
+                ypcol = '%s_Yposition'%track.root_name
+                zpcol = '%s_Zposition'%track.root_name
+                dxpcol = '%s_dXposition'%track.root_name
+                dzpcol = '%s_dZposition'%track.root_name
+                dx = track.values[dxpcol].values
+                dz = track.values[dzpcol].values
+                recx = [startx]
+                recz = [startz]
+                for i in range(dx.shape[0]-1):
+                    recx.append(recx[i]+dx[i+1])
+                    recz.append(recz[i]+dz[i+1])
+                # recx = [recx[i]+dx[i+1] for i in range(dx.shape[0]-1)]
+                # recz = [recz[i]+dz[i+1] for i in range(dz.shape[0]-1)]
+                # recx = dx[:-1] + dx[1:]
+                # recz = dz[:-1] + dz[1:]
+                new_df[xpcol] = pd.Series(data=recx, index=new_df.index)
+                new_df[zpcol] = pd.Series(data=recz, index=new_df.index)
+                new_df.drop([dxpcol, dzpcol], axis=1, inplace=True)
+                new_track.values = new_df
+            # end of abdolute_translation_deltas
+            elif self.method == 'pos_rot_deltas':
+                new_track = track.clone()
+                # Absolute columns
+                xp_col = '%s_Xposition'%track.root_name
+                yp_col = '%s_Yposition'%track.root_name
+                zp_col = '%s_Zposition'%track.root_name
+                xr_col = '%s_Xrotation'%track.root_name
+                yr_col = '%s_Yrotation'%track.root_name
+                zr_col = '%s_Zrotation'%track.root_name
+                # Delta columns
+                dxp_col = '%s_dXposition'%track.root_name
+                dzp_col = '%s_dZposition'%track.root_name
+                dxr_col = '%s_dXrotation'%track.root_name
+                dyr_col = '%s_dYrotation'%track.root_name
+                dzr_col = '%s_dZrotation'%track.root_name
+                new_df = track.values.copy()
+                dx = track.values[dxp_col].values
+                dz = track.values[dzp_col].values
+                drx = track.values[dxr_col].values
+                dry = track.values[dyr_col].values
+                drz = track.values[dzr_col].values
+                rec_xp = [startx]
+                rec_zp = [startz]
+                rec_xr = [0]
+                rec_yr = [0]
+                rec_zr = [0]
+                for i in range(dx.shape[0]-1):
+                    rec_xp.append(rec_xp[i]+dx[i+1])
+                    rec_zp.append(rec_zp[i]+dz[i+1])
+                    rec_xr.append(rec_xr[i]+drx[i+1])
+                    rec_yr.append(rec_yr[i]+dry[i+1])
+                    rec_zr.append(rec_zr[i]+drz[i+1])
+                new_df[xp_col] = pd.Series(data=rec_xp, index=new_df.index)
+                new_df[zp_col] = pd.Series(data=rec_zp, index=new_df.index)
+                new_df[xr_col] = pd.Series(data=rec_xr, index=new_df.index)
+                new_df[yr_col] = pd.Series(data=rec_yr, index=new_df.index)
+                new_df[zr_col] = pd.Series(data=rec_zr, index=new_df.index)
+                new_df.drop([dxr_col, dyr_col, dzr_col, dxp_col, dzp_col], axis=1, inplace=True)
+                new_track.values = new_df
+            Q.append(new_track)
+        return Q
+class RootCentricPositionNormalizer(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        pass
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X, y=None):
+        Q = []
+        for track in X:
+            new_track = track.clone()
+            rxp = '%s_Xposition'%track.root_name
+            ryp = '%s_Yposition'%track.root_name
+            rzp = '%s_Zposition'%track.root_name
+            projected_root_pos = track.values[[rxp, ryp, rzp]]
+            projected_root_pos.loc[:,ryp] = 0 # we want the root's projection on the floor plane as the ref
+            new_df = pd.DataFrame(index=track.values.index)
+            all_but_root = [joint for joint in track.skeleton if track.root_name not in joint]
+            # all_but_root = [joint for joint in track.skeleton]
+            for joint in all_but_root:
+                new_df['%s_Xposition'%joint] = pd.Series(data=track.values['%s_Xposition'%joint]-projected_root_pos[rxp], index=new_df.index)
+                new_df['%s_Yposition'%joint] = pd.Series(data=track.values['%s_Yposition'%joint]-projected_root_pos[ryp], index=new_df.index)
+                new_df['%s_Zposition'%joint] = pd.Series(data=track.values['%s_Zposition'%joint]-projected_root_pos[rzp], index=new_df.index)
+            # keep the root as it is now
+            new_df[rxp] = track.values[rxp]
+            new_df[ryp] = track.values[ryp]
+            new_df[rzp] = track.values[rzp]
+            new_track.values = new_df
+            Q.append(new_track)
+        return Q
+    def inverse_transform(self, X, copy=None):
+        Q = []
+        for track in X:
+            new_track = track.clone()
+            rxp = '%s_Xposition'%track.root_name
+            ryp = '%s_Yposition'%track.root_name
+            rzp = '%s_Zposition'%track.root_name
+            projected_root_pos = track.values[[rxp, ryp, rzp]]
+            projected_root_pos.loc[:,ryp] = 0 # we want the root's projection on the floor plane as the ref
+            new_df = pd.DataFrame(index=track.values.index)
+            for joint in track.skeleton:
+                new_df['%s_Xposition'%joint] = pd.Series(data=track.values['%s_Xposition'%joint]+projected_root_pos[rxp], index=new_df.index)
+                new_df['%s_Yposition'%joint] = pd.Series(data=track.values['%s_Yposition'%joint]+projected_root_pos[ryp], index=new_df.index)
+                new_df['%s_Zposition'%joint] = pd.Series(data=track.values['%s_Zposition'%joint]+projected_root_pos[rzp], index=new_df.index)
+            new_track.values = new_df
+            Q.append(new_track)
+        return Q
+class Flattener(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        pass
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X, y=None):
+        return np.concatenate(X, axis=0)
+class ConstantsRemover(BaseEstimator, TransformerMixin):
+    '''
+    For now it just looks at the first track
+    '''
+    def __init__(self, eps = 10e-10):
+        self.eps = eps
+    def fit(self, X, y=None):
+        stds = X[0].values.std()
+        cols = X[0].values.columns.values
+        self.const_dims_ = [c for c in cols if (stds[c] < self.eps).any()]
+        self.const_values_ = {c:X[0].values[c].values[0] for c in cols if (stds[c] < self.eps).any()}
+        return self
+    def transform(self, X, y=None):
+        Q = []
+        for track in X:
+            t2 = track.clone()
+            #for key in t2.skeleton.keys():
+            #    if key in self.ConstDims_:
+            #        t2.skeleton.pop(key)
+            t2.values = track.values[track.values.columns.difference(self.const_dims_)]
+            Q.append(t2)
+        return Q
+    def inverse_transform(self, X, copy=None):
+        Q = []
+        for track in X:
+            t2 = track.clone()
+            for d in self.const_dims_:
+                t2.values[d] = self.const_values_[d]
+            Q.append(t2)
+        return Q
+class ListStandardScaler(BaseEstimator, TransformerMixin):
+    def __init__(self, is_DataFrame=False):
+        self.is_DataFrame = is_DataFrame
+    def fit(self, X, y=None):
+        if self.is_DataFrame:
+            X_train_flat = np.concatenate([m.values for m in X], axis=0)
+        else:
+            X_train_flat = np.concatenate([m for m in X], axis=0)
+        self.data_mean_ = np.mean(X_train_flat, axis=0)
+        self.data_std_ = np.std(X_train_flat, axis=0)
+        return self
+    def transform(self, X, y=None):
+        Q = []
+        for track in X:
+            if self.is_DataFrame:
+                normalized_track = track.copy()
+                normalized_track.values = (track.values - self.data_mean_) / self.data_std_
+            else:
+                normalized_track = (track - self.data_mean_) / self.data_std_
+            Q.append(normalized_track)
+        if self.is_DataFrame:
+            return Q
+        else:
+            return np.array(Q)
+    def inverse_transform(self, X, copy=None):
+        Q = []
+        for track in X:
+            if self.is_DataFrame:
+                unnormalized_track = track.copy()
+                unnormalized_track.values = (track.values * self.data_std_) + self.data_mean_
+            else:
+                unnormalized_track = (track * self.data_std_) + self.data_mean_
+            Q.append(unnormalized_track)
+        if self.is_DataFrame:
+            return Q
+        else:
+            return np.array(Q)
+class DownSampler(BaseEstimator, TransformerMixin):
+    def __init__(self, rate):
+        self.rate = rate
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X, y=None):
+        Q = []
+        for track in X:
+            #print(track.values.size)
+            #new_track = track.clone()
+            #new_track.values = track.values[0:-1:self.rate]
+            #print(new_track.values.size)
+            new_track = track[0:-1:self.rate]
+            Q.append(new_track)
+        return Q
+    def inverse_transform(self, X, copy=None):
+      return X
+#TODO: JointsSelector (x)
+#TODO: SegmentMaker
+#TODO: DynamicFeaturesAdder
+#TODO: ShapeFeaturesAdder
+#TODO: DataFrameNumpier (x)
+class TemplateTransform(BaseEstimator, TransformerMixin):
+    def __init__(self):
+        pass
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X, y=None):
+        return X
+class UnsupportedParamError(Exception):
+    def __init__(self, message):
+        self.message = message

dataloaders/pymo/rotation_tools.py ADDED Viewed

	@@ -0,0 +1,153 @@

+'''
+Tools for Manipulating and Converting 3D Rotations
+By Omid Alemi
+Created: June 12, 2017
+Adapted from that matlab file...
+'''
+import math
+import numpy as np
+def deg2rad(x):
+    return x/180*math.pi
+def rad2deg(x):
+    return x/math.pi*180
+class Rotation():
+    def __init__(self,rot, param_type, rotation_order, **params):
+        self.rotmat = []
+        self.rotation_order = rotation_order
+        if param_type == 'euler':
+            self._from_euler(rot[0],rot[1],rot[2], params)
+        elif param_type == 'expmap':
+            self._from_expmap(rot[0], rot[1], rot[2], params)
+    def _from_euler(self, alpha, beta, gamma, params):
+        '''Expecting degress'''
+        if params['from_deg']==True:
+            alpha = deg2rad(alpha)
+            beta = deg2rad(beta)
+            gamma = deg2rad(gamma)
+        ca = math.cos(alpha)
+        cb = math.cos(beta)
+        cg = math.cos(gamma)
+        sa = math.sin(alpha)
+        sb = math.sin(beta)
+        sg = math.sin(gamma)
+        Rx = np.asarray([[1, 0, 0],
+              [0, ca, sa],
+              [0, -sa, ca]
+              ])
+        Ry = np.asarray([[cb, 0, -sb],
+              [0, 1, 0],
+              [sb, 0, cb]])
+        Rz = np.asarray([[cg, sg, 0],
+              [-sg, cg, 0],
+              [0, 0, 1]])
+        self.rotmat = np.eye(3)
+        ############################ inner product rotation matrix in order defined at BVH file #########################
+        for axis in self.rotation_order :
+            if axis == 'X' :
+                self.rotmat = np.matmul(Rx, self.rotmat)
+            elif axis == 'Y':
+                self.rotmat = np.matmul(Ry, self.rotmat)
+            else :
+                self.rotmat = np.matmul(Rz, self.rotmat)
+        ################################################################################################################
+    def _from_expmap(self, alpha, beta, gamma, params):
+        if (alpha == 0 and beta == 0 and gamma == 0):
+            self.rotmat = np.eye(3)
+            return
+        #TODO: Check exp map params
+        theta = np.linalg.norm([alpha, beta, gamma])
+        expmap = [alpha, beta, gamma] / theta
+        x = expmap[0]
+        y = expmap[1]
+        z = expmap[2]
+        s = math.sin(theta/2)
+        c = math.cos(theta/2)
+        self.rotmat = np.asarray([
+            [2*(x**2-1)*s**2+1,  2*x*y*s**2-2*z*c*s,  2*x*z*s**2+2*y*c*s],
+            [2*x*y*s**2+2*z*c*s,  2*(y**2-1)*s**2+1,  2*y*z*s**2-2*x*c*s],
+            [2*x*z*s**2-2*y*c*s, 2*y*z*s**2+2*x*c*s , 2*(z**2-1)*s**2+1]
+        ])
+    def get_euler_axis(self):
+        R = self.rotmat
+        theta = math.acos((self.rotmat.trace() - 1) / 2)
+        axis = np.asarray([R[2,1] - R[1,2], R[0,2] - R[2,0], R[1,0] - R[0,1]])
+        axis = axis/(2*math.sin(theta))
+        return theta, axis
+    def to_expmap(self):
+        theta, axis = self.get_euler_axis()
+        rot_arr = theta * axis
+        if np.isnan(rot_arr).any():
+            rot_arr = [0, 0, 0]
+        return rot_arr
+    def to_euler(self, use_deg=False):
+        eulers = np.zeros((2, 3))
+        if np.absolute(np.absolute(self.rotmat[2, 0]) - 1) < 1e-12:
+            #GIMBAL LOCK!
+            print('Gimbal')
+            if np.absolute(self.rotmat[2, 0]) - 1 < 1e-12:
+                eulers[:,0] = math.atan2(-self.rotmat[0,1], -self.rotmat[0,2])
+                eulers[:,1] = -math.pi/2
+            else:
+                eulers[:,0] = math.atan2(self.rotmat[0,1], -elf.rotmat[0,2])
+                eulers[:,1] = math.pi/2
+            return eulers
+        theta = - math.asin(self.rotmat[2,0])
+        theta2 = math.pi - theta
+        # psi1, psi2
+        eulers[0,0] = math.atan2(self.rotmat[2,1]/math.cos(theta), self.rotmat[2,2]/math.cos(theta))
+        eulers[1,0] = math.atan2(self.rotmat[2,1]/math.cos(theta2), self.rotmat[2,2]/math.cos(theta2))
+        # theta1, theta2
+        eulers[0,1] = theta
+        eulers[1,1] = theta2
+        # phi1, phi2
+        eulers[0,2] = math.atan2(self.rotmat[1,0]/math.cos(theta), self.rotmat[0,0]/math.cos(theta))
+        eulers[1,2] = math.atan2(self.rotmat[1,0]/math.cos(theta2), self.rotmat[0,0]/math.cos(theta2))
+        if use_deg:
+            eulers = rad2deg(eulers)
+        return eulers
+    def to_quat(self):
+        #TODO
+        pass
+    def __str__(self):
+        return "Rotation Matrix: \n " + self.rotmat.__str__()

dataloaders/pymo/rotation_tools.py! ADDED Viewed

	@@ -0,0 +1,69 @@

+'''
+Tools for Manipulating and Converting 3D Rotations
+By Omid Alemi
+Created: June 12, 2017
+Adapted from that matlab file...
+'''
+import math
+import numpy as np
+def deg2rad(x):
+    return x/180*math.pi
+class Rotation():
+    def __init__(self,rot, param_type, **params):
+        self.rotmat = []
+        if param_type == 'euler':
+            self._from_euler(rot[0],rot[1],rot[2], params)
+    def _from_euler(self, alpha, beta, gamma, params):
+        '''Expecting degress'''
+        if params['from_deg']==True:
+            alpha = deg2rad(alpha)
+            beta = deg2rad(beta)
+            gamma = deg2rad(gamma)
+        Rx = np.asarray([[1, 0, 0],
+              [0, math.cos(alpha), -math.sin(alpha)],
+              [0, math.sin(alpha), math.cos(alpha)]
+              ])
+        Ry = np.asarray([[math.cos(beta), 0, math.sin(beta)],
+              [0, 1, 0],
+              [-math.sin(beta), 0, math.cos(beta)]])
+        Rz = np.asarray([[math.cos(gamma), -math.sin(gamma), 0],
+              [math.sin(gamma), math.cos(gamma), 0],
+              [0, 0, 1]])
+        self.rotmat = np.matmul(np.matmul(Rz, Ry), Rx).T
+    def get_euler_axis(self):
+        R = self.rotmat
+        theta = math.acos((self.rotmat.trace() - 1) / 2)
+        axis = np.asarray([R[2,1] - R[1,2], R[0,2] - R[2,0], R[1,0] - R[0,1]])
+        axis = axis/(2*math.sin(theta))
+        return theta, axis
+    def to_expmap(self):
+        theta, axis = self.get_euler_axis()
+        rot_arr = theta * axis
+        if np.isnan(rot_arr).any():
+            rot_arr = [0, 0, 0]
+        return rot_arr
+    def to_euler(self):
+        #TODO
+        pass
+    def to_quat(self):
+        #TODO
+        pass

dataloaders/pymo/viz_tools.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import IPython
+import os
+def save_fig(fig_id, tight_layout=True):
+    if tight_layout:
+        plt.tight_layout()
+    plt.savefig(fig_id + '.png', format='png', dpi=300)
+def draw_stickfigure(mocap_track, frame, data=None, joints=None, draw_names=False, ax=None, figsize=(8,8)):
+    if ax is None:
+        fig = plt.figure(figsize=figsize)
+        ax = fig.add_subplot(111)
+    if joints is None:
+        joints_to_draw = mocap_track.skeleton.keys()
+    else:
+        joints_to_draw = joints
+    if data is None:
+        df = mocap_track.values
+    else:
+        df = data
+    for joint in joints_to_draw:
+        ax.scatter(x=df['%s_Xposition'%joint][frame],
+                   y=df['%s_Yposition'%joint][frame],
+                   alpha=0.6, c='b', marker='o')
+        parent_x = df['%s_Xposition'%joint][frame]
+        parent_y = df['%s_Yposition'%joint][frame]
+        children_to_draw = [c for c in mocap_track.skeleton[joint]['children'] if c in joints_to_draw]
+        for c in children_to_draw:
+            child_x = df['%s_Xposition'%c][frame]
+            child_y = df['%s_Yposition'%c][frame]
+            ax.plot([parent_x, child_x], [parent_y, child_y], 'k-', lw=2)
+        if draw_names:
+            ax.annotate(joint,
+                    (df['%s_Xposition'%joint][frame] + 0.1,
+                     df['%s_Yposition'%joint][frame] + 0.1))
+    return ax
+def draw_stickfigure3d(mocap_track, frame, data=None, joints=None, draw_names=False, ax=None, figsize=(8,8)):
+    from mpl_toolkits.mplot3d import Axes3D
+    if ax is None:
+        fig = plt.figure(figsize=figsize)
+        ax = fig.add_subplot(111, projection='3d')
+    if joints is None:
+        joints_to_draw = mocap_track.skeleton.keys()
+    else:
+        joints_to_draw = joints
+    if data is None:
+        df = mocap_track.values
+    else:
+        df = data
+    for joint in joints_to_draw:
+        parent_x = df['%s_Xposition'%joint][frame]
+        parent_y = df['%s_Zposition'%joint][frame]
+        parent_z = df['%s_Yposition'%joint][frame]
+        # ^ In mocaps, Y is the up-right axis
+        ax.scatter(xs=parent_x,
+                   ys=parent_y,
+                   zs=parent_z,
+                   alpha=0.6, c='b', marker='o')
+        children_to_draw = [c for c in mocap_track.skeleton[joint]['children'] if c in joints_to_draw]
+        for c in children_to_draw:
+            child_x = df['%s_Xposition'%c][frame]
+            child_y = df['%s_Zposition'%c][frame]
+            child_z = df['%s_Yposition'%c][frame]
+            # ^ In mocaps, Y is the up-right axis
+            ax.plot([parent_x, child_x], [parent_y, child_y], [parent_z, child_z], 'k-', lw=2, c='black')
+        if draw_names:
+            ax.text(x=parent_x + 0.1,
+                    y=parent_y + 0.1,
+                    z=parent_z + 0.1,
+                    s=joint,
+                    color='rgba(0,0,0,0.9)')
+    return ax
+def sketch_move(mocap_track, data=None, ax=None, figsize=(16,8)):
+    if ax is None:
+        fig = plt.figure(figsize=figsize)
+        ax = fig.add_subplot(111)
+    if data is None:
+        data = mocap_track.values
+    for frame in range(0, data.shape[0], 4):
+#         draw_stickfigure(mocap_track, f, data=data, ax=ax)
+        for joint in mocap_track.skeleton.keys():
+            children_to_draw = [c for c in mocap_track.skeleton[joint]['children']]
+            parent_x = data['%s_Xposition'%joint][frame]
+            parent_y = data['%s_Yposition'%joint][frame]
+            frame_alpha = frame/data.shape[0]
+            for c in children_to_draw:
+                child_x = data['%s_Xposition'%c][frame]
+                child_y = data['%s_Yposition'%c][frame]
+                ax.plot([parent_x, child_x], [parent_y, child_y], '-', lw=1, color='gray', alpha=frame_alpha)
+def viz_cnn_filter(feature_to_viz, mocap_track, data, gap=25):
+    fig = plt.figure(figsize=(16,4))
+    ax = plt.subplot2grid((1,8),(0,0))
+    ax.imshow(feature_to_viz.T, aspect='auto', interpolation='nearest')
+    ax = plt.subplot2grid((1,8),(0,1), colspan=7)
+    for frame in range(feature_to_viz.shape[0]):
+        frame_alpha = 0.2#frame/data.shape[0] * 2 + 0.2
+        for joint_i, joint in enumerate(mocap_track.skeleton.keys()):
+            children_to_draw = [c for c in mocap_track.skeleton[joint]['children']]
+            parent_x = data['%s_Xposition'%joint][frame] + frame * gap
+            parent_y = data['%s_Yposition'%joint][frame]
+            ax.scatter(x=parent_x,
+                       y=parent_y,
+                       alpha=0.6,
+                       cmap='RdBu',
+                       c=feature_to_viz[frame][joint_i] * 10000,
+                       marker='o',
+                       s = abs(feature_to_viz[frame][joint_i] * 10000))
+            plt.axis('off')
+            for c in children_to_draw:
+                child_x = data['%s_Xposition'%c][frame] + frame * gap
+                child_y = data['%s_Yposition'%c][frame]
+                ax.plot([parent_x, child_x], [parent_y, child_y], '-', lw=1, color='gray', alpha=frame_alpha)
+def print_skel(X):
+    stack = [X.root_name]
+    tab=0
+    while stack:
+        joint = stack.pop()
+        tab = len(stack)
+        print('%s- %s (%s)'%('| '*tab, joint, X.skeleton[joint]['parent']))
+        for c in X.skeleton[joint]['children']:
+            stack.append(c)
+def nb_play_mocap_fromurl(mocap, mf, frame_time=1/30, scale=1, base_url='http://titan:8385'):
+    if mf == 'bvh':
+        bw = BVHWriter()
+        with open('test.bvh', 'w') as ofile:
+            bw.write(mocap, ofile)
+        filepath = '../notebooks/test.bvh'
+    elif mf == 'pos':
+        c = list(mocap.values.columns)
+        for cc in c:
+            if 'rotation' in cc:
+                c.remove(cc)
+        mocap.values.to_csv('test.csv', index=False, columns=c)
+        filepath = '../notebooks/test.csv'
+    else:
+        return
+    url = '%s/mocapplayer/player.html?data_url=%s&scale=%f&cz=200&order=xzyi&frame_time=%f'%(base_url, filepath, scale, frame_time)
+    iframe = '<iframe src=' + url + ' width="100%" height=500></iframe>'
+    link = '<a href=%s target="_blank">New Window</a>'%url
+    return IPython.display.HTML(iframe+link)
+def nb_play_mocap(mocap, mf, meta=None, frame_time=1/30, scale=1, camera_z=500, base_url=None):
+    data_template = 'var dataBuffer = `$$DATA$$`;'
+    data_template += 'var metadata = $$META$$;'
+    data_template += 'start(dataBuffer, metadata, $$CZ$$, $$SCALE$$, $$FRAMETIME$$);'
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    if base_url is None:
+        base_url = os.path.join(dir_path, 'mocapplayer/playBuffer.html')
+    # print(dir_path)
+    if mf == 'bvh':
+        pass
+    elif mf == 'pos':
+        cols = list(mocap.values.columns)
+        for c in cols:
+            if 'rotation' in c:
+                cols.remove(c)
+        data_csv = mocap.values.to_csv(index=False, columns=cols)
+        if meta is not None:
+            lines = [','.join(item) for item in meta.astype('str')]
+            meta_csv = '[' + ','.join('[%s]'%l for l in lines) +']'
+        else:
+            meta_csv = '[]'
+        data_assigned = data_template.replace('$$DATA$$', data_csv)
+        data_assigned = data_assigned.replace('$$META$$', meta_csv)
+        data_assigned = data_assigned.replace('$$CZ$$', str(camera_z))
+        data_assigned = data_assigned.replace('$$SCALE$$', str(scale))
+        data_assigned = data_assigned.replace('$$FRAMETIME$$', str(frame_time))
+    else:
+        return
+    with open(os.path.join(dir_path, 'mocapplayer/data.js'), 'w') as oFile:
+        oFile.write(data_assigned)
+    url = '%s?&cz=200&order=xzyi&frame_time=%f&scale=%f'%(base_url, frame_time, scale)
+    iframe = '<iframe frameborder="0" src=' + url + ' width="100%" height=500></iframe>'
+    link = '<a href=%s target="_blank">New Window</a>'%url
+    return IPython.display.HTML(iframe+link)

dataloaders/pymo/writers.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import numpy as np
+import pandas as pd
+class BVHWriter():
+    def __init__(self):
+        pass
+    def write(self, X, ofile):
+        # Writing the skeleton info
+        ofile.write('HIERARCHY\n')
+        self.motions_ = []
+        self._printJoint(X, X.root_name, 0, ofile)
+        # Writing the motion header
+        ofile.write('MOTION\n')
+        ofile.write('Frames: %d\n'%X.values.shape[0])
+        ofile.write('Frame Time: %f\n'%X.framerate)
+        # Writing the data
+        self.motions_ = np.asarray(self.motions_).T
+        lines = [" ".join(item) for item in self.motions_.astype(str)]
+        ofile.write("".join("%s\n"%l for l in lines))
+    def _printJoint(self, X, joint, tab, ofile):
+        if X.skeleton[joint]['parent'] == None:
+            ofile.write('ROOT %s\n'%joint)
+        elif len(X.skeleton[joint]['children']) > 0:
+            ofile.write('%sJOINT %s\n'%('\t'*(tab), joint))
+        else:
+            ofile.write('%sEnd site\n'%('\t'*(tab)))
+        ofile.write('%s{\n'%('\t'*(tab)))
+        ofile.write('%sOFFSET %3.5f %3.5f %3.5f\n'%('\t'*(tab+1),
+                                                X.skeleton[joint]['offsets'][0],
+                                                X.skeleton[joint]['offsets'][1],
+                                                X.skeleton[joint]['offsets'][2]))
+        channels = X.skeleton[joint]['channels']
+        n_channels = len(channels)
+        if n_channels > 0:
+            for ch in channels:
+                self.motions_.append(np.asarray(X.values['%s_%s'%(joint, ch)].values))
+        if len(X.skeleton[joint]['children']) > 0:
+            ch_str = ''.join(' %s'*n_channels%tuple(channels))
+            ofile.write('%sCHANNELS %d%s\n' %('\t'*(tab+1), n_channels, ch_str))
+            for c in X.skeleton[joint]['children']:
+                self._printJoint(X, c, tab+1, ofile)
+        ofile.write('%s}\n'%('\t'*(tab)))

dataloaders/utils/__pycache__/audio_features.cpython-312.pyc ADDED Viewed

Binary file (4.64 kB). View file

dataloaders/utils/__pycache__/other_tools.cpython-312.pyc ADDED Viewed

Binary file (37.2 kB). View file

dataloaders/utils/__pycache__/rotation_conversions.cpython-312.pyc ADDED Viewed

Binary file (22.2 kB). View file

dataloaders/utils/audio_features.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""modified from https://github.com/yesheng-THU/GFGE/blob/main/data_processing/audio_features.py"""
+import numpy as np
+import librosa
+import math
+import os
+import scipy.io.wavfile as wav
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import copy
+from tqdm import tqdm
+from typing import Optional, Tuple
+from numpy.lib import stride_tricks
+from loguru import logger
+# Import Wav2Vec2Model to make it available for other modules
+import sys
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
+from models.utils.wav2vec import Wav2Vec2Model
+def process_audio_data(audio_file, args, data, f_name, selected_file):
+    """Process audio data with support for different representations."""
+    logger.info(f"# ---- Building cache for Audio {f_name} ---- #")
+    if not os.path.exists(audio_file):
+        logger.warning(f"# ---- file not found for Audio {f_name}, skip all files with the same id ---- #")
+        selected_file.drop(selected_file[selected_file['id'] == f_name].index, inplace=True)
+        return None
+    audio_save_path = audio_file.replace("wave16k", "onset_amplitude").replace(".wav", ".npy")
+    if args.audio_rep == "onset+amplitude" and os.path.exists(audio_save_path):
+        data['audio'] = np.load(audio_save_path)
+        logger.warning(f"# ---- file found cache for Audio {f_name} ---- #")
+    elif args.audio_rep == "onset+amplitude":
+        data['audio'] = calculate_onset_amplitude(audio_file, args.audio_sr, audio_save_path)
+    elif args.audio_rep == "mfcc":
+        audio_data, _ = librosa.load(audio_file)
+        data['audio'] = librosa.feature.melspectrogram(
+            y=audio_data,
+            sr=args.audio_sr,
+            n_mels=128,
+            hop_length=int(args.audio_sr/args.audio_fps)
+        ).transpose(1, 0)
+    if args.audio_norm and args.audio_rep == "wave16k":
+        data['audio'] = (data['audio'] - args.mean_audio) / args.std_audio
+    return data
+def calculate_onset_amplitude(audio_file, audio_sr, save_path):
+    """Calculate onset and amplitude features from audio file."""
+    audio_data, sr = librosa.load(audio_file)
+    audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=audio_sr)
+    # Calculate amplitude envelope
+    frame_length = 1024
+    shape = (audio_data.shape[-1] - frame_length + 1, frame_length)
+    strides = (audio_data.strides[-1], audio_data.strides[-1])
+    rolling_view = stride_tricks.as_strided(audio_data, shape=shape, strides=strides)
+    amplitude_envelope = np.max(np.abs(rolling_view), axis=1)
+    amplitude_envelope = np.pad(amplitude_envelope, (0, frame_length-1), mode='constant', constant_values=amplitude_envelope[-1])
+    # Calculate onset
+    audio_onset_f = librosa.onset.onset_detect(y=audio_data, sr=audio_sr, units='frames')
+    onset_array = np.zeros(len(audio_data), dtype=float)
+    onset_array[audio_onset_f] = 1.0
+    # Combine features
+    features = np.concatenate([amplitude_envelope.reshape(-1, 1), onset_array.reshape(-1, 1)], axis=1)
+    # Save features
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    np.save(save_path, features)
+    return features

dataloaders/utils/data_sample.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import math
+import numpy as np
+from collections import defaultdict
+from loguru import logger
+def sample_from_clip(
+    lmdb_manager, audio_file, audio_each_file, pose_each_file, trans_each_file,
+    trans_v_each_file, shape_each_file, facial_each_file, word_each_file,
+    vid_each_file, emo_each_file, sem_each_file, args, ori_stride, ori_length,
+    disable_filtering, clean_first_seconds, clean_final_seconds, is_test,
+    n_out_samples):
+    """Sample clips from the data according to specified parameters."""
+    round_seconds_skeleton = pose_each_file.shape[0] // args.pose_fps
+    # Calculate timing information
+    timing_info = calculate_timing_info(
+        audio_each_file, facial_each_file, round_seconds_skeleton,
+        args.audio_fps, args.pose_fps, args.audio_sr, args.audio_rep
+    )
+    round_seconds_skeleton = timing_info['final_seconds']
+    # Calculate clip boundaries
+    clip_info = calculate_clip_boundaries(
+        round_seconds_skeleton, clean_first_seconds, clean_final_seconds,
+        args.audio_fps, args.pose_fps
+    )
+    n_filtered_out = defaultdict(int)
+    # Process each training length ratio
+    for ratio in args.multi_length_training:
+        processed_data = process_data_with_ratio(
+            ori_stride, ori_length, ratio, clip_info, args, is_test,
+            audio_each_file, pose_each_file, trans_each_file, trans_v_each_file,
+            shape_each_file, facial_each_file, word_each_file, vid_each_file,
+            emo_each_file, sem_each_file, audio_file,
+            lmdb_manager, n_out_samples
+        )
+        for type_key, count in processed_data['filtered_counts'].items():
+            n_filtered_out[type_key] += count
+        n_out_samples = processed_data['n_out_samples']
+    return n_filtered_out, n_out_samples
+def calculate_timing_info(audio_data, facial_data, round_seconds_skeleton,
+                         audio_fps, pose_fps, audio_sr, audio_rep):
+    """Calculate timing information for the data."""
+    if audio_data is not None:
+        if audio_rep != "wave16k":
+            round_seconds_audio = len(audio_data) // audio_fps
+        elif audio_rep == "mfcc":
+            round_seconds_audio = audio_data.shape[0] // audio_fps
+        else:
+            round_seconds_audio = audio_data.shape[0] // audio_sr
+        if facial_data is not None:
+            round_seconds_facial = facial_data.shape[0] // pose_fps
+            logger.info(f"audio: {round_seconds_audio}s, pose: {round_seconds_skeleton}s, facial: {round_seconds_facial}s")
+            final_seconds = min(round_seconds_audio, round_seconds_skeleton, round_seconds_facial)
+            max_round = max(round_seconds_audio, round_seconds_skeleton, round_seconds_facial)
+            if final_seconds != max_round:
+                logger.warning(f"reduce to {final_seconds}s, ignore {max_round-final_seconds}s")
+        else:
+            logger.info(f"pose: {round_seconds_skeleton}s, audio: {round_seconds_audio}s")
+            final_seconds = min(round_seconds_audio, round_seconds_skeleton)
+            max_round = max(round_seconds_audio, round_seconds_skeleton)
+            if final_seconds != max_round:
+                logger.warning(f"reduce to {final_seconds}s, ignore {max_round-final_seconds}s")
+    else:
+        final_seconds = round_seconds_skeleton
+    return {
+        'final_seconds': final_seconds
+    }
+def calculate_clip_boundaries(round_seconds, clean_first_seconds, clean_final_seconds,
+                            audio_fps, pose_fps):
+    """Calculate the boundaries for clip sampling."""
+    clip_s_t = clean_first_seconds
+    clip_e_t = round_seconds - clean_final_seconds
+    return {
+        'clip_s_t': clip_s_t,
+        'clip_e_t': clip_e_t,
+        'clip_s_f_audio': audio_fps * clip_s_t,
+        'clip_e_f_audio': clip_e_t * audio_fps,
+        'clip_s_f_pose': clip_s_t * pose_fps,
+        'clip_e_f_pose': clip_e_t * pose_fps
+    }
+def process_data_with_ratio(ori_stride, ori_length, ratio, clip_info, args, is_test,
+                           audio_data, pose_data, trans_data, trans_v_data,
+                           shape_data, facial_data, word_data, vid_data,
+                           emo_data, sem_data, audio_file,
+                           lmdb_manager, n_out_samples):
+    """Process data with a specific training length ratio."""
+    if is_test and not args.test_clip:
+        cut_length = clip_info['clip_e_f_pose'] - clip_info['clip_s_f_pose']
+        args.stride = cut_length
+        max_length = cut_length
+    else:
+        args.stride = int(ratio * ori_stride)
+        cut_length = int(ori_length * ratio)
+    num_subdivision = math.floor(
+        (clip_info['clip_e_f_pose'] - clip_info['clip_s_f_pose'] - cut_length) / args.stride
+    ) + 1
+    logger.info(f"pose from frame {clip_info['clip_s_f_pose']} to {clip_info['clip_e_f_pose']}, length {cut_length}")
+    logger.info(f"{num_subdivision} clips is expected with stride {args.stride}")
+    if audio_data is not None:
+        audio_short_length = math.floor(cut_length / args.pose_fps * args.audio_fps)
+        logger.info(f"audio from frame {clip_info['clip_s_f_audio']} to {clip_info['clip_e_f_audio']}, length {audio_short_length}")
+    # Process subdivisions
+    filtered_counts = defaultdict(int)
+    for i in range(num_subdivision):
+        sample_data = extract_sample_data(
+            i, clip_info, cut_length, args,
+            audio_data, pose_data, trans_data, trans_v_data,
+            shape_data, facial_data, word_data, vid_data,
+            emo_data, sem_data, audio_file,
+            audio_short_length
+        )
+        if sample_data['pose'].any() is not None:
+            lmdb_manager.add_sample([
+                sample_data['pose'], sample_data['audio'], sample_data['facial'],
+                sample_data['shape'], sample_data['word'], sample_data['emo'],
+                sample_data['sem'], sample_data['vid'], sample_data['trans'],
+                sample_data['trans_v'], sample_data['audio_name']
+            ])
+            n_out_samples += 1
+    return {
+        'filtered_counts': filtered_counts,
+        'n_out_samples': n_out_samples
+    }
+def extract_sample_data(idx, clip_info, cut_length, args,
+                       audio_data, pose_data, trans_data, trans_v_data,
+                       shape_data, facial_data, word_data, vid_data,
+                       emo_data, sem_data, audio_file,
+                       audio_short_length):
+    """Extract a single sample from the data."""
+    start_idx = clip_info['clip_s_f_pose'] + idx * args.stride
+    fin_idx = start_idx + cut_length
+    sample_data = {
+        'pose': pose_data[start_idx:fin_idx],
+        'trans': trans_data[start_idx:fin_idx],
+        'trans_v': trans_v_data[start_idx:fin_idx],
+        'shape': shape_data[start_idx:fin_idx],
+        'facial': facial_data[start_idx:fin_idx] if args.facial_rep is not None else np.array([-1]),
+        'word': word_data[start_idx:fin_idx] if args.word_rep is not None else np.array([-1]),
+        'emo': emo_data[start_idx:fin_idx] if args.emo_rep is not None else np.array([-1]),
+        'sem': sem_data[start_idx:fin_idx] if args.sem_rep is not None else np.array([-1]),
+        'vid': vid_data[start_idx:fin_idx] if args.id_rep is not None else np.array([-1]),
+        'audio_name': audio_file
+    }
+    if audio_data is not None:
+        audio_start = clip_info['clip_s_f_audio'] + math.floor(idx * args.stride * args.audio_fps / args.pose_fps)
+        audio_end = audio_start + audio_short_length
+        sample_data['audio'] = audio_data[audio_start:audio_end]
+    else:
+        sample_data['audio'] = np.array([-1])
+    return sample_data

dataloaders/utils/mis_features.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# semantic_utils.py
+import pandas as pd
+import numpy as np
+from loguru import logger
+import os
+def process_semantic_data(sem_file, args, data, f_name):
+    """Process semantic representation data."""
+    logger.info(f"# ---- Building cache for Semantic {f_name} ---- #")
+    if not os.path.exists(sem_file):
+        logger.warning(f"# ---- file not found for Semantic {f_name} ---- #")
+        return None
+    sem_all = pd.read_csv(sem_file,
+        sep='\t',
+        names=["name", "start_time", "end_time", "duration", "score", "keywords"])
+    sem_data = []
+    for i in range(data['pose'].shape[0]):
+        current_time = i/args.pose_fps
+        found_score = False
+        for _, row in sem_all.iterrows():
+            if row['start_time'] <= current_time <= row['end_time']:
+                sem_data.append(row['score'])
+                found_score = True
+                break
+        if not found_score:
+            sem_data.append(0.0)
+    data['sem'] = np.array(sem_data)
+    return data
+def process_emotion_data(f_name, data, args):
+    """Process emotion representation data."""
+    logger.info(f"# ---- Building cache for Emotion {f_name} ---- #")
+    rtype, start = int(f_name.split('_')[3]), int(f_name.split('_')[3])
+    if rtype in [0, 2, 4, 6]:
+        if 1 <= start <= 64:
+            score = 0
+        elif 65 <= start <= 72:
+            score = 1
+        elif 73 <= start <= 80:
+            score = 2
+        elif 81 <= start <= 86:
+            score = 3
+        elif 87 <= start <= 94:
+            score = 4
+        elif 95 <= start <= 102:
+            score = 5
+        elif 103 <= start <= 110:
+            score = 6
+        elif 111 <= start <= 118:
+            score = 7
+        else:
+            score = 0
+    else:
+        score = 0
+    data['emo'] = np.repeat(np.array(score).reshape(1, 1), data['pose'].shape[0], axis=0)
+    return data

dataloaders/utils/motion_rep_transfer.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import smplx
+import torch
+import numpy as np
+from . import rotation_conversions as rc
+import os
+import wget
+download_path = "./datasets/hub"
+smplx_model_dir = os.path.join(download_path, "smplx_models", "smplx")
+if not os.path.exists(smplx_model_dir):
+    smplx_model_file_path = os.path.join(smplx_model_dir, "SMPLX_NEUTRAL_2020.npz")
+    os.makedirs(smplx_model_dir, exist_ok=True)
+    if not os.path.exists(smplx_model_file_path):
+        print(f"Downloading {smplx_model_file_path}")
+        wget.download(
+            "https://huggingface.co/spaces/H-Liu1997/EMAGE/resolve/main/EMAGE/smplx_models/smplx/SMPLX_NEUTRAL_2020.npz",
+            smplx_model_file_path,
+        )
+smplx_model = smplx.create(
+    "./datasets/hub/smplx_models/",
+    model_type='smplx',
+    gender='NEUTRAL_2020',
+    use_face_contour=False,
+    num_betas=300,
+    num_expression_coeffs=100,
+    ext='npz',
+    use_pca=False,
+).eval()
+def get_motion_rep_tensor(motion_tensor, pose_fps=30, device="cuda", betas=None):
+    global smplx_model
+    smplx_model = smplx_model.to(device)
+    bs, n, _ = motion_tensor.shape
+    motion_tensor = motion_tensor.float().to(device)
+    motion_tensor_reshaped = motion_tensor.reshape(bs * n, 165)
+    betas = torch.zeros(n, 300, device=device) if betas is None else betas.to(device).unsqueeze(0).repeat(n, 1)
+    output = smplx_model(
+        betas=torch.zeros(bs * n, 300, device=device),
+        transl=torch.zeros(bs * n, 3, device=device),
+        expression=torch.zeros(bs * n, 100, device=device),
+        jaw_pose=torch.zeros(bs * n, 3, device=device),
+        global_orient=torch.zeros(bs * n, 3, device=device),
+        body_pose=motion_tensor_reshaped[:, 3:21 * 3 + 3],
+        left_hand_pose=motion_tensor_reshaped[:, 25 * 3:40 * 3],
+        right_hand_pose=motion_tensor_reshaped[:, 40 * 3:55 * 3],
+        return_joints=True,
+        leye_pose=torch.zeros(bs * n, 3, device=device),
+        reye_pose=torch.zeros(bs * n, 3, device=device),
+    )
+    joints = output['joints'].reshape(bs, n, 127, 3)[:, :, :55, :]
+    dt = 1 / pose_fps
+    init_vel = (joints[:, 1:2] - joints[:, 0:1]) / dt
+    middle_vel = (joints[:, 2:] - joints[:, :-2]) / (2 * dt)
+    final_vel = (joints[:, -1:] - joints[:, -2:-1]) / dt
+    vel = torch.cat([init_vel, middle_vel, final_vel], dim=1)
+    position = joints
+    rot_matrices = rc.axis_angle_to_matrix(motion_tensor.reshape(bs, n, 55, 3))
+    rot6d = rc.matrix_to_rotation_6d(rot_matrices).reshape(bs, n, 55, 6)
+    init_vel_ang = (motion_tensor[:, 1:2] - motion_tensor[:, 0:1]) / dt
+    middle_vel_ang = (motion_tensor[:, 2:] - motion_tensor[:, :-2]) / (2 * dt)
+    final_vel_ang = (motion_tensor[:, -1:] - motion_tensor[:, -2:-1]) / dt
+    angular_velocity = torch.cat([init_vel_ang, middle_vel_ang, final_vel_ang], dim=1).reshape(bs, n, 55, 3)
+    rep15d = torch.cat([position, vel, rot6d, angular_velocity], dim=3).reshape(bs, n, 55 * 15)
+    return {
+        "position": position,
+        "velocity": vel,
+        "rotation": rot6d,
+        "axis_angle": motion_tensor,
+        "angular_velocity": angular_velocity,
+        "rep15d": rep15d,
+    }
+def get_motion_rep_numpy(poses_np, pose_fps=30, device="cuda", expressions=None, expression_only=False, betas=None):
+    # motion["poses"] is expected to be numpy array of shape (n, 165)
+    # (n, 55*3), axis-angle for 55 joints
+    global smplx_model
+    smplx_model = smplx_model.to(device)
+    n = poses_np.shape[0]
+    # Convert numpy to torch tensor for SMPL-X forward pass
+    poses_ts = torch.from_numpy(poses_np).float().to(device).unsqueeze(0)  # (1, n, 165)
+    poses_ts_reshaped = poses_ts.reshape(-1, 165)  # (n, 165)
+    betas = torch.zeros(n, 300, device=device) if betas is None else torch.from_numpy(betas).to(device).unsqueeze(0).repeat(n, 1)
+    if expressions is not None and expression_only:
+        # print("xx")
+        expressions = torch.from_numpy(expressions).float().to(device)
+        output = smplx_model(
+            betas=betas,
+            transl=torch.zeros(n, 3, device=device),
+            expression=expressions,
+            jaw_pose=poses_ts_reshaped[:, 22 * 3:23 * 3],
+            global_orient=torch.zeros(n, 3, device=device),
+            body_pose=torch.zeros(n, 21*3, device=device),
+            left_hand_pose=torch.zeros(n, 15*3, device=device),
+            right_hand_pose=torch.zeros(n, 15*3, device=device),
+            return_joints=True,
+            leye_pose=torch.zeros(n, 3, device=device),
+            reye_pose=torch.zeros(n, 3, device=device),
+            )
+        joints = output["vertices"].detach().cpu().numpy().reshape(n, -1)
+        return {"vertices": joints}
+    # Run smplx model to get joints
+    output = smplx_model(
+        betas=betas,
+        transl=torch.zeros(n, 3, device=device),
+        expression=torch.zeros(n, 100, device=device),
+        jaw_pose=torch.zeros(n, 3, device=device),
+        global_orient=torch.zeros(n, 3, device=device),
+        body_pose=poses_ts_reshaped[:, 3:21 * 3 + 3],
+        left_hand_pose=poses_ts_reshaped[:, 25 * 3:40 * 3],
+        right_hand_pose=poses_ts_reshaped[:, 40 * 3:55 * 3],
+        return_joints=True,
+        leye_pose=torch.zeros(n, 3, device=device),
+        reye_pose=torch.zeros(n, 3, device=device),
+    )
+    joints = output["joints"].detach().cpu().numpy().reshape(n, 127, 3)[:, :55, :]
+    dt = 1 / pose_fps
+    # Compute linear velocity
+    init_vel = (joints[1:2] - joints[0:1]) / dt
+    middle_vel = (joints[2:] - joints[:-2]) / (2 * dt)
+    final_vel = (joints[-1:] - joints[-2:-1]) / dt
+    vel = np.concatenate([init_vel, middle_vel, final_vel], axis=0)
+    position = joints
+    # Compute rotation 6D from axis-angle
+    poses_ts_reshaped_aa = poses_ts.reshape(1, n, 55, 3)
+    rot_matrices = rc.axis_angle_to_matrix(poses_ts_reshaped_aa)[0]  # (n, 55, 3, 3)
+    rot6d = rc.matrix_to_rotation_6d(rot_matrices).reshape(n, 55, 6).cpu().numpy()
+    # Compute angular velocity
+    init_vel_ang = (poses_np[1:2] - poses_np[0:1]) / dt
+    middle_vel_ang = (poses_np[2:] - poses_np[:-2]) / (2 * dt)
+    final_vel_ang = (poses_np[-1:] - poses_np[-2:-1]) / dt
+    angular_velocity = np.concatenate([init_vel_ang, middle_vel_ang, final_vel_ang], axis=0).reshape(n, 55, 3)
+    # rep15d: position(55*3), vel(55*3), rot6d(55*6), angular_velocity(55*3) => total 55*(3+3+6+3)=55*15
+    rep15d = np.concatenate([position, vel, rot6d, angular_velocity], axis=2).reshape(n, 55 * 15)
+    return {
+        "position": position,
+        "velocity": vel,
+        "rotation": rot6d,
+        "axis_angle": poses_np,
+        "angular_velocity": angular_velocity,
+        "rep15d": rep15d,
+    }
+def process_smplx_motion(pose_file, smplx_model, pose_fps, facial_rep=None):
+    """Process SMPLX pose and facial data together."""
+    pose_data = np.load(pose_file, allow_pickle=True)
+    stride = int(30/pose_fps)
+    # Extract pose and facial data with same stride
+    pose_frames = pose_data["poses"][::stride]
+    facial_frames = pose_data["expressions"][::stride] if facial_rep is not None else None
+    # Process translations
+    trans = pose_data["trans"][::stride]
+    trans[:,0] = trans[:,0] - trans[0,0]
+    trans[:,2] = trans[:,2] - trans[0,2]
+    # Calculate translation velocities
+    trans_v = np.zeros_like(trans)
+    trans_v[1:,0] = trans[1:,0] - trans[:-1,0]
+    trans_v[0,0] = trans_v[1,0]
+    trans_v[1:,2] = trans[1:,2] - trans[:-1,2]
+    trans_v[0,2] = trans_v[1,2]
+    trans_v[:,1] = trans[:,1]
+    # Process shape data
+    shape = np.repeat(pose_data["betas"].reshape(1, 300), pose_frames.shape[0], axis=0)
+    # # Calculate contacts
+    # contacts = calculate_foot_contacts(pose_data, smplx_model)
+    # if contacts is not None:
+    #     pose_data = np.concatenate([pose_data, contacts], axis=1)
+    return {
+        'pose': pose_frames,
+        'trans': trans,
+        'trans_v': trans_v,
+        'shape': shape,
+        'facial': facial_frames if facial_frames is not None else np.array([-1])
+    }
+def calculate_foot_contacts(pose_data, smplx_model):
+    """Calculate foot contacts from pose data."""
+    max_length = 128
+    all_tensor = []
+    n = pose_data["poses"].shape[0]
+    # Process in batches
+    for i in range(n // max_length):
+        joints = process_joints_batch(pose_data, i, max_length, smplx_model)
+        all_tensor.append(joints)
+    # Process remaining frames
+    if n % max_length != 0:
+        r = n % max_length
+        joints = process_joints_batch(pose_data, n // max_length, r, smplx_model, remainder=True)
+        all_tensor.append(joints)
+    # Calculate velocities and contacts
+    joints = torch.cat(all_tensor, axis=0)
+    feetv = torch.zeros(joints.shape[1], joints.shape[0])
+    joints = joints.permute(1, 0, 2)
+    feetv[:, :-1] = (joints[:, 1:] - joints[:, :-1]).norm(dim=-1)
+    contacts = (feetv < 0.01).numpy().astype(float)
+    return contacts.transpose(1, 0)
+def process_joints_batch(pose_data, batch_idx, batch_size, smplx_model, remainder=False):
+    """Process a batch of joints for contact calculation."""
+    start_idx = batch_idx * batch_size
+    end_idx = start_idx + batch_size
+    with torch.no_grad():
+        return smplx_model(
+            betas=torch.from_numpy(pose_data["betas"]).cuda().float().repeat(batch_size, 1),
+            transl=torch.from_numpy(pose_data["trans"][start_idx:end_idx]).cuda().float(),
+            expression=torch.from_numpy(pose_data["expressions"][start_idx:end_idx]).cuda().float(),
+            jaw_pose=torch.from_numpy(pose_data["poses"][start_idx:end_idx, 66:69]).cuda().float(),
+            global_orient=torch.from_numpy(pose_data["poses"][start_idx:end_idx, :3]).cuda().float(),
+            body_pose=torch.from_numpy(pose_data["poses"][start_idx:end_idx, 3:21*3+3]).cuda().float(),
+            left_hand_pose=torch.from_numpy(pose_data["poses"][start_idx:end_idx, 25*3:40*3]).cuda().float(),
+            right_hand_pose=torch.from_numpy(pose_data["poses"][start_idx:end_idx, 40*3:55*3]).cuda().float(),
+            leye_pose=torch.from_numpy(pose_data["poses"][start_idx:end_idx, 69:72]).cuda().float(),
+            reye_pose=torch.from_numpy(pose_data["poses"][start_idx:end_idx, 72:75]).cuda().float(),
+            return_verts=True,
+            return_joints=True
+        )['joints'][:, (7,8,10,11), :].reshape(batch_size, 4, 3).cpu()

dataloaders/utils/other_tools.py ADDED Viewed

	@@ -0,0 +1,748 @@

+import os
+import numpy as np
+import random
+import torch
+import shutil
+import csv
+import pprint
+import pandas as pd
+from loguru import logger
+from collections import OrderedDict
+import matplotlib.pyplot as plt
+import pickle
+import time
+import lmdb
+import numpy as np
+def adjust_array(x, k):
+    len_x = len(x)
+    len_k = len(k)
+    # If x is shorter than k, pad with zeros
+    if len_x < len_k:
+        return np.pad(x, (0, len_k - len_x), 'constant')
+    # If x is longer than k, truncate x
+    elif len_x > len_k:
+        return x[:len_k]
+    # If both are of same length
+    else:
+        return x
+def onset_to_frame(onset_times, audio_length, fps):
+    # Calculate total number of frames for the given audio length
+    total_frames = int(audio_length * fps)
+    # Create an array of zeros of shape (total_frames,)
+    frame_array = np.zeros(total_frames, dtype=np.int32)
+    # For each onset time, calculate the frame number and set it to 1
+    for onset in onset_times:
+        frame_num = int(onset * fps)
+        # Check if the frame number is within the array bounds
+        if 0 <= frame_num < total_frames:
+            frame_array[frame_num] = 1
+    return frame_array
+def smooth_animations(animation1, animation2, blend_frames):
+    """
+    Smoothly transition between two animation clips using linear interpolation.
+    Parameters:
+    - animation1: The first animation clip, a numpy array of shape [n, k].
+    - animation2: The second animation clip, a numpy array of shape [n, k].
+    - blend_frames: Number of frames over which to blend the two animations.
+    Returns:
+    - A smoothly blended animation clip of shape [2n, k].
+    """
+    # Ensure blend_frames doesn't exceed the length of either animation
+    blend_frames = min(blend_frames, len(animation1), len(animation2))
+    # Extract overlapping sections
+    overlap_a1 = animation1[-blend_frames:-blend_frames+1, :]
+    overlap_a2 = animation2[blend_frames-1:blend_frames, :]
+    # Create blend weights for linear interpolation
+    alpha = np.linspace(0, 1, 2 * blend_frames).reshape(-1, 1)
+    # Linearly interpolate between overlapping sections
+    blended_overlap = overlap_a1 * (1 - alpha) + overlap_a2 * alpha
+    # Extend the animations to form the result with 2n frames
+    if blend_frames == len(animation1) and blend_frames == len(animation2):
+        result = blended_overlap
+    else:
+        before_blend = animation1[:-blend_frames]
+        after_blend = animation2[blend_frames:]
+        result = np.vstack((before_blend, blended_overlap, after_blend))
+    return result
+def interpolate_sequence(quaternions):
+    bs, n, j, _ = quaternions.shape
+    new_n = 2 * n
+    new_quaternions = torch.zeros((bs, new_n, j, 4), device=quaternions.device, dtype=quaternions.dtype)
+    for i in range(n):
+        q1 = quaternions[:, i, :, :]
+        new_quaternions[:, 2*i, :, :] = q1
+        if i < n - 1:
+            q2 = quaternions[:, i + 1, :, :]
+            new_quaternions[:, 2*i + 1, :, :] = slerp(q1, q2, 0.5)
+        else:
+            # For the last point, duplicate the value
+            new_quaternions[:, 2*i + 1, :, :] = q1
+    return new_quaternions
+def quaternion_multiply(q1, q2):
+    w1, x1, y1, z1 = q1
+    w2, x2, y2, z2 = q2
+    w = w1 * w2 - x1 * x2 - y1 * y2 - z1 * z2
+    x = w1 * x2 + x1 * w2 + y1 * z2 - z1 * y2
+    y = w1 * y2 + y1 * w2 + z1 * x2 - x1 * z2
+    z = w1 * z2 + z1 * w2 + x1 * y2 - y1 * x2
+    return w, x, y, z
+def quaternion_conjugate(q):
+    w, x, y, z = q
+    return (w, -x, -y, -z)
+def slerp(q1, q2, t):
+    dot = torch.sum(q1 * q2, dim=-1, keepdim=True)
+    flip = (dot < 0).float()
+    q2 = (1 - flip * 2) * q2
+    dot = dot * (1 - flip * 2)
+    DOT_THRESHOLD = 0.9995
+    mask = (dot > DOT_THRESHOLD).float()
+    theta_0 = torch.acos(dot)
+    theta = theta_0 * t
+    q3 = q2 - q1 * dot
+    q3 = q3 / torch.norm(q3, dim=-1, keepdim=True)
+    interpolated = (torch.cos(theta) * q1 + torch.sin(theta) * q3)
+    return mask * (q1 + t * (q2 - q1)) + (1 - mask) * interpolated
+def estimate_linear_velocity(data_seq, dt):
+    '''
+    Given some batched data sequences of T timesteps in the shape (B, T, ...), estimates
+    the velocity for the middle T-2 steps using a second order central difference scheme.
+    The first and last frames are with forward and backward first-order
+    differences, respectively
+    - h : step size
+    '''
+    # first steps is forward diff (t+1 - t) / dt
+    init_vel = (data_seq[:, 1:2] - data_seq[:, :1]) / dt
+    # middle steps are second order (t+1 - t-1) / 2dt
+    middle_vel = (data_seq[:, 2:] - data_seq[:, 0:-2]) / (2 * dt)
+    # last step is backward diff (t - t-1) / dt
+    final_vel = (data_seq[:, -1:] - data_seq[:, -2:-1]) / dt
+    vel_seq = torch.cat([init_vel, middle_vel, final_vel], dim=1)
+    return vel_seq
+def estimate_angular_velocity(rot_seq, dt):
+    '''
+    Given a batch of sequences of T rotation matrices, estimates angular velocity at T-2 steps.
+    Input sequence should be of shape (B, T, ..., 3, 3)
+    '''
+    # see https://en.wikipedia.org/wiki/Angular_velocity#Calculation_from_the_orientation_matrix
+    dRdt = estimate_linear_velocity(rot_seq, dt)
+    R = rot_seq
+    RT = R.transpose(-1, -2)
+    # compute skew-symmetric angular velocity tensor
+    w_mat = torch.matmul(dRdt, RT)
+    # pull out angular velocity vector by averaging symmetric entries
+    w_x = (-w_mat[..., 1, 2] + w_mat[..., 2, 1]) / 2.0
+    w_y = (w_mat[..., 0, 2] - w_mat[..., 2, 0]) / 2.0
+    w_z = (-w_mat[..., 0, 1] + w_mat[..., 1, 0]) / 2.0
+    w = torch.stack([w_x, w_y, w_z], axis=-1)
+    return w
+import matplotlib.image as mpimg
+from io import BytesIO
+def image_from_bytes(image_bytes):
+    return mpimg.imread(BytesIO(image_bytes), format='PNG')
+def process_frame(i, vertices_all, vertices1_all, faces, output_dir, use_matplotlib, filenames, camera_params, camera_params1):
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt
+    import trimesh
+    import pyvirtualdisplay as Display
+    vertices = vertices_all[i]
+    vertices1 = vertices1_all[i]
+    filename = f"{output_dir}frame_{i}.png"
+    filenames.append(filename)
+    if i%100 == 0:
+        print('processed', i, 'frames')
+    #time_s = time.time()
+    #print(vertices.shape)
+    if use_matplotlib:
+        fig = plt.figure(figsize=(20, 10))
+        ax = fig.add_subplot(121, projection="3d")
+        fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
+        #ax.view_init(elev=0, azim=90)
+        x = vertices[:, 0]
+        y = vertices[:, 1]
+        z = vertices[:, 2]
+        ax.scatter(x, y, z, s=0.5)
+        ax.set_xlim([-1.0, 1.0])
+        ax.set_ylim([-0.5, 1.5])#heigth
+        ax.set_zlim([-0, 2])#depth
+        ax.set_box_aspect((1,1,1))
+    else:
+        mesh = trimesh.Trimesh(vertices, faces)
+        scene = mesh.scene()
+        scene.camera.fov = camera_params['fov']
+        scene.camera.resolution = camera_params['resolution']
+        scene.camera.z_near = camera_params['z_near']
+        scene.camera.z_far = camera_params['z_far']
+        scene.graph[scene.camera.name] = camera_params['transform']
+        fig, ax =plt.subplots(1,2, figsize=(16, 6))
+        image = scene.save_image(resolution=[640, 480], visible=False)
+        im0 = ax[0].imshow(image_from_bytes(image))
+        ax[0].axis('off')
+    if use_matplotlib:
+        ax2 = fig.add_subplot(122, projection="3d")
+        ax2.set_box_aspect((1,1,1))
+        fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
+        x1 = vertices1[:, 0]
+        y1 = vertices1[:, 1]
+        z1 = vertices1[:, 2]
+        ax2.scatter(x1, y1, z1, s=0.5)
+        ax2.set_xlim([-1.0, 1.0])
+        ax2.set_ylim([-0.5, 1.5])#heigth
+        ax2.set_zlim([-0, 2])
+        plt.savefig(filename, bbox_inches='tight')
+        plt.close(fig)
+    else:
+        mesh1 = trimesh.Trimesh(vertices1, faces)
+        scene1 = mesh1.scene()
+        scene1.camera.fov = camera_params1['fov']
+        scene1.camera.resolution = camera_params1['resolution']
+        scene1.camera.z_near = camera_params1['z_near']
+        scene1.camera.z_far = camera_params1['z_far']
+        scene1.graph[scene1.camera.name] = camera_params1['transform']
+        image1 = scene1.save_image(resolution=[640, 480], visible=False)
+        im1 = ax[1].imshow(image_from_bytes(image1))
+        ax[1].axis('off')
+        plt.savefig(filename, bbox_inches='tight')
+        plt.close(fig)
+def generate_images(frames, vertices_all, vertices1_all, faces, output_dir, use_matplotlib, filenames):
+    import multiprocessing
+    import trimesh
+    num_cores = multiprocessing.cpu_count()  # This will get the number of cores on your machine.
+    mesh = trimesh.Trimesh(vertices_all[0], faces)
+    scene = mesh.scene()
+    camera_params = {
+        'fov': scene.camera.fov,
+        'resolution': scene.camera.resolution,
+        'focal': scene.camera.focal,
+        'z_near': scene.camera.z_near,
+        "z_far": scene.camera.z_far,
+        'transform': scene.graph[scene.camera.name][0]
+    }
+    mesh1 = trimesh.Trimesh(vertices1_all[0], faces)
+    scene1 = mesh1.scene()
+    camera_params1 = {
+        'fov': scene1.camera.fov,
+        'resolution': scene1.camera.resolution,
+        'focal': scene1.camera.focal,
+        'z_near': scene1.camera.z_near,
+        "z_far": scene1.camera.z_far,
+        'transform': scene1.graph[scene1.camera.name][0]
+    }
+    # Use a Pool to manage the processes
+    # print(num_cores)
+    progress = multiprocessing.Value('i', 0)
+    lock = multiprocessing.Lock()
+    with multiprocessing.Pool(num_cores) as pool:
+        pool.starmap(process_frame, [(i, vertices_all, vertices1_all, faces, output_dir, use_matplotlib, filenames, camera_params, camera_params1) for i in range(frames)])
+def render_one_sequence(
+         res_npz_path,
+         gt_npz_path,
+         output_dir,
+         audio_path,
+         model_folder="/data/datasets/smplx_models/",
+         model_type='smplx',
+         gender='NEUTRAL_2020',
+         ext='npz',
+         num_betas=300,
+         num_expression_coeffs=100,
+         use_face_contour=False,
+         use_matplotlib=False,
+         args=None):
+    import smplx
+    import matplotlib.pyplot as plt
+    import imageio
+    from tqdm import tqdm
+    import os
+    import numpy as np
+    import torch
+    import moviepy.editor as mp
+    import librosa
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = smplx.create(
+        model_folder,
+        model_type=model_type,
+        gender=gender,
+        use_face_contour=use_face_contour,
+        num_betas=num_betas,
+        num_expression_coeffs=num_expression_coeffs,
+        ext=ext,
+        use_pca=False,
+    ).to(device)
+    #data_npz = np.load(f"{output_dir}{res_npz_path}.npz")
+    data_np_body = np.load(res_npz_path, allow_pickle=True)
+    gt_np_body = np.load(gt_npz_path, allow_pickle=True)
+    if not os.path.exists(output_dir): os.makedirs(output_dir)
+    filenames = []
+    if not use_matplotlib:
+       import trimesh
+       #import pyrender
+       from pyvirtualdisplay import Display
+       display = Display(visible=0, size=(640, 480))
+       display.start()
+       faces = np.load(f"{model_folder}/smplx/SMPLX_NEUTRAL_2020.npz", allow_pickle=True)["f"]
+    seconds = 1
+    #data_npz["jaw_pose"].shape[0]
+    n = data_np_body["poses"].shape[0]
+    beta = torch.from_numpy(data_np_body["betas"]).to(torch.float32).unsqueeze(0).to(device)
+    beta = beta.repeat(n, 1)
+    expression = torch.from_numpy(data_np_body["expressions"][:n]).to(torch.float32).to(device)
+    jaw_pose = torch.from_numpy(data_np_body["poses"][:n, 66:69]).to(torch.float32).to(device)
+    pose = torch.from_numpy(data_np_body["poses"][:n]).to(torch.float32).to(device)
+    transl = torch.from_numpy(data_np_body["trans"][:n]).to(torch.float32).to(device)
+    # print(beta.shape, expression.shape, jaw_pose.shape, pose.shape, transl.shape, pose[:,:3].shape)
+    output = model(betas=beta, transl=transl, expression=expression, jaw_pose=jaw_pose,
+        global_orient=pose[:,:3], body_pose=pose[:,3:21*3+3], left_hand_pose=pose[:,25*3:40*3], right_hand_pose=pose[:,40*3:55*3],
+        leye_pose=pose[:, 69:72],
+        reye_pose=pose[:, 72:75],
+        return_verts=True)
+    vertices_all = output["vertices"].cpu().detach().numpy()
+    beta1 = torch.from_numpy(gt_np_body["betas"]).to(torch.float32).unsqueeze(0).to(device)
+    expression1 = torch.from_numpy(gt_np_body["expressions"][:n]).to(torch.float32).to(device)
+    jaw_pose1 = torch.from_numpy(gt_np_body["poses"][:n,66:69]).to(torch.float32).to(device)
+    pose1 = torch.from_numpy(gt_np_body["poses"][:n]).to(torch.float32).to(device)
+    transl1 = torch.from_numpy(gt_np_body["trans"][:n]).to(torch.float32).to(device)
+    output1 = model(betas=beta1, transl=transl1, expression=expression1, jaw_pose=jaw_pose1, global_orient=pose1[:,:3], body_pose=pose1[:,3:21*3+3], left_hand_pose=pose1[:,25*3:40*3], right_hand_pose=pose1[:,40*3:55*3],
+        leye_pose=pose1[:, 69:72],
+        reye_pose=pose1[:, 72:75],return_verts=True)
+    vertices1_all = output1["vertices"].cpu().detach().numpy()
+    if args.debug:
+        seconds = 1
+    else:
+        seconds = vertices_all.shape[0]//30
+    # camera_settings = None
+    time_s = time.time()
+    generate_images(int(seconds*30), vertices_all, vertices1_all, faces, output_dir, use_matplotlib, filenames)
+    filenames = [f"{output_dir}frame_{i}.png" for i in range(int(seconds*30))]
+    # print(time.time()-time_s)
+    # for i in tqdm(range(seconds*30)):
+    #     vertices = vertices_all[i]
+    #     vertices1 = vertices1_all[i]
+    #     filename = f"{output_dir}frame_{i}.png"
+    #     filenames.append(filename)
+    #     #time_s = time.time()
+    #     #print(vertices.shape)
+    #     if use_matplotlib:
+    #         fig = plt.figure(figsize=(20, 10))
+    #         ax = fig.add_subplot(121, projection="3d")
+    #         fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
+    #         #ax.view_init(elev=0, azim=90)
+    #         x = vertices[:, 0]
+    #         y = vertices[:, 1]
+    #         z = vertices[:, 2]
+    #         ax.scatter(x, y, z, s=0.5)
+    #         ax.set_xlim([-1.0, 1.0])
+    #         ax.set_ylim([-0.5, 1.5])#heigth
+    #         ax.set_zlim([-0, 2])#depth
+    #         ax.set_box_aspect((1,1,1))
+    #     else:
+    #         mesh = trimesh.Trimesh(vertices, faces)
+    #         if i == 0:
+    #             scene = mesh.scene()
+    #             camera_params = {
+    #                 'fov': scene.camera.fov,
+    #                 'resolution': scene.camera.resolution,
+    #                 'focal': scene.camera.focal,
+    #                 'z_near': scene.camera.z_near,
+    #                 "z_far": scene.camera.z_far,
+    #                 'transform': scene.graph[scene.camera.name][0]
+    #             }
+    #         else:
+    #             scene = mesh.scene()
+    #             scene.camera.fov = camera_params['fov']
+    #             scene.camera.resolution = camera_params['resolution']
+    #             scene.camera.z_near = camera_params['z_near']
+    #             scene.camera.z_far = camera_params['z_far']
+    #             scene.graph[scene.camera.name] = camera_params['transform']
+    #         fig, ax =plt.subplots(1,2, figsize=(16, 6))
+    #         image = scene.save_image(resolution=[640, 480], visible=False)
+    #         #print((time.time()-time_s))
+    #         im0 = ax[0].imshow(image_from_bytes(image))
+    #         ax[0].axis('off')
+    #     # beta1 = torch.from_numpy(gt_np_body["betas"]).to(torch.float32).unsqueeze(0)
+    #     # expression1 = torch.from_numpy(gt_np_body["expressions"][i]).to(torch.float32).unsqueeze(0)
+    #     # jaw_pose1 = torch.from_numpy(gt_np_body["poses"][i][66:69]).to(torch.float32).unsqueeze(0)
+    #     # pose1 = torch.from_numpy(gt_np_body["poses"][i]).to(torch.float32).unsqueeze(0)
+    #     # transl1 = torch.from_numpy(gt_np_body["trans"][i]).to(torch.float32).unsqueeze(0)
+    #     # #print(beta.shape, expression.shape, jaw_pose.shape, pose.shape, transl.shape)global_orient=pose[0:1,:3],
+    #     # output1 = model(betas=beta1, transl=transl1, expression=expression1, jaw_pose=jaw_pose1, global_orient=pose1[0:1,:3], body_pose=pose1[0:1,3:21*3+3], left_hand_pose=pose1[0:1,25*3:40*3], right_hand_pose=pose1[0:1,40*3:55*3], return_verts=True)
+    #     # vertices1 = output1["vertices"].cpu().detach().numpy()[0]
+    #     if use_matplotlib:
+    #         ax2 = fig.add_subplot(122, projection="3d")
+    #         ax2.set_box_aspect((1,1,1))
+    #         fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
+    #         #ax2.view_init(elev=0, azim=90)
+    #         x1 = vertices1[:, 0]
+    #         y1 = vertices1[:, 1]
+    #         z1 = vertices1[:, 2]
+    #         ax2.scatter(x1, y1, z1, s=0.5)
+    #         ax2.set_xlim([-1.0, 1.0])
+    #         ax2.set_ylim([-0.5, 1.5])#heigth
+    #         ax2.set_zlim([-0, 2])
+    #         plt.savefig(filename, bbox_inches='tight')
+    #         plt.close(fig)
+    #     else:
+    #         mesh1 = trimesh.Trimesh(vertices1, faces)
+    #         if i == 0:
+    #             scene1 = mesh1.scene()
+    #             camera_params1 = {
+    #                 'fov': scene1.camera.fov,
+    #                 'resolution': scene1.camera.resolution,
+    #                 'focal': scene1.camera.focal,
+    #                 'z_near': scene1.camera.z_near,
+    #                 "z_far": scene1.camera.z_far,
+    #                 'transform': scene1.graph[scene1.camera.name][0]
+    #             }
+    #         else:
+    #             scene1 = mesh1.scene()
+    #             scene1.camera.fov = camera_params1['fov']
+    #             scene1.camera.resolution = camera_params1['resolution']
+    #             scene1.camera.z_near = camera_params1['z_near']
+    #             scene1.camera.z_far = camera_params1['z_far']
+    #             scene1.graph[scene1.camera.name] = camera_params1['transform']
+    #         image1 = scene1.save_image(resolution=[640, 480], visible=False)
+    #         im1 = ax[1].imshow(image_from_bytes(image1))
+    #         ax[1].axis('off')
+    #         plt.savefig(filename, bbox_inches='tight')
+    #         plt.close(fig)
+    # display.stop()
+    # print(filenames)
+    images = [imageio.imread(filename) for filename in filenames]
+    imageio.mimsave(f"{output_dir}raw_{res_npz_path.split('/')[-1][:-4]}.mp4", images, fps=30)
+    for filename in filenames:
+        os.remove(filename)
+    video = mp.VideoFileClip(f"{output_dir}raw_{res_npz_path.split('/')[-1][:-4]}.mp4")
+    # audio, sr = librosa.load(audio_path)
+    # audio = audio[:seconds*sr]
+    # print(audio.shape, seconds, sr)
+    # import soundfile as sf
+    # sf.write(f"{output_dir}{res_npz_path.split('/')[-1][:-4]}.wav", audio, 16000, 'PCM_24')
+    # audio_tmp = librosa.output.write_wav(f"{output_dir}{res_npz_path.split('/')[-1][:-4]}.wav", audio, sr=16000)
+    audio = mp.AudioFileClip(audio_path)
+    if audio.duration > video.duration:
+        audio = audio.subclip(0, video.duration)
+    final_clip = video.set_audio(audio)
+    final_clip.write_videofile(f"{output_dir}{res_npz_path.split('/')[-1][4:-4]}.mp4")
+    os.remove(f"{output_dir}raw_{res_npz_path.split('/')[-1][:-4]}.mp4")
+def print_exp_info(args):
+    logger.info(pprint.pformat(vars(args)))
+    logger.info(f"# ------------ {args.name} ----------- #")
+    logger.info("PyTorch version: {}".format(torch.__version__))
+    logger.info("CUDA version: {}".format(torch.version.cuda))
+    logger.info("{} GPUs".format(torch.cuda.device_count()))
+    logger.info(f"Random Seed: {args.random_seed}")
+def args2csv(args, get_head=False, list4print=[]):
+    for k, v in args.items():
+        if isinstance(args[k], dict):
+            args2csv(args[k], get_head, list4print)
+        else: list4print.append(k) if get_head else list4print.append(v)
+    return list4print
+class EpochTracker:
+    def __init__(self, metric_names, metric_directions):
+        assert len(metric_names) == len(metric_directions), "Metric names and directions should have the same length"
+        self.metric_names = metric_names
+        self.states = ['train', 'val', 'test']
+        self.types = ['last', 'best']
+        self.values = {name: {state: {type_: {'value': np.inf if not is_higher_better else -np.inf, 'epoch': 0}
+                                       for type_ in self.types}
+                              for state in self.states}
+                      for name, is_higher_better in zip(metric_names, metric_directions)}
+        self.loss_meters = {name: {state: AverageMeter(f"{name}_{state}")
+                                   for state in self.states}
+                            for name in metric_names}
+        self.is_higher_better = {name: direction for name, direction in zip(metric_names, metric_directions)}
+        self.train_history = {name: [] for name in metric_names}
+        self.val_history = {name: [] for name in metric_names}
+    def update_meter(self, name, state, value):
+        self.loss_meters[name][state].update(value)
+    def update_values(self, name, state, epoch):
+        value_avg = self.loss_meters[name][state].avg
+        new_best = False
+        if ((value_avg < self.values[name][state]['best']['value'] and not self.is_higher_better[name]) or
+           (value_avg > self.values[name][state]['best']['value'] and self.is_higher_better[name])):
+            self.values[name][state]['best']['value'] = value_avg
+            self.values[name][state]['best']['epoch'] = epoch
+            new_best = True
+        self.values[name][state]['last']['value'] = value_avg
+        self.values[name][state]['last']['epoch'] = epoch
+        return new_best
+    def get(self, name, state, type_):
+        return self.values[name][state][type_]
+    def reset(self):
+        for name in self.metric_names:
+            for state in self.states:
+                self.loss_meters[name][state].reset()
+    def flatten_values(self):
+        flat_dict = {}
+        for name in self.metric_names:
+            for state in self.states:
+                for type_ in self.types:
+                    value_key = f"{name}_{state}_{type_}"
+                    epoch_key = f"{name}_{state}_{type_}_epoch"
+                    flat_dict[value_key] = self.values[name][state][type_]['value']
+                    flat_dict[epoch_key] = self.values[name][state][type_]['epoch']
+        return flat_dict
+    def update_and_plot(self, name, epoch, save_path):
+        new_best_train = self.update_values(name, 'train', epoch)
+        new_best_val = self.update_values(name, 'val', epoch)
+        self.train_history[name].append(self.loss_meters[name]['train'].avg)
+        self.val_history[name].append(self.loss_meters[name]['val'].avg)
+        train_values = self.train_history[name]
+        val_values = self.val_history[name]
+        epochs = list(range(1, len(train_values) + 1))
+        plt.figure(figsize=(10, 6))
+        plt.plot(epochs, train_values, label='Train')
+        plt.plot(epochs, val_values, label='Val')
+        plt.title(f'Train vs Val {name} over epochs')
+        plt.xlabel('Epochs')
+        plt.ylabel(name)
+        plt.legend()
+        plt.savefig(save_path)
+        plt.close()
+        return new_best_train, new_best_val
+def record_trial(args, tracker):
+    """
+    1. record notes, score, env_name, experments_path,
+    """
+    csv_path = args.out_path + "custom/" +args.csv_name+".csv"
+    all_print_dict = vars(args)
+    all_print_dict.update(tracker.flatten_values())
+    if not os.path.exists(csv_path):
+        pd.DataFrame([all_print_dict]).to_csv(csv_path, index=False)
+    else:
+        df_existing = pd.read_csv(csv_path)
+        df_new = pd.DataFrame([all_print_dict])
+        df_aligned = df_existing.append(df_new).fillna("")
+        df_aligned.to_csv(csv_path, index=False)
+def set_random_seed(args):
+    os.environ['PYTHONHASHSEED'] = str(args.random_seed)
+    random.seed(args.random_seed)
+    np.random.seed(args.random_seed)
+    torch.manual_seed(args.random_seed)
+    torch.cuda.manual_seed_all(args.random_seed)
+    torch.cuda.manual_seed(args.random_seed)
+    torch.backends.cudnn.deterministic = args.deterministic #args.CUDNN_DETERMINISTIC
+    torch.backends.cudnn.benchmark = args.benchmark
+    torch.backends.cudnn.enabled = args.cudnn_enabled
+def save_checkpoints(save_path, model, opt=None, epoch=None, lrs=None):
+    if lrs is not None:
+        states = { 'model_state': model.state_dict(),
+                'epoch': epoch + 1,
+                'opt_state': opt.state_dict(),
+                'lrs':lrs.state_dict(),}
+    elif opt is not None:
+        states = { 'model_state': model.state_dict(),
+                'epoch': epoch + 1,
+                'opt_state': opt.state_dict(),}
+    else:
+        states = { 'model_state': model.state_dict(),}
+    torch.save(states, save_path)
+def load_checkpoints(model, save_path, load_name='model'):
+    states = torch.load(save_path)
+    new_weights = OrderedDict()
+    flag=False
+    for k, v in states['model_state'].items():
+        #print(k)
+        if "module" not in k:
+            break
+        else:
+            new_weights[k[7:]]=v
+            flag=True
+    if flag:
+        try:
+            model.load_state_dict(new_weights)
+        except:
+            #print(states['model_state'])
+            model.load_state_dict(states['model_state'])
+    else:
+        model.load_state_dict(states['model_state'])
+    logger.info(f"load self-pretrained checkpoints for {load_name}")
+def model_complexity(model, args):
+    from ptflops import get_model_complexity_info
+    flops, params = get_model_complexity_info(model,  (args.T_GLOBAL._DIM, args.TRAIN.CROP, args.TRAIN),
+        as_strings=False, print_per_layer_stat=False)
+    logging.info('{:<30}  {:<8} BFlops'.format('Computational complexity: ', flops / 1e9))
+    logging.info('{:<30}  {:<8} MParams'.format('Number of parameters: ', params / 1e6))
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f'):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+class MultiLMDBManager:
+    def __init__(self, base_dir, max_db_size=10*1024*1024*1024):  # 10GB default size
+        self.base_dir = base_dir
+        self.max_db_size = max_db_size
+        self.current_db_size = 0
+        self.current_db_idx = 0
+        self.current_lmdb_env = None
+        self.sample_to_db_mapping = {}
+        self.sample_counter = 0
+        self.db_paths = []
+    def get_new_lmdb_path(self):
+        db_path = os.path.join(self.base_dir, f"db_{self.current_db_idx:03d}")
+        self.db_paths.append(db_path)
+        return db_path
+    def init_new_db(self):
+        if self.current_lmdb_env is not None:
+            self.current_lmdb_env.sync()
+            self.current_lmdb_env.close()
+        new_db_path = self.get_new_lmdb_path()
+        self.current_lmdb_env = lmdb.open(new_db_path, map_size=self.max_db_size)
+        self.current_db_size = 0
+        self.current_db_idx += 1
+        return self.current_lmdb_env
+    def add_sample(self, sample_data):
+        if self.current_lmdb_env is None:
+            self.init_new_db()
+        v = pickle.dumps(sample_data)
+        sample_size = len(v)
+        try:
+            sample_key = "{:008d}".format(self.sample_counter).encode("ascii")
+            with self.current_lmdb_env.begin(write=True) as txn:
+                txn.put(sample_key, v)
+            self.sample_to_db_mapping[self.sample_counter] = self.current_db_idx - 1
+        except lmdb.MapFullError:
+            self.init_new_db()
+            sample_key = "{:008d}".format(self.sample_counter).encode("ascii")
+            with self.current_lmdb_env.begin(write=True) as txn:
+                txn.put(sample_key, v)
+            self.sample_to_db_mapping[self.sample_counter] = self.current_db_idx - 1
+        self.current_db_size += sample_size
+        self.sample_counter += 1
+    def save_mapping(self):
+        mapping_path = os.path.join(self.base_dir, "sample_db_mapping.pkl")
+        with open(mapping_path, 'wb') as f:
+            pickle.dump({
+                'mapping': self.sample_to_db_mapping,
+                'db_paths': self.db_paths
+            }, f)
+    def close(self):
+        if self.current_lmdb_env is not None:
+            self.current_lmdb_env.sync()
+            self.current_lmdb_env.close()
+        self.save_mapping()