SpeakerID/m2d/evar/config/m2d.yaml

# AR parameters such as FFT parameters.
audio_repr: ar_m2d.AR_M2D
# weight_file: m2d_vit_base-80x208p16x16-random/random
weight_file: ../weights/msm_mae_vit_base-80x608p16x16-220924-mr75/checkpoint-300.pth
feature_d: 3840
sample_rate: 16000
# sample_rate: 22050
n_fft: 400
window_size: 400
hop_size: 160
n_mels: 80
f_min: 50
f_max: 8000
window: hanning

# Model specific parameters.
cls_token: False # Use CLS token
output_layers: [-1]  # list of layers to stack

# Linear evaluation/Fine-tuning common parameters.
training_mask: 0.0
flat_features: False  # 768-d if True else 3840-d

# Linear evaluaition parameters.
batch_size: 128
lr_lineareval: 0.00003
report_per_epochs: 50
early_stop_epochs: 20

# Fine-tuning parameters.
warmup_epochs: 1000 # warmup first 2K Steps regardless of epoch number
mixup: 0.0 # set mixup to 0 as per the paper for VC1 dataset
ft_bs: 108
# ft_lr: 2.0
ft_lr: 0.001
ft_early_stop_epochs: -1  # -1: no early stopping
ft_epochs: 7000
ft_freq_mask: 0 # do not myask frequencies because they distinguish speakers from each other
ft_time_mask: 70
# the default length of VC1 dataset is 8.2-sec and the number of frames is 820.
# the default ft_time_mask for VC1 dataset is 192.
# 192/820 = 0.23414634146
# if we do 5-sec slicing, the number of frames will be 500
# so the max ft_time_mask will be 0.23414634146 * 500 = 117.073170732
# if we do 3-sec slicing, the number of frames wil be 300
# so the max ft_time_mask will be 0.23414634146 * 300 = 70.243902439
ft_noise: 0.0 # unused
ft_rrc: False # random resized crop. Set to False because the paper didn't use random resized crop for VC1 dataset.