-
Notifications
You must be signed in to change notification settings - Fork 1
/
m2d.yaml
48 lines (44 loc) · 1.59 KB
/
m2d.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# AR parameters such as FFT parameters.
audio_repr: ar_m2d.AR_M2D
# weight_file: m2d_vit_base-80x208p16x16-random/random
weight_file: ../weights/msm_mae_vit_base-80x608p16x16-220924-mr75/checkpoint-300.pth
feature_d: 3840
sample_rate: 16000
# sample_rate: 22050
n_fft: 400
window_size: 400
hop_size: 160
n_mels: 80
f_min: 50
f_max: 8000
window: hanning
# Model specific parameters.
cls_token: False # Use CLS token
output_layers: [-1] # list of layers to stack
# Linear evaluation/Fine-tuning common parameters.
training_mask: 0.0
flat_features: False # 768-d if True else 3840-d
# Linear evaluaition parameters.
batch_size: 128
lr_lineareval: 0.00003
report_per_epochs: 50
early_stop_epochs: 20
# Fine-tuning parameters.
warmup_epochs: 1000 # warmup first 2K Steps regardless of epoch number
mixup: 0.0 # set mixup to 0 as per the paper for VC1 dataset
ft_bs: 108
# ft_lr: 2.0
ft_lr: 0.001
ft_early_stop_epochs: -1 # -1: no early stopping
ft_epochs: 7000
ft_freq_mask: 0 # do not myask frequencies because they distinguish speakers from each other
ft_time_mask: 70
# the default length of VC1 dataset is 8.2-sec and the number of frames is 820.
# the default ft_time_mask for VC1 dataset is 192.
# 192/820 = 0.23414634146
# if we do 5-sec slicing, the number of frames will be 500
# so the max ft_time_mask will be 0.23414634146 * 500 = 117.073170732
# if we do 3-sec slicing, the number of frames wil be 300
# so the max ft_time_mask will be 0.23414634146 * 300 = 70.243902439
ft_noise: 0.0 # unused
ft_rrc: False # random resized crop. Set to False because the paper didn't use random resized crop for VC1 dataset.