-
Notifications
You must be signed in to change notification settings - Fork 1
/
pretrain.sh
executable file
·60 lines (55 loc) · 1.76 KB
/
pretrain.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env bash
#SBATCH --time=48:00:00
#SBATCH --partition=learnlab
#SBATCH --constraint=volta32gb
#SBATCH -o slurm/%x.%j.out
#SBATCH -e slurm/%x.%j.err
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=8
#SBATCH --cpus-per-task=80
#SBATCH --mem=256GB
#SBATCH --job-name=omnitab
# Notes
# 1. The effective batch size is 8 (#gpu) * 36 (batch size) * 2 (accumulation) = 576.
# 2. The pretraining dataset has ~2m examples, which is ~17k update steps for 5 epochs.
# 3. The pretraining dataset takes 1~2 hours to process using 10 workers.
# To accelerate, you could process it using all cores, cache it (activated by default), then run multi-GPU training.
# activate env here (change based on your cluster)
module purge
module load anaconda3
. /usr/share/modules/init/sh
eval "$(conda shell.bash hook)"
conda activate omnitab
# output directory
data=$1 # omnitab_download/pretrain_data
model=$2 # microsoft/tapex-large
output=$3 # output/omnitab-large
python -m torch.distributed.launch --nproc_per_node=8 run.py \
--do_train \
--do_eval \
--pretraindata_dir ${data} \
--dataset_name wikitablequestions \
--remove_unused_columns false \
--model_name_or_path ${model} \
--tokenizer_name neulab/omnitab-large \
--overwrite_output_dir \
--output_dir ${output} \
--max_source_length 1024 \
--max_target_length 1024 \
--val_max_target_length 128 \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 36 \
--per_device_eval_batch_size 6 \
--num_train_epochs 5.0 \
--warmup_ratio 0.1 \
--learning_rate 2e-5 \
--fp16 \
--preprocessing_num_workers 80 \
--logging_steps 10 \
--eval_steps 1000 \
--save_steps 5000 \
--evaluation_strategy steps \
--predict_with_generate \
--num_beams 5 \
--generation_max_length 128