-
Notifications
You must be signed in to change notification settings - Fork 0
/
deploy_glue.py
172 lines (151 loc) Β· 5.58 KB
/
deploy_glue.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
from azureml.core import Dataset, Experiment, RunConfiguration, Workspace
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import CondaDependencies, MpiConfiguration
from azureml.pipeline.core import Pipeline, PipelineData, PipelineParameter, StepSequence
from azureml.pipeline.steps import EstimatorStep, PythonScriptStep
from azureml.train.dnn import PyTorch
def find_or_create_compute_target(
workspace,
name,
vm_size="STANDARD_NC6",
min_nodes=0,
max_nodes=1,
idle_seconds_before_scaledown=1200,
vm_priority="lowpriority",
):
try:
target = ComputeTarget(workspace=workspace, name=name)
except ComputeTargetException:
config = AmlCompute.provisioning_configuration(
vm_size=vm_size, min_nodes=min_nodes, max_nodes=max_nodes, vm_priority=vm_priority
)
target = ComputeTarget.create(workspace, name, config)
target.wait_for_completion(show_output=True)
return target
# The relative path to the folder that holds our scripts
prep_project_folder = "glue_prep"
train_project_folder = "glue_train"
# Name of script to run data preprocessing/preparation
prepare_script_name = "prepare.py"
# Name of script that will do training
train_script_name = "train.py"
# Your runs will get grouped within this experiment name (name it whatever ya want)
experiment_name = "glue_benchmark_mprc"
# A name for your compute target (name it whatever ya want)
compute_target_name = "gpu-cluster"
# Azure specific VM name. This one has a K80 GPU, 6 cores, 56GB RAM, and 380GB Disk Space
# Here's a good link on different VMs and their pricing...
# https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/
vm_size = "STANDARD_NV24" # "STANDARD_NV12"
# The reference name to the path of your prepared/preprocessed data
# This will come through as an env variable in prepare.py
prepared_data_dir = "prepared_data"
workspace = Workspace.from_config()
datastore = workspace.get_default_datastore()
experiment = Experiment(workspace=workspace, name=experiment_name)
compute_target = find_or_create_compute_target(workspace, compute_target_name, vm_size=vm_size)
prepared_dataset = PipelineData(prepared_data_dir, datastore=datastore).as_dataset()
prepared_dataset = prepared_dataset.register(name=prepared_data_dir)
conda = CondaDependencies.create(
pip_packages=[
"azureml-sdk",
"azureml-dataprep[fuse,pandas]",
"torch==1.5.0",
"nlp==0.2.0",
"transformers==2.11.0",
],
pin_sdk_version=False,
)
conda.set_pip_option("--pre")
run_config = RunConfiguration()
run_config.environment.python.conda_dependencies = conda
# Define Pipeline Parameters
model_name_param = PipelineParameter("model_name_or_path", "bert-base-cased")
max_seq_len_param = PipelineParameter("max_seq_length", 128)
task_param = PipelineParameter("task", "mrpc")
learning_rate_param = PipelineParameter("learning_rate", 2e-5)
seed_param = PipelineParameter("seed", 1)
train_batch_size_param = PipelineParameter("train_batch_size", 64)
eval_batch_size_param = PipelineParameter("eval_batch_size", 64)
max_epochs_param = PipelineParameter("max_epochs", 3)
num_gpus_param = PipelineParameter("gpus", 2)
num_workers_param = PipelineParameter("num_workers", 2)
prepare_step = PythonScriptStep(
name="Preparation Step",
script_name=prepare_script_name,
arguments=["--model_name_or_path", model_name_param, "--max_seq_length", max_seq_len_param],
outputs=[prepared_dataset],
source_directory=prep_project_folder,
compute_target=compute_target,
runconfig=run_config,
allow_reuse=True,
)
estimator = PyTorch(
source_directory=train_project_folder,
compute_target=compute_target,
entry_script=train_script_name,
use_gpu=True,
pip_packages=[
"azureml-sdk",
"nlp==0.2.0",
"pytorch-lightning==0.8.0rc4",
"transformers==2.11.0",
"pandas",
"scipy",
"scikit-learn",
],
framework_version="1.5",
)
train_step = EstimatorStep(
name="Training Step",
estimator=estimator,
estimator_entry_script_arguments=[
"--model_name_or_path",
model_name_param,
"--task",
task_param,
"--max_seq_length",
max_seq_len_param,
"--max_epochs",
max_epochs_param,
"--learning_rate",
learning_rate_param,
"--seed",
seed_param,
"--gpus",
num_gpus_param,
"--num_workers",
num_workers_param,
"--train_batch_size",
train_batch_size_param,
"--eval_batch_size",
eval_batch_size_param,
"--output_dir",
"./outputs",
"--do_train",
"--do_predict",
],
inputs=[prepared_dataset.as_mount()],
compute_target=compute_target,
)
step_sequence = StepSequence(steps=[prepare_step, train_step])
pipeline = Pipeline(workspace, steps=step_sequence)
# Submit single experiment run
run = experiment.submit(pipeline)
# Run the three listed models over 5 random seeds. (15 experiment runs total)
# for seed in range(5):
# for model in ["distilbert-base-cased", "bert-base-cased", "albert-base-v2"]:
# run = experiment.submit(
# pipeline,
# pipeline_parameters={
# "model_name_or_path": model,
# "task": "cola",
# "train_batch_size": 32,
# "eval_batch_size": 32,
# "gpus": 4,
# "seed": seed,
# "num_workers": 16,
# "max_epochs": 4,
# },
# )