-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Support aml #2615
Support aml #2615
Changes from 85 commits
dcd2ffd
a738331
3177aeb
2aafac1
0435b7f
2e5ef51
6d7bc62
c67b162
e13a620
59d4a71
eae0540
81c49cf
92cab3a
0674d88
e5b9665
67ef648
3b8b6fb
1e626fd
c6b6061
916e444
b8e47be
0ee933a
c7973be
c094057
c2735d3
d0b2504
c8d4696
1a9f19f
3f4c177
648e0bb
2fa4a77
caeffb8
d0768b0
8dff16f
bea8ed6
e297aa5
500c1cb
d880512
5c33d11
45424e8
9ca3444
5018039
283bceb
0c67c5c
671f5d8
a65a810
6d36ae5
5e352f7
b9d1aa5
a3a91d8
57c300e
69a5170
edc4608
c1f0239
af97bb1
10feb6a
c00cd31
78f1386
586d6ac
2db8ff8
f631e4c
5982fb3
f687a6e
476ffec
0f2367c
9d7bd3c
130ed27
ab86080
4b11a53
15ee064
7c48610
c0c7d96
93eefb2
53cea0f
34d9351
25a9dab
cada76a
de7dc7c
428dc3d
2e9c70e
8cf8583
fd5fd9e
54a22af
525b961
8ec5e7d
bdd3840
b341dce
ce81c51
e66dc23
9cf6744
bd77f5c
ddfb0cc
65660e6
fec8a67
5200a3a
51befa5
478629f
c299ce1
0517e13
fc4b978
e527743
b047681
4acc7e8
ec1475a
8eaeebf
56b6818
e09ff79
ecf615d
a7a3baf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
**Run an Experiment on Azure Machine Learning** | ||
=== | ||
NNI supports running an experiment on [AML](https://azure.microsoft.com/en-us/services/machine-learning/) , called aml mode. | ||
|
||
## Setup environment | ||
Step 1. Install NNI, follow the install guide [here](../Tutorial/QuickStart.md). | ||
|
||
Step 2. Create AML account, follow the document [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace-cli). | ||
|
||
Step 3. Get your account information. | ||
![](../../img/aml_account.png) | ||
|
||
Step4. Install AML package environment. | ||
``` | ||
python3 -m pip install azureml --user | ||
python3 -m pip install azureml-sdk --user | ||
``` | ||
|
||
## Run an experiment | ||
Use `examples/trials/mnist-tfv1` as an example. The NNI config YAML file's content is like: | ||
|
||
```yaml | ||
authorName: default | ||
experimentName: example_mnist | ||
trialConcurrency: 1 | ||
maxExecDuration: 1h | ||
maxTrialNum: 10 | ||
trainingServicePlatform: aml | ||
searchSpacePath: search_space.json | ||
#choice: true, false | ||
useAnnotation: false | ||
tuner: | ||
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner | ||
#SMAC (SMAC should be installed through nnictl) | ||
builtinTunerName: TPE | ||
classArgs: | ||
#choice: maximize, minimize | ||
optimize_mode: maximize | ||
trial: | ||
command: python3 mnist.py | ||
codeDir: . | ||
computeTarget: ussc40rscl | ||
nodeCount: 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where is docker image? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed, missed this variable in doc. |
||
amlConfig: | ||
subscriptionId: ${replace_to_your_subscriptionId} | ||
resourceGroup: ${replace_to_your_resourceGroup} | ||
workspaceName: ${replace_to_your_workspaceName} | ||
|
||
``` | ||
|
||
Note: You should set `trainingServicePlatform: aml` in NNI config YAML file if you want to start experiment in aml mode. | ||
|
||
Compared with [LocalMode](LocalMode.md) trial configuration in aml mode have these additional keys: | ||
* computeTarget | ||
* required key. The computer cluster name you want to use in your AML workspace. | ||
* nodeCount | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think nodeCount can default to 1 because multi-machine runs are seldom used. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, perhaps hide this variable is better, has removed. |
||
* required key. The number of nodes to use for one run. [refer](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.runconfiguration?view=azure-ml-py#variables) | ||
|
||
amlConfig: | ||
* subscriptionId | ||
* the subscriptionId of your account | ||
* resourceGroup | ||
* the resourceGroup of your account | ||
* workspaceName | ||
* the workspaceName of your account | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
authorName: default | ||
experimentName: example_mnist_pytorch | ||
trialConcurrency: 1 | ||
maxExecDuration: 1h | ||
maxTrialNum: 10 | ||
trainingServicePlatform: aml | ||
searchSpacePath: search_space.json | ||
#choice: true, false | ||
useAnnotation: false | ||
tuner: | ||
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner | ||
#SMAC (SMAC should be installed through nnictl) | ||
builtinTunerName: TPE | ||
classArgs: | ||
#choice: maximize, minimize | ||
optimize_mode: maximize | ||
trial: | ||
command: python3 mnist.py | ||
codeDir: . | ||
computeTarget: ussc40rscl | ||
nodeCount: 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Each trial will use one node, i.e., all 8 GPUs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removed. |
||
amlConfig: | ||
subscriptionId: ${replace_to_your_subscriptionId} | ||
resourceGroup: ${replace_to_your_resourceGroup} | ||
workspaceName: ${replace_to_your_workspaceName} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
authorName: default | ||
squirrelsc marked this conversation as resolved.
Show resolved
Hide resolved
|
||
experimentName: example_mnist | ||
trialConcurrency: 1 | ||
maxExecDuration: 1h | ||
maxTrialNum: 10 | ||
trainingServicePlatform: aml | ||
searchSpacePath: search_space.json | ||
#choice: true, false | ||
useAnnotation: false | ||
tuner: | ||
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner | ||
#SMAC (SMAC should be installed through nnictl) | ||
builtinTunerName: TPE | ||
classArgs: | ||
#choice: maximize, minimize | ||
optimize_mode: maximize | ||
trial: | ||
command: python3 mnist.py | ||
codeDir: . | ||
computeTarget: ussc40rscl | ||
nodeCount: 1 | ||
amlConfig: | ||
subscriptionId: ${replace_to_your_subscriptionId} | ||
resourceGroup: ${replace_to_your_resourceGroup} | ||
workspaceName: ${replace_to_your_workspaceName} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# Licensed under the MIT license. | ||
|
||
import os | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing license There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed. |
||
import sys | ||
import time | ||
import json | ||
from argparse import ArgumentParser | ||
from azureml.core import Experiment, RunConfiguration, ScriptRunConfig | ||
from azureml.core.compute import ComputeTarget | ||
from azureml.core.run import RUNNING_STATES, RunStatus, Run | ||
from azureml.core import Workspace | ||
from azureml.core.conda_dependencies import CondaDependencies | ||
|
||
if __name__ == "__main__": | ||
parser = ArgumentParser() | ||
parser.add_argument('--subscription_id', help='the subscription id of aml') | ||
parser.add_argument('--resource_group', help='the resource group of aml') | ||
parser.add_argument('--workspace_name', help='the workspace name of aml') | ||
parser.add_argument('--compute_target', help='the compute cluster name of aml') | ||
parser.add_argument('--docker_image', help='the docker image of job') | ||
parser.add_argument('--experiment_name', help='the experiment name') | ||
parser.add_argument('--script_dir', help='script directory') | ||
parser.add_argument('--script_name', help='script name') | ||
parser.add_argument('--node_count', help='node count of run') | ||
args = parser.parse_args() | ||
|
||
ws = Workspace(args.subscription_id, args.resource_group, args.workspace_name) | ||
compute_target = ComputeTarget(workspace=ws, name=args.compute_target) | ||
experiment = Experiment(ws, args.experiment_name) | ||
run_config = RunConfiguration() | ||
dependencies = CondaDependencies() | ||
dependencies.add_pip_package("azureml-sdk") | ||
dependencies.add_pip_package("azureml") | ||
run_config.environment.python.conda_dependencies = dependencies | ||
run_config.environment.docker.enabled = True | ||
run_config.environment.docker.base_image = args.docker_image | ||
run_config.target = compute_target | ||
run_config.node_count = args.node_count | ||
config = ScriptRunConfig(source_directory=args.script_dir, script=args.script_name, run_config=run_config) | ||
run = experiment.submit(config) | ||
print(run.get_details()["runId"]) | ||
while True: | ||
line = sys.stdin.readline().rstrip() | ||
if line == 'update_status': | ||
print('status:' + run.get_status()) | ||
elif line == 'tracking_url': | ||
print('tracking_url:' + run.get_portal_url()) | ||
elif line == 'stop': | ||
run.cancel() | ||
exit(0) | ||
elif line == 'receive': | ||
print('receive:' + json.dumps(run.get_metrics())) | ||
elif line: | ||
items = line.split(':') | ||
if items[0] == 'command': | ||
run.log('nni_manager', line[8:]) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
// Copyright (c) Microsoft Corporation. | ||
// Licensed under the MIT license. | ||
|
||
'use strict'; | ||
|
||
import * as fs from 'fs'; | ||
import * as request from 'request'; | ||
import * as path from 'path'; | ||
import { Deferred } from 'ts-deferred'; | ||
import { PythonShell } from 'python-shell'; | ||
|
||
export class AMLClient { | ||
public subscriptionId: string; | ||
public resourceGroup: string; | ||
public workspaceName: string; | ||
public experimentId: string; | ||
public image: string; | ||
public scriptName: string; | ||
public pythonShellClient: undefined | PythonShell; | ||
public codeDir: string; | ||
public nodeCount: number; | ||
public computeTarget: string; | ||
|
||
constructor( | ||
subscriptionId: string, | ||
resourceGroup: string, | ||
workspaceName: string, | ||
experimentId: string, | ||
computeTarget: string, | ||
nodeCount: number, | ||
image: string, | ||
scriptName: string, | ||
codeDir: string, | ||
) { | ||
this.subscriptionId = subscriptionId; | ||
this.resourceGroup = resourceGroup; | ||
this.workspaceName = workspaceName; | ||
this.experimentId = experimentId; | ||
this.image = image; | ||
this.nodeCount = nodeCount; | ||
this.scriptName = scriptName; | ||
this.codeDir = codeDir; | ||
this.computeTarget = computeTarget; | ||
} | ||
|
||
public submit(): Promise<string> { | ||
const deferred: Deferred<string> = new Deferred<string>(); | ||
this.pythonShellClient = new PythonShell('amlUtil.py', { | ||
scriptPath: './config/aml', | ||
pythonOptions: ['-u'], // get print results in real-time | ||
args: [ | ||
'--subscription_id', this.subscriptionId, | ||
'--resource_group', this.resourceGroup, | ||
'--workspace_name', this.workspaceName, | ||
'--compute_target', this.computeTarget, | ||
'--docker_image', this.image, | ||
'--experiment_name', `nni_exp_${this.experimentId}`, | ||
'--script_dir', this.codeDir, | ||
'--script_name', this.scriptName, | ||
'--node_count', this.nodeCount.toString() | ||
] | ||
}); | ||
this.pythonShellClient.on('message', function (envId: any) { | ||
// received a message sent from the Python script (a simple "print" statement) | ||
deferred.resolve(envId); | ||
}); | ||
return deferred.promise; | ||
} | ||
|
||
public stop(): void { | ||
if (this.pythonShellClient === undefined) { | ||
throw Error('python shell client not initialized!'); | ||
} | ||
this.pythonShellClient.send('stop'); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no |
||
} | ||
|
||
public getTrackingUrl(): Promise<string> { | ||
const deferred: Deferred<string> = new Deferred<string>(); | ||
if (this.pythonShellClient === undefined) { | ||
throw Error('python shell client not initialized!'); | ||
} | ||
this.pythonShellClient.send('tracking_url'); | ||
let trackingUrl = ''; | ||
this.pythonShellClient.on('message', function (status: any) { | ||
let items = status.split(':'); | ||
if (items[0] === 'tracking_url') { | ||
trackingUrl = items.splice(1, items.length).join('') | ||
} | ||
deferred.resolve(trackingUrl); | ||
}); | ||
return deferred.promise; | ||
} | ||
|
||
public updateStatus(oldStatus: string): Promise<string> { | ||
const deferred: Deferred<string> = new Deferred<string>(); | ||
if (this.pythonShellClient === undefined) { | ||
throw Error('python shell client not initialized!'); | ||
} | ||
let newStatus = oldStatus; | ||
this.pythonShellClient.send('update_status'); | ||
this.pythonShellClient.on('message', function (status: any) { | ||
let items = status.split(':'); | ||
if (items[0] === 'status') { | ||
newStatus = items.splice(1, items.length).join('') | ||
} | ||
deferred.resolve(newStatus); | ||
}); | ||
return deferred.promise; | ||
} | ||
|
||
public sendCommand(message: string): void { | ||
if (this.pythonShellClient === undefined) { | ||
throw Error('python shell client not initialized!'); | ||
} | ||
this.pythonShellClient.send(`command:${message}`); | ||
} | ||
|
||
public receiveCommand(): Promise<any> { | ||
const deferred: Deferred<any> = new Deferred<any>(); | ||
if (this.pythonShellClient === undefined) { | ||
throw Error('python shell client not initialized!'); | ||
} | ||
this.pythonShellClient.send('receive'); | ||
this.pythonShellClient.on('message', function (command: any) { | ||
let items = command.split(':') | ||
if (items[0] === 'receive') { | ||
deferred.resolve(JSON.parse(command.slice(8))) | ||
} | ||
}); | ||
return deferred.promise; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Replace it with a placeholder
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed.