Skip to content

Commit

Permalink
fix: remove non-orchestration mode from the project as it's not maint…
Browse files Browse the repository at this point in the history
…ained anymore, at least for the medium term.

 The only mode flame will have for now will be the orchestration mode, which means that all the wokers are managed by the flame system.
  • Loading branch information
openwithcode committed Mar 28, 2023
1 parent 1c21080 commit 63a5f81
Show file tree
Hide file tree
Showing 34 changed files with 200 additions and 710 deletions.
60 changes: 26 additions & 34 deletions api/job_components.partials.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,46 +79,38 @@ CommBackend:
########################################
DataSpec:
description: Dataset specification
type: object
properties:
fromUser:
$ref: '#/components/schemas/FromUser'
fromSystem:
$ref: '#/components/schemas/FromSystem'
type: array
items:
$ref: '#/components/schemas/RoleDatasetGroups'
example:
fromUser:
default: 1 # there is one user-fed dataset under default realm
uk: 2 # there are two user-fed datasets (hence two compute nodes) under uk realm
us/foo: 3 # there are three user-fed datasets under us/foo realm
fromSystem:
trainer:
"default/eu":
- role: trainer
datasetGroups:
"eu":
- 61609290fa724deafdb2a4fa # id of dataset registered in the flame system
- 61609290fa724deafdb2a4fb # id of dataset registered in the flame system
"default/us":
"us":
- 61609290fa724deafdb2a4fc # id of dataset registered in the flame system

########################################
# FromUser
########################################
FromUser:
# an object is a key-value pair of realm and count
type: object
additionalProperties:
type: integer
format: int32

########################################
# FromSystem
########################################
FromSystem:
RoleDatasetGroups:
description: Dataset specification
type: object
additionalProperties:
type: object
additionalProperties:
type: array
items:
type: string
properties:
role:
type: string
datasetGroups:
type: object
additionalProperties:
type: array
items:
type: string
example:
role: trainer
datasetGroups:
"eu":
- 61609290fa724deafdb2a4fa # id of dataset registered in the flame system
- 61609290fa724deafdb2a4fb # id of dataset registered in the flame system
"us":
- 61609290fa724deafdb2a4fc # id of dataset registered in the flame system

########################################
# Model Spec
Expand Down
20 changes: 4 additions & 16 deletions cmd/controller/app/job/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,10 @@ func (b *JobBuilder) setup() error {
b.roleCode = zippedRoleCode

// Iterating for each dataset id to fetch dataset info and update the datasets array.
for roleName, groups := range b.jobSpec.DataSpec.FromSystem {
for _, dataSpec := range b.jobSpec.DataSpec {
roleName := dataSpec.Role
groups := dataSpec.DatasetGroups

if len(groups) == 0 {
return fmt.Errorf("no dataset group specified for trainer role %s", roleName)
}
Expand Down Expand Up @@ -220,21 +223,6 @@ func (b *JobBuilder) build() ([]objects.Task, []string, error) {
continue
}

// TODO: this is absolute and should be removed
for group, count := range b.jobSpec.DataSpec.FromUser {
for i := 0; i < int(count); i++ {
task := tmpl.Task

task.Type = openapi.USER
task.JobConfig.Realm = group
task.JobConfig.GroupAssociation = b.getGroupAssociationByGroup(roleName, group)

task.GenerateTaskId(i)

tasks = append(tasks, task)
}
}

var count int
groups := sortedKeys(b.datasets[roleName])

Expand Down
66 changes: 42 additions & 24 deletions cmd/controller/app/job/builder_example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,13 @@ func Test_asyncfl_hier_mnist(t *testing.T) {

var jobSpecData openapi.JobSpec
readFileToStruct(t, rootExample+"/job.json", &jobSpecData)
jobSpecData.DataSpec.FromSystem = map[string]map[string][]string{
"trainer": {
"eu": []string{datasetEuGermanyID, datasetEuUkID},
"na": []string{datasetNaCanadaID, datasetNaUsID},
jobSpecData.DataSpec = []openapi.RoleDatasetGroups{
{
Role: "trainer",
DatasetGroups: map[string][]string{
"eu": []string{datasetEuGermanyID, datasetEuUkID},
"na": []string{datasetNaCanadaID, datasetNaUsID},
},
},
}
jobStatus, err := dbService.CreateJob(userID, jobSpecData)
Expand Down Expand Up @@ -169,9 +172,12 @@ func Test_distributed_training(t *testing.T) {

var jobSpecData openapi.JobSpec
readFileToStruct(t, rootExample+"/job.json", &jobSpecData)
jobSpecData.DataSpec.FromSystem = map[string]map[string][]string{
"trainer": {
"us": []string{dataset1ID, dataset2ID, dataset3ID},
jobSpecData.DataSpec = []openapi.RoleDatasetGroups{
{
Role: "trainer",
DatasetGroups: map[string][]string{
"us": []string{dataset1ID, dataset2ID, dataset3ID},
},
},
}
jobStatus, err := dbService.CreateJob(userID, jobSpecData)
Expand Down Expand Up @@ -242,10 +248,13 @@ func Test_hier_mnist(t *testing.T) {

var jobSpecData openapi.JobSpec
readFileToStruct(t, rootExample+"/job.json", &jobSpecData)
jobSpecData.DataSpec.FromSystem = map[string]map[string][]string{
"trainer": {
"eu": []string{datasetEuGermanyID, datasetEuUkID},
"na": []string{datasetNaCanadaID, datasetNaUsID},
jobSpecData.DataSpec = []openapi.RoleDatasetGroups{
{
Role: "trainer",
DatasetGroups: map[string][]string{
"eu": []string{datasetEuGermanyID, datasetEuUkID},
"na": []string{datasetNaCanadaID, datasetNaUsID},
},
},
}
jobStatus, err := dbService.CreateJob(userID, jobSpecData)
Expand Down Expand Up @@ -335,11 +344,14 @@ func Test_medmnist(t *testing.T) {

var jobSpecData openapi.JobSpec
readFileToStruct(t, rootExample+"/job.json", &jobSpecData)
jobSpecData.DataSpec.FromSystem = map[string]map[string][]string{
"trainer": {
"us": []string{
dataset1ID, dataset2ID, dataset3ID, dataset4ID, dataset5ID,
dataset6ID, dataset7ID, dataset8ID, dataset9ID, dataset10ID,
jobSpecData.DataSpec = []openapi.RoleDatasetGroups{
{
Role: "trainer",
DatasetGroups: map[string][]string{
"us": []string{
dataset1ID, dataset2ID, dataset3ID, dataset4ID, dataset5ID,
dataset6ID, dataset7ID, dataset8ID, dataset9ID, dataset10ID,
},
},
},
}
Expand Down Expand Up @@ -402,9 +414,12 @@ func Test_mnist(t *testing.T) {

var jobSpecData openapi.JobSpec
readFileToStruct(t, rootExample+"/job.json", &jobSpecData)
jobSpecData.DataSpec.FromSystem = map[string]map[string][]string{
"trainer": {
"us": []string{datasetID},
jobSpecData.DataSpec = []openapi.RoleDatasetGroups{
{
Role: "trainer",
DatasetGroups: map[string][]string{
"us": []string{datasetID},
},
},
}
jobStatus, err := dbService.CreateJob(userID, jobSpecData)
Expand Down Expand Up @@ -471,11 +486,14 @@ func Test_parallel_experiment(t *testing.T) {

var jobSpecData openapi.JobSpec
readFileToStruct(t, rootExample+"/job.json", &jobSpecData)
jobSpecData.DataSpec.FromSystem = map[string]map[string][]string{
"trainer": {
"asia": []string{datasetAsiaID},
"uk": []string{datasetEuUkID},
"us": []string{datasetUsWestID},
jobSpecData.DataSpec = []openapi.RoleDatasetGroups{
{
Role: "trainer",
DatasetGroups: map[string][]string{
"asia": []string{datasetAsiaID},
"uk": []string{datasetEuUkID},
"us": []string{datasetUsWestID},
},
},
}
jobStatus, err := dbService.CreateJob(userID, jobSpecData)
Expand Down
2 changes: 1 addition & 1 deletion cmd/flamectl/resources/job/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func createJobSpec(data []byte, jobFile string) (bool, openapi.JobSpec) {
}

//validate data spec
dataSpec := openapi.DataSpec{}
var dataSpec []openapi.RoleDatasetGroups
dataSpecPath := fmt.Sprintf("%s/%s", path.Dir(jobFile), createJobRequest.DataSpecPath)

err = util.ReadFileToStruct(dataSpecPath, &dataSpec)
Expand Down
12 changes: 0 additions & 12 deletions docs/01-introduction.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,4 @@ and makes worker creation requests.
9. Step 9: The flamelet monitors the execution of the task and updates the state once the task execution is over.
10. Step 10: In the meantime, the controller also monitors a job's status and take action when necessary (e.g., deallocating workers).

The above workflow can have some variations depending deployment mode.
There are two types of deployment modes: **orchestration** and **non-ochestration**.
In orchestration mode, all the workers are under the management of flame system through the help of cluster orchestrators.
On the other hand, in non-orchestration mode, the workers of consuming data (e.g., training worker) are not under management of the flame system.
The non-ochestration mode is useful in one of the following situations:
* when the flame system doesn't have permission to utilize resources of geo-distributed clusters
* when the geo-distributed clusters are not under the management of one organization
* when participants of a FL job want to have a control over when to join or leave the job

In non-ochestration mode, the fleddge system is only responsible for managing (i.e., (de)allocation) non-data consuming workers (e.g., model aggregating workers).
The system supports a hybrid mode where some are managed workers and others are non-managed workers.

Note that the flame system is in active development and not all the functionalities are supported yet.
2 changes: 1 addition & 1 deletion docs/03-a-ubuntu.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ Error: INSTALLATION FAILED: failed post-install: timed out waiting for the condi
This issue may be because container images are large or the Internet connection is slow.
The issue has been reported in minikube [github](https://github.com/kubernetes/minikube/issues/14789).
The latest minikube still doesn't contain the patched component (cri-dockerd 0.2.6).
A workaround is to pull images manually (e.g. `minikube ssh docker pull ciscoresearch/flame:latest`).
A workaround is to pull images manually (e.g. `minikube ssh docker pull ciscoresearch/flame:v0.2.2`).
The command `kubectl get pods -n flame` gives a list of pods and their status.
The pods with `ErrImagePull` or `ImagePullBackOff` status are ones that might be affected by the issue.
Identifying the required image can be done by running a `kubectl describe` command
Expand Down
4 changes: 2 additions & 2 deletions docs/04-examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,11 @@ flamectl create dataset dataset.json
```
The last command returns the dataset's ID if successful.
If you want to start a two-trainer example, you need to create one more dataset because flame automatically assigns a trainer to a new dataset.
As the dataset ID is a unique key based on both URL in `dataset.json` and user ID in `${Home}/.flame/config.yaml`, you can modify either URL or user id. Or you can simply duplicate the same dataset's ID in `job.json`.
As the dataset ID is a unique key based on both URL in `dataset.json` and user ID in `${HOME}/.flame/config.yaml`, you can modify either URL or user id. Or you can simply duplicate the same dataset's ID in `dataSpec.json`.

### Step 5: modify a job specification

With your choice of text editor, modify `job.json` to specify correct dataset's ID and save the change.
With your choice of text editor, modify `dataSpec.json` to specify correct dataset's ID and save the change.

### Step 6: create a job
```bash
Expand Down
2 changes: 1 addition & 1 deletion docs/05-flame-basics.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ For role, it has two attributes: *isDataConsumer* and *replica*.

**isDataconsumer**: this is a boolean attribute to denote that a role is supposed to consume data.
If the attribute is set, it indicates workers created from this role are training workers.
It has an important implication. In the orchestration mode, the number of specified datasets corresponds to the number of workers from the role with isDataConsumer attribute set.
It has an important implication, indicating the number of specified datasets corresponds to the number of workers from the role with isDataConsumer attribute set.

**replica**: This is applied to the roles with no isDataConsumer attribute set. This feature is for high availability.
It is yet to be implemented and will be supported in the future.
Expand Down
Loading

0 comments on commit 63a5f81

Please sign in to comment.