fix: remove non-orchestration mode from the project as it's not maint…

…ained anymore, at least for the medium term. The only mode flame will have for now will be the orchestration mode, which means that all the wokers are managed by the flame system.
cisco-open · Mar 28, 2023 · 63a5f81 · 63a5f81
1 parent 1c21080
commit 63a5f81
Show file tree

Hide file tree

Showing 34 changed files with 200 additions and 710 deletions.
diff --git a/api/job_components.partials.yml b/api/job_components.partials.yml
@@ -79,46 +79,38 @@ CommBackend:
 ########################################
 DataSpec:
   description: Dataset specification
-  type: object
-  properties:
-    fromUser:
-      $ref: '#/components/schemas/FromUser'
-    fromSystem:
-      $ref: '#/components/schemas/FromSystem'
+  type: array
+  items:
+    $ref: '#/components/schemas/RoleDatasetGroups'
   example:
-    fromUser:
-      default: 1 # there is one user-fed dataset under default realm
-      uk: 2 # there are two user-fed datasets (hence two compute nodes) under uk realm
-      us/foo: 3 # there are three user-fed datasets under us/foo realm
-    fromSystem:
-      trainer:
-        "default/eu":
+    - role: trainer
+      datasetGroups:
+        "eu":
           - 61609290fa724deafdb2a4fa # id of dataset registered in the flame system
           - 61609290fa724deafdb2a4fb # id of dataset registered in the flame system
-        "default/us":
+        "us":
           - 61609290fa724deafdb2a4fc # id of dataset registered in the flame system
 
-########################################
-# FromUser
-########################################
-FromUser:
-  # an object is a key-value pair of realm and count
-  type: object
-  additionalProperties:
-    type: integer
-    format: int32
-
-########################################
-# FromSystem
-########################################
-FromSystem:
+RoleDatasetGroups:
+  description: Dataset specification
   type: object
-  additionalProperties:
-    type: object
-    additionalProperties:
-      type: array
-      items:
-        type: string
+  properties:
+    role:
+      type: string
+    datasetGroups:
+      type: object
+      additionalProperties:
+        type: array
+        items:
+          type: string
+  example:
+    role: trainer
+    datasetGroups:
+      "eu":
+        - 61609290fa724deafdb2a4fa # id of dataset registered in the flame system
+        - 61609290fa724deafdb2a4fb # id of dataset registered in the flame system
+      "us":
+        - 61609290fa724deafdb2a4fc # id of dataset registered in the flame system
 
 ########################################
 # Model Spec

diff --git a/cmd/controller/app/job/builder.go b/cmd/controller/app/job/builder.go
@@ -153,7 +153,10 @@ func (b *JobBuilder) setup() error {
 	b.roleCode = zippedRoleCode
 
 	// Iterating for each dataset id to fetch dataset info and update the datasets array.
-	for roleName, groups := range b.jobSpec.DataSpec.FromSystem {
+	for _, dataSpec := range b.jobSpec.DataSpec {
+		roleName := dataSpec.Role
+		groups := dataSpec.DatasetGroups
+
 		if len(groups) == 0 {
 			return fmt.Errorf("no dataset group specified for trainer role %s", roleName)
 		}
@@ -220,21 +223,6 @@ func (b *JobBuilder) build() ([]objects.Task, []string, error) {
 			continue
 		}
 
-		// TODO: this is absolute and should be removed
-		for group, count := range b.jobSpec.DataSpec.FromUser {
-			for i := 0; i < int(count); i++ {
-				task := tmpl.Task
-
-				task.Type = openapi.USER
-				task.JobConfig.Realm = group
-				task.JobConfig.GroupAssociation = b.getGroupAssociationByGroup(roleName, group)
-
-				task.GenerateTaskId(i)
-
-				tasks = append(tasks, task)
-			}
-		}
-
 		var count int
 		groups := sortedKeys(b.datasets[roleName])
 

diff --git a/cmd/controller/app/job/builder_example_test.go b/cmd/controller/app/job/builder_example_test.go
@@ -98,10 +98,13 @@ func Test_asyncfl_hier_mnist(t *testing.T) {
 
 	var jobSpecData openapi.JobSpec
 	readFileToStruct(t, rootExample+"/job.json", &jobSpecData)
-	jobSpecData.DataSpec.FromSystem = map[string]map[string][]string{
-		"trainer": {
-			"eu": []string{datasetEuGermanyID, datasetEuUkID},
-			"na": []string{datasetNaCanadaID, datasetNaUsID},
+	jobSpecData.DataSpec = []openapi.RoleDatasetGroups{
+		{
+			Role: "trainer",
+			DatasetGroups: map[string][]string{
+				"eu": []string{datasetEuGermanyID, datasetEuUkID},
+				"na": []string{datasetNaCanadaID, datasetNaUsID},
+			},
 		},
 	}
 	jobStatus, err := dbService.CreateJob(userID, jobSpecData)
@@ -169,9 +172,12 @@ func Test_distributed_training(t *testing.T) {
 
 	var jobSpecData openapi.JobSpec
 	readFileToStruct(t, rootExample+"/job.json", &jobSpecData)
-	jobSpecData.DataSpec.FromSystem = map[string]map[string][]string{
-		"trainer": {
-			"us": []string{dataset1ID, dataset2ID, dataset3ID},
+	jobSpecData.DataSpec = []openapi.RoleDatasetGroups{
+		{
+			Role: "trainer",
+			DatasetGroups: map[string][]string{
+				"us": []string{dataset1ID, dataset2ID, dataset3ID},
+			},
 		},
 	}
 	jobStatus, err := dbService.CreateJob(userID, jobSpecData)
@@ -242,10 +248,13 @@ func Test_hier_mnist(t *testing.T) {
 
 	var jobSpecData openapi.JobSpec
 	readFileToStruct(t, rootExample+"/job.json", &jobSpecData)
-	jobSpecData.DataSpec.FromSystem = map[string]map[string][]string{
-		"trainer": {
-			"eu": []string{datasetEuGermanyID, datasetEuUkID},
-			"na": []string{datasetNaCanadaID, datasetNaUsID},
+	jobSpecData.DataSpec = []openapi.RoleDatasetGroups{
+		{
+			Role: "trainer",
+			DatasetGroups: map[string][]string{
+				"eu": []string{datasetEuGermanyID, datasetEuUkID},
+				"na": []string{datasetNaCanadaID, datasetNaUsID},
+			},
 		},
 	}
 	jobStatus, err := dbService.CreateJob(userID, jobSpecData)
@@ -335,11 +344,14 @@ func Test_medmnist(t *testing.T) {
 
 	var jobSpecData openapi.JobSpec
 	readFileToStruct(t, rootExample+"/job.json", &jobSpecData)
-	jobSpecData.DataSpec.FromSystem = map[string]map[string][]string{
-		"trainer": {
-			"us": []string{
-				dataset1ID, dataset2ID, dataset3ID, dataset4ID, dataset5ID,
-				dataset6ID, dataset7ID, dataset8ID, dataset9ID, dataset10ID,
+	jobSpecData.DataSpec = []openapi.RoleDatasetGroups{
+		{
+			Role: "trainer",
+			DatasetGroups: map[string][]string{
+				"us": []string{
+					dataset1ID, dataset2ID, dataset3ID, dataset4ID, dataset5ID,
+					dataset6ID, dataset7ID, dataset8ID, dataset9ID, dataset10ID,
+				},
 			},
 		},
 	}
@@ -402,9 +414,12 @@ func Test_mnist(t *testing.T) {
 
 	var jobSpecData openapi.JobSpec
 	readFileToStruct(t, rootExample+"/job.json", &jobSpecData)
-	jobSpecData.DataSpec.FromSystem = map[string]map[string][]string{
-		"trainer": {
-			"us": []string{datasetID},
+	jobSpecData.DataSpec = []openapi.RoleDatasetGroups{
+		{
+			Role: "trainer",
+			DatasetGroups: map[string][]string{
+				"us": []string{datasetID},
+			},
 		},
 	}
 	jobStatus, err := dbService.CreateJob(userID, jobSpecData)
@@ -471,11 +486,14 @@ func Test_parallel_experiment(t *testing.T) {
 
 	var jobSpecData openapi.JobSpec
 	readFileToStruct(t, rootExample+"/job.json", &jobSpecData)
-	jobSpecData.DataSpec.FromSystem = map[string]map[string][]string{
-		"trainer": {
-			"asia": []string{datasetAsiaID},
-			"uk":   []string{datasetEuUkID},
-			"us":   []string{datasetUsWestID},
+	jobSpecData.DataSpec = []openapi.RoleDatasetGroups{
+		{
+			Role: "trainer",
+			DatasetGroups: map[string][]string{
+				"asia": []string{datasetAsiaID},
+				"uk":   []string{datasetEuUkID},
+				"us":   []string{datasetUsWestID},
+			},
 		},
 	}
 	jobStatus, err := dbService.CreateJob(userID, jobSpecData)

diff --git a/cmd/flamectl/resources/job/job.go b/cmd/flamectl/resources/job/job.go
@@ -62,7 +62,7 @@ func createJobSpec(data []byte, jobFile string) (bool, openapi.JobSpec) {
 	}
 
 	//validate data spec
-	dataSpec := openapi.DataSpec{}
+	var dataSpec []openapi.RoleDatasetGroups
 	dataSpecPath := fmt.Sprintf("%s/%s", path.Dir(jobFile), createJobRequest.DataSpecPath)
 
 	err = util.ReadFileToStruct(dataSpecPath, &dataSpec)

diff --git a/docs/01-introduction.md b/docs/01-introduction.md
@@ -72,16 +72,4 @@ and makes worker creation requests.
 9. Step 9: The flamelet monitors the execution of the task and updates the state once the task execution is over.
 10. Step 10: In the meantime, the controller also monitors a job's status and take action when necessary (e.g., deallocating workers).
 
-The above workflow can have some variations depending deployment mode.
-There are two types of deployment modes: **orchestration** and **non-ochestration**.
-In orchestration mode, all the workers are under the management of flame system through the help of cluster orchestrators.
-On the other hand, in non-orchestration mode, the workers of consuming data (e.g., training worker) are not under management of the flame system.
-The non-ochestration mode is useful in one of the following situations:
-* when the flame system doesn't have permission to utilize resources of geo-distributed clusters
-* when the geo-distributed clusters are not under the management of one organization
-* when participants of a FL job want to have a control over when to join or leave the job
-
-In non-ochestration mode, the fleddge system is only responsible for managing (i.e., (de)allocation) non-data consuming workers (e.g., model aggregating workers).
-The system supports a hybrid mode where some are managed workers and others are non-managed workers.
-
 Note that the flame system is in active development and not all the functionalities are supported yet.
diff --git a/docs/03-a-ubuntu.md b/docs/03-a-ubuntu.md
@@ -148,7 +148,7 @@ Error: INSTALLATION FAILED: failed post-install: timed out waiting for the condi
 This issue may be because container images are large or the Internet connection is slow.
 The issue has been reported in minikube [github](https://github.com/kubernetes/minikube/issues/14789).
 The latest minikube still doesn't contain the patched component (cri-dockerd 0.2.6).
-A workaround is to pull images manually (e.g. `minikube ssh docker pull ciscoresearch/flame:latest`).
+A workaround is to pull images manually (e.g. `minikube ssh docker pull ciscoresearch/flame:v0.2.2`).
 The command `kubectl get pods -n flame` gives a list of pods and their status.
 The pods with `ErrImagePull` or `ImagePullBackOff` status are ones that might be affected by the issue.
 Identifying the required image can be done by running a `kubectl describe` command

diff --git a/docs/04-examples.md b/docs/04-examples.md
@@ -72,11 +72,11 @@ flamectl create dataset dataset.json
 ```
 The last command returns the dataset's ID if successful.
 If you want to start a two-trainer example, you need to create one more dataset because flame automatically assigns a trainer to a new dataset.
-As the dataset ID is a unique key based on both URL in `dataset.json` and user ID in `${Home}/.flame/config.yaml`, you can modify either URL or user id. Or you can simply duplicate the same dataset's ID in `job.json`.
+As the dataset ID is a unique key based on both URL in `dataset.json` and user ID in `${HOME}/.flame/config.yaml`, you can modify either URL or user id. Or you can simply duplicate the same dataset's ID in `dataSpec.json`.
 
 ### Step 5: modify a job specification
 
-With your choice of text editor, modify `job.json` to specify correct dataset's ID and save the change.
+With your choice of text editor, modify `dataSpec.json` to specify correct dataset's ID and save the change.
 
 ### Step 6: create a job
 ```bash

diff --git a/docs/05-flame-basics.md b/docs/05-flame-basics.md
@@ -36,7 +36,7 @@ For role, it has two attributes: *isDataConsumer* and *replica*.
 
 **isDataconsumer**: this is a boolean attribute to denote that a role is supposed to consume data.
 If the attribute is set, it indicates workers created from this role are training workers.
-It has an important implication. In the orchestration mode, the number of specified datasets corresponds to the number of workers from the role with isDataConsumer attribute set.
+It has an important implication, indicating the number of specified datasets corresponds to the number of workers from the role with isDataConsumer attribute set.
 
 **replica**: This is applied to the roles with no isDataConsumer attribute set. This feature is for high availability.
 It is yet to be implemented and will be supported in the future.