Skip to content

Commit

Permalink
Adding LIFL's shared memory backend (#567)
Browse files Browse the repository at this point in the history
* Added FedScale@7ec441c as submodule

* Adding script to partition the FedScale dataset into multiple clients.

* update

* update git modules

* Add fedscale package

* Adding FedScale dataset partitioner (#531)

* Added FedScale@7ec441c as submodule

* Adding script to partition the FedScale dataset into multiple clients.

* update

* update git modules

* Add fedscale package

* Avoid duplicated backend instances

* Avoid creating multiple instances of the same backend in a channel (#532)

* Avoid duplicated backend instances

* Add example to use the Femnist dataset from FedScale (1 aggregator + 1 trainer)

* Add Hier_Femnist example

* Add round-robin pairing in coordinator

* Add 3-level hierachy: top-mid-leaf---trainer

* Remove submodule

* Add download.sh to download FedScale dataset

* moving files

* update dataset directory in fedscale example

* fixing the protocol in leaf aggregator

* Update Readme to introduce LIFL

* Fix Readme

* Removed .gitmodules as per request

* Update README; update year; create directory for lifl docs

* update year in fedscale_dataset_partitioner.py

* add pip instruction for gdown; remove random_pair

* Adding shared memory backend components

* update backends.py and config.py

* Add config files for shm backend in coord_3_hier_syncfl_mnist

* Remove flame/backend/lifl/

* Adding support for eager aggregation

* Add shm backend

* Clean up code of shared memory backend

* Adding scripts for upgrading kernel and installing libbpf

* Change execution permission of SPRIGHT scripts

* Make 1024 as a constant in shm.py

* Add config files for using shared memory backend within hierarchy

* Simplifying the implementation of the top aggregator in eager_syncfl
  • Loading branch information
ShixiongQi authored May 20, 2024
1 parent 25a1e23 commit e350a32
Show file tree
Hide file tree
Showing 42 changed files with 2,952 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
{
"taskid": "59d06b7526964db86cf37c70e8e0cdb6bd7aa744",
"backend": "p2p",
"brokers": [
{
"host": "localhost",
"sort": "mqtt"
},
{
"host": "localhost:10104",
"sort": "p2p"
},
{
"host": "localhost:10105",
"sort": "shm"
}
],
"groupAssociation": {
"leaf-agg-coord-channel": "default",
"param-channel": "default",
"hier-channel": "default"
},
"channels": [
{
"name": "leaf-agg-coord-channel",
"description": "Channel between leaf aggregator and coordinator",
"pair": [
"leaf-aggregator",
"coordinator"
],
"groupBy": {
"type": "tag",
"value": [
"default"
]
},
"funcTags": {
"leaf-aggregator": [
"coordinate"
],
"coordinator": [
"coordinateWithLeafAgg"
]
}
},
{
"description": "Model update is sent from leaf aggregator to mid aggregator and vice-versa",
"groupBy": {
"type": "tag",
"value": [
"default"
]
},
"name": "hier-channel",
"pair": [
"middle-aggregator",
"leaf-aggregator"
],
"backend": "shm",
"funcTags": {
"middle-aggregator": [
"distribute",
"aggregate"
],
"leaf-aggregator": [
"fetch",
"upload"
]
}
},
{
"description": "Model update is sent from leaf aggregator to trainer and vice-versa",
"groupBy": {
"type": "tag",
"value": [
"default"
]
},
"name": "param-channel",
"pair": [
"leaf-aggregator",
"trainer"
],
"funcTags": {
"leaf-aggregator": [
"distribute",
"aggregate"
],
"trainer": [
"fetch",
"upload"
]
}
}
],
"dataset": "https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz",
"dependencies": [
"numpy >= 1.2.0"
],
"hyperparameters": {
"batchSize": 32,
"learningRate": 0.01,
"rounds": 5
},
"baseModel": {
"name": "",
"version": 1
},
"job": {
"id": "622a358619ab59012eabeefb",
"name": "mnist"
},
"registry": {
"sort": "dummy",
"uri": "http://flame-mlflow:5000"
},
"selector": {
"sort": "default",
"kwargs": {}
},
"maxRunTime": 300,
"realm": "default-cluster",
"role": "leaf-aggregator"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
{
"taskid": "59d06b7526964db86cf37c70e8e0cdb6bd7aa745",
"backend": "p2p",
"brokers": [
{
"host": "localhost",
"sort": "mqtt"
},
{
"host": "localhost:10104",
"sort": "p2p"
},
{
"host": "localhost:10105",
"sort": "shm"
}
],
"groupAssociation": {
"leaf-agg-coord-channel": "default",
"param-channel": "default",
"hier-channel": "default"
},
"channels": [
{
"name": "leaf-agg-coord-channel",
"description": "Channel between leaf aggregator and coordinator",
"pair": [
"leaf-aggregator",
"coordinator"
],
"groupBy": {
"type": "tag",
"value": [
"default"
]
},
"funcTags": {
"leaf-aggregator": [
"coordinate"
],
"coordinator": [
"coordinateWithLeafAgg"
]
}
},
{
"description": "Model update is sent from leaf aggregator to mid aggregator and vice-versa",
"groupBy": {
"type": "tag",
"value": [
"default"
]
},
"name": "hier-channel",
"pair": [
"middle-aggregator",
"leaf-aggregator"
],
"backend": "shm",
"funcTags": {
"middle-aggregator": [
"distribute",
"aggregate"
],
"leaf-aggregator": [
"fetch",
"upload"
]
}
},
{
"description": "Model update is sent from leaf aggregator to trainer and vice-versa",
"groupBy": {
"type": "tag",
"value": [
"default"
]
},
"name": "param-channel",
"pair": [
"leaf-aggregator",
"trainer"
],
"funcTags": {
"leaf-aggregator": [
"distribute",
"aggregate"
],
"trainer": [
"fetch",
"upload"
]
}
}
],
"dataset": "https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz",
"dependencies": [
"numpy >= 1.2.0"
],
"hyperparameters": {
"batchSize": 32,
"learningRate": 0.01,
"rounds": 5
},
"baseModel": {
"name": "",
"version": 1
},
"job": {
"id": "622a358619ab59012eabeefb",
"name": "mnist"
},
"registry": {
"sort": "dummy",
"uri": ""
},
"selector": {
"sort": "default",
"kwargs": {}
},
"maxRunTime": 300,
"realm": "default-cluster",
"role": "leaf-aggregator"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
{
"taskid": "49d06b7526964db86cf37c70e8e0cdb6bd7aa746",
"backend": "p2p",
"brokers": [
{
"host": "localhost",
"sort": "mqtt"
},
{
"host": "localhost:10104",
"sort": "p2p"
},
{
"host": "localhost:10105",
"sort": "shm"
}
],
"groupAssociation": {
"middle-agg-coord-channel": "default",
"hier-channel": "default",
"global-channel": "default"
},
"channels": [
{
"name": "middle-agg-coord-channel",
"description": "Channel between middle aggregator and coordinator",
"pair": [
"middle-aggregator",
"coordinator"
],
"groupBy": {
"type": "tag",
"value": [
"default"
]
},
"funcTags": {
"middle-aggregator": [
"coordinate"
],
"coordinator": [
"coordinateWithMidAgg"
]
}
},
{
"description": "Model update is sent from mid aggregator to global aggregator and vice-versa",
"groupBy": {
"type": "tag",
"value": [
"default"
]
},
"name": "global-channel",
"pair": [
"top-aggregator",
"middle-aggregator"
],
"backend": "shm",
"funcTags": {
"top-aggregator": [
"distribute",
"aggregate"
],
"middle-aggregator": [
"fetch",
"upload"
]
}
},
{
"description": "Model update is sent from mid aggregator to leaf aggregator and vice-versa",
"groupBy": {
"type": "tag",
"value": [
"default"
]
},
"name": "hier-channel",
"pair": [
"middle-aggregator",
"leaf-aggregator"
],
"backend": "shm",
"funcTags": {
"middle-aggregator": [
"distribute",
"aggregate"
],
"leaf-aggregator": [
"fetch",
"upload"
]
}
}
],
"dataset": "https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz",
"dependencies": [
"numpy >= 1.2.0"
],
"hyperparameters": {
"batchSize": 32,
"learningRate": 0.01,
"rounds": 5
},
"baseModel": {
"name": "",
"version": 1
},
"job": {
"id": "622a358619ab59012eabeefb",
"name": "mnist"
},
"registry": {
"sort": "dummy",
"uri": "http://flame-mlflow:5000"
},
"selector": {
"sort": "default",
"kwargs": {}
},
"maxRunTime": 300,
"realm": "default-cluster",
"role": "middle-aggregator"
}
Loading

0 comments on commit e350a32

Please sign in to comment.