cisco-open · jaemin-shin · May 19, 2023 · May 19, 2023 · myungjin · May 19, 2023
diff --git a/lib/python/flame/mode/horizontal/asyncfl/middle_aggregator.py b/lib/python/flame/mode/horizontal/asyncfl/middle_aggregator.py
@@ -202,6 +202,7 @@ def _aggregate_weights(self, tag: str) -> None:
             tres = TrainResult(weights, count, version)
             # save training result from trainer in a disk cache
             self.cache[end] = tres
+            logger.debug(f"received {len(self.cache)} trainer updates in cache")
 
             self._agg_goal_weights = self.optimizer.do(
                 self._agg_goal_weights, self.cache, total=count, version=self._round

diff --git a/lib/python/flame/mode/horizontal/asyncfl/top_aggregator.py b/lib/python/flame/mode/horizontal/asyncfl/top_aggregator.py
@@ -89,6 +89,7 @@ def _aggregate_weights(self, tag: str) -> None:
             tres = TrainResult(weights, count, version)
             # save training result from trainer in a disk cache
             self.cache[end] = tres
+            logger.debug(f"received {len(self.cache)} trainer updates in cache")
 
             self._agg_goal_weights = self.optimizer.do(
                 self._agg_goal_weights, self.cache, total=count, version=self._round

diff --git a/lib/python/flame/mode/horizontal/feddyn/top_aggregator.py b/lib/python/flame/mode/horizontal/feddyn/top_aggregator.py
@@ -96,6 +96,8 @@ def _aggregate_weights(self, tag: str) -> None:
                 # save training result from trainer in a disk cache
                 self.cache[end] = tres
 
+        logger.debug(f"received {len(self.cache)} trainer updates in cache")
+
         # optimizer conducts optimization (in this case, aggregation)
         global_weights = self.optimizer.do(
             deepcopy(self.cld_weights),

diff --git a/lib/python/flame/mode/horizontal/oort/top_aggregator.py b/lib/python/flame/mode/horizontal/oort/top_aggregator.py
@@ -105,6 +105,8 @@ def _aggregate_weights(self, tag: str) -> None:
                 if received_end_count == aggr_num:
                     break
 
+        logger.debug(f"received {len(self.cache)} trainer updates in cache")
+
         # optimizer conducts optimization (in this case, aggregation)
         global_weights = self.optimizer.do(
             deepcopy(self.weights), self.cache, total=total

diff --git a/lib/python/flame/mode/horizontal/syncfl/middle_aggregator.py b/lib/python/flame/mode/horizontal/syncfl/middle_aggregator.py
@@ -75,7 +75,12 @@ def internal_init(self) -> None:
         self._round = 1
         self._work_done = False
 
+        # disk cache is used for saving memory in case model is large
+        # automatic eviction of disk cache is disabled with cull_limit 0
         self.cache = Cache()
+        self.cache.reset("size_limit", 1e15)
+        self.cache.reset("cull_limit", 0)
+
         self.dataset_size = 0
 
         # save distribute tag in an instance variable
@@ -182,6 +187,8 @@ def _aggregate_weights(self, tag: str) -> None:
                 # save training result from trainer in a disk cache
                 self.cache[end] = tres
 
+        logger.debug(f"received {len(self.cache)} trainer updates in cache")
+
         # optimizer conducts optimization (in this case, aggregation)
         global_weights = self.optimizer.do(
             deepcopy(self.weights), self.cache, total=total

diff --git a/lib/python/flame/mode/horizontal/syncfl/top_aggregator.py b/lib/python/flame/mode/horizontal/syncfl/top_aggregator.py
@@ -93,7 +93,11 @@ def internal_init(self) -> None:
         self.metrics = dict()
 
         # disk cache is used for saving memory in case model is large
+        # automatic eviction of disk cache is disabled with cull_limit 0
         self.cache = Cache()
+        self.cache.reset('size_limit', 1e15)
+        self.cache.reset('cull_limit', 0)
+
         self.optimizer = optimizer_provider.get(
             self.config.optimizer.sort, **self.config.optimizer.kwargs
         )
@@ -157,6 +161,8 @@ def _aggregate_weights(self, tag: str) -> None:
                 # save training result from trainer in a disk cache
                 self.cache[end] = tres
 
+        logger.debug(f"received {len(self.cache)} trainer updates in cache")
+
         # optimizer conducts optimization (in this case, aggregation)
         global_weights = self.optimizer.do(
             deepcopy(self.weights),