From 4548f06e6a7576467e4ab1de45ff9ef748410295 Mon Sep 17 00:00:00 2001 From: Jaemin Shin <999fg@kaist.ac.kr> Date: Fri, 19 May 2023 11:05:16 -0700 Subject: [PATCH] fix: disable automatic eviction of diskcache at aggregation Python diskcache, which we use for aggregating trainer updates, have its own automatic eviction policy, depending on its size_limit and cull_limit value. Updated it to disable automatic eviction, and added a logger debug line that tells you the number of updates in the cache. --- lib/python/flame/mode/horizontal/syncfl/top_aggregator.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/python/flame/mode/horizontal/syncfl/top_aggregator.py b/lib/python/flame/mode/horizontal/syncfl/top_aggregator.py index 3d017cdf3..55a2a32c4 100644 --- a/lib/python/flame/mode/horizontal/syncfl/top_aggregator.py +++ b/lib/python/flame/mode/horizontal/syncfl/top_aggregator.py @@ -93,7 +93,11 @@ def internal_init(self) -> None: self.metrics = dict() # disk cache is used for saving memory in case model is large + # automatic eviction of disk cache is disabled with cull_limit 0 self.cache = Cache() + self.cache.reset('size_limit', 1e15) + self.cache.reset('cull_limit', 0) + self.optimizer = optimizer_provider.get( self.config.optimizer.sort, **self.config.optimizer.kwargs ) @@ -157,6 +161,8 @@ def _aggregate_weights(self, tag: str) -> None: # save training result from trainer in a disk cache self.cache[end] = tres + logger.debug(f"received {len(self.cache)} trainer updates in cache") + # optimizer conducts optimization (in this case, aggregation) global_weights = self.optimizer.do( deepcopy(self.weights),