dask · gjoseph92 · Jun 9, 2021 · Jun 9, 2021 · Jun 9, 2021 · Jun 9, 2021
@@ -950,6 +950,9 @@ class TaskGroup:
     _start: double
     _stop: double
     _all_durations: object
+    _last_worker: WorkerState
+    _last_worker_tasks_left: int  # TODO Py_ssize_t?
+    _last_worker_priority: tuple  # TODO remove (debugging only)
 
     def __init__(self, name: str):
         self._name = name
@@ -964,6 +967,9 @@ def __init__(self, name: str):
         self._start = 0.0
         self._stop = 0.0
         self._all_durations = defaultdict(float)
+        self._last_worker = None
+        self._last_worker_tasks_left = 0
+        self._last_worker_priority = ()
 
     @property
     def name(self):
@@ -1009,6 +1015,26 @@ def start(self):
     def stop(self):
         return self._stop
 
+    @property
+    def last_worker(self):
+        return self._last_worker
+
+    @property
+    def last_worker_tasks_left(self):
+        return self._last_worker_tasks_left
+
+    @last_worker_tasks_left.setter
+    def last_worker_tasks_left(self, n: int):
+        self._last_worker_tasks_left = n
+
+    @property
+    def last_worker_priority(self):
+        return self._last_worker_priority
+
+    @last_worker_priority.setter
+    def last_worker_priority(self, x: tuple):
+        self._last_worker_priority = x
+
     @ccall
     def add(self, o):
         ts: TaskState = o
@@ -2337,14 +2363,20 @@ def decide_worker(self, ts: TaskState) -> WorkerState:
             ts.state = "no-worker"
             return ws
 
-        if ts._dependencies or valid_workers is not None:
+        if (
+            ts._dependencies
+            or valid_workers is not None
+            or ts._group._last_worker is not None
+        ):
             ws = decide_worker(
                 ts,
                 self._workers_dv.values(),
                 valid_workers,
                 partial(self.worker_objective, ts),
+                self._total_nthreads,
             )
         else:
+            # Fastpath when there are no related tasks or restrictions
             worker_pool = self._idle or self._workers
             worker_pool_dv = cast(dict, worker_pool)
             wp_vals = worker_pool.values()
@@ -2366,6 +2398,14 @@ def decide_worker(self, ts: TaskState) -> WorkerState:
             else:  # dumb but fast in large case
                 ws = wp_vals[self._n_tasks % n_workers]
 
+            # TODO repeated logic from `decide_worker`
+            print(f"nodeps / no last worker fastpah - {ts.group_key} -> {ws.name}")
+            ts._group._last_worker = ws
+            ts._group._last_worker_tasks_left = math.floor(
+                len(ts._group) / self._total_nthreads
+            )
+            ts._group._last_worker_priority = ts._priority
+
         if self._validate:
             assert ws is None or isinstance(ws, WorkerState), (
                 type(ws),
@@ -7471,7 +7511,11 @@ def _reevaluate_occupancy_worker(state: SchedulerState, ws: WorkerState):
 @cfunc
 @exceptval(check=False)
 def decide_worker(
-    ts: TaskState, all_workers, valid_workers: set, objective
+    ts: TaskState,
+    all_workers,
+    valid_workers: set,
+    objective,
+    total_nthreads: Py_ssize_t,
 ) -> WorkerState:
     """
     Decide which worker should take task *ts*.
@@ -7488,36 +7532,92 @@ def decide_worker(
     of bytes sent between workers.  This is determined by calling the
     *objective* function.
     """
-    ws: WorkerState = None
     wws: WorkerState
-    dts: TaskState
+
+    group: TaskGroup = ts._group
+    ws: WorkerState = group._last_worker
+
+    if valid_workers is not None:
+        total_nthreads = sum(wws._nthreads for wws in valid_workers)
+
+    group_tasks_per_worker = len(group) / total_nthreads
+    ignore_deps_while_picking: bool = False
+
+    # Try to schedule sibling root-like tasks on the same workers, so subsequent reduction tasks
+    # don't require data transfer. Assumes `decide_worker` is being called in priority order.
+    if (
+        ws is not None  # there is a previous worker
+        and group_tasks_per_worker > 1  # group is larger than cluster
+        and (  # is a root-like task (task group is large, but depends on very few tasks)
+            sum(map(len, group._dependencies)) < 5  # TODO what number
+        )
+    ):
+        if group._last_worker_tasks_left > 0:
+            # Previous worker not fully assigned
+            group._last_worker_tasks_left -= 1
+            if group._last_worker_priority >= ts.priority:
+                print(
+                    f"decide_worker called out of priority order: {group._last_worker_priority} >= {ts.priority}.\n"
+                    f"{ts=}\n"
+                    f"{group.last_worker=}\n"
+                    f"{group.last_worker_tasks_left=}\n"
+                    f"{group_tasks_per_worker=}\n"
+                )
+            group._last_worker_priority = ts.priority
+            print(f"reusing worker - {ts.group_key} -> {ws.name}")
+            return ws
+
+        # Previous worker is fully assigned, so pick a new worker.
+        # Since this is a root-like task, we should ignore the placement of its dependencies while selecting workers.
+        # Every worker is going to end up running this type of task eventually, and any dependencies will have to be
+        # transferred to all workers, so there's no gain from only considering workers where the dependencies already live.
+        # Indeed, we _must_ consider all workers, otherwise we would keep picking the same "new" worker(s) every time,
+        # since there are only N workers to choose from that actually have the dependency (where N <= n_deps).
+        ignore_deps_while_picking = True
+        print(f"{ts.group_key} is root-like but {ws.name} is full - picking a new worker")
+
+    # Not a root-like task; pick the best worker among the valid workers
+    # that hold at least one dependency of this task.
     deps: set = ts._dependencies
+    dts: TaskState
     candidates: set
     assert all([dts._who_has for dts in deps])
-    if ts._actor:
-        candidates = set(all_workers)
+    if ignore_deps_while_picking:
+        candidates = valid_workers if valid_workers is not None else set(all_workers)
     else:
-        candidates = {wws for dts in deps for wws in dts._who_has}
-    if valid_workers is None:
-        if not candidates:
+        if ts._actor:
             candidates = set(all_workers)
-    else:
-        candidates &= valid_workers
-        if not candidates:
-            candidates = valid_workers
+        else:
+            candidates = {wws for dts in deps for wws in dts._who_has}
+        if valid_workers is None:
             if not candidates:
-                if ts._loose_restrictions:
-                    ws = decide_worker(ts, all_workers, None, objective)
-                return ws
+                candidates = set(all_workers)
+        else:
+            candidates &= valid_workers
+            if not candidates:
+                candidates = valid_workers
+                if not candidates:
+                    if ts._loose_restrictions:
+                        ws = decide_worker(ts, all_workers, None, objective, total_nthreads)
+                    return ws
 
     ncandidates: Py_ssize_t = len(candidates)
     if ncandidates == 0:
+        print(f"no candidates - {ts.group_key}")
         pass
     elif ncandidates == 1:
+        # NOTE: this is the ideal case: all the deps are already on the same worker.
+        # We did a good job in previous `decide_worker`s!
         for ws in candidates:
             break
+        print(f"1 candidate - {ts.group_key} -> {ws.name}")
     else:
         ws = min(candidates, key=objective)
+        print(f"picked worker - {ts.group_key} -> {ws.name}")
+
+    group._last_worker = ws
+    group._last_worker_tasks_left = math.floor(group_tasks_per_worker)
+    group._last_worker_priority = ts.priority
     return ws