[BUG] Setting Dask Cluster through UCX not working #872

kshitizgupta21 · 2021-06-09T20:57:04Z

I tried setting the LocalCUDACluster using ucx but get this ModuleNotFoundError: No module named 'ucp' I'm running this in NVTabular merlin-pytorch-training:0.5.3 NGC container. I thought that container had everything we need for UCX.

Here's my code:

protocol = "ucx"  
# Select GPUs to place workers. Here 1st and 2nd GPU
# If you want the first 4 GPUs it would be 0,1,2,3 and so on
visible_devices = "0,1"  
# Get the IP Address
cmd = "hostname --all-ip-addresses"
process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
IPADDR = str(output.decode()).split()[0]
cluster = LocalCUDACluster(
    ip=IPADDR,
    protocol=protocol,
    enable_nvlink=True,
    CUDA_VISIBLE_DEVICES=visible_devices,
    local_directory=dask_workdir,
    device_memory_limit=0.8 # This can be changed depending on your workflow
)
# Create the distributed client
client = Client(cluster)
client

Here's the error traceback:

/opt/conda/lib/python3.8/site-packages/dask_cuda/local_cuda_cluster.py:219: UserWarning: When using NVLink we recommend setting a `rmm_pool_size`. Please see: https://dask-cuda.readthedocs.io/en/latest/ucx.html#important-notes for more details
  warnings.warn(
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-17-f505f8fa4f24> in <module>
     14 IPADDR = str(output.decode()).split()[0]
     15 
---> 16 cluster = LocalCUDACluster(
     17     ip=IPADDR,
     18     protocol=protocol,
/opt/conda/lib/python3.8/site-packages/dask_cuda/local_cuda_cluster.py in __init__(self, n_workers, threads_per_worker, processes, memory_limit, device_memory_limit, CUDA_VISIBLE_DEVICES, data, local_directory, protocol, enable_tcp_over_ucx, enable_infiniband, enable_nvlink, enable_rdmacm, ucx_net_devices, rmm_pool_size, rmm_managed_memory, rmm_log_directory, jit_unspill, log_spilling, **kwargs)
    284         )
    285 
--> 286         super().__init__(
    287             n_workers=0,
    288             threads_per_worker=threads_per_worker,
/opt/conda/lib/python3.8/site-packages/distributed/deploy/local.py in __init__(self, name, n_workers, threads_per_worker, processes, loop, start, host, ip, scheduler_port, silence_logs, dashboard_address, worker_dashboard_address, diagnostics_port, services, worker_services, service_kwargs, asynchronous, security, protocol, blocked_handlers, interface, worker_class, scheduler_kwargs, **worker_kwargs)
    229         workers = {i: worker for i in range(n_workers)}
    230 
--> 231         super().__init__(
    232             name=name,
    233             scheduler=scheduler,
/opt/conda/lib/python3.8/site-packages/distributed/deploy/spec.py in __init__(self, workers, scheduler, worker, asynchronous, loop, security, silence_logs, name, shutdown_on_close)
    280         if not self.asynchronous:
    281             self._loop_runner.start()
--> 282             self.sync(self._start)
    283             self.sync(self._correct_state)
    284 
/opt/conda/lib/python3.8/site-packages/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    186             return future
    187         else:
--> 188             return sync(self.loop, func, *args, **kwargs)
    189 
    190     def _log(self, log):
/opt/conda/lib/python3.8/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    351     if error[0]:
    352         typ, exc, tb = error[0]
--> 353         raise exc.with_traceback(tb)
    354     else:
    355         return result[0]
/opt/conda/lib/python3.8/site-packages/distributed/utils.py in f()
    334             if callback_timeout is not None:
    335                 future = asyncio.wait_for(future, callback_timeout)
--> 336             result[0] = yield future
    337         except Exception as exc:
    338             error[0] = sys.exc_info()
/opt/conda/lib/python3.8/site-packages/tornado/gen.py in run(self)
    760 
    761                     try:
--> 762                         value = future.result()
    763                     except Exception:
    764                         exc_info = sys.exc_info()
/opt/conda/lib/python3.8/site-packages/distributed/deploy/spec.py in _start(self)
    309                 cls = import_term(cls)
    310             self.scheduler = cls(**self.scheduler_spec.get("options", {}))
--> 311             self.scheduler = await self.scheduler
    312         self.scheduler_comm = rpc(
    313             getattr(self.scheduler, "external_address", None) or self.scheduler.address,
/opt/conda/lib/python3.8/site-packages/distributed/core.py in _()
    283                         )
    284                 else:
--> 285                     await self.start()
    286                     self.status = Status.running
    287             return self
/opt/conda/lib/python3.8/site-packages/distributed/scheduler.py in start(self)
   3462 
   3463         for addr in self._start_address:
-> 3464             await self.listen(
   3465                 addr,
   3466                 allow_offload=False,
/opt/conda/lib/python3.8/site-packages/distributed/core.py in listen(self, port_or_addr, allow_offload, **kwargs)
    398             addr = port_or_addr
    399             assert isinstance(addr, str)
--> 400         listener = await listen(
    401             addr,
    402             self.handle_comm,
/opt/conda/lib/python3.8/site-packages/distributed/comm/core.py in _()
    206     def __await__(self):
    207         async def _():
--> 208             await self.start()
    209             return self
    210 
/opt/conda/lib/python3.8/site-packages/distributed/comm/ucx.py in start(self)
    389                 await self.comm_handler(ucx)
    390 
--> 391         init_once()
    392         self.ucp_server = ucx_create_listener(serve_forever, port=self._input_port)
    393 
/opt/conda/lib/python3.8/site-packages/distributed/comm/ucx.py in init_once()
     54         return
     55 
---> 56     import ucp as _ucp
     57 
     58     ucp = _ucp
ModuleNotFoundError: No module named 'ucp'

The text was updated successfully, but these errors were encountered:

benfred · 2021-06-28T16:40:36Z

This is on all our staging containers -

benfred · 2021-07-12T16:42:40Z

On our staging containers - will be in the 0.6 release

kshitizgupta21 added the bug Something isn't working label Jun 9, 2021

benfred added the PyTorch label Jun 10, 2021

benfred assigned jperez999 Jun 10, 2021

benfred added the P0 label Jun 10, 2021

benfred closed this as completed Jul 12, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[BUG] Setting Dask Cluster through UCX not working #872

[BUG] Setting Dask Cluster through UCX not working #872

kshitizgupta21 commented Jun 9, 2021

benfred commented Jun 28, 2021

benfred commented Jul 12, 2021

[BUG] Setting Dask Cluster through UCX not working #872

[BUG] Setting Dask Cluster through UCX not working #872

Comments

kshitizgupta21 commented Jun 9, 2021

benfred commented Jun 28, 2021

benfred commented Jul 12, 2021