Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUG] Setting Dask Cluster through UCX not working #872

Closed
kshitizgupta21 opened this issue Jun 9, 2021 · 2 comments
Closed

[BUG] Setting Dask Cluster through UCX not working #872

kshitizgupta21 opened this issue Jun 9, 2021 · 2 comments
Assignees
Labels
bug Something isn't working P0 PyTorch

Comments

@kshitizgupta21
Copy link

I tried setting the LocalCUDACluster using ucx but get this ModuleNotFoundError: No module named 'ucp' I'm running this in NVTabular merlin-pytorch-training:0.5.3 NGC container. I thought that container had everything we need for UCX.

Here's my code:

protocol = "ucx"  
# Select GPUs to place workers. Here 1st and 2nd GPU
# If you want the first 4 GPUs it would be 0,1,2,3 and so on
visible_devices = "0,1"  
# Get the IP Address
cmd = "hostname --all-ip-addresses"
process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
IPADDR = str(output.decode()).split()[0]
cluster = LocalCUDACluster(
    ip=IPADDR,
    protocol=protocol,
    enable_nvlink=True,
    CUDA_VISIBLE_DEVICES=visible_devices,
    local_directory=dask_workdir,
    device_memory_limit=0.8 # This can be changed depending on your workflow
)
# Create the distributed client
client = Client(cluster)
client

Here's the error traceback:

/opt/conda/lib/python3.8/site-packages/dask_cuda/local_cuda_cluster.py:219: UserWarning: When using NVLink we recommend setting a `rmm_pool_size`. Please see: https://dask-cuda.readthedocs.io/en/latest/ucx.html#important-notes for more details
  warnings.warn(
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-17-f505f8fa4f24> in <module>
     14 IPADDR = str(output.decode()).split()[0]
     15 
---> 16 cluster = LocalCUDACluster(
     17     ip=IPADDR,
     18     protocol=protocol,
/opt/conda/lib/python3.8/site-packages/dask_cuda/local_cuda_cluster.py in __init__(self, n_workers, threads_per_worker, processes, memory_limit, device_memory_limit, CUDA_VISIBLE_DEVICES, data, local_directory, protocol, enable_tcp_over_ucx, enable_infiniband, enable_nvlink, enable_rdmacm, ucx_net_devices, rmm_pool_size, rmm_managed_memory, rmm_log_directory, jit_unspill, log_spilling, **kwargs)
    284         )
    285 
--> 286         super().__init__(
    287             n_workers=0,
    288             threads_per_worker=threads_per_worker,
/opt/conda/lib/python3.8/site-packages/distributed/deploy/local.py in __init__(self, name, n_workers, threads_per_worker, processes, loop, start, host, ip, scheduler_port, silence_logs, dashboard_address, worker_dashboard_address, diagnostics_port, services, worker_services, service_kwargs, asynchronous, security, protocol, blocked_handlers, interface, worker_class, scheduler_kwargs, **worker_kwargs)
    229         workers = {i: worker for i in range(n_workers)}
    230 
--> 231         super().__init__(
    232             name=name,
    233             scheduler=scheduler,
/opt/conda/lib/python3.8/site-packages/distributed/deploy/spec.py in __init__(self, workers, scheduler, worker, asynchronous, loop, security, silence_logs, name, shutdown_on_close)
    280         if not self.asynchronous:
    281             self._loop_runner.start()
--> 282             self.sync(self._start)
    283             self.sync(self._correct_state)
    284 
/opt/conda/lib/python3.8/site-packages/distributed/deploy/cluster.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    186             return future
    187         else:
--> 188             return sync(self.loop, func, *args, **kwargs)
    189 
    190     def _log(self, log):
/opt/conda/lib/python3.8/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    351     if error[0]:
    352         typ, exc, tb = error[0]
--> 353         raise exc.with_traceback(tb)
    354     else:
    355         return result[0]
/opt/conda/lib/python3.8/site-packages/distributed/utils.py in f()
    334             if callback_timeout is not None:
    335                 future = asyncio.wait_for(future, callback_timeout)
--> 336             result[0] = yield future
    337         except Exception as exc:
    338             error[0] = sys.exc_info()
/opt/conda/lib/python3.8/site-packages/tornado/gen.py in run(self)
    760 
    761                     try:
--> 762                         value = future.result()
    763                     except Exception:
    764                         exc_info = sys.exc_info()
/opt/conda/lib/python3.8/site-packages/distributed/deploy/spec.py in _start(self)
    309                 cls = import_term(cls)
    310             self.scheduler = cls(**self.scheduler_spec.get("options", {}))
--> 311             self.scheduler = await self.scheduler
    312         self.scheduler_comm = rpc(
    313             getattr(self.scheduler, "external_address", None) or self.scheduler.address,
/opt/conda/lib/python3.8/site-packages/distributed/core.py in _()
    283                         )
    284                 else:
--> 285                     await self.start()
    286                     self.status = Status.running
    287             return self
/opt/conda/lib/python3.8/site-packages/distributed/scheduler.py in start(self)
   3462 
   3463         for addr in self._start_address:
-> 3464             await self.listen(
   3465                 addr,
   3466                 allow_offload=False,
/opt/conda/lib/python3.8/site-packages/distributed/core.py in listen(self, port_or_addr, allow_offload, **kwargs)
    398             addr = port_or_addr
    399             assert isinstance(addr, str)
--> 400         listener = await listen(
    401             addr,
    402             self.handle_comm,
/opt/conda/lib/python3.8/site-packages/distributed/comm/core.py in _()
    206     def __await__(self):
    207         async def _():
--> 208             await self.start()
    209             return self
    210 
/opt/conda/lib/python3.8/site-packages/distributed/comm/ucx.py in start(self)
    389                 await self.comm_handler(ucx)
    390 
--> 391         init_once()
    392         self.ucp_server = ucx_create_listener(serve_forever, port=self._input_port)
    393 
/opt/conda/lib/python3.8/site-packages/distributed/comm/ucx.py in init_once()
     54         return
     55 
---> 56     import ucp as _ucp
     57 
     58     ucp = _ucp
ModuleNotFoundError: No module named 'ucp'
@kshitizgupta21 kshitizgupta21 added the bug Something isn't working label Jun 9, 2021
@benfred benfred added the P0 label Jun 10, 2021
@benfred
Copy link
Member

benfred commented Jun 28, 2021

This is on all our staging containers -

@benfred
Copy link
Member

benfred commented Jul 12, 2021

On our staging containers - will be in the 0.6 release

@benfred benfred closed this as completed Jul 12, 2021
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working P0 PyTorch
Projects
None yet
Development

No branches or pull requests

3 participants