Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-enable NVML monitoring for WSL #6119

Merged
merged 11 commits into from
May 4, 2022
51 changes: 38 additions & 13 deletions distributed/diagnostics/nvml.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
from platform import uname

from packaging.version import parse as parse_version

import dask

try:
Expand All @@ -10,6 +12,7 @@

nvmlInitialized = False
nvmlLibraryNotFound = False
nvmlWslInsufficientDriver = False
nvmlOwnerPID = None
charlesbluca marked this conversation as resolved.
Show resolved Hide resolved


Expand All @@ -21,17 +24,21 @@ def _in_wsl():


def init_once():
global nvmlInitialized, nvmlLibraryNotFound, nvmlOwnerPID
global nvmlInitialized, nvmlLibraryNotFound, nvmlWslInsufficientDriver, nvmlOwnerPID

if dask.config.get("distributed.diagnostics.nvml") is False or _in_wsl():
nvmlInitialized = False
# nvml monitoring disabled
if dask.config.get("distributed.diagnostics.nvml") is False:
return

if pynvml is None or (nvmlInitialized is True and nvmlOwnerPID == os.getpid()):
# nvml is already initialized on this process
if nvmlInitialized and nvmlOwnerPID == os.getpid():
return

nvmlInitialized = True
nvmlOwnerPID = os.getpid()
# nvml failed to initialize due to missing / outdated requirements
if pynvml is None or nvmlLibraryNotFound or nvmlWslInsufficientDriver:
return

# attempt to initialize nvml
try:
pynvml.nvmlInit()
except (
Expand All @@ -40,11 +47,26 @@ def init_once():
pynvml.NVMLError_Unknown,
):
nvmlLibraryNotFound = True
return

# set a minimum driver version for WSL so we can assume certain queries work
if (
not nvmlLibraryNotFound
and parse_version(pynvml.nvmlSystemGetDriverVersion().decode())
< parse_version("512.15")
charlesbluca marked this conversation as resolved.
Show resolved Hide resolved
and _in_wsl()
):
nvmlWslInsufficientDriver = True
return

# initialization was successful
nvmlInitialized = True
nvmlOwnerPID = os.getpid()


def device_get_count():
init_once()
if nvmlLibraryNotFound or not nvmlInitialized:
if not nvmlInitialized:
return 0
else:
return pynvml.nvmlDeviceGetCount()
Expand All @@ -53,8 +75,14 @@ def device_get_count():
def _pynvml_handles():
count = device_get_count()
if count == 0:
if nvmlLibraryNotFound:
raise RuntimeError("PyNVML is installed, but NVML is not")
if pynvml is None or nvmlLibraryNotFound:
raise RuntimeError(
"NVML monitoring requires PyNVML and NVML to be installed"
)
charlesbluca marked this conversation as resolved.
Show resolved Hide resolved
if nvmlWslInsufficientDriver:
raise RuntimeError(
"NVML is installed, but NVIDIA drivers are outdated for WSL"
charlesbluca marked this conversation as resolved.
Show resolved Hide resolved
)
else:
raise RuntimeError("No GPUs available")

Expand All @@ -80,13 +108,10 @@ def has_cuda_context():
index of the device for which there's a CUDA context.
"""
init_once()
if nvmlLibraryNotFound or not nvmlInitialized:
if not nvmlInitialized:
return False
for index in range(device_get_count()):
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
# TODO: WSL doesn't support this NVML query yet; when NVML monitoring is enabled
# there we may need to wrap this in a try/except block.
# See https://github.com/dask/distributed/pull/5568
if hasattr(pynvml, "nvmlDeviceGetComputeRunningProcesses_v2"):
running_processes = pynvml.nvmlDeviceGetComputeRunningProcesses_v2(handle)
else:
Expand Down