Skip to content

Commit

Permalink
handle gpu without fan
Browse files Browse the repository at this point in the history
  • Loading branch information
Ed-Yang committed May 17, 2021
1 parent 1fbccd4 commit 3ed7171
Show file tree
Hide file tree
Showing 14 changed files with 140 additions and 47 deletions.
4 changes: 3 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"program": "${file}",
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"justMyCode": false,
"args": [
"--verbose",
],
Expand Down Expand Up @@ -54,7 +55,8 @@
"program": "${workspaceFolder}/gpuctl/gpu_main.py",
"console": "integratedTerminal",
"args": [
"--slots", "0000:01:00.0",
// "--slots", "0000:01:00.0",
// "--slots", "0000:81:00.0",
"--set-speed", "50"
],
"env": {
Expand Down
9 changes: 6 additions & 3 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@
"-p",
"test_*.py"
],
"python.testing.pytestEnabled": false,
"python.testing.pytestEnabled": true,
"python.testing.nosetestsEnabled": false,
"python.testing.unittestEnabled": true,
"python.pythonPath": "venv/bin/python"
"python.testing.unittestEnabled": false,
"python.pythonPath": "venv/bin/python",
"python.testing.pytestArgs": [
"tests"
]
}
11 changes: 5 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Currently, the example of action script are based on the [miner's remote managem
## Environment and Installation

Intel CPU/16G RAM
Ubuntu 18.4/Python3
HiveOS/Ubuntu 18.4/Python3

* Environment setup

Expand Down Expand Up @@ -231,11 +231,9 @@ If a failure is detected, the gpuctl will invoke the given script with slot name
```
```shell
ID Slot Name Vendor PCI-ID Temp. Fan PWR Working
-- ------------ -------- ----------- ----- ---- ------- -------
1 0000:01:00.0 NVIDIA [10DE:1C03] 61c 50% 74.76w True
2 0000:0b:00.0 AMD [1002:67DF] 78c 47% 81.00w True
3 0000:0d:00.0 NVIDIA [10DE:1C03] 49c 50% 72.60w True
Set slot 0000:01:00.0 fan speed to 50%
Set slot 0000:0b:00.0 fan speed to 50%
Set slot 0000:0d:00.0 fan speed to 50%
```
* Example 3) If the temperature of a GPU is over 65c, activate the fan speed control for the specific GPU.
Expand Down Expand Up @@ -522,3 +520,4 @@ Use PhoenixMiner 5.3b as example:
* [GPUFan](https://github.com/milani/gpufan)
* [PyOpenCL Samples](https://github.com/virus-warnning/pyopencl_samples)
* [Associating OpenCL device ids with GPUs](https://anteru.net/blog/2014/associating-opencl-device-ids-with-gpus/)
* [NVIDIA/Tips and tricks](https://wiki.archlinux.org/title/NVIDIA/Tips_and_tricks)
5 changes: 5 additions & 0 deletions gpuctl/eth_ctl.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,11 @@ class EthCtl:
def __init__(self, **kwargs):

valid_keys = ["base", "temp", "rate", "wait", "rmode", "delay", "script", "verbose"]

for k in kwargs.keys():
if k not in valid_keys:
return None

for key in valid_keys:
setattr(self, key, kwargs.get(key))

Expand Down
2 changes: 1 addition & 1 deletion gpuctl/eth_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import syslog

from gpuctl import __version__
from gpuctl import DRYRUN, GpuCtl, logger
from gpuctl import DRYRUN, logger
from gpuctl import PciDev, GpuDev, GpuAMD, GpuNV

from gpuctl import EthCtl, scan_miner
Expand Down
2 changes: 2 additions & 0 deletions gpuctl/gpu_amd.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ def set_speed(self, speed):
self.speed = speed
logger.debug(f'[{self.pci_dev.slot_name}/{self.name}] set speed {speed}% pwm {pwm}')

return True

def get_speed(self):
pwn = fv.read_file_value(self.hwmon_dir, GpuAMD.PWM_VAL)
speed = int(pwn/(self.max-self.min) * 100)
Expand Down
11 changes: 9 additions & 2 deletions gpuctl/gpu_ctl.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,13 @@ def __init__(self, **kwargs):
self.vendors = None

# overwrite default value with arguments
valid_keys = ["slots", "gpu_devices",
valid_keys = ["gpu_devices",
"fan", "curve", "delta", "temp", "tas", "verbose"]

for k in kwargs.keys():
if k not in valid_keys:
return None

for key in valid_keys:
setattr(self, key, kwargs.get(key))

Expand Down Expand Up @@ -170,7 +175,9 @@ def stop(self):
self.thread.join()

def set_interval(self, intvl=None, wait_period=None):

"""
wait interval must greater than interval
"""
interval = intvl if intvl else self.interval
wait_period = wait_period if wait_period else self.wait

Expand Down
23 changes: 16 additions & 7 deletions gpuctl/gpu_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,16 @@ def sigstop(a, b):
if gpu_dev and gpu_dev.is_gpu() and pdev.slot_name not in slot_names:
gpu_devices.append(gpu_dev)

if args.set_speed != None:
for gpu in gpu_devices:
pdev = gpu.pci_dev
# set fan speed
print(f"Set slot {pdev.slot_name} fan speed to {args.set_speed}%")
rv = gpu.set_speed(args.set_speed)
if not rv:
print(f"Failed to set slot {pdev.slot_name} fan speed !!!")
sys.exit(0)

# list monitored devices
print("\n")
print("ID Slot Name Vendor PCI-ID Temp. Fan PWR Working")
Expand All @@ -169,12 +179,13 @@ def sigstop(a, b):
cnt = 1
for gpu in gpu_devices:
pdev = gpu.pci_dev
# set fan speed
if args.set_speed != None:
gpu.set_speed(args.set_speed)
working = gpu.is_working()
t_str = gpu.get_temperature() if gpu.get_temperature() else '--'
s_str = gpu.get_speed() if gpu.get_speed() else '--'
p_str = gpu.get_pwr() if gpu.get_pwr() else '--'

msg = f"{cnt:2} {pdev.slot_name} {pdev.vendor_name():8} [{pdev.vendor_id}:{pdev.device_id}] "
msg += f"{gpu.get_temperature():4}c {gpu.get_speed():3}% {gpu.get_pwr():6.2f}w {working}"
msg += f"{t_str:4}c {s_str:3}% {p_str:6.2f}w {working}"
print(msg)
cnt += 1

Expand All @@ -183,16 +194,14 @@ def sigstop(a, b):
if args.list:
sys.exit(0)

if args.set_speed != None:
sys.exit(0)

if len(gpu_devices) == 0:
print('No GPU found, abort !\n')
sys.exit(0)

# remove not working devices
for gpu in list(gpu_devices):
if gpu.is_working() == False:
print(f'slot {gpu.pci_dev.slot_name} is malfunction, removed !\n')
gpu_devices.remove(gpu)

gpu_ctl = GpuCtl(gpu_devices=gpu_devices,
Expand Down
8 changes: 5 additions & 3 deletions gpuctl/gpu_mock.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,22 @@ def get_speed(self):

def set_speed(self, percentage):
self.speed = percentage
return True

class GpuNak(GpuDev):
def is_gpu(self):
return True

def is_working(self):
return True
return False

def get_temperature(self):
return 70
return None

def get_speed(self):
return 10
return None

def set_speed(self, percentage):
self.speed = percentage
return False

23 changes: 15 additions & 8 deletions gpuctl/gpu_nv.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,55 +65,62 @@ def is_gpu(self):
def set_speed(self, speed):

if DRYRUN:
return
return True

if self.nvh == None:
return
return False

cmd = f"nvidia-settings -c {self.display} -a [gpu:{self.nv_id}]/GPUFanControlState=1 -a [fan:{self.nv_id}]/GPUTargetFanSpeed={speed}"
# logger.debug(f'exec: {cmd}')
rv = True
try:
sc.exec_cmd(cmd)
logger.debug(
f'[{self.pci_dev.slot_name}/{self.name}] set speed {speed}%')
self.speed = speed
except:
logger.error(f"{self.pci_dev.slot_name}/{self.name}] set fan speed failed !!")
# some GPU card does not install fan (ie. Tesla P4)
# logger.error(f"{self.pci_dev.slot_name}/{self.name}] set fan speed failed !!")
rv = False

return rv

def get_speed(self):
# NOTE: nvmlDeviceGetFanSpeed report wrong speed, use nvidia-settings instead
# s = nv.nvmlDeviceGetFanSpeed(self.nvh)
cmd = f"nvidia-settings -t -q [fan:{self.nv_id}]/GPUTargetFanSpeed"
# logger.debug(f'exec: {cmd}')
speed = 0
speed = None
try:
s = sc.exec_cmd(cmd)
speed = int(s)
# logger.debug(
# f'[{self.pci_dev.slot_name}/{self.name}] speed {speed}%')
except:
logger.error(f"{self.pci_dev.slot_name}/{self.name}] get fan speed failed !!")
# some GPU card does not install fan (ie. Tesla P4)
# logger.error(f"{self.pci_dev.slot_name}/{self.name}] get fan speed failed !!")
pass
return speed

def get_temperature(self):
if self.nvh == None:
return None
t = None
try:
t = nv.nvmlDeviceGetTemperature(self.nvh, nv.NVML_TEMPERATURE_GPU)
self.temperature = t
except:
logger.error(f"{self.pci_dev.slot_name}/{self.name}] get temperature failed !!")
t = None
return t

def get_pwr(self):
if self.nvh == None:
return 0
return None
pwr = None
try:
pwr = nv.nvmlDeviceGetPowerUsage(self.nvh)/1000
self.pwr = pwr
except:
logger.error(f"{self.pci_dev.slot_name}/{self.name}] get pwr failed !!")
pwr = 0

return pwr
2 changes: 1 addition & 1 deletion gpuctl/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

__all__ = ['__version__']

__version__ = '0.3.6'
__version__ = '0.3.7'

5 changes: 5 additions & 0 deletions scripts/nv-temp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

# get temperature of NV cards

nvidia-settings -q gpucoretemp -t
47 changes: 34 additions & 13 deletions tests/test_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,43 @@
from re import U
import unittest

from gpuctl import PciDev, GpuDev, GpuCtl
from gpuctl import PciDev, GpuDev, GpuAMD, GpuNV

class TestSlot(unittest.TestCase):

def test_valid_slot(self):
pass

def test_invalid_slot(self):
gpu_ctl = None
try:
gpu_ctl = GpuCtl(slot_name='aaaa')
self.assertFalse(True)
except:
pass

class TestGpu(unittest.TestCase):
def test_get_temp(self):
vendors = ['AMD', 'NVIDIA']
pci_devices = PciDev.discovery(vendor_filter=vendors)
gpu_devices = []
for pdev in pci_devices:
gpu = None
if pdev.is_amd():
gpu = GpuAMD(pdev)
if pdev.is_nvidia():
gpu = GpuNV(pdev)
if gpu and gpu.is_gpu():
gpu_devices.append(gpu)

for gpu in gpu_devices:
gpu.get_temperature()

def test_get_speed(self):
vendors = ['AMD', 'NVIDIA']
pci_devices = PciDev.discovery(vendor_filter=vendors)
gpu_devices = []
for pdev in pci_devices:
gpu = None
if pdev.is_amd():
gpu = GpuAMD(pdev)
if pdev.is_nvidia():
gpu = GpuNV(pdev)
if gpu and gpu.is_gpu():
gpu_devices.append(gpu)

for gpu in gpu_devices:
gpu.get_speed()

class TestCurve(unittest.TestCase):
def test_valid_curve(self):
curve = [[0,0], [10,10], [50,50], [100,100]]
rv = GpuDev.check_curve(curve)
Expand Down
35 changes: 33 additions & 2 deletions tests/test_gpuctl.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@
from gpuctl import GpuOk, GpuNak

class TestGpuCtl(unittest.TestCase):

def test_invalid_key(self):
gpu_ctl = None
try:
gpu_ctl = GpuCtl(slots='aaaa')
self.assertEqual(gpu_ctl, None)
except:
pass

def test_discover_all(self):

pci_devices = PciDev.discovery()
Expand Down Expand Up @@ -68,9 +77,12 @@ def test_over_temp(self):
gpu_dev = GpuNak(pdev)
self.assertIsNotNone(gpu_dev)

gpu_ctl = GpuCtl(gpu_devices=[gpu_dev], fam=10, temp=20, tas='./tests/ok.sh')
gpu_ctl = GpuCtl(gpu_devices=[gpu_dev], fan=10, temp=20, tas='./tests/ok.sh')
self.assertNotEqual(gpu_ctl, None)

rv = gpu_ctl.set_interval(wait_period=10)
self.assertTrue(rv)

gpu_ctl.set_interval(wait_period=3)
gpu_ctl.start()
time.sleep(5)
gpu_ctl.stop()
Expand All @@ -96,5 +108,24 @@ def test_interval(self):
rv = gpu_ctl.set_interval(intvl=2, wait_period=1)
self.assertFalse(rv)

def test_nak_gpu(self):
slot_name = '1111:11:11.1'
pci_id = '1111:1111'
pdev = PciDev(slot_name, pci_id, 'mock pci')
self.assertIsNotNone(pdev)
self.assertEqual(pdev.vendor_name(), 'Other')

gpu_dev = GpuNak(pdev)
self.assertIsNotNone(gpu_dev)

gpu_ctl = GpuCtl(gpu_devices=[gpu_dev], verbose=True)

rv = gpu_ctl.set_interval(wait_period=10)
self.assertTrue(rv)

gpu_ctl.start()
time.sleep(5)
gpu_ctl.stop()

if __name__ == '__main__':
unittest.main()

0 comments on commit 3ed7171

Please sign in to comment.