Skip to content

Commit

Permalink
Merge pull request #2189 from CoderTH/support-mig-gpu-operator
Browse files Browse the repository at this point in the history
update mig config & upgrade gpu-operator version
  • Loading branch information
wawa0210 authored Jun 20, 2024
2 parents 05ae57a + 0809d38 commit bddc1d7
Show file tree
Hide file tree
Showing 11 changed files with 164 additions and 64 deletions.
5 changes: 4 additions & 1 deletion charts/gpu-operator/config
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,7 @@ export DAOCLOUD_REPO_PROJECT=community
export CUSTOM_SHELL=custom.sh
export APPEND_VALUES_FILE=appendValues.yaml
export NO_IMAGE=true
export NO_TRIVY=true
export NO_TRIVY=true

# gpu-operator version
export CUSTOM_VERSION=${VERSION}+1
19 changes: 19 additions & 0 deletions charts/gpu-operator/custom.sh

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion charts/gpu-operator/gpu-operator/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ kubeVersion: '>= 1.16.0-0'
name: gpu-operator
sources:
- https://github.com/NVIDIA/gpu-operator
version: v23.9.0
version: v23.9.0+1
dependencies:
- name: gpu-operator
version: "v23.9.0"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: device-plugin-config
namespace: {{ .Release.Namespace }}
data:
mixed: |-
version: v1
flags:
migStrategy: mixed
failOnInitError: true
nvidiaDriverRoot: "/run/nvidia/driver"
gdsEnabled: false
mofedEnabled: false
plugin:
passDeviceSpecs: true
deviceListStrategy:
- envvar
deviceIDStrategy: uuid
cdiAnnotationPrefix: cdi.k8s.io/
nvidiaCTKPath: "/usr/bin/nvidia-ctk"
containerDriverRoot: "/run/nvidia/driver"
resources:
gpus:
- pattern: "*"
name: nvidia.com/gpu
single: |-
version: v1
flags:
migStrategy: single
failOnInitError: true
nvidiaDriverRoot: "/run/nvidia/driver"
gdsEnabled: false
mofedEnabled: false
plugin:
passDeviceSpecs: true
deviceListStrategy:
- envvar
deviceIDStrategy: uuid
cdiAnnotationPrefix: cdi.k8s.io/
nvidiaCTKPath: "/usr/bin/nvidia-ctk"
containerDriverRoot: "/run/nvidia/driver"
resources:
gpus:
- pattern: "*"
name: nvidia.com/gpu
none: |-
version: v1
flags:
migStrategy: none
failOnInitError: true
nvidiaDriverRoot: "/run/nvidia/driver"
gdsEnabled: false
mofedEnabled: false
plugin:
passDeviceSpecs: true
deviceListStrategy:
- envvar
deviceIDStrategy: uuid
cdiAnnotationPrefix: cdi.k8s.io/
nvidiaCTKPath: "/usr/bin/nvidia-ctk"
containerDriverRoot: "/run/nvidia/driver"
resources:
gpus:
- pattern: "*"
name: nvidia.com/gpu
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,7 @@ operator:
requests:
cpu: 200m
memory: 100Mi
mig:
strategy: single
mig: {}
driver:
enabled: true
nvidiaDriverCRD:
Expand Down Expand Up @@ -261,9 +260,9 @@ devicePlugin:
# Create a ConfigMap (default: false)
create: false
# ConfigMap name (either exiting or to create a new one with create=true above)
name: ""
name: "device-plugin-config"
# Default config name within the ConfigMap
default: ""
default: "none"
# Data section for the ConfigMap to create (i.e only applies when create=true)
data: {}
# standalone dcgm hostengine
Expand Down Expand Up @@ -323,7 +322,7 @@ gfd:
value: "true"
resources: {}
migManager:
enabled: false
enabled: true
repository: nvcr.m.daocloud.io
image: nvidia/cloud-native/k8s-mig-manager
version: v0.5.5-ubuntu20.04
Expand Down
3 changes: 0 additions & 3 deletions charts/gpu-operator/gpu-operator/values-schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,9 @@ gpu-operator:
image: nvidia/k8s/container-toolkit
version: v1.14.3-centos7
migManager:
enabled: false
config:
name: "default-mig-parted-config"
default: "all-disabled"
mig:
strategy: single

node-feature-discovery:
enableNodeFeatureApi: true
23 changes: 0 additions & 23 deletions charts/gpu-operator/gpu-operator/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,6 @@
"migManager": {
"type": "object",
"properties": {
"enabled": {
"type": "boolean"
},
"config": {
"type": "object",
"properties": {
Expand All @@ -81,26 +78,6 @@
}
}
}
},
"mig": {
"type": "object",
"properties": {
"strategy": {
"type": "string",
"enum": [
"single",
"mixed"
]
}
}
},
"node-feature-discovery": {
"type": "object",
"properties": {
"enableNodeFeatureApi": {
"type": "boolean"
}
}
}
}
}
Expand Down
9 changes: 4 additions & 5 deletions charts/gpu-operator/gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,7 @@ gpu-operator:
requests:
cpu: 200m
memory: 100Mi
mig:
strategy: single
mig: {}
driver:
enabled: true
nvidiaDriverCRD:
Expand Down Expand Up @@ -262,9 +261,9 @@ gpu-operator:
# Create a ConfigMap (default: false)
create: false
# ConfigMap name (either exiting or to create a new one with create=true above)
name: ""
name: "device-plugin-config"
# Default config name within the ConfigMap
default: ""
default: "none"
# Data section for the ConfigMap to create (i.e only applies when create=true)
data: {}
# standalone dcgm hostengine
Expand Down Expand Up @@ -324,7 +323,7 @@ gpu-operator:
value: "true"
resources: {}
migManager:
enabled: false
enabled: true
repository: nvcr.m.daocloud.io
image: nvidia/cloud-native/k8s-mig-manager
version: v0.5.5-ubuntu20.04
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: device-plugin-config
namespace: {{ .Release.Namespace }}
data:
mixed: |-
version: v1
flags:
migStrategy: mixed
failOnInitError: true
nvidiaDriverRoot: "/run/nvidia/driver"
gdsEnabled: false
mofedEnabled: false
plugin:
passDeviceSpecs: true
deviceListStrategy:
- envvar
deviceIDStrategy: uuid
cdiAnnotationPrefix: cdi.k8s.io/
nvidiaCTKPath: "/usr/bin/nvidia-ctk"
containerDriverRoot: "/run/nvidia/driver"
resources:
gpus:
- pattern: "*"
name: nvidia.com/gpu
single: |-
version: v1
flags:
migStrategy: single
failOnInitError: true
nvidiaDriverRoot: "/run/nvidia/driver"
gdsEnabled: false
mofedEnabled: false
plugin:
passDeviceSpecs: true
deviceListStrategy:
- envvar
deviceIDStrategy: uuid
cdiAnnotationPrefix: cdi.k8s.io/
nvidiaCTKPath: "/usr/bin/nvidia-ctk"
containerDriverRoot: "/run/nvidia/driver"
resources:
gpus:
- pattern: "*"
name: nvidia.com/gpu
none: |-
version: v1
flags:
migStrategy: none
failOnInitError: true
nvidiaDriverRoot: "/run/nvidia/driver"
gdsEnabled: false
mofedEnabled: false
plugin:
passDeviceSpecs: true
deviceListStrategy:
- envvar
deviceIDStrategy: uuid
cdiAnnotationPrefix: cdi.k8s.io/
nvidiaCTKPath: "/usr/bin/nvidia-ctk"
containerDriverRoot: "/run/nvidia/driver"
resources:
gpus:
- pattern: "*"
name: nvidia.com/gpu
3 changes: 0 additions & 3 deletions charts/gpu-operator/parent/values-schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,9 @@ gpu-operator:
image: nvidia/k8s/container-toolkit
version: v1.14.3-centos7
migManager:
enabled: false
config:
name: "default-mig-parted-config"
default: "all-disabled"
mig:
strategy: single

node-feature-discovery:
enableNodeFeatureApi: true
23 changes: 0 additions & 23 deletions charts/gpu-operator/parent/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,6 @@
"migManager": {
"type": "object",
"properties": {
"enabled": {
"type": "boolean"
},
"config": {
"type": "object",
"properties": {
Expand All @@ -81,26 +78,6 @@
}
}
}
},
"mig": {
"type": "object",
"properties": {
"strategy": {
"type": "string",
"enum": [
"single",
"mixed"
]
}
}
},
"node-feature-discovery": {
"type": "object",
"properties": {
"enableNodeFeatureApi": {
"type": "boolean"
}
}
}
}
}
Expand Down

0 comments on commit bddc1d7

Please sign in to comment.