Skip to content

Commit

Permalink
feat: Deploy GenAI in Helm (#8727)
Browse files Browse the repository at this point in the history
  • Loading branch information
tayritenour authored Feb 7, 2024
1 parent 8e067d9 commit 762fcef
Show file tree
Hide file tree
Showing 8 changed files with 238 additions and 6 deletions.
25 changes: 25 additions & 0 deletions helm/charts/determined/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,28 @@
{{- define "determined.masterPort" -}}
8081
{{- end -}}

{{- define "determined.dbHost" -}}
{{- if .Values.db.hostAddress }}
{{- .Values.db.hostAddress }}
{{- else }}
{{- "determined-db-service-" }}{{ .Release.Name }}
{{- end -}}
{{- end -}}

{{- define "genai.PVCName" -}}
{{- if .Values.genai.sharedPVCName }}
{{- .Values.genai.sharedPVCName }}
{{- else }}
{{- "genai-pvc-" }}{{ .Release.Name }}
{{- end -}}
{{- end -}}

{{- define "genai.allResourcePoolNames" -}}
{{- $orig_resource_pool_data := (required "A valid .Values.resourcePools entry required!" .Values.resourcePools) }}
{{- $resource_pools := list -}}
{{- range $v := $orig_resource_pool_data }}
{{- $resource_pools = mustAppend $resource_pools $v.pool_name }}
{{- end }}
{{ toJson $resource_pools }}
{{- end }}
82 changes: 82 additions & 0 deletions helm/charts/determined/templates/genai/genai-deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
{{- if .Values.genai.version }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: genai-deployment-{{ .Release.Name }}
namespace: {{ .Release.Namespace }}
labels:
app: genai-{{ .Release.Name }}
release: {{ .Release.Name }}
spec:
replicas: 1
selector:
matchLabels:
app: genai-{{ .Release.Name }}
template:
metadata:
labels:
app: genai-{{ .Release.Name }}
determined-system: master
spec:
priorityClassName: determined-system-priority
serviceAccount: genai-{{ .Release.Name }}
containers:
- name: genai-{{ .Release.Name }}
{{ $tag := (required "A valid .Values.genai.version entry required!" .Values.genai.version) }}
{{- /* genai.version is used for CI to override the appVersion. */ -}}
image: {{ .Values.imageRegistry }}/genai:{{ $tag }}
imagePullPolicy: "Always"
env:
- name: DET_MASTER
value: determined-master-service-{{ .Release.Name }}:{{ .Values.masterPort }}
- name: DB_NAME
value: lore
- name: DB_USER
value: {{ .Values.db.user | quote }}
- name: DB_PASSWORD
value: {{ .Values.db.password | quote }}
- name: DB_PORT
value: {{ .Values.db.port | quote }}
- name: DB_HOST
value: {{ include "determined.dbHost" . }}
- name: LORE_DOCKER_TAG_SUFFIX
value: {{ $tag }}
- name: K8S_SHARED_PVC_NAME
value: {{ include "genai.PVCName" . }}
- name: RESOURCE_POOL_EXTRA_METADATA_PATH
value: /run/determined/workdir/rp_config/resource_pool_metadata.yaml
volumeMounts:
- name: genai-pvc-storage
mountPath: /run/determined/workdir/shared_fs
readOnly: false
- name: genai-resource-pool-metadata
mountPath: /run/determined/workdir/rp_config
resources:
requests:
{{- if .Values.genai.cpuRequest }}
cpu: {{ .Values.genai.cpuRequest | quote }}
{{- end }}
{{- if .Values.genai.memRequest }}
memory: {{ .Values.genai.memRequest | quote }}
{{- end}}
{{- if or .Values.genai.cpuLimit .Values.genai.memLimit }}
limits:
{{- if .Values.genai.cpuLimit }}
cpu: {{ .Values.genai.cpuLimit | quote }}
{{- end }}
{{- if .Values.genai.memLimit }}
memory: {{ .Values.genai.memLimit | quote }}
{{- end}}
{{- end}}
{{- if .Values.imagePullSecretName}}
imagePullSecrets:
- name: {{ .Values.imagePullSecretName }}
{{- end}}
volumes:
- name: genai-pvc-storage
persistentVolumeClaim:
claimName: {{ include "genai.PVCName" . }}
- name: genai-resource-pool-metadata
configMap:
name: genai-resource-pool-metadata-{{ .Release.Name }}
{{ end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{{- if and .Values.genai.version (not .Values.genai.sharedPVCName) }}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ include "genai.PVCName" . }}
namespace: {{ .Release.Namespace }}
labels:
app: {{ include "genai.PVCName" . }}
release: {{ .Release.Name }}
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: {{ required "A valid .Values.genai.generatedPVC.storageSize entry is required if a .Values.sharedPVCName is not specified" (and .Values.genai.generatedPVC .Values.genai.generatedPVC.storageSize) }}
storageClassName: {{ required "A valid .Values.genai.generatedPVC.storageClassName entry is required if a .Values.sharedPVCName is not specified" .Values.genai.generatedPVC.storageClassName }}
{{ end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{{- if .Values.genai.version }}
apiVersion: v1
kind: ConfigMap
metadata:
name: genai-resource-pool-metadata-{{ .Release.Name }}
namespace: {{ .Release.Namespace }}
labels:
app: genai-{{ .Release.Name }}
release: {{ .Release.Name }}
data:
resource_pool_metadata.yaml: |
{{- $resource_pools := include "genai.allResourcePoolNames" . | fromJsonArray }}
{{- $metadata := (required "A valid .Values.genai.extra_resource_pool_metadata entry required!" .Values.genai.extra_resource_pool_metadata) }}
{{- range $k, $v := $metadata }}
{{- if not (has $k $resource_pools) }}
{{- $k | printf ".Values.genai.extra_resource_pool_metadata defines a resource_pool '%s' which is not present in the .Values.resourcePools" | fail }}
{{- end }}
{{- end }}
{{- toYaml .Values.genai.extra_resource_pool_metadata | nindent 4 }}
{{- end }}
10 changes: 10 additions & 0 deletions helm/charts/determined/templates/genai/genai-service-account.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{{- if .Values.genai.version }}
apiVersion: v1
kind: ServiceAccount
metadata:
name: genai-{{ .Release.Name }}
namespace: {{ .Release.Namespace }}
labels:
app: genai-{{ .Release.Name }}
release: {{ .Release.Name }}
{{ end -}}
18 changes: 18 additions & 0 deletions helm/charts/determined/templates/genai/genai-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{{- if .Values.genai.version }}
apiVersion: v1
kind: Service
metadata:
name: genai-service-{{ .Release.Name }}
namespace: {{ .Release.Namespace }}
labels:
app: genai-{{ .Release.Name }}
release: {{ .Release.Name }}
spec:
ports:
- port: {{ required "A valid Values.genai.port entry required!" .Values.genai.port }}
targetPort: {{ .Values.genai.port }}
protocol: TCP
type: LoadBalancer
selector:
app: genai-{{ .Release.Name }}
{{ end }}
19 changes: 14 additions & 5 deletions helm/charts/determined/templates/master-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,7 @@ stringData:
db:
user: {{ required "A valid Values.db.user entry required!" .Values.db.user | quote }}
password: {{ required "A valid Values.db.password entry required!" .Values.db.password | quote }}
{{- if .Values.db.hostAddress }}
host: {{ .Values.db.hostAddress }}
{{- else }}
host: determined-db-service-{{ .Release.Name }}
{{- end }}
host: {{ include "determined.dbHost" . }}
port: {{ .Values.db.port }}
name: {{ .Values.db.name | quote }}
{{- if .Values.db.sslMode }}
Expand Down Expand Up @@ -136,6 +132,19 @@ stringData:
{{- end }}
{{- end }}
{{- if .Values.genai.version }}
feature_switches:
- genai
{{- $port := (required "A valid .Values.genai.port entry required!" .Values.genai.port) }}
__internal:
proxied_servers:
- destination: "http://genai-service-{{ .Release.Name }}:{{ $port }}/lore"
path_prefix: /lore
- destination: "http://genai-service-{{ .Release.Name }}:{{ $port }}/genai"
path_prefix: /genai
{{- end}}
resource_manager:
type: "kubernetes"
namespace: {{ .Release.Namespace }}
Expand Down
53 changes: 52 additions & 1 deletion helm/charts/determined/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ checkpointStorage:
# storage beyond initial testing as most Kubernetes cluster nodes do not have a shared file
# system.
type: shared_fs
hostPath: /checkpoints
hostPath: /tmp/checkpoints
# By default, shared_fs is not mounted to the server pod. Change this to true to enable checkpoint downloads from the server.
mountToServer: false
# For storing in GCS.
Expand Down Expand Up @@ -338,3 +338,54 @@ resourcePools:
- pool_name: default
# defaultAuxResourcePool: default
# defaultComputeResourcePool: default

## Configure GenAI Deployment
genai:
## Version of GenAI to use. If unset, GenAI will not be deployed
# version: "release"

## Port for GenAI to use
port: 9011

## Secret to pull the GenAI image
# imagePullSecretName:

## GenAI pod memory request
memRequest: 1Gi

## GenAI pod cpu request
cpuRequest: 100m

## GenAI pod memory limit
# memLimit: 1Gi

## GenAI pod cpu limit
# cpuLimit: 2

## PVC Name for the shared file system for GenAI.
## Note: Either `sharedPVCName` or `generatedPVC.storageSize` (to
## generate a new PVC) is required for GenAI deployment
# sharedPVCName:

## Spec for the generated PVC for GenAI
## Note: In order to generate a shared PVC, you will need access to a
## StorageClass that can provide a ReadWriteMany volume
generatedPVC:
## Storage class name for the generated PVC
storageClassName: standard-rwx

## Size of the generated PVC
storageSize: 100Gi

## Extra Resource Pool Metadata is hardcoded information about the
## GPUs available to the resource pools. This information
## is not provided in k8s so we provide it directly.
## Note: All resource pools defined here need to also be reflected in
## the .Values.resourcePools.
# extra_resource_pool_metadata:
# a100:
# gpu_type: A100
# max_agents: 3
# v100:
# gpu_type: V100
# max_agents: 2

0 comments on commit 762fcef

Please sign in to comment.