From 1e407b080a3514b8d0260072fbe92271641d44f8 Mon Sep 17 00:00:00 2001 From: Arnaldo Garcia Rincon Date: Thu, 21 Apr 2022 21:44:30 +0000 Subject: [PATCH] variants: add aws-ecs-1-nvidia Signed-off-by: Arnaldo Garcia Rincon --- .github/workflows/build.yml | 8 +++ QUICKSTART-ECS.md | 22 ++++++++ README.md | 2 +- packages/os/os.spec | 11 ++-- .../logdog/conf/logdog.aws-ecs-1-nvidia.conf | 1 + sources/models/shared-defaults/ecs.toml | 28 +++++++++++ .../defaults.d/10-defaults.toml | 1 + .../defaults.d/20-aws-host-containers.toml | 1 + .../defaults.d/25-cf-signal.toml | 1 + .../defaults.d/30-metrics.toml | 1 + .../defaults.d/51-docker-services.toml | 1 + .../defaults.d/52-aws-ecs-1.toml | 1 + .../defaults.d/53-docker-daemon.toml | 1 + .../defaults.d/60-lockdown-none.toml | 1 + .../defaults.d/70-oci-hooks.toml | 1 + sources/models/src/aws-ecs-1-nvidia/mod.rs | 30 +++++++++++ .../aws-ecs-1/defaults.d/52-aws-ecs-1.toml | 29 +---------- variants/Cargo.lock | 23 +++++++++ variants/Cargo.toml | 1 + variants/README.md | 6 +++ variants/aws-ecs-1-nvidia/Cargo.toml | 50 +++++++++++++++++++ variants/aws-ecs-1-nvidia/build.rs | 9 ++++ variants/aws-ecs-1-nvidia/lib.rs | 1 + 23 files changed, 196 insertions(+), 34 deletions(-) create mode 120000 sources/logdog/conf/logdog.aws-ecs-1-nvidia.conf create mode 100644 sources/models/shared-defaults/ecs.toml create mode 120000 sources/models/src/aws-ecs-1-nvidia/defaults.d/10-defaults.toml create mode 120000 sources/models/src/aws-ecs-1-nvidia/defaults.d/20-aws-host-containers.toml create mode 120000 sources/models/src/aws-ecs-1-nvidia/defaults.d/25-cf-signal.toml create mode 120000 sources/models/src/aws-ecs-1-nvidia/defaults.d/30-metrics.toml create mode 120000 sources/models/src/aws-ecs-1-nvidia/defaults.d/51-docker-services.toml create mode 120000 sources/models/src/aws-ecs-1-nvidia/defaults.d/52-aws-ecs-1.toml create mode 120000 sources/models/src/aws-ecs-1-nvidia/defaults.d/53-docker-daemon.toml create mode 120000 sources/models/src/aws-ecs-1-nvidia/defaults.d/60-lockdown-none.toml create mode 120000 sources/models/src/aws-ecs-1-nvidia/defaults.d/70-oci-hooks.toml create mode 100644 sources/models/src/aws-ecs-1-nvidia/mod.rs mode change 100644 => 120000 sources/models/src/aws-ecs-1/defaults.d/52-aws-ecs-1.toml create mode 100644 variants/aws-ecs-1-nvidia/Cargo.toml create mode 100644 variants/aws-ecs-1-nvidia/build.rs create mode 100644 variants/aws-ecs-1-nvidia/lib.rs diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b89eae9a6c7..af6a993eb9e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -77,6 +77,14 @@ jobs: arch: aarch64 supported: true fetch-upstream: "true" + - variant: aws-ecs-1-nvidia + arch: x86_64 + supported: true + fetch-upstream: "true" + - variant: aws-ecs-1-nvidia + arch: aarch64 + supported: true + fetch-upstream: "true" fail-fast: false steps: - uses: actions/checkout@v3 diff --git a/QUICKSTART-ECS.md b/QUICKSTART-ECS.md index d538fd89eaa..200163a279d 100644 --- a/QUICKSTART-ECS.md +++ b/QUICKSTART-ECS.md @@ -220,3 +220,25 @@ aws ec2 run-instances --key-name YOUR_KEY_NAME \ And remember, if you used a public subnet, add `--associate-public-ip-address` or attach an Elastic IP after launch. Once it launches, you should be able to run tasks on your Bottlerocket instance using the ECS API and console. + + +### aws-ecs-*-nvidia variants + +The `aws-ecs-*-nvidia` variants include the required packages and configurations to leverage NVIDIA GPUs. +They come with the [NVIDIA Tesla driver](https://docs.nvidia.com/datacenter/tesla/drivers/index.html) along with the libraries required by the [CUDA toolkit](https://developer.nvidia.com/cuda-toolkit) included in your ECS tasks. +In hosts with multiple GPUs (ex. EC2 `g4dn` instances) you can assign one or multiple GPUs per container by specifying the resource requirements in your container definitions as described in the [official ECS documentation](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-gpu.html): + +```json +{ + "containerDefinitions": [ + { + "resourceRequirements" : [ + { + "type" : "GPU", + "value" : "2" + } + ] + } + ] +} +``` diff --git a/README.md b/README.md index 3644ac77ad7..543a47b8318 100644 --- a/README.md +++ b/README.md @@ -749,7 +749,7 @@ There are a few important caveats about the provided kdump support: ### NVIDIA GPUs Support Bottlerocket's `nvidia` variants include the required packages and configurations to leverage NVIDIA GPUs. The official AMIs for these variants can be used with EC2 GPU-equipped instance types such as: `p2`, `p3`, `p4`, `g4dn`, `g5` and `g5g`. -Please see [QUICKSTART-EKS](QUICKSTART-EKS.md#aws-k8s--nvidia-variants) for further details about Kubernetes variants. +Please see [QUICKSTART-EKS](QUICKSTART-EKS.md#aws-k8s--nvidia-variants) for further details about Kubernetes variants, and [QUICKSTART-ECS](QUICKSTART-ECS.md#aws-ecs--nvidia-variants) for ECS variants. ## Details diff --git a/packages/os/os.spec b/packages/os/os.spec index 3c713802567..5a2ed502eb9 100644 --- a/packages/os/os.spec +++ b/packages/os/os.spec @@ -1,5 +1,6 @@ %global _cross_first_party 1 %global _is_k8s_variant %(if echo %{_cross_variant} | grep -Fqw "k8s"; then echo 1; else echo 0; fi) +%global _is_ecs_variant %(if echo %{_cross_variant} | grep -Fqw "ecs"; then echo 1; else echo 0; fi) %global _is_aws_variant %(if echo %{_cross_variant} | grep -Fqw "aws"; then echo 1; else echo 0; fi) %global _is_vendor_variant %(if echo %{_cross_variant} | grep -Fqw "nvidia"; then echo 1; else echo 0; fi) %undefine _debugsource_packages @@ -94,7 +95,7 @@ Requires: %{_cross_os}shibaken Requires: %{_cross_os}cfsignal %endif -%if "%{_cross_variant}" == "aws-ecs-1" +%if %{_is_ecs_variant} Requires: %{_cross_os}ecs-settings-applier %endif @@ -227,7 +228,7 @@ Summary: Bottlerocket certificates handler %description -n %{_cross_os}certdog %{summary}. -%if "%{_cross_variant}" == "aws-ecs-1" +%if %{_is_ecs_variant} %package -n %{_cross_os}ecs-settings-applier Summary: Settings generator for ECS %description -n %{_cross_os}ecs-settings-applier @@ -340,7 +341,7 @@ echo "** Output from non-static builds:" -p prairiedog \ -p certdog \ -p shimpei \ -%if "%{_cross_variant}" == "aws-ecs-1" +%if %{_is_ecs_variant} -p ecs-settings-applier \ %endif %if %{_is_aws_variant} @@ -377,7 +378,7 @@ for p in \ signpost updog metricdog logdog \ ghostdog bootstrap-containers \ shimpei \ -%if "%{_cross_variant}" == "aws-ecs-1" +%if %{_is_ecs_variant} ecs-settings-applier \ %endif %if %{_is_aws_variant} @@ -562,7 +563,7 @@ install -p -m 0644 %{S:300} %{buildroot}%{_cross_udevrulesdir}/80-ephemeral-stor %files -n %{_cross_os}logdog %{_cross_bindir}/logdog -%if "%{_cross_variant}" == "aws-ecs-1" +%if %{_is_ecs_variant} %files -n %{_cross_os}ecs-settings-applier %{_cross_bindir}/ecs-settings-applier %endif diff --git a/sources/logdog/conf/logdog.aws-ecs-1-nvidia.conf b/sources/logdog/conf/logdog.aws-ecs-1-nvidia.conf new file mode 120000 index 00000000000..3d5782ca503 --- /dev/null +++ b/sources/logdog/conf/logdog.aws-ecs-1-nvidia.conf @@ -0,0 +1 @@ +logdog.aws-ecs-1.conf \ No newline at end of file diff --git a/sources/models/shared-defaults/ecs.toml b/sources/models/shared-defaults/ecs.toml new file mode 100644 index 00000000000..0eb26a9b4f8 --- /dev/null +++ b/sources/models/shared-defaults/ecs.toml @@ -0,0 +1,28 @@ +# ECS +[services.ecs] +restart-commands = ["/usr/bin/ecs-settings-applier", "/bin/systemctl try-reload-or-restart ecs.service"] +configuration-files = ["ecs-config"] + +[configuration-files.ecs-config] +path = "/etc/ecs/ecs.config" +template-path = "/usr/share/templates/ecs.config" + +[metadata.settings.ecs] +affected-services = ["ecs"] + +[settings.ecs] +allow-privileged-containers = false +logging-drivers = ["json-file", "awslogs", "none"] +loglevel = "info" + +# Metrics +[settings.metrics] +service-checks = ["apiserver", "chronyd", "containerd", "host-containerd", "docker", "ecs"] + +# Network +[metadata.settings.network] +affected-services = ["containerd", "docker", "ecs", "host-containerd", "host-containers"] + +# Image registry credentials +[metadata.settings.container-registry.credentials] +affected-services = ["ecs", "host-containers", "bootstrap-containers"] diff --git a/sources/models/src/aws-ecs-1-nvidia/defaults.d/10-defaults.toml b/sources/models/src/aws-ecs-1-nvidia/defaults.d/10-defaults.toml new file mode 120000 index 00000000000..a202ba61a4c --- /dev/null +++ b/sources/models/src/aws-ecs-1-nvidia/defaults.d/10-defaults.toml @@ -0,0 +1 @@ +../../../shared-defaults/defaults.toml \ No newline at end of file diff --git a/sources/models/src/aws-ecs-1-nvidia/defaults.d/20-aws-host-containers.toml b/sources/models/src/aws-ecs-1-nvidia/defaults.d/20-aws-host-containers.toml new file mode 120000 index 00000000000..4d404d663cd --- /dev/null +++ b/sources/models/src/aws-ecs-1-nvidia/defaults.d/20-aws-host-containers.toml @@ -0,0 +1 @@ +../../../shared-defaults/aws-host-containers.toml \ No newline at end of file diff --git a/sources/models/src/aws-ecs-1-nvidia/defaults.d/25-cf-signal.toml b/sources/models/src/aws-ecs-1-nvidia/defaults.d/25-cf-signal.toml new file mode 120000 index 00000000000..a33d541652c --- /dev/null +++ b/sources/models/src/aws-ecs-1-nvidia/defaults.d/25-cf-signal.toml @@ -0,0 +1 @@ +../../../shared-defaults/cf-signal.toml \ No newline at end of file diff --git a/sources/models/src/aws-ecs-1-nvidia/defaults.d/30-metrics.toml b/sources/models/src/aws-ecs-1-nvidia/defaults.d/30-metrics.toml new file mode 120000 index 00000000000..99f0b2b6980 --- /dev/null +++ b/sources/models/src/aws-ecs-1-nvidia/defaults.d/30-metrics.toml @@ -0,0 +1 @@ +../../../shared-defaults/metrics.toml \ No newline at end of file diff --git a/sources/models/src/aws-ecs-1-nvidia/defaults.d/51-docker-services.toml b/sources/models/src/aws-ecs-1-nvidia/defaults.d/51-docker-services.toml new file mode 120000 index 00000000000..a8512f25961 --- /dev/null +++ b/sources/models/src/aws-ecs-1-nvidia/defaults.d/51-docker-services.toml @@ -0,0 +1 @@ +../../../shared-defaults/docker-services.toml \ No newline at end of file diff --git a/sources/models/src/aws-ecs-1-nvidia/defaults.d/52-aws-ecs-1.toml b/sources/models/src/aws-ecs-1-nvidia/defaults.d/52-aws-ecs-1.toml new file mode 120000 index 00000000000..06ea554fe9f --- /dev/null +++ b/sources/models/src/aws-ecs-1-nvidia/defaults.d/52-aws-ecs-1.toml @@ -0,0 +1 @@ +../../../shared-defaults/ecs.toml \ No newline at end of file diff --git a/sources/models/src/aws-ecs-1-nvidia/defaults.d/53-docker-daemon.toml b/sources/models/src/aws-ecs-1-nvidia/defaults.d/53-docker-daemon.toml new file mode 120000 index 00000000000..09b29470e5b --- /dev/null +++ b/sources/models/src/aws-ecs-1-nvidia/defaults.d/53-docker-daemon.toml @@ -0,0 +1 @@ +../../../shared-defaults/docker-daemon-nvidia.toml \ No newline at end of file diff --git a/sources/models/src/aws-ecs-1-nvidia/defaults.d/60-lockdown-none.toml b/sources/models/src/aws-ecs-1-nvidia/defaults.d/60-lockdown-none.toml new file mode 120000 index 00000000000..cced543330e --- /dev/null +++ b/sources/models/src/aws-ecs-1-nvidia/defaults.d/60-lockdown-none.toml @@ -0,0 +1 @@ +../../../shared-defaults/lockdown-none.toml \ No newline at end of file diff --git a/sources/models/src/aws-ecs-1-nvidia/defaults.d/70-oci-hooks.toml b/sources/models/src/aws-ecs-1-nvidia/defaults.d/70-oci-hooks.toml new file mode 120000 index 00000000000..82b0def22bb --- /dev/null +++ b/sources/models/src/aws-ecs-1-nvidia/defaults.d/70-oci-hooks.toml @@ -0,0 +1 @@ +../../../shared-defaults/nvidia-oci-hooks-docker.toml \ No newline at end of file diff --git a/sources/models/src/aws-ecs-1-nvidia/mod.rs b/sources/models/src/aws-ecs-1-nvidia/mod.rs new file mode 100644 index 00000000000..84535c2c19e --- /dev/null +++ b/sources/models/src/aws-ecs-1-nvidia/mod.rs @@ -0,0 +1,30 @@ +use model_derive::model; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +use crate::modeled_types::Identifier; +use crate::{ + AwsSettings, BootstrapContainer, CloudFormationSettings, ECSSettings, HostContainer, + KernelSettings, MetricsSettings, NetworkSettings, NtpSettings, OciHooks, PemCertificate, + RegistrySettings, UpdatesSettings, +}; + +// Note: we have to use 'rename' here because the top-level Settings structure is the only one +// that uses its name in serialization; internal structures use the field name that points to it +#[model(rename = "settings", impl_default = true)] +struct Settings { + motd: String, + updates: UpdatesSettings, + host_containers: HashMap, + bootstrap_containers: HashMap, + ntp: NtpSettings, + network: NetworkSettings, + kernel: KernelSettings, + aws: AwsSettings, + ecs: ECSSettings, + metrics: MetricsSettings, + pki: HashMap, + container_registry: RegistrySettings, + oci_hooks: OciHooks, + cloudformation: CloudFormationSettings, +} diff --git a/sources/models/src/aws-ecs-1/defaults.d/52-aws-ecs-1.toml b/sources/models/src/aws-ecs-1/defaults.d/52-aws-ecs-1.toml deleted file mode 100644 index 0eb26a9b4f8..00000000000 --- a/sources/models/src/aws-ecs-1/defaults.d/52-aws-ecs-1.toml +++ /dev/null @@ -1,28 +0,0 @@ -# ECS -[services.ecs] -restart-commands = ["/usr/bin/ecs-settings-applier", "/bin/systemctl try-reload-or-restart ecs.service"] -configuration-files = ["ecs-config"] - -[configuration-files.ecs-config] -path = "/etc/ecs/ecs.config" -template-path = "/usr/share/templates/ecs.config" - -[metadata.settings.ecs] -affected-services = ["ecs"] - -[settings.ecs] -allow-privileged-containers = false -logging-drivers = ["json-file", "awslogs", "none"] -loglevel = "info" - -# Metrics -[settings.metrics] -service-checks = ["apiserver", "chronyd", "containerd", "host-containerd", "docker", "ecs"] - -# Network -[metadata.settings.network] -affected-services = ["containerd", "docker", "ecs", "host-containerd", "host-containers"] - -# Image registry credentials -[metadata.settings.container-registry.credentials] -affected-services = ["ecs", "host-containers", "bootstrap-containers"] diff --git a/sources/models/src/aws-ecs-1/defaults.d/52-aws-ecs-1.toml b/sources/models/src/aws-ecs-1/defaults.d/52-aws-ecs-1.toml new file mode 120000 index 00000000000..06ea554fe9f --- /dev/null +++ b/sources/models/src/aws-ecs-1/defaults.d/52-aws-ecs-1.toml @@ -0,0 +1 @@ +../../../shared-defaults/ecs.toml \ No newline at end of file diff --git a/variants/Cargo.lock b/variants/Cargo.lock index a117c803360..6f7f424f643 100644 --- a/variants/Cargo.lock +++ b/variants/Cargo.lock @@ -39,6 +39,22 @@ dependencies = [ "release", ] +[[package]] +name = "aws-ecs-1-nvidia" +version = "0.1.0" +dependencies = [ + "docker-cli", + "docker-engine", + "docker-init", + "docker-proxy", + "ecs-agent", + "ecs-gpu-init", + "kernel-5_10", + "kmod-5_10-nvidia", + "nvidia-container-toolkit", + "release", +] + [[package]] name = "aws-iam-authenticator" version = "0.1.0" @@ -258,6 +274,13 @@ dependencies = [ "glibc", ] +[[package]] +name = "ecs-gpu-init" +version = "0.1.0" +dependencies = [ + "glibc", +] + [[package]] name = "filesystem" version = "0.1.0" diff --git a/variants/Cargo.toml b/variants/Cargo.toml index f95e2efd1a9..e130ea17a5f 100644 --- a/variants/Cargo.toml +++ b/variants/Cargo.toml @@ -2,6 +2,7 @@ members = [ "aws-dev", "aws-ecs-1", + "aws-ecs-1-nvidia", "aws-k8s-1.19", "aws-k8s-1.20", "aws-k8s-1.21", diff --git a/variants/README.md b/variants/README.md index 2a05c690062..517c4c83c7a 100644 --- a/variants/README.md +++ b/variants/README.md @@ -80,6 +80,12 @@ This variant is compatible with Kubernetes 1.22, 1.23, and 1.24 clusters. The [aws-ecs-1](aws-ecs-1/Cargo.toml) variant includes the packages needed to run an [Amazon ECS](https://ecs.aws) container instance in AWS. +### aws-ecs-1-nvidia: Amazon ECS container instance + +The [aws-ecs-1-nvidia](aws-ecs-1-nvidia/Cargo.toml) variant includes the packages needed to run an [Amazon ECS](https://ecs.aws) +container instance in AWS. +It also includes the required packages to configure containers to leverage NVIDIA GPUs. + ### aws-dev: AWS development build The [aws-dev](aws-dev/Cargo.toml) variant has useful packages for local development of the OS. diff --git a/variants/aws-ecs-1-nvidia/Cargo.toml b/variants/aws-ecs-1-nvidia/Cargo.toml new file mode 100644 index 00000000000..672a3a54150 --- /dev/null +++ b/variants/aws-ecs-1-nvidia/Cargo.toml @@ -0,0 +1,50 @@ +[package] +name = "aws-ecs-1-nvidia" +version = "0.1.0" +edition = "2018" +publish = false +build = "build.rs" + +[package.metadata.build-variant.image-layout] +os-image-size-gib = 4 + +[package.metadata.build-variant] +kernel-parameters = [ + "console=tty0", + "console=ttyS0,115200n8", +] +included-packages = [ +# core + "release", + "kernel-5.10", +# docker + "docker-cli", + "docker-engine", + "docker-init", + "docker-proxy", +# ecs + "ecs-agent", +# NVIDIA support + "ecs-gpu-init", + "nvidia-container-toolkit", + "kmod-5.10-nvidia-tesla-470" +] + +[lib] +path = "lib.rs" + +[build-dependencies] +# core +release = { path = "../../packages/release" } +kernel-5_10 = { path = "../../packages/kernel-5.10" } +# docker +docker-cli = { path = "../../packages/docker-cli" } +docker-engine = { path = "../../packages/docker-engine" } +docker-init = { path = "../../packages/docker-init" } +docker-proxy = { path = "../../packages/docker-proxy" } +# ecs +ecs-agent = { path = "../../packages/ecs-agent" } +# NVIDIA +ecs-gpu-init = { path = "../../packages/ecs-gpu-init" } +nvidia-container-toolkit = { path = "../../packages/nvidia-container-toolkit" } +kmod-5_10-nvidia = { path = "../../packages/kmod-5.10-nvidia" } diff --git a/variants/aws-ecs-1-nvidia/build.rs b/variants/aws-ecs-1-nvidia/build.rs new file mode 100644 index 00000000000..d6a90e4df44 --- /dev/null +++ b/variants/aws-ecs-1-nvidia/build.rs @@ -0,0 +1,9 @@ +use std::process::{exit, Command}; + +fn main() -> Result<(), std::io::Error> { + let ret = Command::new("buildsys").arg("build-variant").status()?; + if !ret.success() { + exit(1); + } + Ok(()) +} diff --git a/variants/aws-ecs-1-nvidia/lib.rs b/variants/aws-ecs-1-nvidia/lib.rs new file mode 100644 index 00000000000..d799fb2d44c --- /dev/null +++ b/variants/aws-ecs-1-nvidia/lib.rs @@ -0,0 +1 @@ +// not used