From 744ff5e920b0fc3069d2f86f09c821dde5ff9372 Mon Sep 17 00:00:00 2001 From: Scott Wittenburg Date: Wed, 22 May 2024 17:40:27 -0600 Subject: [PATCH] First pass at snapshot pruning --- .github/workflows/custom_docker_builds.yml | 2 +- images/snapshot-release-tags/Dockerfile | 2 +- .../snapshot-release-tags/prune_snapshots.py | 85 +++++++++++++++++++ images/snapshot-release-tags/requirements.txt | 1 + .../snapshot-release-tags/cron-jobs.yaml | 47 +++++++++- terraform/modules/prune_snapshots.tf | 58 +++++++++++++ 6 files changed, 192 insertions(+), 3 deletions(-) create mode 100644 images/snapshot-release-tags/prune_snapshots.py create mode 100644 terraform/modules/prune_snapshots.tf diff --git a/.github/workflows/custom_docker_builds.yml b/.github/workflows/custom_docker_builds.yml index 8733d3290..b9c11fb9d 100644 --- a/.github/workflows/custom_docker_builds.yml +++ b/.github/workflows/custom_docker_builds.yml @@ -38,7 +38,7 @@ jobs: - docker-image: ./images/python-aws-bash image-tags: ghcr.io/spack/python-aws-bash:0.0.2 - docker-image: ./images/snapshot-release-tags - image-tags: ghcr.io/spack/snapshot-release-tags:0.0.4 + image-tags: ghcr.io/spack/snapshot-release-tags:0.0.5 - docker-image: ./images/cache-indexer image-tags: ghcr.io/spack/cache-indexer:0.0.3 - docker-image: ./analytics diff --git a/images/snapshot-release-tags/Dockerfile b/images/snapshot-release-tags/Dockerfile index bfbb6e1e6..9594a3b7f 100644 --- a/images/snapshot-release-tags/Dockerfile +++ b/images/snapshot-release-tags/Dockerfile @@ -9,4 +9,4 @@ RUN pip install -r requirements.txt COPY . . -CMD [ "python", "./snapshot_release_tags.py" ] +ENTRYPOINT [ "python" ] diff --git a/images/snapshot-release-tags/prune_snapshots.py b/images/snapshot-release-tags/prune_snapshots.py new file mode 100644 index 000000000..5dcf7a93e --- /dev/null +++ b/images/snapshot-release-tags/prune_snapshots.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 + +import argparse +import os +import re +import subprocess + +import sentry_sdk +from github import Github + + +sentry_sdk.init( + # This cron job only runs once weekly, + # so just record all transactions. + traces_sample_rate=1.0, +) + +TAG_REF_REGEX = re.compile(r"^refs/tags/(develop-\d{4}-\d{2}-\d{2})$") + + +def main(): + if "GITHUB_TOKEN" not in os.environ: + raise Exception("GITHUB_TOKEN environment is not set") + + parser = argparse.ArgumentParser( + prog="prune_snapshots.py", + description="Prune expired snapshots", + ) + + parser.add_argument( + "-k", + "--keep-last-n", + type=int, + default=8, + help="Prune all but most recent --keep-last-n", + ) + parser.add_argument( + "-m", + "--mirror-root", + default="s3://spack-binaries", + help=("Root url of mirror where snapshot binaries are mirrored"), + ) + + args = parser.parse_args() + + keep_n = args.keep_last_n + mirror_root_url = args.mirror_root + + # Use the GitHub API to create a tag for this commit of develop. + github_token = os.environ.get("GITHUB_TOKEN") + py_github = Github(github_token) + py_gh_repo = py_github.get_repo("spack/spack", lazy=True) + + # Get a list of all the tags matching the develop snapshot pattern + snapshot_tags = py_gh_repo.get_git_matching_refs("tags/develop-") + + # Sort them so we can prune all but the KEEP_LAST_N most recent + pruning_candidates = sorted(snapshot_tags, key=lambda ref: ref.ref)[:-keep_n] + + print("Deleting the following snapshots:") + for tag in pruning_candidates: + m = TAG_REF_REGEX.search(tag.ref) + + if not m: + print(f"Unable to parse {tag.ref}, skipping") + continue + + mirror_prefix = m.group(1) + url_to_prune = f"{mirror_root_url}/{mirror_prefix}" + + print(f" Ref: {tag.ref}, Mirror: {url_to_prune}") + + # First, try to delete the mirror associated with the snapshot + try: + subprocess.run(["aws", "s3", "rm", "--recursive", url_to_prune], check=True) + except subprocess.CalledProcessError as cpe: + print(f"Failed to delete the mirror url {url_to_prune}, skipping") + continue + + # If mirror deletion succeeded, also delete the tag from GitHub + tag.delete() + + +if __name__ == "__main__": + main() diff --git a/images/snapshot-release-tags/requirements.txt b/images/snapshot-release-tags/requirements.txt index f16b4a991..c5aabef8b 100644 --- a/images/snapshot-release-tags/requirements.txt +++ b/images/snapshot-release-tags/requirements.txt @@ -1,3 +1,4 @@ +awscli==1.32.101 certifi==2023.5.7 cffi==1.15.1 charset-normalizer==3.1.0 diff --git a/k8s/production/custom/snapshot-release-tags/cron-jobs.yaml b/k8s/production/custom/snapshot-release-tags/cron-jobs.yaml index deded8bab..a8ea4cfe0 100644 --- a/k8s/production/custom/snapshot-release-tags/cron-jobs.yaml +++ b/k8s/production/custom/snapshot-release-tags/cron-jobs.yaml @@ -16,12 +16,57 @@ spec: restartPolicy: Never containers: - name: snapshot-release-tags - image: ghcr.io/spack/snapshot-release-tags:0.0.4 + image: ghcr.io/spack/snapshot-release-tags:0.0.5 imagePullPolicy: IfNotPresent resources: requests: cpu: 500m memory: 500M + command: + - "./snapshot_release_tags.py" + env: + - name: GITHUB_TOKEN + valueFrom: + secretKeyRef: + name: gh-gl-sync + key: github-public-repo-token + envFrom: + - configMapRef: + name: python-scripts-sentry-config + nodeSelector: + spack.io/node-pool: base +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: prune-snapshots + namespace: custom +spec: + schedule: "0 22 * * 6" # 10pm on Saturday + concurrencyPolicy: Forbid + jobTemplate: + spec: + activeDeadlineSeconds: 1200 # terminate any running job after 20 minutes + backoffLimit: 0 + template: + spec: + serviceAccountName: prune-snapshots + restartPolicy: Never + containers: + - name: snapshot-release-tags + image: ghcr.io/spack/snapshot-release-tags:0.0.5 + imagePullPolicy: IfNotPresent + resources: + requests: + cpu: 500m + memory: 500M + command: + - "./prune_snapshots.py" + args: + - "--keep-last-n" + - "8" + - "--mirror-root" + - "s3://spack-binaries" env: - name: GITHUB_TOKEN valueFrom: diff --git a/terraform/modules/prune_snapshots.tf b/terraform/modules/prune_snapshots.tf new file mode 100644 index 000000000..786427ba6 --- /dev/null +++ b/terraform/modules/prune_snapshots.tf @@ -0,0 +1,58 @@ +# IAM Role for granting delete access to spack-binaries bucket for the snapshot pruner +resource "aws_iam_role" "delete_spack_binaries" { + name = "DeleteFromBucketSpackBinaries${local.suffix}" + description = "Managed by Terraform. Grants Kubernetes pods access to delete objects from the spack-binaries S3 bucket" + assume_role_policy = jsonencode({ + "Version" : "2012-10-17", + "Statement" : [ + { + "Effect" : "Allow", + "Principal" : { + "Federated" : module.eks.oidc_provider_arn + }, + "Action" : "sts:AssumeRoleWithWebIdentity", + "Condition" : { + "StringEquals" : { + "${module.eks.oidc_provider}:aud" : "sts.amazonaws.com" + } + } + } + ] + }) +} + +resource "aws_iam_policy" "delete_spack_binaries" { + name = "DeleteObjectsFromBucketSpackBinaries${local.suffix}" + description = "Allows deletion of any object in the ${module.protected_binary_mirror.bucket_name} bucket." + policy = jsonencode({ + "Version" : "2012-10-17", + "Statement" : [ + { + "Effect" : "Allow", + "Action" : "s3:DeleteObject", + "Resource" : "${module.protected_binary_mirror.bucket_arn}/*" + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "delete_spack_binaries" { + role = aws_iam_role.delete_spack_binaries.name + policy_arn = aws_iam_policy.delete_spack_binaries.arn +} + +resource "kubectl_manifest" "snapshot_pruner_service_account" { + yaml_body = <<-YAML + apiVersion: v1 + kind: ServiceAccount + metadata: + name: prune-snapshots + namespace: custom + annotations: + # DeleteFromBucketSpackBinaries + eks.amazonaws.com/role-arn: ${aws_iam_role.delete_spack_binaries.arn} + YAML + depends_on = [ + aws_iam_role_policy_attachment.delete_spack_binaries + ] +}