diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index a2aae0772b15..7f843de1a55c 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -179,7 +179,15 @@ runs: # Wake up the cluster if we use remote neon instance if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" + QUERIES=("SELECT version()") + if [[ "${PLATFORM}" = "neon"* ]]; then + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") + fi + + for q in "${QUERIES[@]}"; do + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "${q}" + done fi # Run the tests. diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 0e748adeb69e..db04b5de7ddc 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -239,11 +239,6 @@ jobs: path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Create Neon Project if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform) id: create-neon-project @@ -282,16 +277,6 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERIES=("SELECT version()") - if [[ "${PLATFORM}" = "neon"* ]]; then - QUERIES+=("SHOW neon.tenant_id") - QUERIES+=("SHOW neon.timeline_id") - fi - - for q in "${QUERIES[@]}"; do - psql ${CONNSTR} -c "${q}" - done - - name: Benchmark init uses: ./.github/actions/run-python-test-set with: @@ -377,29 +362,12 @@ jobs: path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - LD_LIBRARY_PATH="${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib" - export LD_LIBRARY_PATH - echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> $GITHUB_ENV - - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Set up Connection String id: set-up-connstr run: | CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }} - - echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERIES=("SELECT version()") - QUERIES+=("SHOW neon.tenant_id") - QUERIES+=("SHOW neon.timeline_id") - - for q in "${QUERIES[@]}"; do - psql ${CONNSTR} -c "${q}" - done + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - name: Benchmark pgvector hnsw indexing uses: ./.github/actions/run-python-test-set @@ -421,12 +389,12 @@ jobs: test_selection: performance/test_perf_pgvector_queries.py run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} - extra_params: -m remote_cluster --timeout 21600 + extra_params: -m remote_cluster --timeout 21600 env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - + - name: Create Allure report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -481,11 +449,6 @@ jobs: path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Set up Connection String id: set-up-connstr run: | @@ -507,16 +470,6 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERIES=("SELECT version()") - if [[ "${PLATFORM}" = "neon"* ]]; then - QUERIES+=("SHOW neon.tenant_id") - QUERIES+=("SHOW neon.timeline_id") - fi - - for q in "${QUERIES[@]}"; do - psql ${CONNSTR} -c "${q}" - done - - name: ClickBench benchmark uses: ./.github/actions/run-python-test-set with: @@ -584,11 +537,6 @@ jobs: path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Get Connstring Secret Name run: | case "${PLATFORM}" in @@ -617,16 +565,6 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERIES=("SELECT version()") - if [[ "${PLATFORM}" = "neon"* ]]; then - QUERIES+=("SHOW neon.tenant_id") - QUERIES+=("SHOW neon.timeline_id") - fi - - for q in "${QUERIES[@]}"; do - psql ${CONNSTR} -c "${q}" - done - - name: Run TPC-H benchmark uses: ./.github/actions/run-python-test-set with: @@ -685,11 +623,6 @@ jobs: path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Set up Connection String id: set-up-connstr run: | @@ -711,16 +644,6 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERIES=("SELECT version()") - if [[ "${PLATFORM}" = "neon"* ]]; then - QUERIES+=("SHOW neon.tenant_id") - QUERIES+=("SHOW neon.timeline_id") - fi - - for q in "${QUERIES[@]}"; do - psql ${CONNSTR} -c "${q}" - done - - name: Run user examples uses: ./.github/actions/run-python-test-set with: diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index f1c39e7e4f5b..a69686bf2a6e 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -63,14 +63,16 @@ jobs: mkdir -p /tmp/.docker-custom echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV - - uses: docker/setup-buildx-action@v2 + - uses: docker/setup-buildx-action@v3 + with: + cache-binary: false - - uses: docker/login-action@v2 + - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - uses: docker/build-push-action@v4 + - uses: docker/build-push-action@v6 with: context: . provenance: false diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 24ad26205b60..a3246987e2b8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -30,7 +30,7 @@ jobs: if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} uses: ./.github/workflows/check-permissions.yml with: - github-event-name: ${{ github.event_name}} + github-event-name: ${{ github.event_name }} cancel-previous-e2e-tests: needs: [ check-permissions ] @@ -751,14 +751,16 @@ jobs: run: | mkdir -p .docker-custom echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV - - uses: docker/setup-buildx-action@v2 + - uses: docker/setup-buildx-action@v3 + with: + cache-binary: false - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - uses: docker/build-push-action@v5 + - uses: docker/build-push-action@v6 with: context: . build-args: | @@ -829,11 +831,12 @@ jobs: run: | mkdir -p .docker-custom echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV - - uses: docker/setup-buildx-action@v2 + - uses: docker/setup-buildx-action@v3 with: + cache-binary: false # Disable parallelism for docker buildkit. # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner. - config-inline: | + buildkitd-config-inline: | [worker.oci] max-parallelism = 1 @@ -849,7 +852,7 @@ jobs: password: ${{ secrets.AWS_SECRET_KEY_DEV }} - name: Build compute-node image - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: . build-args: | @@ -868,7 +871,7 @@ jobs: - name: Build neon extensions test image if: matrix.version == 'v16' - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: . build-args: | @@ -889,7 +892,7 @@ jobs: - name: Build compute-tools image # compute-tools are Postgres independent, so build it only once if: matrix.version == 'v16' - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: target: compute-tools-image context: . @@ -1365,3 +1368,31 @@ jobs: with: from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }} secrets: inherit + + # This job simplifies setting branch protection rules (in GitHub UI) + # by allowing to set only this job instead of listing many others. + # It also makes it easier to rename or parametrise jobs (using matrix) + # which requires changes in branch protection rules + # + # Note, that we can't add external check (like `neon-cloud-e2e`) we still need to use GitHub UI for that. + # + # https://github.com/neondatabase/neon/settings/branch_protection_rules + conclusion: + if: always() + # Format `needs` differently to make the list more readable. + # Usually we do `needs: [...]` + needs: + - check-codestyle-python + - check-codestyle-rust + - regress-tests + - test-images + runs-on: ubuntu-22.04 + steps: + # The list of possible results: + # https://docs.github.com/en/actions/learn-github-actions/contexts#needs-context + - name: Fail the job if any of the dependencies do not succeed + run: exit 1 + if: | + contains(needs.*.result, 'failure') + || contains(needs.*.result, 'cancelled') + || contains(needs.*.result, 'skipped') diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml new file mode 100644 index 000000000000..ed4e6be71239 --- /dev/null +++ b/.github/workflows/periodic_pagebench.yml @@ -0,0 +1,155 @@ +name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region + +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '0 18 * * *' # Runs at 6 PM UTC every day + workflow_dispatch: # Allows manual triggering of the workflow + inputs: + commit_hash: + type: string + description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.' + required: false + default: '' + +defaults: + run: + shell: bash -euo pipefail {0} + +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: false + +jobs: + trigger_bench_on_ec2_machine_in_eu_central_1: + runs-on: [ self-hosted, gen3, small ] + container: + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + timeout-minutes: 360 # Set the timeout to 6 hours + env: + API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }} + RUN_ID: ${{ github.run_id }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }} + AWS_DEFAULT_REGION : "eu-central-1" + AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74" + steps: + # we don't need the neon source code because we run everything remotely + # however we still need the local github actions to run the allure step below + - uses: actions/checkout@v4 + + - name: Show my own (github runner) external IP address - usefull for IP allowlisting + run: curl https://ifconfig.me + + - name: Start EC2 instance and wait for the instance to boot up + run: | + aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID + aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID + sleep 60 # sleep some time to allow cloudinit and our API server to start up + + - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US + run: | + public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text) + echo "Public IP of the EC2 instance: $public_ip" + echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV + + - name: Determine commit hash + env: + INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }} + run: | + if [ -z "$INPUT_COMMIT_HASH" ]; then + echo "COMMIT_HASH=$(curl -s https://github.com/gitapi/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV + else + echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV + fi + + - name: Start Bench with run_id + run: | + curl -k -X 'POST' \ + "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -H "Authorization: Bearer $API_KEY" \ + -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}" + + - name: Poll Test Status + id: poll_step + run: | + status="" + while [[ "$status" != "failure" && "$status" != "success" ]]; do + response=$(curl -k -X 'GET' \ + "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer $API_KEY") + echo "Response: $response" + set +x + status=$(echo $response | jq -r '.status') + echo "Test status: $status" + if [[ "$status" == "failure" ]]; then + echo "Test failed" + exit 1 # Fail the job step if status is failure + elif [[ "$status" == "success" || "$status" == "null" ]]; then + break + elif [[ "$status" == "too_many_runs" ]]; then + echo "Too many runs already running" + echo "too_many_runs=true" >> "$GITHUB_OUTPUT" + exit 1 + fi + + sleep 60 # Poll every 60 seconds + done + + - name: Retrieve Test Logs + if: always() && steps.poll_step.outputs.too_many_runs != 'true' + run: | + curl -k -X 'GET' \ + "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \ + -H 'accept: application/gzip' \ + -H "Authorization: Bearer $API_KEY" \ + --output "test_log_${GITHUB_RUN_ID}.gz" + + - name: Unzip Test Log and Print it into this job's log + if: always() && steps.poll_step.outputs.too_many_runs != 'true' + run: | + gzip -d "test_log_${GITHUB_RUN_ID}.gz" + cat "test_log_${GITHUB_RUN_ID}" + + - name: Create Allure report + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + + - name: Cleanup Test Resources + if: always() + run: | + curl -k -X 'POST' \ + "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer $API_KEY" \ + -d '' + + - name: Stop EC2 instance and wait for the instance to be stopped + if: always() && steps.poll_step.outputs.too_many_runs != 'true' + run: | + aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID + aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml new file mode 100644 index 000000000000..e21e45c929c8 --- /dev/null +++ b/.github/workflows/pg-clients.yml @@ -0,0 +1,115 @@ +name: Test Postgres client libraries + +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '23 02 * * *' # run once a day, timezone is utc + pull_request: + paths: + - '.github/workflows/pg-clients.yml' + - 'test_runner/pg_clients/**' + - 'poetry.lock' + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref_name }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +defaults: + run: + shell: bash -euxo pipefail {0} + +env: + DEFAULT_PG_VERSION: 16 + PLATFORM: neon-captest-new + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + AWS_DEFAULT_REGION: eu-central-1 + +jobs: + check-permissions: + if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} + uses: ./.github/workflows/check-permissions.yml + with: + github-event-name: ${{ github.event_name }} + + check-build-tools-image: + needs: [ check-permissions ] + uses: ./.github/workflows/check-build-tools-image.yml + + build-build-tools-image: + needs: [ check-build-tools-image ] + uses: ./.github/workflows/build-build-tools-image.yml + with: + image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }} + secrets: inherit + + test-postgres-client-libs: + needs: [ build-build-tools-image ] + runs-on: ubuntu-22.04 + + container: + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init --user root + + steps: + - uses: actions/checkout@v4 + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Create Neon Project + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + postgres_version: ${{ env.DEFAULT_PG_VERSION }} + + - name: Run tests + uses: ./.github/actions/run-python-test-set + with: + build_type: remote + test_selection: pg_clients + run_in_parallel: false + extra_params: -m remote_cluster + pg_version: ${{ env.DEFAULT_PG_VERSION }} + env: + BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} + + - name: Delete Neon Project + if: always() + uses: ./.github/actions/neon-project-delete + with: + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Create Allure report + if: ${{ !cancelled() }} + id: create-allure-report + uses: ./.github/actions/allure-report-generate + with: + store-test-results-into-db: true + env: + REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + + - name: Post to a Slack channel + if: github.event.schedule && failure() + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream + slack-message: | + Testing Postgres clients: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>) + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml deleted file mode 100644 index dd09abddb848..000000000000 --- a/.github/workflows/pg_clients.yml +++ /dev/null @@ -1,98 +0,0 @@ -name: Test Postgres client libraries - -on: - schedule: - # * is a special character in YAML so you have to quote this string - # ┌───────────── minute (0 - 59) - # │ ┌───────────── hour (0 - 23) - # │ │ ┌───────────── day of the month (1 - 31) - # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) - # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '23 02 * * *' # run once a day, timezone is utc - - workflow_dispatch: - -concurrency: - # Allow only one workflow per any non-`main` branch. - group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} - cancel-in-progress: true - -jobs: - test-postgres-client-libs: - # TODO: switch to gen2 runner, requires docker - runs-on: ubuntu-22.04 - - env: - DEFAULT_PG_VERSION: 14 - TEST_OUTPUT: /tmp/test_output - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - uses: actions/setup-python@v4 - with: - python-version: 3.9 - - - name: Install Poetry - uses: snok/install-poetry@v1 - - - name: Cache poetry deps - uses: actions/cache@v4 - with: - path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }} - - - name: Install Python deps - shell: bash -euxo pipefail {0} - run: ./scripts/pysync - - - name: Create Neon Project - id: create-neon-project - uses: ./.github/actions/neon-project-create - with: - api_key: ${{ secrets.NEON_STAGING_API_KEY }} - postgres_version: ${{ env.DEFAULT_PG_VERSION }} - - - name: Run pytest - env: - REMOTE_ENV: 1 - BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} - POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - shell: bash -euxo pipefail {0} - run: | - # Test framework expects we have psql binary; - # but since we don't really need it in this test, let's mock it - mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql"; - ./scripts/pytest \ - --junitxml=$TEST_OUTPUT/junit.xml \ - --tb=short \ - --verbose \ - -m "remote_cluster" \ - -rA "test_runner/pg_clients" - - - name: Delete Neon Project - if: ${{ always() }} - uses: ./.github/actions/neon-project-delete - with: - project_id: ${{ steps.create-neon-project.outputs.project_id }} - api_key: ${{ secrets.NEON_STAGING_API_KEY }} - - # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI. - # It will be fixed after switching to gen2 runner - - name: Upload python test logs - if: always() - uses: actions/upload-artifact@v4 - with: - retention-days: 7 - name: python-test-pg_clients-${{ runner.os }}-${{ runner.arch }}-stage-logs - path: ${{ env.TEST_OUTPUT }} - - - name: Post to a Slack channel - if: ${{ github.event.schedule && failure() }} - uses: slackapi/slack-github-action@v1 - with: - channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" - env: - SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/Cargo.lock b/Cargo.lock index 5393538c5902..6dae8e340348 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6811,6 +6811,7 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", + "toml_edit 0.19.10", "tracing", "tracing-error", "tracing-subscriber", diff --git a/Dockerfile b/Dockerfile index f0197758e48b..a41598ef72cd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -57,6 +57,7 @@ RUN set -e \ --bin storage_controller \ --bin proxy \ --bin neon_local \ + --bin storage_scrubber \ --locked --release \ && cachepot -s @@ -83,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index 30314376efdb..4826b7914e42 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -1,5 +1,13 @@ FROM debian:bullseye-slim +# Use ARG as a build-time environment variable here to allow. +# It's not supposed to be set outside. +# Alternatively it can be obtained using the following command +# ``` +# . /etc/os-release && echo "${VERSION_CODENAME}" +# ``` +ARG DEBIAN_VERSION_CODENAME=bullseye + # Add nonroot user RUN useradd -ms /bin/bash nonroot -b /home SHELL ["/bin/bash", "-c"] @@ -66,12 +74,24 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/ # LLVM ENV LLVM_VERSION=18 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ - && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ + && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ && apt update \ && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \ && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +# Install docker +RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \ + && apt update \ + && apt install -y docker-ce docker-ce-cli \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# Configure sudo & docker +RUN usermod -aG sudo nonroot && \ + echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \ + usermod -aG docker nonroot + # AWS CLI RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \ && unzip -q awscliv2.zip \ diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index a79b666409ae..41a52ef5b641 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -873,9 +873,8 @@ impl ComputeNode { Ok(()) } - // We could've wrapped this around `pg_ctl reload`, but right now we don't use - // `pg_ctl` for start / stop, so this just seems much easier to do as we already - // have opened connection to Postgres and superuser access. + // Wrapped this around `pg_ctl reload`, but right now we don't use + // `pg_ctl` for start / stop. #[instrument(skip_all)] fn pg_reload_conf(&self) -> Result<()> { let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl"); diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index fa0822748b61..863fa9468ff4 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -489,7 +489,7 @@ pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> /// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions: /// - next line starts with timestamp /// - EOF -/// - no new lines were written for the last second +/// - no new lines were written for the last 100 milliseconds async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> { let mut lines = tokio::io::BufReader::new(stderr).lines(); let timeout_duration = Duration::from_millis(100); diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index cd430bfab7d4..0acd83753eff 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -29,7 +29,7 @@ pub const KEY_SIZE: usize = 18; /// See [`Key::to_i128`] for more information on the encoding. pub const METADATA_KEY_SIZE: usize = 16; -/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key. +/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x60 is a metadata key. pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60; pub const METADATA_KEY_END_PREFIX: u8 = 0x7F; diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 9a61f2ad81ae..401887d3629c 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -17,6 +17,16 @@ pub struct KeySpace { pub ranges: Vec>, } +impl std::fmt::Display for KeySpace { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "[")?; + for range in &self.ranges { + write!(f, "{}..{},", range.start, range.end)?; + } + write!(f, "]") + } +} + /// A wrapper type for sparse keyspaces. #[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct SparseKeySpace(pub KeySpace); diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 61a255cdbc80..ecc543917e56 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -228,6 +228,11 @@ pub struct TimelineCreateRequest { pub pg_version: Option, } +#[derive(Serialize, Deserialize, Clone)] +pub struct LsnLeaseRequest { + pub lsn: Lsn, +} + #[derive(Serialize, Deserialize)] pub struct TenantShardSplitRequest { pub new_shard_count: u8, @@ -432,6 +437,37 @@ pub enum CompactionAlgorithm { Tiered, } +#[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + Serialize, + Deserialize, + strum_macros::FromRepr, + strum_macros::EnumString, +)] +#[strum(serialize_all = "kebab-case")] +pub enum ImageCompressionAlgorithm { + /// Disabled for writes, and never decompress during reading. + /// Never set this after you've enabled compression once! + DisabledNoDecompress, + // Disabled for writes, support decompressing during read path + Disabled, + /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well. + /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html). + Zstd { + level: Option, + }, +} + +impl ImageCompressionAlgorithm { + pub fn allow_decompression(&self) -> bool { + !matches!(self, ImageCompressionAlgorithm::DisabledNoDecompress) + } +} + #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)] pub struct CompactionAlgorithmSettings { pub kind: CompactionAlgorithm, @@ -643,6 +679,16 @@ pub struct TimelineInfo { pub current_physical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, + /// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes + /// beyond the branch's branch point, we only count up to the branch point. + pub pitr_history_size: u64, + + /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any + /// ancestor data used by this branch would have been retained anyway). If this is false, then + /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would + /// otherwise be able to GC. + pub within_ancestor_pitr: bool, + pub timeline_dir_layer_file_size_sum: Option, pub wal_source_connstr: Option, diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index 8a8f6212e99b..fa3f2cba58d7 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -1,6 +1,5 @@ use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration}; -use anyhow::bail; use aws_sdk_s3::types::StorageClass; use camino::Utf8PathBuf; @@ -176,20 +175,8 @@ fn serialize_storage_class( impl RemoteStorageConfig { pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120); - pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result> { - let document: toml_edit::Document = match toml { - toml_edit::Item::Table(toml) => toml.clone().into(), - toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => { - toml.clone().into_table().into() - } - _ => bail!("toml not a table or inline table"), - }; - - if document.is_empty() { - return Ok(None); - } - - Ok(Some(toml_edit::de::from_document(document)?)) + pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { + Ok(utils::toml_edit_ext::deserialize_item(toml)?) } } @@ -197,7 +184,7 @@ impl RemoteStorageConfig { mod tests { use super::*; - fn parse(input: &str) -> anyhow::Result> { + fn parse(input: &str) -> anyhow::Result { let toml = input.parse::().unwrap(); RemoteStorageConfig::from_toml(toml.as_item()) } @@ -207,7 +194,7 @@ mod tests { let input = "local_path = '.' timeout = '5s'"; - let config = parse(input).unwrap().expect("it exists"); + let config = parse(input).unwrap(); assert_eq!( config, @@ -229,7 +216,7 @@ timeout = '5s'"; timeout = '7s' "; - let config = parse(toml).unwrap().expect("it exists"); + let config = parse(toml).unwrap(); assert_eq!( config, @@ -257,7 +244,7 @@ timeout = '5s'"; timeout = '7s' "; - let config = parse(toml).unwrap().expect("it exists"); + let config = parse(toml).unwrap(); assert_eq!( config, diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs index f05997ee6547..be005622199d 100644 --- a/libs/tenant_size_model/src/calculation.rs +++ b/libs/tenant_size_model/src/calculation.rs @@ -34,10 +34,10 @@ struct SegmentSize { } struct SizeAlternatives { - // cheapest alternative if parent is available. + /// cheapest alternative if parent is available. incremental: SegmentSize, - // cheapest alternative if parent node is not available + /// cheapest alternative if parent node is not available non_incremental: Option, } diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs index f26d3aa79d1a..0de2890bb414 100644 --- a/libs/tenant_size_model/src/svg.rs +++ b/libs/tenant_size_model/src/svg.rs @@ -3,10 +3,17 @@ use std::fmt::Write; const SVG_WIDTH: f32 = 500.0; +/// Different branch kind for SVG drawing. +#[derive(PartialEq)] +pub enum SvgBranchKind { + Timeline, + Lease, +} + struct SvgDraw<'a> { storage: &'a StorageModel, branches: &'a [String], - seg_to_branch: &'a [usize], + seg_to_branch: &'a [(usize, SvgBranchKind)], sizes: &'a [SegmentSizeResult], // layout @@ -42,13 +49,18 @@ fn draw_legend(result: &mut String) -> anyhow::Result<()> { "" )?; writeln!(result, "WAL not retained")?; + writeln!( + result, + "" + )?; + writeln!(result, "LSN lease")?; Ok(()) } pub fn draw_svg( storage: &StorageModel, branches: &[String], - seg_to_branch: &[usize], + seg_to_branch: &[(usize, SvgBranchKind)], sizes: &SizeResult, ) -> anyhow::Result { let mut draw = SvgDraw { @@ -100,7 +112,7 @@ impl<'a> SvgDraw<'a> { // Layout the timelines on Y dimension. // TODO - let mut y = 100.0; + let mut y = 120.0; let mut branch_y_coordinates = Vec::new(); for _branch in self.branches { branch_y_coordinates.push(y); @@ -109,7 +121,7 @@ impl<'a> SvgDraw<'a> { // Calculate coordinates for each point let seg_coordinates = std::iter::zip(segments, self.seg_to_branch) - .map(|(seg, branch_id)| { + .map(|(seg, (branch_id, _))| { let x = (seg.lsn - min_lsn) as f32 / xscale; let y = branch_y_coordinates[*branch_id]; (x, y) @@ -175,6 +187,22 @@ impl<'a> SvgDraw<'a> { // draw a snapshot point if it's needed let (coord_x, coord_y) = self.seg_coordinates[seg_id]; + + let (_, kind) = &self.seg_to_branch[seg_id]; + if kind == &SvgBranchKind::Lease { + let (x1, y1) = (coord_x, coord_y - 10.0); + let (x2, y2) = (coord_x, coord_y + 10.0); + + let style = "stroke-width=\"3\" stroke=\"blue\""; + + writeln!( + result, + "", + )?; + writeln!(result, " leased lsn at {}", seg.lsn)?; + writeln!(result, "")?; + } + if self.sizes[seg_id].method == SegmentMethod::SnapshotHere { writeln!( result, diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index a6a081c5c144..261ca2cc1ac0 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -40,6 +40,7 @@ thiserror.workspace = true tokio.workspace = true tokio-tar.workspace = true tokio-util.workspace = true +toml_edit.workspace = true tracing.workspace = true tracing-error.workspace = true tracing-subscriber = { workspace = true, features = ["json", "registry"] } diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 2953f0aad4fd..2a397d97d2b9 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -94,6 +94,8 @@ pub mod env; pub mod poison; +pub mod toml_edit_ext; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/libs/utils/src/toml_edit_ext.rs b/libs/utils/src/toml_edit_ext.rs new file mode 100644 index 000000000000..ab5f7bdd95ab --- /dev/null +++ b/libs/utils/src/toml_edit_ext.rs @@ -0,0 +1,22 @@ +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("item is not a document")] + ItemIsNotADocument, + #[error(transparent)] + Serde(toml_edit::de::Error), +} + +pub fn deserialize_item(item: &toml_edit::Item) -> Result +where + T: serde::de::DeserializeOwned, +{ + let document: toml_edit::Document = match item { + toml_edit::Item::Table(toml) => toml.clone().into(), + toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => { + toml.clone().into_table().into() + } + _ => return Err(Error::ItemIsNotADocument), + }; + + toml_edit::de::from_document(document).map_err(Error::Serde) +} diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 50c3ac4c6143..ea09a011e5cf 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -178,7 +178,7 @@ async fn main() -> anyhow::Result<()> { let toml_item = toml_document .get("remote_storage") .expect("need remote_storage"); - let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config"); + let config = RemoteStorageConfig::from_toml(toml_item)?; let storage = remote_storage::GenericRemoteStorage::from_config(&config); let cancel = CancellationToken::new(); storage diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index ba5b2608bdf1..39d4e46c9663 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -421,6 +421,10 @@ fn start_pageserver( background_jobs_can_start: background_jobs_barrier.clone(), }; + info!(config=?conf.l0_flush, "using l0_flush config"); + let l0_flush_global_state = + pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone()); + // Scan the local 'tenants/' directory and start loading the tenants let deletion_queue_client = deletion_queue.new_client(); let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( @@ -429,6 +433,7 @@ fn start_pageserver( broker_client: broker_client.clone(), remote_storage: remote_storage.clone(), deletion_queue_client, + l0_flush_global_state, }, order, shutdown_pageserver.clone(), diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index f36e63f035c7..b7c9af224404 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -5,7 +5,7 @@ //! See also `settings.md` for better description on every parameter. use anyhow::{anyhow, bail, ensure, Context, Result}; -use pageserver_api::shard::TenantShardId; +use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId}; use remote_storage::{RemotePath, RemoteStorageConfig}; use serde; use serde::de::IntoDeserializer; @@ -30,11 +30,11 @@ use utils::{ logging::LogFormat, }; -use crate::tenant::timeline::GetVectoredImpl; use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; use crate::tenant::{config::TenantConfOpt, timeline::GetImpl}; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine}; +use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl}; use crate::{tenant::config::TenantConf, virtual_file}; use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX}; @@ -50,6 +50,7 @@ pub mod defaults { DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_PG_LISTEN_PORT, }; + use pageserver_api::models::ImageCompressionAlgorithm; pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; @@ -90,6 +91,9 @@ pub mod defaults { pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB + pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm = + ImageCompressionAlgorithm::DisabledNoDecompress; + pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true; pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; @@ -159,7 +163,7 @@ pub mod defaults { #ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB} -[remote_storage] +#[remote_storage] "# ); @@ -285,12 +289,16 @@ pub struct PageServerConf { pub validate_vectored_get: bool, + pub image_compression: ImageCompressionAlgorithm, + /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this /// is exceeded, we start proactively closing ephemeral layers to limit the total amount /// of ephemeral data. /// /// Setting this to zero disables limits on total ephemeral layer size. pub ephemeral_bytes_per_memory_kb: usize, + + pub l0_flush: L0FlushConfig, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -395,7 +403,11 @@ struct PageServerConfigBuilder { validate_vectored_get: BuilderValue, + image_compression: BuilderValue, + ephemeral_bytes_per_memory_kb: BuilderValue, + + l0_flush: BuilderValue, } impl PageServerConfigBuilder { @@ -482,8 +494,10 @@ impl PageServerConfigBuilder { max_vectored_read_bytes: Set(MaxVectoredReadBytes( NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), )), + image_compression: Set(DEFAULT_IMAGE_COMPRESSION), validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), + l0_flush: Set(L0FlushConfig::default()), } } } @@ -667,10 +681,18 @@ impl PageServerConfigBuilder { self.validate_vectored_get = BuilderValue::Set(value); } + pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) { + self.image_compression = BuilderValue::Set(value); + } + pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) { self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value); } + pub fn l0_flush(&mut self, value: L0FlushConfig) { + self.l0_flush = BuilderValue::Set(value); + } + pub fn build(self) -> anyhow::Result { let default = Self::default_values(); @@ -727,7 +749,9 @@ impl PageServerConfigBuilder { get_impl, max_vectored_read_bytes, validate_vectored_get, + image_compression, ephemeral_bytes_per_memory_kb, + l0_flush, } CUSTOM LOGIC { @@ -918,7 +942,7 @@ impl PageServerConf { "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?), "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?), "remote_storage" => { - builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?) + builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?)) } "tenant_config" => { t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?; @@ -946,7 +970,7 @@ impl PageServerConf { builder.metric_collection_endpoint(Some(endpoint)); }, "metric_collection_bucket" => { - builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?) + builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?)) } "synthetic_size_calculation_interval" => builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), @@ -1004,9 +1028,15 @@ impl PageServerConf { "validate_vectored_get" => { builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?) } + "image_compression" => { + builder.get_image_compression(parse_toml_from_str("image_compression", item)?) + } "ephemeral_bytes_per_memory_kb" => { builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) } + "l0_flush" => { + builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?) + } _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -1088,8 +1118,10 @@ impl PageServerConf { NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant"), ), + image_compression: defaults::DEFAULT_IMAGE_COMPRESSION, validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + l0_flush: L0FlushConfig::default(), } } } @@ -1328,7 +1360,9 @@ background_task_maximum_delay = '334 s' .expect("Invalid default constant") ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, + image_compression: defaults::DEFAULT_IMAGE_COMPRESSION, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + l0_flush: L0FlushConfig::default(), }, "Correct defaults should be used when no config values are provided" ); @@ -1401,7 +1435,9 @@ background_task_maximum_delay = '334 s' .expect("Invalid default constant") ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, + image_compression: defaults::DEFAULT_IMAGE_COMPRESSION, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + l0_flush: L0FlushConfig::default(), }, "Should be able to parse all basic config values correctly" ); @@ -1681,6 +1717,19 @@ threshold = "20m" } } + #[test] + fn empty_remote_storage_is_error() { + let tempdir = tempdir().unwrap(); + let (workdir, _) = prepare_fs(&tempdir).unwrap(); + let input = r#" +remote_storage = {} + "#; + let doc = toml_edit::Document::from_str(input).unwrap(); + let err = PageServerConf::parse_and_validate(&doc, &workdir) + .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage"); + assert!(format!("{err}").contains("remote_storage"), "{err}"); + } + fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> { let tempdir_path = tempdir.path(); diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index bf06c78e673f..d215fd2b7d2d 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -190,7 +190,7 @@ where } } else { // If we failed validation, then do not apply any of the projected updates - warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation); + info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation); metrics::DELETION_QUEUE.dropped_lsn_updates.inc(); } } @@ -225,7 +225,7 @@ where && (tenant.generation == *validated_generation); if !this_list_valid { - warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation); + info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation); metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64); mutated = true; } else { diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 58ff6e3f83cc..5ba329f05ece 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -265,15 +265,19 @@ paths: type: string format: hex post: - description: Obtain lease for the given LSN - parameters: - - name: lsn - in: query - required: true - schema: - type: string - format: hex - description: A LSN to obtain the lease for + description: Obtains a lease for the given LSN. + requestBody: + content: + application/json: + schema: + type: object + required: + - lsn + properties: + lsn: + description: A LSN to obtain the lease for. + type: string + format: hex responses: "200": description: OK diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 1fda2eaa854e..893302b7d6d9 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -22,6 +22,7 @@ use pageserver_api::models::ListAuxFilesRequest; use pageserver_api::models::LocationConfig; use pageserver_api::models::LocationConfigListResponse; use pageserver_api::models::LsnLease; +use pageserver_api::models::LsnLeaseRequest; use pageserver_api::models::ShardParameters; use pageserver_api::models::TenantDetails; use pageserver_api::models::TenantLocationConfigResponse; @@ -42,7 +43,7 @@ use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use remote_storage::TimeTravelError; -use tenant_size_model::{SizeResult, StorageModel}; +use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::JwtAuth; @@ -227,7 +228,7 @@ impl From for ApiError { BadRequest(e) => ApiError::BadRequest(e), Unavailable(_) => ApiError::ShuttingDown, e @ InProgress => ApiError::Conflict(format!("{e}")), - Flush(e) | Other(e) => ApiError::InternalServerError(e), + Flush(e) | InternalError(e) => ApiError::InternalServerError(e), } } } @@ -406,6 +407,8 @@ async fn build_timeline_info_common( let walreceiver_status = timeline.walreceiver_status(); + let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats(); + let info = TimelineInfo { tenant_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, @@ -426,6 +429,8 @@ async fn build_timeline_info_common( directory_entries_counts: timeline.get_directory_metrics().to_vec(), current_physical_size, current_logical_size_non_incremental: None, + pitr_history_size, + within_ancestor_pitr, timeline_dir_layer_file_size_sum: None, wal_source_connstr, last_received_msg_lsn, @@ -1191,10 +1196,15 @@ fn synthetic_size_html_response( timeline_map.insert(ti.timeline_id, index); timeline_ids.push(ti.timeline_id.to_string()); } - let seg_to_branch: Vec = inputs + let seg_to_branch: Vec<(usize, SvgBranchKind)> = inputs .segments .iter() - .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap()) + .map(|seg| { + ( + *timeline_map.get(&seg.timeline_id).unwrap(), + seg.kind.into(), + ) + }) .collect(); let svg = @@ -1296,7 +1306,7 @@ async fn update_tenant_config_handler( crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) .await - .map_err(ApiError::InternalServerError)?; + .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; tenant.set_new_tenant_config(new_tenant_conf); json_response(StatusCode::OK, ()) @@ -1527,15 +1537,13 @@ async fn handle_tenant_break( // Obtains an lsn lease on the given timeline. async fn lsn_lease_handler( - request: Request, + mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; - - let lsn: Lsn = parse_query_param(&request, "lsn")? - .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?; + let lsn = json_request::(&mut request).await?.lsn; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs new file mode 100644 index 000000000000..7fe8fedc6394 --- /dev/null +++ b/pageserver/src/l0_flush.rs @@ -0,0 +1,46 @@ +use std::{num::NonZeroUsize, sync::Arc}; + +use crate::tenant::ephemeral_file; + +#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)] +#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] +pub enum L0FlushConfig { + #[default] + PageCached, + #[serde(rename_all = "snake_case")] + Direct { max_concurrency: NonZeroUsize }, +} + +#[derive(Clone)] +pub struct L0FlushGlobalState(Arc); + +pub(crate) enum Inner { + PageCached, + Direct { semaphore: tokio::sync::Semaphore }, +} + +impl L0FlushGlobalState { + pub fn new(config: L0FlushConfig) -> Self { + match config { + L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)), + L0FlushConfig::Direct { max_concurrency } => { + let semaphore = tokio::sync::Semaphore::new(max_concurrency.get()); + Self(Arc::new(Inner::Direct { semaphore })) + } + } + } + + pub(crate) fn inner(&self) -> &Arc { + &self.0 + } +} + +impl L0FlushConfig { + pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite { + use L0FlushConfig::*; + match self { + PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes, + Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No, + } + } +} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 353f97264c5f..ac6b9b4f2a60 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -11,6 +11,7 @@ pub mod deletion_queue; pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; +pub mod l0_flush; pub use pageserver_api::keyspace; pub mod aux_file; pub mod metrics; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 50ab0b7a8d84..59b729363147 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -8,7 +8,7 @@ use metrics::{ }; use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; -use strum::{EnumCount, IntoEnumIterator, VariantNames}; +use strum::{EnumCount, VariantNames}; use strum_macros::{EnumVariantNames, IntoStaticStr}; use tracing::warn; use utils::id::TimelineId; @@ -464,6 +464,24 @@ static LAST_RECORD_LSN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static PITR_HISTORY_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_pitr_history_size", + "Data written since PITR cutoff on this timeline", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static TIMELINE_ARCHIVE_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_archive_size", + "Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + static STANDBY_HORIZON: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_standby_horizon", @@ -1076,21 +1094,12 @@ pub(crate) mod virtual_file_io_engine { }); } -#[derive(Debug)] -struct GlobalAndPerTimelineHistogram { - global: Histogram, - per_tenant_timeline: Histogram, -} +struct GlobalAndPerTimelineHistogramTimer<'a, 'c> { + global_metric: &'a Histogram, -impl GlobalAndPerTimelineHistogram { - fn observe(&self, value: f64) { - self.global.observe(value); - self.per_tenant_timeline.observe(value); - } -} + // Optional because not all op types are tracked per-timeline + timeline_metric: Option<&'a Histogram>, -struct GlobalAndPerTimelineHistogramTimer<'a, 'c> { - h: &'a GlobalAndPerTimelineHistogram, ctx: &'c RequestContext, start: std::time::Instant, op: SmgrQueryType, @@ -1121,7 +1130,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> { elapsed } }; - self.h.observe(ex_throttled.as_secs_f64()); + self.global_metric.observe(ex_throttled.as_secs_f64()); + if let Some(timeline_metric) = self.timeline_metric { + timeline_metric.observe(ex_throttled.as_secs_f64()); + } } } @@ -1146,7 +1158,8 @@ pub enum SmgrQueryType { #[derive(Debug)] pub(crate) struct SmgrQueryTimePerTimeline { - metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT], + global_metrics: [Histogram; SmgrQueryType::COUNT], + per_timeline_getpage: Histogram, } static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { @@ -1224,27 +1237,32 @@ impl SmgrQueryTimePerTimeline { let tenant_id = tenant_shard_id.tenant_id.to_string(); let shard_slug = format!("{}", tenant_shard_id.shard_slug()); let timeline_id = timeline_id.to_string(); - let metrics = std::array::from_fn(|i| { + let global_metrics = std::array::from_fn(|i| { let op = SmgrQueryType::from_repr(i).unwrap(); - let global = SMGR_QUERY_TIME_GLOBAL + SMGR_QUERY_TIME_GLOBAL .get_metric_with_label_values(&[op.into()]) - .unwrap(); - let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE - .get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id]) - .unwrap(); - GlobalAndPerTimelineHistogram { - global, - per_tenant_timeline, - } + .unwrap() }); - Self { metrics } + + let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE + .get_metric_with_label_values(&[ + SmgrQueryType::GetPageAtLsn.into(), + &tenant_id, + &shard_slug, + &timeline_id, + ]) + .unwrap(); + Self { + global_metrics, + per_timeline_getpage, + } } pub(crate) fn start_timer<'c: 'a, 'a>( &'a self, op: SmgrQueryType, ctx: &'c RequestContext, - ) -> impl Drop + '_ { - let metric = &self.metrics[op as usize]; + ) -> Option { + let global_metric = &self.global_metrics[op as usize]; let start = Instant::now(); match ctx.micros_spent_throttled.open() { Ok(()) => (), @@ -1263,12 +1281,20 @@ impl SmgrQueryTimePerTimeline { }); } } - GlobalAndPerTimelineHistogramTimer { - h: metric, + + let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) { + Some(&self.per_timeline_getpage) + } else { + None + }; + + Some(GlobalAndPerTimelineHistogramTimer { + global_metric, + timeline_metric, ctx, start, op, - } + }) } } @@ -1315,17 +1341,9 @@ mod smgr_query_time_tests { let get_counts = || { let global: u64 = ops .iter() - .map(|op| metrics.metrics[*op as usize].global.get_sample_count()) - .sum(); - let per_tenant_timeline: u64 = ops - .iter() - .map(|op| { - metrics.metrics[*op as usize] - .per_tenant_timeline - .get_sample_count() - }) + .map(|op| metrics.global_metrics[*op as usize].get_sample_count()) .sum(); - (global, per_tenant_timeline) + (global, metrics.per_timeline_getpage.get_sample_count()) }; let (pre_global, pre_per_tenant_timeline) = get_counts(); @@ -1336,7 +1354,12 @@ mod smgr_query_time_tests { drop(timer); let (post_global, post_per_tenant_timeline) = get_counts(); - assert_eq!(post_per_tenant_timeline, 1); + if matches!(op, super::SmgrQueryType::GetPageAtLsn) { + // getpage ops are tracked per-timeline, others aren't + assert_eq!(post_per_tenant_timeline, 1); + } else { + assert_eq!(post_per_tenant_timeline, 0); + } assert!(post_global > pre_global); } } @@ -1449,7 +1472,6 @@ pub(crate) enum ComputeCommandKind { PageStreamV2, PageStream, Basebackup, - GetLastRecordRlsn, Fullbackup, ImportBasebackup, ImportWal, @@ -2104,6 +2126,8 @@ pub(crate) struct TimelineMetrics { pub garbage_collect_histo: StorageTimeMetrics, pub find_gc_cutoffs_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, + pub pitr_history_size: UIntGauge, + pub archival_size: UIntGauge, pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size @@ -2177,6 +2201,15 @@ impl TimelineMetrics { let last_record_gauge = LAST_RECORD_LSN .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + + let pitr_history_size = PITR_HISTORY_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + + let archival_size = TIMELINE_ARCHIVE_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + let standby_horizon_gauge = STANDBY_HORIZON .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -2229,6 +2262,8 @@ impl TimelineMetrics { find_gc_cutoffs_histo, load_layer_map_histo, last_record_gauge, + pitr_history_size, + archival_size, standby_horizon_gauge, resident_physical_size_gauge, current_logical_size_gauge, @@ -2286,6 +2321,10 @@ impl TimelineMetrics { if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) { let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]); } + + let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]); @@ -2319,14 +2358,12 @@ impl TimelineMetrics { let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]); } - for op in SmgrQueryType::iter() { - let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[ - op.into(), - tenant_id, - shard_id, - timeline_id, - ]); - } + let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[ + SmgrQueryType::GetPageAtLsn.into(), + tenant_id, + shard_id, + timeline_id, + ]); } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 695f635c1b44..07365b5eb85e 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1651,53 +1651,6 @@ where metric_recording.observe(&res); res?; } - // return pair of prev_lsn and last_lsn - else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) { - if params.len() != 2 { - return Err(QueryError::Other(anyhow::anyhow!( - "invalid param number for get_last_record_rlsn command" - ))); - } - - let tenant_id = TenantId::from_str(params[0]) - .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; - let timeline_id = TimelineId::from_str(params[1]) - .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; - - tracing::Span::current() - .record("tenant_id", field::display(tenant_id)) - .record("timeline_id", field::display(timeline_id)); - - self.check_permission(Some(tenant_id))?; - - COMPUTE_COMMANDS_COUNTERS - .for_command(ComputeCommandKind::GetLastRecordRlsn) - .inc(); - - async { - let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) - .await?; - - let end_of_timeline = timeline.get_last_record_rlsn(); - - pgb.write_message_noflush(&BeMessage::RowDescription(&[ - RowDescriptor::text_col(b"prev_lsn"), - RowDescriptor::text_col(b"last_lsn"), - ]))? - .write_message_noflush(&BeMessage::DataRow(&[ - Some(end_of_timeline.prev.to_string().as_bytes()), - Some(end_of_timeline.last.to_string().as_bytes()), - ]))? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - anyhow::Ok(()) - } - .instrument(info_span!( - "handle_get_last_record_lsn", - shard_id = tracing::field::Empty - )) - .await?; - } // same as basebackup, but result includes relational data as well else if let Some(params) = parts.strip_prefix(&["fullbackup"]) { if params.len() < 2 { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 3ffbaf98c69f..eef8dc104c69 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -73,6 +73,7 @@ use crate::deletion_queue::DeletionQueueClient; use crate::deletion_queue::DeletionQueueError; use crate::import_datadir; use crate::is_uninit_mark; +use crate::l0_flush::L0FlushGlobalState; use crate::metrics::TENANT; use crate::metrics::{ remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, @@ -166,6 +167,7 @@ pub struct TenantSharedResources { pub broker_client: storage_broker::BrokerClientChannel, pub remote_storage: GenericRemoteStorage, pub deletion_queue_client: DeletionQueueClient, + pub l0_flush_global_state: L0FlushGlobalState, } /// A [`Tenant`] is really an _attached_ tenant. The configuration @@ -294,6 +296,8 @@ pub struct Tenant { /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline. ongoing_timeline_detach: std::sync::Mutex>, + + l0_flush_global_state: L0FlushGlobalState, } impl std::fmt::Debug for Tenant { @@ -529,6 +533,15 @@ impl From for GcError { } } +#[derive(thiserror::Error, Debug)] +pub(crate) enum LoadConfigError { + #[error("TOML deserialization error: '{0}'")] + DeserializeToml(#[from] toml_edit::de::Error), + + #[error("Config not found at {0}")] + NotFound(Utf8PathBuf), +} + impl Tenant { /// Yet another helper for timeline initialization. /// @@ -667,6 +680,7 @@ impl Tenant { broker_client, remote_storage, deletion_queue_client, + l0_flush_global_state, } = resources; let attach_mode = attached_conf.location.attach_mode; @@ -681,6 +695,7 @@ impl Tenant { tenant_shard_id, remote_storage.clone(), deletion_queue_client, + l0_flush_global_state, )); // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if @@ -980,6 +995,7 @@ impl Tenant { TimelineResources { remote_client, timeline_get_throttle: self.timeline_get_throttle.clone(), + l0_flush_global_state: self.l0_flush_global_state.clone(), }, ctx, ) @@ -1349,7 +1365,7 @@ impl Tenant { initdb_lsn: Lsn, pg_version: u32, ctx: &RequestContext, - delta_layer_desc: Vec>, + delta_layer_desc: Vec, image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, end_lsn: Lsn, ) -> anyhow::Result> { @@ -1800,9 +1816,15 @@ impl Tenant { // If we're still attaching, fire the cancellation token early to drop out: this // will prevent us flushing, but ensures timely shutdown if some I/O during attach // is very slow. - if matches!(self.current_state(), TenantState::Attaching) { + let shutdown_mode = if matches!(self.current_state(), TenantState::Attaching) { self.cancel.cancel(); - } + + // Having fired our cancellation token, do not try and flush timelines: their cancellation tokens + // are children of ours, so their flush loops will have shut down already + timeline::ShutdownMode::Hard + } else { + shutdown_mode + }; match self.set_stopping(shutdown_progress, false, false).await { Ok(()) => {} @@ -2469,6 +2491,7 @@ impl Tenant { tenant_shard_id: TenantShardId, remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, + l0_flush_global_state: L0FlushGlobalState, ) -> Tenant { debug_assert!( !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none() @@ -2556,6 +2579,7 @@ impl Tenant { )), tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), ongoing_timeline_detach: std::sync::Mutex::default(), + l0_flush_global_state, } } @@ -2563,36 +2587,35 @@ impl Tenant { pub(super) fn load_tenant_config( conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, - ) -> anyhow::Result { + ) -> Result { let config_path = conf.tenant_location_config_path(tenant_shard_id); - if config_path.exists() { - // New-style config takes precedence - let deserialized = Self::read_config(&config_path)?; - Ok(toml_edit::de::from_document::(deserialized)?) - } else { - // The config should almost always exist for a tenant directory: - // - When attaching a tenant, the config is the first thing we write - // - When detaching a tenant, we atomically move the directory to a tmp location - // before deleting contents. - // - // The very rare edge case that can result in a missing config is if we crash during attach - // between creating directory and writing config. Callers should handle that as if the - // directory didn't exist. - anyhow::bail!("tenant config not found in {}", config_path); - } - } - - fn read_config(path: &Utf8Path) -> anyhow::Result { - info!("loading tenant configuration from {path}"); + info!("loading tenant configuration from {config_path}"); // load and parse file - let config = fs::read_to_string(path) - .with_context(|| format!("Failed to load config from path '{path}'"))?; + let config = fs::read_to_string(&config_path).map_err(|e| { + match e.kind() { + std::io::ErrorKind::NotFound => { + // The config should almost always exist for a tenant directory: + // - When attaching a tenant, the config is the first thing we write + // - When detaching a tenant, we atomically move the directory to a tmp location + // before deleting contents. + // + // The very rare edge case that can result in a missing config is if we crash during attach + // between creating directory and writing config. Callers should handle that as if the + // directory didn't exist. + + LoadConfigError::NotFound(config_path) + } + _ => { + // No IO errors except NotFound are acceptable here: other kinds of error indicate local storage or permissions issues + // that we cannot cleanly recover + crate::virtual_file::on_fatal_io_error(&e, "Reading tenant config file") + } + } + })?; - config - .parse::() - .with_context(|| format!("Failed to parse config from file '{path}' as toml file")) + Ok(toml_edit::de::from_str::(&config)?) } #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] @@ -2600,7 +2623,7 @@ impl Tenant { conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, location_conf: &LocationConf, - ) -> anyhow::Result<()> { + ) -> std::io::Result<()> { let config_path = conf.tenant_location_config_path(tenant_shard_id); Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await @@ -2611,7 +2634,7 @@ impl Tenant { tenant_shard_id: &TenantShardId, config_path: &Utf8Path, location_conf: &LocationConf, - ) -> anyhow::Result<()> { + ) -> std::io::Result<()> { debug!("persisting tenantconf to {config_path}"); let mut conf_content = r#"# This file contains a specific per-tenant's config. @@ -2620,22 +2643,20 @@ impl Tenant { .to_string(); fail::fail_point!("tenant-config-before-write", |_| { - anyhow::bail!("tenant-config-before-write"); + Err(std::io::Error::new( + std::io::ErrorKind::Other, + "tenant-config-before-write", + )) }); // Convert the config to a toml file. - conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?; + conf_content += + &toml_edit::ser::to_string_pretty(&location_conf).expect("Config serialization failed"); let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX); - let tenant_shard_id = *tenant_shard_id; - let config_path = config_path.to_owned(); let conf_content = conf_content.into_bytes(); - VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content) - .await - .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?; - - Ok(()) + VirtualFile::crashsafe_overwrite(config_path.to_owned(), temp_path, conf_content).await } // @@ -2853,6 +2874,7 @@ impl Tenant { { let mut target = timeline.gc_info.write().unwrap(); + // Cull any expired leases let now = SystemTime::now(); target.leases.retain(|_, lease| !lease.is_expired(&now)); @@ -2861,6 +2883,31 @@ impl Tenant { .valid_lsn_lease_count_gauge .set(target.leases.len() as u64); + // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR + if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() { + if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) { + target.within_ancestor_pitr = + timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr; + } + } + + // Update metrics that depend on GC state + timeline + .metrics + .archival_size + .set(if target.within_ancestor_pitr { + timeline.metrics.current_logical_size_gauge.get() + } else { + 0 + }); + timeline.metrics.pitr_history_size.set( + timeline + .get_last_record_lsn() + .checked_sub(target.cutoffs.pitr) + .unwrap_or(Lsn(0)) + .0, + ); + match gc_cutoffs.remove(&timeline.timeline_id) { Some(cutoffs) => { target.retain_lsns = branchpoints; @@ -2912,7 +2959,7 @@ impl Tenant { dst_id: TimelineId, ancestor_lsn: Option, ctx: &RequestContext, - delta_layer_desc: Vec>, + delta_layer_desc: Vec, image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, end_lsn: Lsn, ) -> anyhow::Result> { @@ -3296,6 +3343,7 @@ impl Tenant { TimelineResources { remote_client, timeline_get_throttle: self.timeline_get_throttle.clone(), + l0_flush_global_state: self.l0_flush_global_state.clone(), } } @@ -3632,6 +3680,7 @@ pub(crate) mod harness { use utils::logging; use crate::deletion_queue::mock::MockDeletionQueue; + use crate::l0_flush::L0FlushConfig; use crate::walredo::apply_neon; use crate::{repository::Key, walrecord::NeonWalRecord}; @@ -3821,6 +3870,8 @@ pub(crate) mod harness { self.tenant_shard_id, self.remote_storage.clone(), self.deletion_queue.new_client(), + // TODO: ideally we should run all unit tests with both configs + L0FlushGlobalState::new(L0FlushConfig::default()), )); let preload = tenant @@ -3908,7 +3959,7 @@ mod tests { use storage_layer::PersistentLayerKey; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; - use timeline::GcInfo; + use timeline::{DeltaLayerTestDesc, GcInfo}; use utils::bin_ser::BeSer; use utils::id::TenantId; @@ -6204,27 +6255,6 @@ mod tests { .await .unwrap(); - async fn get_vectored_impl_wrapper( - tline: &Arc, - key: Key, - lsn: Lsn, - ctx: &RequestContext, - ) -> Result, GetVectoredError> { - let mut reconstruct_state = ValuesReconstructState::new(); - let mut res = tline - .get_vectored_impl( - KeySpace::single(key..key.next()), - lsn, - &mut reconstruct_state, - ctx, - ) - .await?; - Ok(res.pop_last().map(|(k, v)| { - assert_eq!(k, key); - v.unwrap() - })) - } - let lsn = Lsn(0x30); // test vectored get on parent timeline @@ -6300,27 +6330,6 @@ mod tests { .await .unwrap(); - async fn get_vectored_impl_wrapper( - tline: &Arc, - key: Key, - lsn: Lsn, - ctx: &RequestContext, - ) -> Result, GetVectoredError> { - let mut reconstruct_state = ValuesReconstructState::new(); - let mut res = tline - .get_vectored_impl( - KeySpace::single(key..key.next()), - lsn, - &mut reconstruct_state, - ctx, - ) - .await?; - Ok(res.pop_last().map(|(k, v)| { - assert_eq!(k, key); - v.unwrap() - })) - } - let lsn = Lsn(0x30); // test vectored get on parent timeline @@ -6396,9 +6405,18 @@ mod tests { &ctx, // delta layers vec![ - vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], - vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], - vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + ), ], // image layers vec![ @@ -6464,17 +6482,29 @@ mod tests { &ctx, // delta layers vec![ - vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], - vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], - vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], - vec![ - (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))), - (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))), - ], + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x30)..Lsn(0x40), + vec![ + (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))), + (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))), + ], + ), ], // image layers vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])], - Lsn(0x30), + Lsn(0x40), ) .await .unwrap(); @@ -6497,7 +6527,7 @@ mod tests { // Image layers are created at last_record_lsn let images = tline - .inspect_image_layers(Lsn(0x30), &ctx) + .inspect_image_layers(Lsn(0x40), &ctx) .await .unwrap() .into_iter() @@ -6523,9 +6553,18 @@ mod tests { &ctx, // delta layers vec![ - vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], - vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], - vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + ), ], // image layers vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])], @@ -6573,15 +6612,21 @@ mod tests { key } - // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon. + // We create + // - one bottom-most image layer, + // - a delta layer D1 crossing the GC horizon with data below and above the horizon, + // - a delta layer D2 crossing the GC horizon with data only below the horizon, + // - a delta layer D3 above the horizon. // - // | D1 | | D3 | + // | D3 | + // | D1 | // -| |-- gc horizon ----------------- // | | | D2 | // --------- img layer ------------------ // // What we should expact from this compaction is: - // | Part of D1 | | D3 | + // | D3 | + // | Part of D1 | // --------- img layer with D1+D2 at GC horizon------------------ // img layer at 0x10 @@ -6621,13 +6666,13 @@ mod tests { let delta3 = vec![ ( get_key(8), - Lsn(0x40), - Value::Image(Bytes::from("value 8@0x40")), + Lsn(0x48), + Value::Image(Bytes::from("value 8@0x48")), ), ( get_key(9), - Lsn(0x40), - Value::Image(Bytes::from("value 9@0x40")), + Lsn(0x48), + Value::Image(Bytes::from("value 9@0x48")), ), ]; @@ -6637,7 +6682,11 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, - vec![delta1, delta2, delta3], // delta layers + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), + ], // delta layers vec![(Lsn(0x10), img_layer)], // image layers Lsn(0x50), ) @@ -6658,8 +6707,8 @@ mod tests { Bytes::from_static(b"value 5@0x20"), Bytes::from_static(b"value 6@0x20"), Bytes::from_static(b"value 7@0x10"), - Bytes::from_static(b"value 8@0x40"), - Bytes::from_static(b"value 9@0x40"), + Bytes::from_static(b"value 8@0x48"), + Bytes::from_static(b"value 9@0x48"), ]; for (idx, expected) in expected_result.iter().enumerate() { @@ -6747,10 +6796,10 @@ mod tests { lsn_range: Lsn(0x30)..Lsn(0x41), is_delta: true }, - // The delta layer we created and should not be picked for the compaction + // The delta3 layer that should not be picked for the compaction PersistentLayerKey { key_range: get_key(8)..get_key(10), - lsn_range: Lsn(0x40)..Lsn(0x41), + lsn_range: Lsn(0x48)..Lsn(0x50), is_delta: true } ] @@ -6814,7 +6863,10 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, - vec![delta1], // delta layers + vec![DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x40), + delta1, + )], // delta layers vec![(Lsn(0x10), image1)], // image layers Lsn(0x50), ) @@ -6938,15 +6990,21 @@ mod tests { key } - // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon. + // We create + // - one bottom-most image layer, + // - a delta layer D1 crossing the GC horizon with data below and above the horizon, + // - a delta layer D2 crossing the GC horizon with data only below the horizon, + // - a delta layer D3 above the horizon. // - // | D1 | | D3 | + // | D3 | + // | D1 | // -| |-- gc horizon ----------------- // | | | D2 | // --------- img layer ------------------ // // What we should expact from this compaction is: - // | Part of D1 | | D3 | + // | D3 | + // | Part of D1 | // --------- img layer with D1+D2 at GC horizon------------------ // img layer at 0x10 @@ -6996,13 +7054,13 @@ mod tests { let delta3 = vec![ ( get_key(8), - Lsn(0x40), - Value::WalRecord(NeonWalRecord::wal_append("@0x40")), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), ), ( get_key(9), - Lsn(0x40), - Value::WalRecord(NeonWalRecord::wal_append("@0x40")), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), ), ]; @@ -7012,7 +7070,11 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, - vec![delta1, delta2, delta3], // delta layers + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), + ], // delta layers vec![(Lsn(0x10), img_layer)], // image layers Lsn(0x50), ) @@ -7027,6 +7089,7 @@ mod tests { horizon: Lsn(0x30), }, leases: Default::default(), + within_ancestor_pitr: false, }; } @@ -7039,8 +7102,8 @@ mod tests { Bytes::from_static(b"value 5@0x10@0x20"), Bytes::from_static(b"value 6@0x10@0x20"), Bytes::from_static(b"value 7@0x10"), - Bytes::from_static(b"value 8@0x10@0x40"), - Bytes::from_static(b"value 9@0x10@0x40"), + Bytes::from_static(b"value 8@0x10@0x48"), + Bytes::from_static(b"value 9@0x10@0x48"), ]; let expected_result_at_gc_horizon = [ diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 2be8816cefbd..0705182d5db2 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -6,13 +6,20 @@ //! is written as a one byte. If it's larger than that, the length //! is written as a four-byte integer, in big-endian, with the high //! bit set. This way, we can detect whether it's 1- or 4-byte header -//! by peeking at the first byte. +//! by peeking at the first byte. For blobs larger than 128 bits, +//! we also specify three reserved bits, only one of the three bit +//! patterns is currently in use (0b011) and signifies compression +//! with zstd. //! //! len < 128: 0XXXXXXX -//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX +//! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! +use async_compression::Level; use bytes::{BufMut, BytesMut}; +use pageserver_api::models::ImageCompressionAlgorithm; +use tokio::io::AsyncWriteExt; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; +use tracing::warn; use crate::context::RequestContext; use crate::page_cache::PAGE_SZ; @@ -66,12 +73,37 @@ impl<'a> BlockCursor<'a> { len_buf.copy_from_slice(&buf[off..off + 4]); off += 4; } - len_buf[0] &= 0x7f; + let bit_mask = if self.read_compressed { + !LEN_COMPRESSION_BIT_MASK + } else { + 0x7f + }; + len_buf[0] &= bit_mask; u32::from_be_bytes(len_buf) as usize }; + let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK; - dstbuf.clear(); - dstbuf.reserve(len); + let mut tmp_buf = Vec::new(); + let buf_to_write; + let compression = if compression_bits <= BYTE_UNCOMPRESSED || !self.read_compressed { + if compression_bits > BYTE_UNCOMPRESSED { + warn!("reading key above future limit ({len} bytes)"); + } + buf_to_write = dstbuf; + None + } else if compression_bits == BYTE_ZSTD { + buf_to_write = &mut tmp_buf; + Some(dstbuf) + } else { + let error = std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("invalid compression byte {compression_bits:x}"), + ); + return Err(error); + }; + + buf_to_write.clear(); + buf_to_write.reserve(len); // Read the payload let mut remain = len; @@ -85,14 +117,35 @@ impl<'a> BlockCursor<'a> { page_remain = PAGE_SZ; } let this_blk_len = min(remain, page_remain); - dstbuf.extend_from_slice(&buf[off..off + this_blk_len]); + buf_to_write.extend_from_slice(&buf[off..off + this_blk_len]); remain -= this_blk_len; off += this_blk_len; } + + if let Some(dstbuf) = compression { + if compression_bits == BYTE_ZSTD { + let mut decoder = async_compression::tokio::write::ZstdDecoder::new(dstbuf); + decoder.write_all(buf_to_write).await?; + decoder.flush().await?; + } else { + unreachable!("already checked above") + } + } + Ok(()) } } +/// Reserved bits for length and compression +const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0; + +/// The maximum size of blobs we support. The highest few bits +/// are reserved for compression and other further uses. +const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff; + +const BYTE_UNCOMPRESSED: u8 = 0x80; +const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10; + /// A wrapper of `VirtualFile` that allows users to write blobs. /// /// If a `BlobWriter` is dropped, the internal buffer will be @@ -219,6 +272,22 @@ impl BlobWriter { &mut self, srcbuf: B, ctx: &RequestContext, + ) -> (B::Buf, Result) { + self.write_blob_maybe_compressed( + srcbuf, + ctx, + ImageCompressionAlgorithm::DisabledNoDecompress, + ) + .await + } + + /// Write a blob of data. Returns the offset that it was written to, + /// which can be used to retrieve the data later. + pub async fn write_blob_maybe_compressed, Buf: IoBuf + Send>( + &mut self, + srcbuf: B, + ctx: &RequestContext, + algorithm: ImageCompressionAlgorithm, ) -> (B::Buf, Result) { let offset = self.offset; @@ -226,29 +295,61 @@ impl BlobWriter { let mut io_buf = self.io_buf.take().expect("we always put it back below"); io_buf.clear(); - let (io_buf, hdr_res) = async { + let mut compressed_buf = None; + let ((io_buf, hdr_res), srcbuf) = async { if len < 128 { // Short blob. Write a 1-byte length header io_buf.put_u8(len as u8); - self.write_all(io_buf, ctx).await + ( + self.write_all(io_buf, ctx).await, + srcbuf.slice_full().into_inner(), + ) } else { // Write a 4-byte length header - if len > 0x7fff_ffff { + if len > MAX_SUPPORTED_LEN { return ( - io_buf, - Err(Error::new( - ErrorKind::Other, - format!("blob too large ({len} bytes)"), - )), + ( + io_buf, + Err(Error::new( + ErrorKind::Other, + format!("blob too large ({len} bytes)"), + )), + ), + srcbuf.slice_full().into_inner(), ); } - if len > 0x0fff_ffff { - tracing::warn!("writing blob above future limit ({len} bytes)"); - } - let mut len_buf = (len as u32).to_be_bytes(); - len_buf[0] |= 0x80; + let (high_bit_mask, len_written, srcbuf) = match algorithm { + ImageCompressionAlgorithm::Zstd { level } => { + let mut encoder = if let Some(level) = level { + async_compression::tokio::write::ZstdEncoder::with_quality( + Vec::new(), + Level::Precise(level.into()), + ) + } else { + async_compression::tokio::write::ZstdEncoder::new(Vec::new()) + }; + let slice = srcbuf.slice_full(); + encoder.write_all(&slice[..]).await.unwrap(); + encoder.shutdown().await.unwrap(); + let compressed = encoder.into_inner(); + if compressed.len() < len { + let compressed_len = compressed.len(); + compressed_buf = Some(compressed); + (BYTE_ZSTD, compressed_len, slice.into_inner()) + } else { + (BYTE_UNCOMPRESSED, len, slice.into_inner()) + } + } + ImageCompressionAlgorithm::Disabled + | ImageCompressionAlgorithm::DisabledNoDecompress => { + (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner()) + } + }; + let mut len_buf = (len_written as u32).to_be_bytes(); + assert_eq!(len_buf[0] & 0xf0, 0); + len_buf[0] |= high_bit_mask; io_buf.extend_from_slice(&len_buf[..]); - self.write_all(io_buf, ctx).await + (self.write_all(io_buf, ctx).await, srcbuf) } } .await; @@ -257,7 +358,12 @@ impl BlobWriter { Ok(_) => (), Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)), } - let (srcbuf, res) = self.write_all(srcbuf, ctx).await; + let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf { + let (_buf, res) = self.write_all(compressed_buf, ctx).await; + (Slice::into_inner(srcbuf.slice(..)), res) + } else { + self.write_all(srcbuf, ctx).await + }; (srcbuf, res.map(|_| offset)) } } @@ -295,6 +401,13 @@ mod tests { use rand::{Rng, SeedableRng}; async fn round_trip_test(blobs: &[Vec]) -> Result<(), Error> { + round_trip_test_compressed::(blobs, false).await + } + + async fn round_trip_test_compressed( + blobs: &[Vec], + compression: bool, + ) -> Result<(), Error> { let temp_dir = camino_tempfile::tempdir()?; let pathbuf = temp_dir.path().join("file"); let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); @@ -305,7 +418,16 @@ mod tests { let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?; let mut wtr = BlobWriter::::new(file, 0); for blob in blobs.iter() { - let (_, res) = wtr.write_blob(blob.clone(), &ctx).await; + let (_, res) = if compression { + wtr.write_blob_maybe_compressed( + blob.clone(), + &ctx, + ImageCompressionAlgorithm::Zstd { level: Some(1) }, + ) + .await + } else { + wtr.write_blob(blob.clone(), &ctx).await + }; let offs = res?; offsets.push(offs); } @@ -319,7 +441,7 @@ mod tests { let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?; let rdr = BlockReaderRef::VirtualFile(&file); - let rdr = BlockCursor::new(rdr); + let rdr = BlockCursor::new_with_compression(rdr, compression); for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { let blob_read = rdr.read_blob(*offset, &ctx).await?; assert_eq!( @@ -353,6 +475,8 @@ mod tests { ]; round_trip_test::(blobs).await?; round_trip_test::(blobs).await?; + round_trip_test_compressed::(blobs, true).await?; + round_trip_test_compressed::(blobs, true).await?; Ok(()) } @@ -361,10 +485,15 @@ mod tests { let blobs = &[ b"test".to_vec(), random_array(10 * PAGE_SZ), + b"hello".to_vec(), + random_array(66 * PAGE_SZ), + vec![0xf3; 24 * PAGE_SZ], b"foobar".to_vec(), ]; round_trip_test::(blobs).await?; round_trip_test::(blobs).await?; + round_trip_test_compressed::(blobs, true).await?; + round_trip_test_compressed::(blobs, true).await?; Ok(()) } diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index b406d5033243..3324e840ecf1 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -37,6 +37,7 @@ where pub enum BlockLease<'a> { PageReadGuard(PageReadGuard<'static>), EphemeralFileMutableTail(&'a [u8; PAGE_SZ]), + Slice(&'a [u8; PAGE_SZ]), #[cfg(test)] Arc(std::sync::Arc<[u8; PAGE_SZ]>), #[cfg(test)] @@ -63,6 +64,7 @@ impl<'a> Deref for BlockLease<'a> { match self { BlockLease::PageReadGuard(v) => v.deref(), BlockLease::EphemeralFileMutableTail(v) => v, + BlockLease::Slice(v) => v, #[cfg(test)] BlockLease::Arc(v) => v.deref(), #[cfg(test)] @@ -81,6 +83,7 @@ pub(crate) enum BlockReaderRef<'a> { FileBlockReader(&'a FileBlockReader<'a>), EphemeralFile(&'a EphemeralFile), Adapter(Adapter<&'a DeltaLayerInner>), + Slice(&'a [u8]), #[cfg(test)] TestDisk(&'a super::disk_btree::tests::TestDisk), #[cfg(test)] @@ -99,6 +102,7 @@ impl<'a> BlockReaderRef<'a> { FileBlockReader(r) => r.read_blk(blknum, ctx).await, EphemeralFile(r) => r.read_blk(blknum, ctx).await, Adapter(r) => r.read_blk(blknum, ctx).await, + Slice(s) => Self::read_blk_slice(s, blknum), #[cfg(test)] TestDisk(r) => r.read_blk(blknum), #[cfg(test)] @@ -107,6 +111,24 @@ impl<'a> BlockReaderRef<'a> { } } +impl<'a> BlockReaderRef<'a> { + fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result { + let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap(); + let end = start.checked_add(PAGE_SZ).unwrap(); + if end > slice.len() { + return Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + format!("slice too short, len={} end={}", slice.len(), end), + )); + } + let slice = &slice[start..end]; + let page_sized: &[u8; PAGE_SZ] = slice + .try_into() + .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ"); + Ok(BlockLease::Slice(page_sized)) + } +} + /// /// A "cursor" for efficiently reading multiple pages from a BlockReader /// @@ -127,16 +149,24 @@ impl<'a> BlockReaderRef<'a> { /// ``` /// pub struct BlockCursor<'a> { + pub(super) read_compressed: bool, reader: BlockReaderRef<'a>, } impl<'a> BlockCursor<'a> { pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self { - BlockCursor { reader } + Self::new_with_compression(reader, false) + } + pub(crate) fn new_with_compression(reader: BlockReaderRef<'a>, read_compressed: bool) -> Self { + BlockCursor { + read_compressed, + reader, + } } // Needed by cli pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self { BlockCursor { + read_compressed: false, reader: BlockReaderRef::FileBlockReader(reader), } } @@ -166,11 +196,25 @@ pub struct FileBlockReader<'a> { /// Unique ID of this file, used as key in the page cache. file_id: page_cache::FileId, + + compressed_reads: bool, } impl<'a> FileBlockReader<'a> { pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self { - FileBlockReader { file_id, file } + Self::new_with_compression(file, file_id, false) + } + + pub fn new_with_compression( + file: &'a VirtualFile, + file_id: FileId, + compressed_reads: bool, + ) -> Self { + FileBlockReader { + file_id, + file, + compressed_reads, + } } /// Read a page from the underlying file into given buffer. @@ -217,7 +261,10 @@ impl<'a> FileBlockReader<'a> { impl BlockReader for FileBlockReader<'_> { fn block_cursor(&self) -> BlockCursor<'_> { - BlockCursor::new(BlockReaderRef::FileBlockReader(self)) + BlockCursor::new_with_compression( + BlockReaderRef::FileBlockReader(self), + self.compressed_reads, + ) } } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 79cc7bf15373..bb65ae24fc5e 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -21,6 +21,7 @@ pub struct EphemeralFile { } mod page_caching; +pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite; mod zero_padded_read_write; impl EphemeralFile { @@ -53,7 +54,7 @@ impl EphemeralFile { Ok(EphemeralFile { _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, - rw: page_caching::RW::new(file), + rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()), }) } @@ -65,6 +66,11 @@ impl EphemeralFile { self.rw.page_cache_file_id() } + /// See [`self::page_caching::RW::load_to_vec`]. + pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result, io::Error> { + self.rw.load_to_vec(ctx).await + } + pub(crate) async fn read_blk( &self, blknum: u32, diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 276ac8706493..43b9fff28d98 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -8,6 +8,7 @@ use crate::virtual_file::VirtualFile; use once_cell::sync::Lazy; use std::io::{self, ErrorKind}; +use std::ops::{Deref, Range}; use tokio_epoll_uring::BoundedBuf; use tracing::*; @@ -19,14 +20,23 @@ pub struct RW { rw: super::zero_padded_read_write::RW, } +/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`], +/// should we pre-warm the [`crate::page_cache`] with the contents? +#[derive(Clone, Copy)] +pub enum PrewarmOnWrite { + Yes, + No, +} + impl RW { - pub fn new(file: VirtualFile) -> Self { + pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self { let page_cache_file_id = page_cache::next_file_id(); Self { page_cache_file_id, rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new( page_cache_file_id, file, + prewarm_on_write, )), } } @@ -49,6 +59,43 @@ impl RW { self.rw.bytes_written() } + /// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer. + /// + /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer. + /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`]. + pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result, io::Error> { + // round up to the next PAGE_SZ multiple, required by blob_io + let size = { + let s = usize::try_from(self.bytes_written()).unwrap(); + if s % PAGE_SZ == 0 { + s + } else { + s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap() + } + }; + let vec = Vec::with_capacity(size); + + // read from disk what we've already flushed + let writer = self.rw.as_writer(); + let flushed_range = writer.written_range(); + let mut vec = writer + .file + .read_exact_at( + vec.slice(0..(flushed_range.end - flushed_range.start)), + u64::try_from(flushed_range.start).unwrap(), + ctx, + ) + .await? + .into_inner(); + + // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk + let buffered = self.rw.get_tail_zero_padded(); + vec.extend_from_slice(buffered); + assert_eq!(vec.len(), size); + assert_eq!(vec.len() % PAGE_SZ, 0); + Ok(vec) + } + pub(crate) async fn read_blk( &self, blknum: u32, @@ -116,19 +163,40 @@ impl Drop for RW { } struct PreWarmingWriter { + prewarm_on_write: PrewarmOnWrite, nwritten_blocks: u32, page_cache_file_id: page_cache::FileId, file: VirtualFile, } impl PreWarmingWriter { - fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self { + fn new( + page_cache_file_id: page_cache::FileId, + file: VirtualFile, + prewarm_on_write: PrewarmOnWrite, + ) -> Self { Self { + prewarm_on_write, nwritten_blocks: 0, page_cache_file_id, file, } } + + /// Return the byte range within `file` that has been written though `write_all`. + /// + /// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`. + fn written_range(&self) -> (impl Deref> + '_) { + let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap(); + struct Wrapper(Range); + impl Deref for Wrapper { + type Target = Range; + fn deref(&self) -> &Range { + &self.0 + } + } + Wrapper(0..nwritten_blocks * PAGE_SZ) + } } impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter { @@ -178,45 +246,51 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi assert_eq!(&check_bounds_stuff_works, &*buf); } - // Pre-warm page cache with the contents. - // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming - // benefits the code that writes InMemoryLayer=>L0 layers. let nblocks = buflen / PAGE_SZ; let nblocks32 = u32::try_from(nblocks).unwrap(); - let cache = page_cache::get(); - static CTX: Lazy = Lazy::new(|| { - RequestContext::new( - crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache, - crate::context::DownloadBehavior::Error, - ) - }); - for blknum_in_buffer in 0..nblocks { - let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ]; - let blknum = self - .nwritten_blocks - .checked_add(blknum_in_buffer as u32) - .unwrap(); - match cache - .read_immutable_buf(self.page_cache_file_id, blknum, &CTX) - .await - { - Err(e) => { - error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}"); - // fail gracefully, it's not the end of the world if we can't pre-warm the cache here - } - Ok(v) => match v { - page_cache::ReadBufResult::Found(_guard) => { - // This function takes &mut self, so, it shouldn't be possible to reach this point. - unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \ - and this function takes &mut self, so, no concurrent read_blk is possible"); - } - page_cache::ReadBufResult::NotFound(mut write_guard) => { - write_guard.copy_from_slice(blk_in_buffer); - let _ = write_guard.mark_valid(); + + if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) { + // Pre-warm page cache with the contents. + // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming + // benefits the code that writes InMemoryLayer=>L0 layers. + + let cache = page_cache::get(); + static CTX: Lazy = Lazy::new(|| { + RequestContext::new( + crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache, + crate::context::DownloadBehavior::Error, + ) + }); + for blknum_in_buffer in 0..nblocks { + let blk_in_buffer = + &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ]; + let blknum = self + .nwritten_blocks + .checked_add(blknum_in_buffer as u32) + .unwrap(); + match cache + .read_immutable_buf(self.page_cache_file_id, blknum, &CTX) + .await + { + Err(e) => { + error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}"); + // fail gracefully, it's not the end of the world if we can't pre-warm the cache here } - }, + Ok(v) => match v { + page_cache::ReadBufResult::Found(_guard) => { + // This function takes &mut self, so, it shouldn't be possible to reach this point. + unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \ + and this function takes &mut self, so, no concurrent read_blk is possible"); + } + page_cache::ReadBufResult::NotFound(mut write_guard) => { + write_guard.copy_from_slice(blk_in_buffer); + let _ = write_guard.mark_valid(); + } + }, + } } } + self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap(); Ok((buflen, buf.into_inner())) } diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs index b37eafb52c5b..fe310acab888 100644 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs @@ -75,6 +75,21 @@ where flushed_offset + u64::try_from(buffer.pending()).unwrap() } + /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`]. + pub fn get_tail_zero_padded(&self) -> &[u8] { + let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); + let buffer_written_up_to = buffer.pending(); + // pad to next page boundary + let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 { + buffer_written_up_to + } else { + buffer_written_up_to + .checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ)) + .unwrap() + }; + &buffer.as_zero_padded_slice()[0..read_up_to] + } + pub(crate) async fn read_blk(&self, blknum: u32) -> Result, std::io::Error> { let flushed_offset = self.buffered_writer.as_inner().bytes_written(); let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 08c3f19b6f75..b0159e22bfc0 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -43,7 +43,8 @@ use crate::tenant::config::{ use crate::tenant::span::debug_assert_current_span_has_tenant_id; use crate::tenant::storage_layer::inmemory_layer; use crate::tenant::timeline::ShutdownMode; -use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState}; +use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState}; +use crate::virtual_file::MaybeFatalIo; use crate::{InitializationOrder, TEMP_FILE_SUFFIX}; use utils::crashsafe::path_with_suffix_extension; @@ -272,7 +273,7 @@ pub struct TenantManager { } fn emergency_generations( - tenant_confs: &HashMap>, + tenant_confs: &HashMap>, ) -> HashMap { tenant_confs .iter() @@ -296,7 +297,7 @@ fn emergency_generations( async fn init_load_generations( conf: &'static PageServerConf, - tenant_confs: &HashMap>, + tenant_confs: &HashMap>, resources: &TenantSharedResources, cancel: &CancellationToken, ) -> anyhow::Result>> { @@ -346,56 +347,32 @@ async fn init_load_generations( /// Given a directory discovered in the pageserver's tenants/ directory, attempt /// to load a tenant config from it. /// -/// If file is missing, return Ok(None) +/// If we cleaned up something expected (like an empty dir or a temp dir), return None. fn load_tenant_config( conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, dentry: Utf8DirEntry, -) -> anyhow::Result)>> { +) -> Option> { let tenant_dir_path = dentry.path().to_path_buf(); if crate::is_temporary(&tenant_dir_path) { info!("Found temporary tenant directory, removing: {tenant_dir_path}"); // No need to use safe_remove_tenant_dir_all because this is already // a temporary path - if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) { - error!( - "Failed to remove temporary directory '{}': {:?}", - tenant_dir_path, e - ); - } - return Ok(None); + std::fs::remove_dir_all(&tenant_dir_path).fatal_err("delete temporary tenant dir"); + return None; } // This case happens if we crash during attachment before writing a config into the dir let is_empty = tenant_dir_path .is_empty_dir() - .with_context(|| format!("Failed to check whether {tenant_dir_path:?} is an empty dir"))?; + .fatal_err("Checking for empty tenant dir"); if is_empty { info!("removing empty tenant directory {tenant_dir_path:?}"); - if let Err(e) = std::fs::remove_dir(&tenant_dir_path) { - error!( - "Failed to remove empty tenant directory '{}': {e:#}", - tenant_dir_path - ) - } - return Ok(None); + std::fs::remove_dir(&tenant_dir_path).fatal_err("delete empty tenant dir"); + return None; } - let tenant_shard_id = match tenant_dir_path - .file_name() - .unwrap_or_default() - .parse::() - { - Ok(id) => id, - Err(_) => { - warn!("Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",); - return Ok(None); - } - }; - - Ok(Some(( - tenant_shard_id, - Tenant::load_tenant_config(conf, &tenant_shard_id), - ))) + Some(Tenant::load_tenant_config(conf, &tenant_shard_id)) } /// Initial stage of load: walk the local tenants directory, clean up any temp files, @@ -405,32 +382,51 @@ fn load_tenant_config( /// seconds even on reasonably fast drives. async fn init_load_tenant_configs( conf: &'static PageServerConf, -) -> anyhow::Result>> { +) -> HashMap> { let tenants_dir = conf.tenants_path(); - let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result> { - let dir_entries = tenants_dir - .read_dir_utf8() - .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?; + let dentries = tokio::task::spawn_blocking(move || -> Vec { + let context = format!("read tenants dir {tenants_dir}"); + let dir_entries = tenants_dir.read_dir_utf8().fatal_err(&context); - Ok(dir_entries.collect::, std::io::Error>>()?) + dir_entries + .collect::, std::io::Error>>() + .fatal_err(&context) }) - .await??; + .await + .expect("Config load task panicked"); let mut configs = HashMap::new(); let mut join_set = JoinSet::new(); for dentry in dentries { - join_set.spawn_blocking(move || load_tenant_config(conf, dentry)); + let tenant_shard_id = match dentry.file_name().parse::() { + Ok(id) => id, + Err(_) => { + warn!( + "Invalid tenant path (garbage in our repo directory?): '{}'", + dentry.file_name() + ); + continue; + } + }; + + join_set.spawn_blocking(move || { + ( + tenant_shard_id, + load_tenant_config(conf, tenant_shard_id, dentry), + ) + }); } while let Some(r) = join_set.join_next().await { - if let Some((tenant_id, tenant_config)) = r?? { - configs.insert(tenant_id, tenant_config); + let (tenant_shard_id, tenant_config) = r.expect("Panic in config load task"); + if let Some(tenant_config) = tenant_config { + configs.insert(tenant_shard_id, tenant_config); } } - Ok(configs) + configs } #[derive(Debug, thiserror::Error)] @@ -472,7 +468,7 @@ pub async fn init_tenant_mgr( ); // Scan local filesystem for attached tenants - let tenant_configs = init_load_tenant_configs(conf).await?; + let tenant_configs = init_load_tenant_configs(conf).await; // Determine which tenants are to be secondary or attached, and in which generation let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; @@ -590,31 +586,23 @@ pub async fn init_tenant_mgr( ); // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running for (tenant_shard_id, location_conf, config_write_result) in config_write_results { - // Errors writing configs are fatal - config_write_result?; + // Writing a config to local disk is foundational to startup up tenants: panic if we can't. + config_write_result.fatal_err("write tenant shard config file"); let tenant_dir_path = conf.tenant_path(&tenant_shard_id); let shard_identity = location_conf.shard; let slot = match location_conf.mode { - LocationMode::Attached(attached_conf) => { - match tenant_spawn( - conf, - tenant_shard_id, - &tenant_dir_path, - resources.clone(), - AttachedTenantConf::new(location_conf.tenant_conf, attached_conf), - shard_identity, - Some(init_order.clone()), - SpawnMode::Lazy, - &ctx, - ) { - Ok(tenant) => TenantSlot::Attached(tenant), - Err(e) => { - error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}"); - continue; - } - } - } + LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn( + conf, + tenant_shard_id, + &tenant_dir_path, + resources.clone(), + AttachedTenantConf::new(location_conf.tenant_conf, attached_conf), + shard_identity, + Some(init_order.clone()), + SpawnMode::Lazy, + &ctx, + )), LocationMode::Secondary(secondary_conf) => { info!( tenant_id = %tenant_shard_id.tenant_id, @@ -649,8 +637,7 @@ pub async fn init_tenant_mgr( }) } -/// Wrapper for Tenant::spawn that checks invariants before running, and inserts -/// a broken tenant in the map if Tenant::spawn fails. +/// Wrapper for Tenant::spawn that checks invariants before running #[allow(clippy::too_many_arguments)] fn tenant_spawn( conf: &'static PageServerConf, @@ -662,23 +649,18 @@ fn tenant_spawn( init_order: Option, mode: SpawnMode, ctx: &RequestContext, -) -> anyhow::Result> { - anyhow::ensure!( - tenant_path.is_dir(), - "Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory" - ); - anyhow::ensure!( - !crate::is_temporary(tenant_path), - "Cannot load tenant from temporary path {tenant_path:?}" - ); - anyhow::ensure!( - !tenant_path.is_empty_dir().with_context(|| { - format!("Failed to check whether {tenant_path:?} is an empty dir") - })?, - "Cannot load tenant from empty directory {tenant_path:?}" - ); - - let tenant = Tenant::spawn( +) -> Arc { + // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed + // path, and contains a configuration file. Assertions that do synchronous I/O are limited to debug mode + // to avoid impacting prod runtime performance. + assert!(!crate::is_temporary(tenant_path)); + debug_assert!(tenant_path.is_dir()); + debug_assert!(conf + .tenant_location_config_path(&tenant_shard_id) + .try_exists() + .unwrap()); + + Tenant::spawn( conf, tenant_shard_id, resources, @@ -687,9 +669,7 @@ fn tenant_spawn( init_order, mode, ctx, - ); - - Ok(tenant) + ) } async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { @@ -840,8 +820,9 @@ pub(crate) enum UpsertLocationError { #[error("Failed to flush: {0}")] Flush(anyhow::Error), + /// This error variant is for unexpected situations (soft assertions) where the system is in an unexpected state. #[error("Internal error: {0}")] - Other(#[from] anyhow::Error), + InternalError(anyhow::Error), } impl TenantManager { @@ -971,7 +952,8 @@ impl TenantManager { match fast_path_taken { Some(FastPathModified::Attached(tenant)) => { Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) - .await?; + .await + .fatal_err("write tenant shard config"); // Transition to AttachedStale means we may well hold a valid generation // still, and have been requested to go stale as part of a migration. If @@ -1001,7 +983,8 @@ impl TenantManager { } Some(FastPathModified::Secondary(_secondary_tenant)) => { Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) - .await?; + .await + .fatal_err("write tenant shard config"); return Ok(None); } @@ -1067,7 +1050,7 @@ impl TenantManager { Some(TenantSlot::InProgress(_)) => { // This should never happen: acquire_slot should error out // if the contents of a slot were InProgress. - return Err(UpsertLocationError::Other(anyhow::anyhow!( + return Err(UpsertLocationError::InternalError(anyhow::anyhow!( "Acquired an InProgress slot, this is a bug." ))); } @@ -1086,12 +1069,14 @@ impl TenantManager { // Does not need to be fsync'd because local storage is just a cache. tokio::fs::create_dir_all(&timelines_path) .await - .with_context(|| format!("Creating {timelines_path}"))?; + .fatal_err("create timelines/ dir"); // Before activating either secondary or attached mode, persist the // configuration, so that on restart we will re-attach (or re-start // secondary) on the tenant. - Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?; + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) + .await + .fatal_err("write tenant shard config"); let new_slot = match &new_location_config.mode { LocationMode::Secondary(secondary_config) => { @@ -1110,13 +1095,15 @@ impl TenantManager { // from upserts. This enables creating generation-less tenants even though neon_local // always uses generations when calling the location conf API. let attached_conf = if cfg!(feature = "testing") { - let mut conf = AttachedTenantConf::try_from(new_location_config)?; + let mut conf = AttachedTenantConf::try_from(new_location_config) + .map_err(UpsertLocationError::BadRequest)?; if self.conf.control_plane_api.is_none() { conf.location.generation = Generation::none(); } conf } else { - AttachedTenantConf::try_from(new_location_config)? + AttachedTenantConf::try_from(new_location_config) + .map_err(UpsertLocationError::BadRequest)? }; let tenant = tenant_spawn( @@ -1129,7 +1116,7 @@ impl TenantManager { None, spawn_mode, ctx, - )?; + ); TenantSlot::Attached(tenant) } @@ -1143,7 +1130,7 @@ impl TenantManager { match slot_guard.upsert(new_slot) { Err(TenantSlotUpsertError::InternalError(e)) => { - Err(UpsertLocationError::Other(anyhow::anyhow!(e))) + Err(UpsertLocationError::InternalError(anyhow::anyhow!(e))) } Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)), Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => { @@ -1250,7 +1237,7 @@ impl TenantManager { None, SpawnMode::Eager, ctx, - )?; + ); slot_guard.upsert(TenantSlot::Attached(tenant))?; @@ -1984,7 +1971,7 @@ impl TenantManager { None, SpawnMode::Eager, ctx, - )?; + ); slot_guard.upsert(TenantSlot::Attached(tenant))?; diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index e33e4b84aa97..bc9364de61d4 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -519,7 +519,7 @@ impl RemoteTimelineClient { local_path: &Utf8Path, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let downloaded_size = { let _unfinished_gauge_guard = self.metrics.call_begin( &RemoteOpFileKind::Layer, diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index b2338b620ebf..23354417e788 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -3,6 +3,7 @@ use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::sync::Arc; +use tenant_size_model::svg::SvgBranchKind; use tokio::sync::oneshot::error::RecvError; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; @@ -87,6 +88,9 @@ impl SegmentMeta { LsnKind::BranchPoint => true, LsnKind::GcCutOff => true, LsnKind::BranchEnd => false, + LsnKind::LeasePoint => true, + LsnKind::LeaseStart => false, + LsnKind::LeaseEnd => false, } } } @@ -103,6 +107,21 @@ pub enum LsnKind { GcCutOff, /// Last record LSN BranchEnd, + /// A LSN lease is granted here. + LeasePoint, + /// A lease starts from here. + LeaseStart, + /// Last record LSN for the lease (should have the same LSN as the previous [`LsnKind::LeaseStart`]). + LeaseEnd, +} + +impl From for SvgBranchKind { + fn from(kind: LsnKind) -> Self { + match kind { + LsnKind::LeasePoint | LsnKind::LeaseStart | LsnKind::LeaseEnd => SvgBranchKind::Lease, + _ => SvgBranchKind::Timeline, + } + } } /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as @@ -124,6 +143,9 @@ pub struct TimelineInputs { /// Cutoff point calculated from the user-supplied 'max_retention_period' retention_param_cutoff: Option, + + /// Lease points on the timeline + lease_points: Vec, } /// Gathers the inputs for the tenant sizing model. @@ -234,6 +256,13 @@ pub(super) async fn gather_inputs( None }; + let lease_points = gc_info + .leases + .keys() + .filter(|&&lsn| lsn > ancestor_lsn) + .copied() + .collect::>(); + // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we // want to query any logical size before initdb_lsn. let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn); @@ -248,6 +277,8 @@ pub(super) async fn gather_inputs( .map(|lsn| (lsn, LsnKind::BranchPoint)) .collect::>(); + lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); + drop(gc_info); // Add branch points we collected earlier, just in case there were any that were @@ -296,6 +327,7 @@ pub(super) async fn gather_inputs( if kind == LsnKind::BranchPoint { branchpoint_segments.insert((timeline_id, lsn), segments.len()); } + segments.push(SegmentMeta { segment: Segment { parent: Some(parent), @@ -306,7 +338,45 @@ pub(super) async fn gather_inputs( timeline_id: timeline.timeline_id, kind, }); - parent += 1; + + parent = segments.len() - 1; + + if kind == LsnKind::LeasePoint { + // Needs `LeaseStart` and `LeaseEnd` as well to model lease as a read-only branch that never writes data + // (i.e. it's lsn has not advanced from ancestor_lsn), and therefore the three segments have the same LSN + // value. Without the other two segments, the calculation code would not count the leased LSN as a point + // to be retained. + // Did not use `BranchStart` or `BranchEnd` so we can differentiate branches and leases during debug. + // + // Alt Design: rewrite the entire calculation code to be independent of timeline id. Both leases and + // branch points can be given a synthetic id so we can unite them. + let mut lease_parent = parent; + + // Start of a lease. + segments.push(SegmentMeta { + segment: Segment { + parent: Some(lease_parent), + lsn: lsn.0, + size: None, // Filled in later, if necessary + needed: lsn > next_gc_cutoff, // only needed if the point is within rentention. + }, + timeline_id: timeline.timeline_id, + kind: LsnKind::LeaseStart, + }); + lease_parent += 1; + + // End of the lease. + segments.push(SegmentMeta { + segment: Segment { + parent: Some(lease_parent), + lsn: lsn.0, + size: None, // Filled in later, if necessary + needed: true, // everything at the lease LSN must be readable => is needed + }, + timeline_id: timeline.timeline_id, + kind: LsnKind::LeaseEnd, + }); + } } // Current end of the timeline @@ -332,6 +402,7 @@ pub(super) async fn gather_inputs( pitr_cutoff, next_gc_cutoff, retention_param_cutoff, + lease_points, }); } @@ -674,7 +745,8 @@ fn verify_size_for_multiple_branches() { "horizon_cutoff": "0/2210CD0", "pitr_cutoff": "0/2210CD0", "next_gc_cutoff": "0/2210CD0", - "retention_param_cutoff": null + "retention_param_cutoff": null, + "lease_points": [] }, { "timeline_id": "454626700469f0a9914949b9d018e876", @@ -684,7 +756,8 @@ fn verify_size_for_multiple_branches() { "horizon_cutoff": "0/1817770", "pitr_cutoff": "0/1817770", "next_gc_cutoff": "0/1817770", - "retention_param_cutoff": null + "retention_param_cutoff": null, + "lease_points": [] }, { "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", @@ -694,7 +767,8 @@ fn verify_size_for_multiple_branches() { "horizon_cutoff": "0/18B3D98", "pitr_cutoff": "0/18B3D98", "next_gc_cutoff": "0/18B3D98", - "retention_param_cutoff": null + "retention_param_cutoff": null, + "lease_points": [] } ] } @@ -749,7 +823,8 @@ fn verify_size_for_one_branch() { "horizon_cutoff": "47/240A5860", "pitr_cutoff": "47/240A5860", "next_gc_cutoff": "47/240A5860", - "retention_param_cutoff": "0/0" + "retention_param_cutoff": "0/0", + "lease_points": [] } ] }"#; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index c2d4a2776b1d..685f6dce60e7 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -49,7 +49,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::LayerAccessKind; +use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind}; use pageserver_api::shard::TenantShardId; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; @@ -452,7 +452,12 @@ impl DeltaLayerWriterInner { ctx: &RequestContext, ) -> (Vec, anyhow::Result<()>) { assert!(self.lsn_range.start <= lsn); - let (val, res) = self.blob_writer.write_blob(val, ctx).await; + // We don't want to use compression in delta layer creation + let compression = ImageCompressionAlgorithm::DisabledNoDecompress; + let (val, res) = self + .blob_writer + .write_blob_maybe_compressed(val, ctx, compression) + .await; let off = match res { Ok(off) => off, Err(e) => return (val, Err(anyhow::anyhow!(e))), diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 50aacbd9ad46..4a1b3a02377a 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -165,6 +165,7 @@ pub struct ImageLayerInner { file_id: FileId, max_vectored_read_bytes: Option, + compressed_reads: bool, } impl std::fmt::Debug for ImageLayerInner { @@ -178,7 +179,8 @@ impl std::fmt::Debug for ImageLayerInner { impl ImageLayerInner { pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> { - let block_reader = FileBlockReader::new(&self.file, self.file_id); + let block_reader = + FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads); let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new( self.index_start_blk, self.index_root_blk, @@ -266,9 +268,10 @@ impl ImageLayer { async fn load_inner(&self, ctx: &RequestContext) -> Result { let path = self.path(); - let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, ctx) - .await - .and_then(|res| res)?; + let loaded = + ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, None, false, ctx) + .await + .and_then(|res| res)?; // not production code let actual_layer_name = LayerName::from_str(path.file_name().unwrap()).unwrap(); @@ -377,6 +380,7 @@ impl ImageLayerInner { lsn: Lsn, summary: Option, max_vectored_read_bytes: Option, + support_compressed_reads: bool, ctx: &RequestContext, ) -> Result, anyhow::Error> { let file = match VirtualFile::open(path, ctx).await { @@ -420,6 +424,7 @@ impl ImageLayerInner { file, file_id, max_vectored_read_bytes, + compressed_reads: support_compressed_reads, key_range: actual_summary.key_range, })) } @@ -430,7 +435,8 @@ impl ImageLayerInner { reconstruct_state: &mut ValueReconstructState, ctx: &RequestContext, ) -> anyhow::Result { - let block_reader = FileBlockReader::new(&self.file, self.file_id); + let block_reader = + FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads); let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader); @@ -490,12 +496,14 @@ impl ImageLayerInner { &self, ctx: &RequestContext, ) -> anyhow::Result> { - let block_reader = FileBlockReader::new(&self.file, self.file_id); + let block_reader = + FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads); let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader); let mut result = Vec::new(); let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx)); - let block_reader = FileBlockReader::new(&self.file, self.file_id); + let block_reader = + FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads); let cursor = block_reader.block_cursor(); while let Some(item) = stream.next().await { // TODO: dedup code with get_reconstruct_value @@ -530,7 +538,8 @@ impl ImageLayerInner { .into(), ); - let block_reader = FileBlockReader::new(&self.file, self.file_id); + let block_reader = + FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads); let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); @@ -691,7 +700,8 @@ impl ImageLayerInner { #[cfg(test)] pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> { - let block_reader = FileBlockReader::new(&self.file, self.file_id); + let block_reader = + FileBlockReader::new_with_compression(&self.file, self.file_id, self.compressed_reads); let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); ImageLayerIterator { diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 6624fb7e6ba5..e1eaea90af57 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -6,13 +6,14 @@ //! use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; +use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value}; -use crate::tenant::block_io::BlockReader; +use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef}; use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::storage_layer::ValueReconstructResult; use crate::tenant::timeline::GetVectoredError; use crate::tenant::{PageReconstructError, Timeline}; -use crate::{page_cache, walrecord}; +use crate::{l0_flush, page_cache, walrecord}; use anyhow::{anyhow, ensure, Result}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; @@ -410,6 +411,7 @@ impl InMemoryLayer { continue; } + // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183 let buf = reader.read_blob(block_read.block_offset, &ctx).await; if let Err(e) = buf { reconstruct_state @@ -620,6 +622,13 @@ impl InMemoryLayer { // rare though, so we just accept the potential latency hit for now. let inner = self.inner.read().await; + let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone(); + use l0_flush::Inner; + let _concurrency_permit = match &*l0_flush_global_state { + Inner::PageCached => None, + Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await), + }; + let end_lsn = *self.end_lsn.get().unwrap(); let key_count = if let Some(key_range) = key_range { @@ -645,28 +654,77 @@ impl InMemoryLayer { ) .await?; - let mut buf = Vec::new(); - - let cursor = inner.file.block_cursor(); + match &*l0_flush_global_state { + l0_flush::Inner::PageCached => { + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::InMemoryLayer) + .build(); + + let mut buf = Vec::new(); + + let cursor = inner.file.block_cursor(); + + for (key, vec_map) in inner.index.iter() { + // Write all page versions + for (lsn, pos) in vec_map.as_slice() { + cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?; + let will_init = Value::des(&buf)?.will_init(); + let res; + (buf, res) = delta_layer_writer + .put_value_bytes(*key, *lsn, buf, will_init, &ctx) + .await; + res?; + } + } + } + l0_flush::Inner::Direct { .. } => { + let file_contents: Vec = inner.file.load_to_vec(ctx).await?; + assert_eq!( + file_contents.len() % PAGE_SZ, + 0, + "needed by BlockReaderRef::Slice" + ); + assert_eq!(file_contents.len(), { + let written = usize::try_from(inner.file.len()).unwrap(); + if written % PAGE_SZ == 0 { + written + } else { + written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap() + } + }); + + let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents)); + + let mut buf = Vec::new(); + + for (key, vec_map) in inner.index.iter() { + // Write all page versions + for (lsn, pos) in vec_map.as_slice() { + // TODO: once we have blob lengths in the in-memory index, we can + // 1. get rid of the blob_io / BlockReaderRef::Slice business and + // 2. load the file contents into a Bytes and + // 3. the use `Bytes::slice` to get the `buf` that is our blob + // 4. pass that `buf` into `put_value_bytes` + // => https://github.com/neondatabase/neon/issues/8183 + cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?; + let will_init = Value::des(&buf)?.will_init(); + let res; + (buf, res) = delta_layer_writer + .put_value_bytes(*key, *lsn, buf, will_init, ctx) + .await; + res?; + } + } - let ctx = RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::InMemoryLayer) - .build(); - for (key, vec_map) in inner.index.iter() { - // Write all page versions - for (lsn, pos) in vec_map.as_slice() { - cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?; - let will_init = Value::des(&buf)?.will_init(); - let res; - (buf, res) = delta_layer_writer - .put_value_bytes(*key, *lsn, buf, will_init, &ctx) - .await; - res?; + // Hold the permit until the IO is done; if we didn't, one could drop this future, + // thereby releasing the permit, but the Vec remains allocated until the IO completes. + // => we'd have more concurrenct Vec than allowed as per the semaphore. + drop(_concurrency_permit); } } // MAX is used here because we identify L0 layers by full key range - let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?; + let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?; Ok(Some(delta_layer)) } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 5dd947253578..afd11780e77d 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1096,19 +1096,10 @@ impl LayerInner { match rx.await { Ok(Ok(res)) => Ok(res), - Ok(Err(e)) => { - // sleep already happened in the spawned task, if it was not cancelled - match e.downcast_ref::() { - // If the download failed due to its cancellation token, - // propagate the cancellation error upstream. - Some(remote_storage::DownloadError::Cancelled) => { - Err(DownloadError::DownloadCancelled) - } - // FIXME: this is not embedding the error because historically it would had - // been output to compute, however that is no longer the case. - _ => Err(DownloadError::DownloadFailed), - } + Ok(Err(remote_storage::DownloadError::Cancelled)) => { + Err(DownloadError::DownloadCancelled) } + Ok(Err(_)) => Err(DownloadError::DownloadFailed), Err(_gone) => Err(DownloadError::DownloadCancelled), } } @@ -1118,7 +1109,7 @@ impl LayerInner { timeline: Arc, permit: heavier_once_cell::InitPermit, ctx: &RequestContext, - ) -> anyhow::Result> { + ) -> Result, remote_storage::DownloadError> { let result = timeline .remote_client .download_layer_file( @@ -1694,6 +1685,7 @@ impl DownloadedLayer { lsn, summary, Some(owner.conf.max_vectored_read_bytes), + owner.conf.image_compression.allow_decompression(), ctx, ) .await diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8dd0a23f4637..92baf1073aae 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -14,6 +14,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use arc_swap::ArcSwap; use bytes::Bytes; use camino::Utf8Path; +use chrono::{DateTime, Utc}; use enumset::EnumSet; use fail::fail_point; use once_cell::sync::Lazy; @@ -65,7 +66,6 @@ use std::{ ops::{Deref, Range}, }; -use crate::metrics::GetKind; use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS; use crate::{ aux_file::AuxFileSizeEstimator, @@ -90,6 +90,10 @@ use crate::{ use crate::{ disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, }; +use crate::{ + l0_flush::{self, L0FlushGlobalState}, + metrics::GetKind, +}; use crate::{ metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, }; @@ -208,6 +212,7 @@ pub struct TimelineResources { pub timeline_get_throttle: Arc< crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>, >, + pub l0_flush_global_state: l0_flush::L0FlushGlobalState, } pub(crate) struct AuxFilesState { @@ -360,6 +365,7 @@ pub struct Timeline { repartition_threshold: u64, last_image_layer_creation_check_at: AtomicLsn, + last_image_layer_creation_check_instant: std::sync::Mutex>, /// Current logical size of the "datadir", at the last LSN. current_logical_size: LogicalSize, @@ -433,6 +439,8 @@ pub struct Timeline { /// in the future, add `extra_test_sparse_keyspace` if necessary. #[cfg(test)] pub(crate) extra_test_dense_keyspace: ArcSwap, + + pub(crate) l0_flush_global_state: L0FlushGlobalState, } pub struct WalReceiverInfo { @@ -457,6 +465,9 @@ pub(crate) struct GcInfo { /// Leases granted to particular LSNs. pub(crate) leases: BTreeMap, + + /// Whether our branch point is within our ancestor's PITR interval (for cost estimation) + pub(crate) within_ancestor_pitr: bool, } impl GcInfo { @@ -845,6 +856,18 @@ impl Timeline { .map(|ancestor| ancestor.timeline_id) } + /// Get the bytes written since the PITR cutoff on this branch, and + /// whether this branch's ancestor_lsn is within its parent's PITR. + pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) { + let gc_info = self.gc_info.read().unwrap(); + let history = self + .get_last_record_lsn() + .checked_sub(gc_info.cutoffs.pitr) + .unwrap_or(Lsn(0)) + .0; + (history, gc_info.within_ancestor_pitr) + } + /// Lock and get timeline's GC cutoff pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { self.latest_gc_cutoff_lsn.read() @@ -996,6 +1019,7 @@ impl Timeline { } pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32; + pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0; /// Look up multiple page versions at a given LSN /// @@ -1228,7 +1252,7 @@ impl Timeline { let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME .for_get_kind(get_kind) .start_timer(); - self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx) + self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx) .await?; get_data_timer.stop_and_record(); @@ -1258,11 +1282,25 @@ impl Timeline { // (this is a requirement, not a bug). Skip updating the metric in these cases // to avoid infinite results. if !results.is_empty() { + let avg = layers_visited as f64 / results.len() as f64; + if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + tracing::info!( + shard_id = %self.tenant_shard_id.shard_slug(), + lsn = %lsn, + "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned", + keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size()); + }); + } + // Note that this is an approximation. Tracking the exact number of layers visited // per key requires virtually unbounded memory usage and is inefficient // (i.e. segment tree tracking each range queried from a layer) - crate::metrics::VEC_READ_NUM_LAYERS_VISITED - .observe(layers_visited as f64 / results.len() as f64); + crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg); } Ok(results) @@ -1554,7 +1592,13 @@ impl Timeline { let existing_lease = occupied.get_mut(); if valid_until > existing_lease.valid_until { existing_lease.valid_until = valid_until; + let dt: DateTime = valid_until.into(); + info!("lease extended to {}", dt); + } else { + let dt: DateTime = existing_lease.valid_until.into(); + info!("existing lease covers greater length, valid until {}", dt); } + existing_lease.clone() } else { // Reject already GC-ed LSN (lsn < latest_gc_cutoff) @@ -1563,6 +1607,8 @@ impl Timeline { bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); } + let dt: DateTime = valid_until.into(); + info!("lease created, valid until {}", dt); entry.or_insert(LsnLease { valid_until }).clone() } }; @@ -2339,6 +2385,7 @@ impl Timeline { )), repartition_threshold: 0, last_image_layer_creation_check_at: AtomicLsn::new(0), + last_image_layer_creation_check_instant: Mutex::new(None), last_received_wal: Mutex::new(None), rel_size_cache: RwLock::new(RelSizeCache { @@ -2376,6 +2423,8 @@ impl Timeline { #[cfg(test)] extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())), + + l0_flush_global_state: resources.l0_flush_global_state, }; result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; @@ -4417,6 +4466,58 @@ impl Timeline { } } + /// Predicate function which indicates whether we should check if new image layers + /// are required. Since checking if new image layers are required is expensive in + /// terms of CPU, we only do it in the following cases: + /// 1. If the timeline has ingested sufficient WAL to justify the cost + /// 2. If enough time has passed since the last check + /// 2.1. For large tenants, we wish to perform the check more often since they + /// suffer from the lack of image layers + /// 2.2. For small tenants (that can mostly fit in RAM), we use a much longer interval + fn should_check_if_image_layers_required(self: &Arc, lsn: Lsn) -> bool { + const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024; + + let last_checks_at = self.last_image_layer_creation_check_at.load(); + let distance = lsn + .checked_sub(last_checks_at) + .expect("Attempt to compact with LSN going backwards"); + let min_distance = + self.get_image_layer_creation_check_threshold() as u64 * self.get_checkpoint_distance(); + + let distance_based_decision = distance.0 >= min_distance; + + let mut time_based_decision = false; + let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap(); + if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() { + let check_required_after = if Into::::into(&logical_size) >= LARGE_TENANT_THRESHOLD + { + self.get_checkpoint_timeout() + } else { + Duration::from_secs(3600 * 48) + }; + + time_based_decision = match *last_check_instant { + Some(last_check) => { + let elapsed = last_check.elapsed(); + elapsed >= check_required_after + } + None => true, + }; + } + + // Do the expensive delta layer counting only if this timeline has ingested sufficient + // WAL since the last check or a checkpoint timeout interval has elapsed since the last + // check. + let decision = distance_based_decision || time_based_decision; + + if decision { + self.last_image_layer_creation_check_at.store(lsn); + *last_check_instant = Some(Instant::now()); + } + + decision + } + #[tracing::instrument(skip_all, fields(%lsn, %mode))] async fn create_image_layers( self: &Arc, @@ -4439,22 +4540,7 @@ impl Timeline { // image layers <100000000..100000099> and <200000000..200000199> are not completely covering it. let mut start = Key::MIN; - let check_for_image_layers = { - let last_checks_at = self.last_image_layer_creation_check_at.load(); - let distance = lsn - .checked_sub(last_checks_at) - .expect("Attempt to compact with LSN going backwards"); - let min_distance = self.get_image_layer_creation_check_threshold() as u64 - * self.get_checkpoint_distance(); - - // Skip the expensive delta layer counting if this timeline has not ingested sufficient - // WAL since the last check. - distance.0 >= min_distance - }; - - if check_for_image_layers { - self.last_image_layer_creation_check_at.store(lsn); - } + let check_for_image_layers = self.should_check_if_image_layers_required(lsn); for partition in partitioning.parts.iter() { let img_range = start..partition.ranges.last().unwrap().end; @@ -4711,6 +4797,42 @@ impl DurationRecorder { } } +/// Descriptor for a delta layer used in testing infra. The start/end key/lsn range of the +/// delta layer might be different from the min/max key/lsn in the delta layer. Therefore, +/// the layer descriptor requires the user to provide the ranges, which should cover all +/// keys specified in the `data` field. +#[cfg(test)] +pub struct DeltaLayerTestDesc { + pub lsn_range: Range, + pub key_range: Range, + pub data: Vec<(Key, Lsn, Value)>, +} + +#[cfg(test)] +impl DeltaLayerTestDesc { + #[allow(dead_code)] + pub fn new(lsn_range: Range, key_range: Range, data: Vec<(Key, Lsn, Value)>) -> Self { + Self { + lsn_range, + key_range, + data, + } + } + + pub fn new_with_inferred_key_range( + lsn_range: Range, + data: Vec<(Key, Lsn, Value)>, + ) -> Self { + let key_min = data.iter().map(|(key, _, _)| key).min().unwrap(); + let key_max = data.iter().map(|(key, _, _)| key).max().unwrap(); + Self { + key_range: (*key_min)..(key_max.next()), + lsn_range, + data, + } + } +} + impl Timeline { async fn finish_compact_batch( self: &Arc, @@ -5511,37 +5633,65 @@ impl Timeline { #[cfg(test)] pub(super) async fn force_create_delta_layer( self: &Arc, - mut deltas: Vec<(Key, Lsn, Value)>, + mut deltas: DeltaLayerTestDesc, check_start_lsn: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { let last_record_lsn = self.get_last_record_lsn(); - deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb))); - let min_key = *deltas.first().map(|(k, _, _)| k).unwrap(); - let end_key = deltas.last().map(|(k, _, _)| k).unwrap().next(); - let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap(); - let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap(); + deltas + .data + .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb))); + assert!(deltas.data.first().unwrap().0 >= deltas.key_range.start); + assert!(deltas.data.last().unwrap().0 < deltas.key_range.end); + for (_, lsn, _) in &deltas.data { + assert!(deltas.lsn_range.start <= *lsn && *lsn < deltas.lsn_range.end); + } assert!( - max_lsn <= last_record_lsn, - "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}" + deltas.lsn_range.end <= last_record_lsn, + "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}", + deltas.lsn_range.end, + last_record_lsn ); - let end_lsn = Lsn(max_lsn.0 + 1); if let Some(check_start_lsn) = check_start_lsn { - assert!(min_lsn >= check_start_lsn); + assert!(deltas.lsn_range.start >= check_start_lsn); + } + // check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of + // layers of the same start/end LSN, and so should the force inserted layer + { + /// Checks if a overlaps with b, assume a/b = [start, end). + pub fn overlaps_with(a: &Range, b: &Range) -> bool { + !(a.end <= b.start || b.end <= a.start) + } + + let guard = self.layers.read().await; + for layer in guard.layer_map().iter_historic_layers() { + if layer.is_delta() + && overlaps_with(&layer.lsn_range, &deltas.lsn_range) + && layer.lsn_range != deltas.lsn_range + { + // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic + panic!( + "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}", + deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end + ); + } + } } let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, - min_key, - min_lsn..end_lsn, + deltas.key_range.start, + deltas.lsn_range, ctx, ) .await?; - for (key, lsn, val) in deltas { + for (key, lsn, val) in deltas.data { delta_layer_writer.put_value(key, lsn, val, ctx).await?; } - let delta_layer = delta_layer_writer.finish(end_key, self, ctx).await?; + let delta_layer = delta_layer_writer + .finish(deltas.key_range.end, self, ctx) + .await?; { let mut guard = self.layers.write().await; diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 6d747d424dde..b0088f4ea228 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -272,6 +272,7 @@ impl DeleteTimelineFlow { TimelineResources { remote_client, timeline_get_throttle: tenant.timeline_get_throttle.clone(), + l0_flush_global_state: tenant.l0_flush_global_state.clone(), }, // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index cd316dbb9141..3b755bb0420c 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -6,6 +6,7 @@ OBJS = \ $(WIN32RES) \ extension_server.o \ file_cache.o \ + hll.o \ libpagestore.o \ neon.o \ neon_utils.o \ @@ -22,7 +23,7 @@ SHLIB_LINK_INTERNAL = $(libpq) SHLIB_LINK = -lcurl EXTENSION = neon -DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql +DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql PGFILEDESC = "neon - cloud storage for PostgreSQL" EXTRA_CLEAN = \ diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 25275ef31fe9..1894e8c72a5c 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -26,7 +26,6 @@ #include "miscadmin.h" #include "pagestore_client.h" #include "common/hashfn.h" -#include "lib/hyperloglog.h" #include "pgstat.h" #include "postmaster/bgworker.h" #include RELFILEINFO_HDR @@ -40,6 +39,8 @@ #include "utils/dynahash.h" #include "utils/guc.h" +#include "hll.h" + /* * Local file cache is used to temporary store relations pages in local file system. * All blocks of all relations are stored inside one file and addressed using shared hash map. @@ -62,7 +63,6 @@ #define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */ #define MB ((uint64)1024*1024) -#define HYPER_LOG_LOG_BIT_WIDTH 10 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) typedef struct FileCacheEntry @@ -87,8 +87,7 @@ typedef struct FileCacheControl uint64 writes; dlist_head lru; /* double linked list for LRU replacement * algorithm */ - hyperLogLogState wss_estimation; /* estimation of wroking set size */ - uint8_t hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1]; + HyperLogLogState wss_estimation; /* estimation of working set size */ } FileCacheControl; static HTAB *lfc_hash; @@ -238,12 +237,7 @@ lfc_shmem_startup(void) dlist_init(&lfc_ctl->lru); /* Initialize hyper-log-log structure for estimating working set size */ - initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH); - - /* We need hashes in shared memory */ - pfree(lfc_ctl->wss_estimation.hashesArr); - memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes); - lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes; + initSHLL(&lfc_ctl->wss_estimation); /* Recreate file cache on restart */ fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); @@ -545,7 +539,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, /* Approximate working set */ tag.blockNum = blkno; - addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0) { @@ -986,20 +980,38 @@ local_cache_pages(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funcctx); } +PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds); + +Datum +approximate_working_set_size_seconds(PG_FUNCTION_ARGS) +{ + if (lfc_size_limit != 0) + { + int32 dc; + time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0); + LWLockAcquire(lfc_lock, LW_SHARED); + dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration); + LWLockRelease(lfc_lock); + PG_RETURN_INT32(dc); + } + PG_RETURN_NULL(); +} + PG_FUNCTION_INFO_V1(approximate_working_set_size); Datum approximate_working_set_size(PG_FUNCTION_ARGS) { - int32 dc = -1; if (lfc_size_limit != 0) { + int32 dc; bool reset = PG_GETARG_BOOL(0); LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED); - dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation); + dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1); if (reset) - memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes); + memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs); LWLockRelease(lfc_lock); + PG_RETURN_INT32(dc); } - PG_RETURN_INT32(dc); + PG_RETURN_NULL(); } diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c new file mode 100644 index 000000000000..f8496b31259d --- /dev/null +++ b/pgxn/neon/hll.c @@ -0,0 +1,193 @@ +/*------------------------------------------------------------------------- + * + * hll.c + * Sliding HyperLogLog cardinality estimator + * + * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group + * + * Implements https://hal.science/hal-00465313/document + * + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally + * suited to estimating the cardinality of very large sets; in particular, we + * have not attempted to further optimize the implementation as described in + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic + * Engineering of a State of The Art Cardinality Estimation Algorithm". + * + * A sparse representation of HyperLogLog state is used, with fixed space + * overhead. + * + * The copyright terms of Ohno's original version (the MIT license) follow. + * + * IDENTIFICATION + * src/backend/lib/hyperloglog.c + * + *------------------------------------------------------------------------- + */ + +/* + * Copyright (c) 2013 Hideaki Ohno + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "postgres.h" +#include "funcapi.h" +#include "port/pg_bitutils.h" +#include "utils/timestamp.h" +#include "hll.h" + + +#define POW_2_32 (4294967296.0) +#define NEG_POW_2_32 (-4294967296.0) + +#define ALPHA_MM ((0.7213 / (1.0 + 1.079 / HLL_N_REGISTERS)) * HLL_N_REGISTERS * HLL_N_REGISTERS) + +/* + * Worker for addHyperLogLog(). + * + * Calculates the position of the first set bit in first b bits of x argument + * starting from the first, reading from most significant to least significant + * bits. + * + * Example (when considering fist 10 bits of x): + * + * rho(x = 0b1000000000) returns 1 + * rho(x = 0b0010000000) returns 3 + * rho(x = 0b0000000000) returns b + 1 + * + * "The binary address determined by the first b bits of x" + * + * Return value "j" used to index bit pattern to watch. + */ +static inline uint8 +rho(uint32 x, uint8 b) +{ + uint8 j = 1; + + if (x == 0) + return b + 1; + + j = 32 - pg_leftmost_one_pos32(x); + + if (j > b) + return b + 1; + + return j; +} + +/* + * Initialize HyperLogLog track state + */ +void +initSHLL(HyperLogLogState *cState) +{ + memset(cState->regs, 0, sizeof(cState->regs)); +} + +/* + * Adds element to the estimator, from caller-supplied hash. + * + * It is critical that the hash value passed be an actual hash value, typically + * generated using hash_any(). The algorithm relies on a specific bit-pattern + * observable in conjunction with stochastic averaging. There must be a + * uniform distribution of bits in hash values for each distinct original value + * observed. + */ +void +addSHLL(HyperLogLogState *cState, uint32 hash) +{ + uint8 count; + uint32 index; + size_t i; + size_t j; + + TimestampTz now = GetCurrentTimestamp(); + /* Use the first "k" (registerWidth) bits as a zero based index */ + index = hash >> HLL_C_BITS; + + /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ + count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS); + + cState->regs[index][count] = now; +} + +static uint8 +getMaximum(const TimestampTz* reg, TimestampTz since) +{ + uint8 max = 0; + + for (size_t i = 0; i < HLL_C_BITS + 1; i++) + { + if (reg[i] >= since) + { + max = i; + } + } + + return max; +} + + +/* + * Estimates cardinality, based on elements added so far + */ +double +estimateSHLL(HyperLogLogState *cState, time_t duration) +{ + double result; + double sum = 0.0; + size_t i; + uint8 R[HLL_N_REGISTERS]; + /* 0 indicates uninitialized timestamp, so if we need to cover the whole range than starts with 1 */ + TimestampTz since = duration == (time_t)-1 ? 1 : GetCurrentTimestamp() - duration * USECS_PER_SEC; + + for (i = 0; i < HLL_N_REGISTERS; i++) + { + R[i] = getMaximum(cState->regs[i], since); + sum += 1.0 / pow(2.0, R[i]); + } + + /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */ + result = ALPHA_MM / sum; + + if (result <= (5.0 / 2.0) * HLL_N_REGISTERS) + { + /* Small range correction */ + int zero_count = 0; + + for (i = 0; i < HLL_N_REGISTERS; i++) + { + zero_count += R[i] == 0; + } + + if (zero_count != 0) + result = HLL_N_REGISTERS * log((double) HLL_N_REGISTERS / + zero_count); + } + else if (result > (1.0 / 30.0) * POW_2_32) + { + /* Large range correction */ + result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32)); + } + + return result; +} + diff --git a/pgxn/neon/hll.h b/pgxn/neon/hll.h new file mode 100644 index 000000000000..9256cb9afa2f --- /dev/null +++ b/pgxn/neon/hll.h @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * + * hll.h + * Sliding HyperLogLog cardinality estimator + * + * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group + * + * Implements https://hal.science/hal-00465313/document + * + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally + * suited to estimating the cardinality of very large sets; in particular, we + * have not attempted to further optimize the implementation as described in + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic + * Engineering of a State of The Art Cardinality Estimation Algorithm". + * + * A sparse representation of HyperLogLog state is used, with fixed space + * overhead. + * + * The copyright terms of Ohno's original version (the MIT license) follow. + * + * IDENTIFICATION + * src/backend/lib/hyperloglog.c + * + *------------------------------------------------------------------------- + */ + +/* + * Copyright (c) 2013 Hideaki Ohno + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef HLL_H +#define HLL_H + +#define HLL_BIT_WIDTH 10 +#define HLL_C_BITS (32 - HLL_BIT_WIDTH) +#define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH) + +/* + * HyperLogLog is an approximate technique for computing the number of distinct + * entries in a set. Importantly, it does this by using a fixed amount of + * memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal + * cardinality estimation algorithm" for more. + * + * Instead of a single counter for every bits register, we have a timestamp + * for every valid number of bits we can encounter. Every time we encounter + * a certain number of bits, we update the timestamp in those registers to + * the current timestamp. + * + * We can query the sketch's stored cardinality for the range of some timestamp + * up to now: For each register, we return the highest bits bucket that has a + * modified timestamp >= the query timestamp. This value is the number of bits + * for this register in the normal HLL calculation. + * + * The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB. + * Usage could be halved if we decide to reduce the required time dimension + * precision; as 32 bits in second precision should be enough for statistics. + * However, that is not yet implemented. + */ +typedef struct HyperLogLogState +{ + TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1]; +} HyperLogLogState; + +extern void initSHLL(HyperLogLogState *cState); +extern void addSHLL(HyperLogLogState *cState, uint32 hash); +extern double estimateSHLL(HyperLogLogState *cState, time_t dutration); + +#endif diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index a665cafafe71..73a001b6ba72 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -427,12 +427,17 @@ pageserver_connect(shardno_t shard_no, int elevel) values[n_pgsql_params] = NULL; shard->conn = PQconnectStartParams(keywords, values, 1); - if (!shard->conn) + if (PQstatus(shard->conn) == CONNECTION_BAD) { - neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory"); + char *msg = pchomp(PQerrorMessage(shard->conn)); + CLEANUP_AND_DISCONNECT(shard); + ereport(elevel, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), + errdetail_internal("%s", msg))); + pfree(msg); return false; } - shard->state = PS_Connecting_Startup; /* fallthrough */ } diff --git a/pgxn/neon/neon--1.3--1.4.sql b/pgxn/neon/neon--1.3--1.4.sql new file mode 100644 index 000000000000..042effe3461c --- /dev/null +++ b/pgxn/neon/neon--1.3--1.4.sql @@ -0,0 +1,9 @@ +\echo Use "ALTER EXTENSION neon UPDATE TO '1.4'" to load this file. \quit + +CREATE FUNCTION approximate_working_set_size_seconds(duration integer default null) +RETURNS integer +AS 'MODULE_PATHNAME', 'approximate_working_set_size_seconds' +LANGUAGE C PARALLEL SAFE; + +GRANT EXECUTE ON FUNCTION approximate_working_set_size_seconds(integer) TO pg_monitor; + diff --git a/pgxn/neon/neon--1.4--1.3.sql b/pgxn/neon/neon--1.4--1.3.sql new file mode 100644 index 000000000000..bea72d1a6b17 --- /dev/null +++ b/pgxn/neon/neon--1.4--1.3.sql @@ -0,0 +1 @@ +DROP FUNCTION IF EXISTS approximate_working_set_size_seconds(integer) CASCADE; diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile index 13712724399d..252810b5b02e 100644 --- a/pgxn/neon_test_utils/Makefile +++ b/pgxn/neon_test_utils/Makefile @@ -7,7 +7,7 @@ OBJS = \ neontest.o EXTENSION = neon_test_utils -DATA = neon_test_utils--1.2.sql +DATA = neon_test_utils--1.3.sql PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging" PG_CONFIG = pg_config diff --git a/pgxn/neon_test_utils/neon_test_utils--1.2.sql b/pgxn/neon_test_utils/neon_test_utils--1.3.sql similarity index 77% rename from pgxn/neon_test_utils/neon_test_utils--1.2.sql rename to pgxn/neon_test_utils/neon_test_utils--1.3.sql index f84a24ec8d48..3b8794a8cff4 100644 --- a/pgxn/neon_test_utils/neon_test_utils--1.2.sql +++ b/pgxn/neon_test_utils/neon_test_utils--1.3.sql @@ -45,3 +45,21 @@ CREATE FUNCTION neon_xlogflush(lsn pg_lsn DEFAULT NULL) RETURNS VOID AS 'MODULE_PATHNAME', 'neon_xlogflush' LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION trigger_panic() +RETURNS VOID +AS 'MODULE_PATHNAME', 'trigger_panic' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION trigger_segfault() +RETURNS VOID +AS 'MODULE_PATHNAME', 'trigger_segfault' +LANGUAGE C PARALLEL UNSAFE; + +-- Alias for `trigger_segfault`, just because `SELECT 💣()` looks fun +CREATE OR REPLACE FUNCTION 💣() RETURNS void +LANGUAGE plpgsql AS $$ +BEGIN + PERFORM trigger_segfault(); +END; +$$; diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control index c7b9191ddc12..f22afd70c4fa 100644 --- a/pgxn/neon_test_utils/neon_test_utils.control +++ b/pgxn/neon_test_utils/neon_test_utils.control @@ -1,6 +1,6 @@ # neon_test_utils extension comment = 'helpers for neon testing and debugging' -default_version = '1.2' +default_version = '1.3' module_pathname = '$libdir/neon_test_utils' relocatable = true trusted = true diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 071dc122edbd..650ef7405d64 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -42,6 +42,8 @@ PG_FUNCTION_INFO_V1(clear_buffer_cache); PG_FUNCTION_INFO_V1(get_raw_page_at_lsn); PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex); PG_FUNCTION_INFO_V1(neon_xlogflush); +PG_FUNCTION_INFO_V1(trigger_panic); +PG_FUNCTION_INFO_V1(trigger_segfault); /* * Linkage to functions in neon module. @@ -489,3 +491,24 @@ neon_xlogflush(PG_FUNCTION_ARGS) XLogFlush(lsn); PG_RETURN_VOID(); } + +/* + * Function to trigger panic. + */ +Datum +trigger_panic(PG_FUNCTION_ARGS) +{ + elog(PANIC, "neon_test_utils: panic"); + PG_RETURN_VOID(); +} + +/* + * Function to trigger a segfault. + */ +Datum +trigger_segfault(PG_FUNCTION_ARGS) +{ + int *ptr = NULL; + *ptr = 42; + PG_RETURN_VOID(); +} diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index dffebf55800c..7f4cb2c0100c 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -35,6 +35,7 @@ use proxy::usage_metrics; use anyhow::bail; use proxy::config::{self, ProxyConfig}; use proxy::serverless; +use remote_storage::RemoteStorageConfig; use std::net::SocketAddr; use std::pin::pin; use std::sync::Arc; @@ -205,8 +206,8 @@ struct ProxyCliArgs { /// remote storage configuration for backup metric collection /// Encoded as toml (same format as pageservers), eg /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` - #[clap(long, default_value = "{}")] - metric_backup_collection_remote_storage: String, + #[clap(long, value_parser = remote_storage_from_toml)] + metric_backup_collection_remote_storage: Option, /// chunk size for backup metric collection /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. #[clap(long, default_value = "4194304")] @@ -511,9 +512,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { } let backup_metric_collection_config = config::MetricBackupCollectionConfig { interval: args.metric_backup_collection_interval, - remote_storage_config: remote_storage_from_toml( - &args.metric_backup_collection_remote_storage, - )?, + remote_storage_config: args.metric_backup_collection_remote_storage.clone(), chunk_size: args.metric_backup_collection_chunk_size, }; diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs index bc1c37512bce..4e393fddb2aa 100644 --- a/proxy/src/cache/common.rs +++ b/proxy/src/cache/common.rs @@ -53,6 +53,13 @@ impl Cached { ) } + pub fn map(self, f: impl FnOnce(V) -> U) -> Cached { + Cached { + token: self.token, + value: f(self.value), + } + } + /// Drop this entry from a cache if it's still there. pub fn invalidate(self) -> V { if let Some((cache, info)) = &self.token { diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs index 3b21381bb971..c5c4f6a1ed09 100644 --- a/proxy/src/cache/timed_lru.rs +++ b/proxy/src/cache/timed_lru.rs @@ -65,6 +65,8 @@ impl Cache for TimedLru { struct Entry { created_at: Instant, expires_at: Instant, + ttl: Duration, + update_ttl_on_retrieval: bool, value: T, } @@ -122,7 +124,6 @@ impl TimedLru { Q: Hash + Eq + ?Sized, { let now = Instant::now(); - let deadline = now.checked_add(self.ttl).expect("time overflow"); // Do costly things before taking the lock. let mut cache = self.cache.lock(); @@ -142,7 +143,8 @@ impl TimedLru { let (created_at, expires_at) = (entry.created_at, entry.expires_at); // Update the deadline and the entry's position in the LRU list. - if self.update_ttl_on_retrieval { + let deadline = now.checked_add(raw_entry.get().ttl).expect("time overflow"); + if raw_entry.get().update_ttl_on_retrieval { raw_entry.get_mut().expires_at = deadline; } raw_entry.to_back(); @@ -162,12 +164,27 @@ impl TimedLru { /// existed, return the previous value and its creation timestamp. #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] fn insert_raw(&self, key: K, value: V) -> (Instant, Option) { + self.insert_raw_ttl(key, value, self.ttl, self.update_ttl_on_retrieval) + } + + /// Insert an entry to the cache. If an entry with the same key already + /// existed, return the previous value and its creation timestamp. + #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] + fn insert_raw_ttl( + &self, + key: K, + value: V, + ttl: Duration, + update: bool, + ) -> (Instant, Option) { let created_at = Instant::now(); - let expires_at = created_at.checked_add(self.ttl).expect("time overflow"); + let expires_at = created_at.checked_add(ttl).expect("time overflow"); let entry = Entry { created_at, expires_at, + ttl, + update_ttl_on_retrieval: update, value, }; @@ -190,6 +207,21 @@ impl TimedLru { } impl TimedLru { + pub fn insert_ttl(&self, key: K, value: V, ttl: Duration) { + self.insert_raw_ttl(key, value, ttl, false); + } + + pub fn insert_unit(&self, key: K, value: V) -> (Option, Cached<&Self, ()>) { + let (created_at, old) = self.insert_raw(key.clone(), value); + + let cached = Cached { + token: Some((self, LookupInfo { created_at, key })), + value: (), + }; + + (old, cached) + } + pub fn insert(&self, key: K, value: V) -> (Option, Cached<&Self>) { let (created_at, old) = self.insert_raw(key.clone(), value.clone()); diff --git a/proxy/src/config.rs b/proxy/src/config.rs index f4707a33aa79..af5511d7ec24 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -399,15 +399,11 @@ impl FromStr for EndpointCacheConfig { #[derive(Debug)] pub struct MetricBackupCollectionConfig { pub interval: Duration, - pub remote_storage_config: OptRemoteStorageConfig, + pub remote_storage_config: Option, pub chunk_size: usize, } -/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get -/// runtime type errors from the value parser we use. -pub type OptRemoteStorageConfig = Option; - -pub fn remote_storage_from_toml(s: &str) -> anyhow::Result { +pub fn remote_storage_from_toml(s: &str) -> anyhow::Result { RemoteStorageConfig::from_toml(&s.parse()?) } diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index d28d13ba692b..9abf24ab7ffa 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -9,7 +9,7 @@ use crate::proxy::retry::CouldRetry; /// Generic error response with human-readable description. /// Note that we can't always present it to user as is. -#[derive(Debug, Deserialize)] +#[derive(Debug, Deserialize, Clone)] pub struct ConsoleError { pub error: Box, #[serde(skip)] @@ -82,41 +82,19 @@ impl CouldRetry for ConsoleError { .details .error_info .map_or(Reason::Unknown, |e| e.reason); - match reason { - // not a transitive error - Reason::RoleProtected => false, - // on retry, it will still not be found - Reason::ResourceNotFound - | Reason::ProjectNotFound - | Reason::EndpointNotFound - | Reason::BranchNotFound => false, - // we were asked to go away - Reason::RateLimitExceeded - | Reason::NonDefaultBranchComputeTimeExceeded - | Reason::ActiveTimeQuotaExceeded - | Reason::ComputeTimeQuotaExceeded - | Reason::WrittenDataQuotaExceeded - | Reason::DataTransferQuotaExceeded - | Reason::LogicalSizeQuotaExceeded => false, - // transitive error. control plane is currently busy - // but might be ready soon - Reason::RunningOperations => true, - Reason::ConcurrencyLimitReached => true, - Reason::LockAlreadyTaken => true, - // unknown error. better not retry it. - Reason::Unknown => false, - } + + reason.can_retry() } } -#[derive(Debug, Deserialize)] +#[derive(Debug, Deserialize, Clone)] pub struct Status { pub code: Box, pub message: Box, pub details: Details, } -#[derive(Debug, Deserialize)] +#[derive(Debug, Deserialize, Clone)] pub struct Details { pub error_info: Option, pub retry_info: Option, @@ -199,6 +177,34 @@ impl Reason { | Reason::BranchNotFound ) } + + pub fn can_retry(&self) -> bool { + match self { + // do not retry role protected errors + // not a transitive error + Reason::RoleProtected => false, + // on retry, it will still not be found + Reason::ResourceNotFound + | Reason::ProjectNotFound + | Reason::EndpointNotFound + | Reason::BranchNotFound => false, + // we were asked to go away + Reason::RateLimitExceeded + | Reason::NonDefaultBranchComputeTimeExceeded + | Reason::ActiveTimeQuotaExceeded + | Reason::ComputeTimeQuotaExceeded + | Reason::WrittenDataQuotaExceeded + | Reason::DataTransferQuotaExceeded + | Reason::LogicalSizeQuotaExceeded => false, + // transitive error. control plane is currently busy + // but might be ready soon + Reason::RunningOperations + | Reason::ConcurrencyLimitReached + | Reason::LockAlreadyTaken => true, + // unknown error. better not retry it. + Reason::Unknown => false, + } + } } #[derive(Copy, Clone, Debug, Deserialize)] @@ -206,7 +212,7 @@ pub struct RetryInfo { pub retry_delay_ms: u64, } -#[derive(Debug, Deserialize)] +#[derive(Debug, Deserialize, Clone)] pub struct UserFacingMessage { pub message: Box, } diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index bec55a83435f..7a9637066fb1 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -2,7 +2,7 @@ pub mod mock; pub mod neon; -use super::messages::MetricsAuxInfo; +use super::messages::{ConsoleError, MetricsAuxInfo}; use crate::{ auth::{ backend::{ComputeCredentialKeys, ComputeUserInfo}, @@ -317,8 +317,8 @@ impl NodeInfo { } } -pub type NodeInfoCache = TimedLru; -pub type CachedNodeInfo = Cached<&'static NodeInfoCache>; +pub type NodeInfoCache = TimedLru>>; +pub type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>; pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option>; pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc>>; diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 41bd2f49567e..a6e67be22f13 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -9,7 +9,7 @@ use super::{ use crate::{ auth::backend::ComputeUserInfo, compute, - console::messages::ColdStartInfo, + console::messages::{ColdStartInfo, Reason}, http, metrics::{CacheOutcome, Metrics}, rate_limiter::EndpointRateLimiter, @@ -17,10 +17,10 @@ use crate::{ }; use crate::{cache::Cached, context::RequestMonitoring}; use futures::TryFutureExt; -use std::sync::Arc; +use std::{sync::Arc, time::Duration}; use tokio::time::Instant; use tokio_postgres::config::SslMode; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{debug, error, info, info_span, warn, Instrument}; pub struct Api { endpoint: http::Endpoint, @@ -273,26 +273,34 @@ impl super::Api for Api { ) -> Result { let key = user_info.endpoint_cache_key(); + macro_rules! check_cache { + () => { + if let Some(cached) = self.caches.node_info.get(&key) { + let (cached, info) = cached.take_value(); + let info = info.map_err(|c| { + info!(key = &*key, "found cached wake_compute error"); + WakeComputeError::ApiError(ApiError::Console(*c)) + })?; + + debug!(key = &*key, "found cached compute node info"); + ctx.set_project(info.aux.clone()); + return Ok(cached.map(|()| info)); + } + }; + } + // Every time we do a wakeup http request, the compute node will stay up // for some time (highly depends on the console's scale-to-zero policy); // The connection info remains the same during that period of time, // which means that we might cache it to reduce the load and latency. - if let Some(cached) = self.caches.node_info.get(&key) { - info!(key = &*key, "found cached compute node info"); - ctx.set_project(cached.aux.clone()); - return Ok(cached); - } + check_cache!(); let permit = self.locks.get_permit(&key).await?; // after getting back a permit - it's possible the cache was filled // double check if permit.should_check_cache() { - if let Some(cached) = self.caches.node_info.get(&key) { - info!(key = &*key, "found cached compute node info"); - ctx.set_project(cached.aux.clone()); - return Ok(cached); - } + check_cache!(); } // check rate limit @@ -300,23 +308,56 @@ impl super::Api for Api { .wake_compute_endpoint_rate_limiter .check(user_info.endpoint.normalize_intern(), 1) { - info!(key = &*key, "found cached compute node info"); return Err(WakeComputeError::TooManyConnections); } - let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?; - ctx.set_project(node.aux.clone()); - let cold_start_info = node.aux.cold_start_info; - info!("woken up a compute node"); + let node = permit.release_result(self.do_wake_compute(ctx, user_info).await); + match node { + Ok(node) => { + ctx.set_project(node.aux.clone()); + debug!(key = &*key, "created a cache entry for woken compute node"); - // store the cached node as 'warm' - node.aux.cold_start_info = ColdStartInfo::WarmCached; - let (_, mut cached) = self.caches.node_info.insert(key.clone(), node); - cached.aux.cold_start_info = cold_start_info; + let mut stored_node = node.clone(); + // store the cached node as 'warm_cached' + stored_node.aux.cold_start_info = ColdStartInfo::WarmCached; - info!(key = &*key, "created a cache entry for compute node info"); + let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node)); - Ok(cached) + Ok(cached.map(|()| node)) + } + Err(err) => match err { + WakeComputeError::ApiError(ApiError::Console(err)) => { + let Some(status) = &err.status else { + return Err(WakeComputeError::ApiError(ApiError::Console(err))); + }; + + let reason = status + .details + .error_info + .map_or(Reason::Unknown, |x| x.reason); + + // if we can retry this error, do not cache it. + if reason.can_retry() { + return Err(WakeComputeError::ApiError(ApiError::Console(err))); + } + + // at this point, we should only have quota errors. + debug!( + key = &*key, + "created a cache entry for the wake compute error" + ); + + self.caches.node_info.insert_ttl( + key, + Err(Box::new(err.clone())), + Duration::from_secs(30), + ); + + Err(WakeComputeError::ApiError(ApiError::Console(err))) + } + err => return Err(err), + }, + } } } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index e72bf199e362..cfc1f8e89e3f 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -14,17 +14,14 @@ use parquet::{ record::RecordWriter, }; use pq_proto::StartupMessageParams; -use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; +use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel}; use serde::ser::SerializeMap; use tokio::{sync::mpsc, time}; use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; use utils::backoff; -use crate::{ - config::{remote_storage_from_toml, OptRemoteStorageConfig}, - context::LOG_CHAN_DISCONNECT, -}; +use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT}; use super::{RequestMonitoring, LOG_CHAN}; @@ -33,11 +30,11 @@ pub struct ParquetUploadArgs { /// Storage location to upload the parquet files to. /// Encoded as toml (same format as pageservers), eg /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` - #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)] - parquet_upload_remote_storage: OptRemoteStorageConfig, + #[clap(long, value_parser = remote_storage_from_toml)] + parquet_upload_remote_storage: Option, - #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)] - parquet_upload_disconnect_events_remote_storage: OptRemoteStorageConfig, + #[clap(long, value_parser = remote_storage_from_toml)] + parquet_upload_disconnect_events_remote_storage: Option, /// How many rows to include in a row group #[clap(long, default_value_t = 8192)] diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 8119f39fae6b..5186a9e1b0f1 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -540,8 +540,8 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn }, allow_self_signed_compute: false, }; - let (_, node) = cache.insert("key".into(), node); - node + let (_, node2) = cache.insert_unit("key".into(), Ok(node.clone())); + node2.map(|()| node) } fn helper_create_connect_info( diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index c81373c77c7d..4d580e57ed7e 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -12,7 +12,6 @@ use sd_notify::NotifyState; use tokio::runtime::Handle; use tokio::signal::unix::{signal, SignalKind}; use tokio::task::JoinError; -use toml_edit::Document; use utils::logging::SecretString; use std::env::{var, VarError}; @@ -126,7 +125,7 @@ struct Args { peer_recovery: bool, /// Remote storage configuration for WAL backup (offloading to s3) as TOML /// inline table, e.g. - /// {"max_concurrent_syncs" = 17, "max_sync_errors": 13, "bucket_name": "", "bucket_region":"", "concurrency_limit": 119} + /// {max_concurrent_syncs = 17, max_sync_errors = 13, bucket_name = "", bucket_region = "", concurrency_limit = 119} /// Safekeeper offloads WAL to /// [prefix_in_bucket/]//, mirroring /// structure on the file system. @@ -446,6 +445,19 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { .map(|res| ("WAL service main".to_owned(), res)); tasks_handles.push(Box::pin(wal_service_handle)); + let timeline_housekeeping_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle()) + .spawn(async move { + const TOMBSTONE_TTL: Duration = Duration::from_secs(3600 * 24); + loop { + tokio::time::sleep(TOMBSTONE_TTL).await; + GlobalTimelines::housekeeping(&TOMBSTONE_TTL); + } + }) + .map(|res| ("Timeline map housekeeping".to_owned(), res)); + tasks_handles.push(Box::pin(timeline_housekeeping_handle)); + if let Some(pg_listener_tenant_only) = pg_listener_tenant_only { let conf_ = conf.clone(); let wal_service_handle = current_thread_rt @@ -553,16 +565,8 @@ fn set_id(workdir: &Utf8Path, given_id: Option) -> Result { Ok(my_id) } -// Parse RemoteStorage from TOML table. fn parse_remote_storage(storage_conf: &str) -> anyhow::Result { - // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse - let storage_conf_toml = format!("remote_storage = {storage_conf}"); - let parsed_toml = storage_conf_toml.parse::()?; // parse - let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again - RemoteStorageConfig::from_toml(storage_conf_parsed_toml).and_then(|parsed_config| { - // XXX: Don't print the original toml here, there might be some sensitive data - parsed_config.context("Incorrectly parsed remote storage toml as no remote storage config") - }) + RemoteStorageConfig::from_toml(&storage_conf.parse()?) } #[test] diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 9ce1112cec43..f57da5c7cbf1 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -15,12 +15,19 @@ use std::collections::HashMap; use std::str::FromStr; use std::sync::atomic::Ordering; use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; use tracing::*; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; struct GlobalTimelinesState { timelines: HashMap>, + + // A tombstone indicates this timeline used to exist has been deleted. These are used to prevent + // on-demand timeline creation from recreating deleted timelines. This is only soft-enforced, as + // this map is dropped on restart. + tombstones: HashMap, + conf: Option, broker_active_set: Arc, load_lock: Arc>, @@ -64,11 +71,17 @@ impl GlobalTimelinesState { .cloned() .ok_or(TimelineError::NotFound(*ttid)) } + + fn delete(&mut self, ttid: TenantTimelineId) { + self.timelines.remove(&ttid); + self.tombstones.insert(ttid, Instant::now()); + } } static TIMELINES_STATE: Lazy> = Lazy::new(|| { Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), + tombstones: HashMap::new(), conf: None, broker_active_set: Arc::new(TimelinesSet::default()), load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)), @@ -198,11 +211,17 @@ impl GlobalTimelines { let tli = Arc::new(timeline); // TODO: prevent concurrent timeline creation/loading - TIMELINES_STATE - .lock() - .unwrap() - .timelines - .insert(ttid, tli.clone()); + { + let mut state = TIMELINES_STATE.lock().unwrap(); + + // We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`). We trust + // that the human doing this manual intervention knows what they are doing, and remove its tombstone. + if state.tombstones.remove(&ttid).is_some() { + warn!("Un-deleted timeline {ttid}"); + } + + state.timelines.insert(ttid, tli.clone()); + } tli.bootstrap(&conf, broker_active_set, partial_backup_rate_limiter); @@ -229,7 +248,7 @@ impl GlobalTimelines { /// Create a new timeline with the given id. If the timeline already exists, returns /// an existing timeline. - pub async fn create( + pub(crate) async fn create( ttid: TenantTimelineId, server_info: ServerInfo, commit_lsn: Lsn, @@ -241,6 +260,11 @@ impl GlobalTimelines { // Timeline already exists, return it. return Ok(timeline); } + + if state.tombstones.contains_key(&ttid) { + anyhow::bail!("Timeline {ttid} is deleted, refusing to recreate"); + } + state.get_dependencies() }; @@ -300,17 +324,19 @@ impl GlobalTimelines { /// Get a timeline from the global map. If it's not present, it doesn't exist on disk, /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid, /// i.e. loaded in memory and not cancelled. - pub fn get(ttid: TenantTimelineId) -> Result, TimelineError> { - let res = TIMELINES_STATE.lock().unwrap().get(&ttid); - - match res { + pub(crate) fn get(ttid: TenantTimelineId) -> Result, TimelineError> { + let tli_res = { + let state = TIMELINES_STATE.lock().unwrap(); + state.get(&ttid) + }; + match tli_res { Ok(tli) => { if tli.is_cancelled() { return Err(TimelineError::Cancelled(ttid)); } Ok(tli) } - _ => res, + _ => tli_res, } } @@ -339,12 +365,26 @@ impl GlobalTimelines { /// Cancels timeline, then deletes the corresponding data directory. /// If only_local, doesn't remove WAL segments in remote storage. - pub async fn delete( + pub(crate) async fn delete( ttid: &TenantTimelineId, only_local: bool, ) -> Result { - let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid); - match tli_res { + let tli_res = { + let state = TIMELINES_STATE.lock().unwrap(); + + if state.tombstones.contains_key(ttid) { + // Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do. + info!("Timeline {ttid} was already deleted"); + return Ok(TimelineDeleteForceResult { + dir_existed: false, + was_active: false, + }); + } + + state.get(ttid) + }; + + let result = match tli_res { Ok(timeline) => { let was_active = timeline.broker_active.load(Ordering::Relaxed); @@ -354,11 +394,6 @@ impl GlobalTimelines { info!("deleting timeline {}, only_local={}", ttid, only_local); let dir_existed = timeline.delete(&mut shared_state, only_local).await?; - // Remove timeline from the map. - // FIXME: re-enable it once we fix the issue with recreation of deleted timelines - // https://github.com/neondatabase/neon/issues/3146 - // TIMELINES_STATE.lock().unwrap().timelines.remove(ttid); - Ok(TimelineDeleteForceResult { dir_existed, was_active, // TODO: we probably should remove this field @@ -374,7 +409,14 @@ impl GlobalTimelines { was_active: false, }) } - } + }; + + // Finalize deletion, by dropping Timeline objects and storing smaller tombstones. The tombstones + // are used to prevent still-running computes from re-creating the same timeline when they send data, + // and to speed up repeated deletion calls by avoiding re-listing objects. + TIMELINES_STATE.lock().unwrap().delete(*ttid); + + result } /// Deactivates and deletes all timelines for the tenant. Returns map of all timelines which @@ -420,19 +462,20 @@ impl GlobalTimelines { tenant_id, ))?; - // FIXME: we temporarily disabled removing timelines from the map, see `delete_force` - // let tlis_after_delete = Self::get_all_for_tenant(*tenant_id); - // if !tlis_after_delete.is_empty() { - // // Some timelines were created while we were deleting them, returning error - // // to the caller, so it can retry later. - // bail!( - // "failed to delete all timelines for tenant {}: some timelines were created while we were deleting them", - // tenant_id - // ); - // } - Ok(deleted) } + + pub fn housekeeping(tombstone_ttl: &Duration) { + let mut state = TIMELINES_STATE.lock().unwrap(); + + // We keep tombstones long enough to have a good chance of preventing rogue computes from re-creating deleted + // timelines. If a compute kept running for longer than this TTL (or across a safekeeper restart) then they + // may recreate a deleted timeline. + let now = Instant::now(); + state + .tombstones + .retain(|_, v| now.duration_since(*v) < *tombstone_ttl); + } } #[derive(Clone, Copy, Serialize)] diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 4eb8580e32cf..f687b24320ce 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -259,7 +259,7 @@ pub(crate) enum BlobDataParseResult { Incorrect(Vec), } -fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { +pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { match name.rsplit_once('-') { // FIXME: this is gross, just use a regex? Some((layer_filename, gen)) if gen.len() == 8 => { diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs new file mode 100644 index 000000000000..24668b65169a --- /dev/null +++ b/storage_scrubber/src/find_large_objects.rs @@ -0,0 +1,97 @@ +use futures::StreamExt; +use pageserver::tenant::storage_layer::LayerName; +use serde::{Deserialize, Serialize}; + +use crate::{ + checks::parse_layer_object_name, init_remote, list_objects_with_retries, + metadata_stream::stream_tenants, BucketConfig, NodeKind, +}; + +#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] +enum LargeObjectKind { + DeltaLayer, + ImageLayer, + Other, +} + +impl LargeObjectKind { + fn from_key(key: &str) -> Self { + let fname = key.split('/').last().unwrap(); + + let Ok((layer_name, _generation)) = parse_layer_object_name(fname) else { + return LargeObjectKind::Other; + }; + + match layer_name { + LayerName::Image(_) => LargeObjectKind::ImageLayer, + LayerName::Delta(_) => LargeObjectKind::DeltaLayer, + } + } +} + +#[derive(Serialize, Deserialize)] +pub struct LargeObject { + pub key: String, + pub size: u64, + kind: LargeObjectKind, +} + +#[derive(Serialize, Deserialize)] +pub struct LargeObjectListing { + pub objects: Vec, +} + +pub async fn find_large_objects( + bucket_config: BucketConfig, + min_size: u64, + ignore_deltas: bool, +) -> anyhow::Result { + let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?; + let mut tenants = std::pin::pin!(stream_tenants(&s3_client, &target)); + let mut objects = Vec::new(); + let mut tenant_ctr = 0u64; + let mut object_ctr = 0u64; + while let Some(tenant_shard_id) = tenants.next().await { + let tenant_shard_id = tenant_shard_id?; + let mut tenant_root = target.tenant_root(&tenant_shard_id); + // We want the objects and not just common prefixes + tenant_root.delimiter.clear(); + let mut continuation_token = None; + loop { + let fetch_response = + list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone()) + .await?; + for obj in fetch_response.contents().iter().filter(|o| { + if let Some(obj_size) = o.size { + min_size as i64 <= obj_size + } else { + false + } + }) { + let key = obj.key().expect("couldn't get key").to_owned(); + let kind = LargeObjectKind::from_key(&key); + if ignore_deltas && kind == LargeObjectKind::DeltaLayer { + continue; + } + objects.push(LargeObject { + key, + size: obj.size.unwrap() as u64, + kind, + }) + } + object_ctr += fetch_response.contents().len() as u64; + match fetch_response.next_continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + + tenant_ctr += 1; + if tenant_ctr % 50 == 0 { + tracing::info!( + "Scanned {tenant_ctr} shards. objects={object_ctr}, found={}, current={tenant_shard_id}.", objects.len() + ); + } + } + Ok(LargeObjectListing { objects }) +} diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index 64273432fc0c..6adaa5d38f6b 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -2,6 +2,7 @@ #![deny(clippy::undocumented_unsafe_blocks)] pub mod checks; pub mod cloud_admin_api; +pub mod find_large_objects; pub mod garbage; pub mod metadata_stream; pub mod pageserver_physical_gc; diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index 222bd10ed248..10699edd3c94 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -1,6 +1,7 @@ use anyhow::bail; use camino::Utf8PathBuf; use pageserver_api::shard::TenantShardId; +use storage_scrubber::find_large_objects; use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; use storage_scrubber::pageserver_physical_gc::GcMode; use storage_scrubber::scan_pageserver_metadata::scan_metadata; @@ -72,6 +73,12 @@ enum Command { #[arg(short, long, default_value_t = GcMode::IndicesOnly)] mode: GcMode, }, + FindLargeObjects { + #[arg(long = "min-size")] + min_size: u64, + #[arg(short, long, default_value_t = false)] + ignore_deltas: bool, + }, } #[tokio::main] @@ -86,6 +93,7 @@ async fn main() -> anyhow::Result<()> { Command::PurgeGarbage { .. } => "purge-garbage", Command::TenantSnapshot { .. } => "tenant-snapshot", Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc", + Command::FindLargeObjects { .. } => "find-large-objects", }; let _guard = init_logging(&format!( "{}_{}_{}_{}.log", @@ -199,5 +207,15 @@ async fn main() -> anyhow::Result<()> { println!("{}", serde_json::to_string(&summary).unwrap()); Ok(()) } + Command::FindLargeObjects { + min_size, + ignore_deltas, + } => { + let summary = + find_large_objects::find_large_objects(bucket_config, min_size, ignore_deltas) + .await?; + println!("{}", serde_json::to_string(&summary).unwrap()); + Ok(()) + } } } diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 41fa8e679f28..c019cbbc7790 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -144,6 +144,8 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]: "pageserver_smgr_query_seconds_bucket", "pageserver_smgr_query_seconds_count", "pageserver_smgr_query_seconds_sum", + "pageserver_archive_size", + "pageserver_pitr_history_size", "pageserver_storage_operations_seconds_count_total", "pageserver_storage_operations_seconds_sum_total", "pageserver_evictions_total", diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index e1c851435142..5fb4d948175f 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -943,6 +943,8 @@ def __exit__( # if the test threw an exception, don't check for errors # as a failing assertion would cause the cleanup below to fail ps_assert_metric_no_errors=(exc_type is None), + # do not fail on endpoint errors to allow the rest of cleanup to proceed + fail_on_endpoint_errors=False, ) cleanup_error = None @@ -1167,7 +1169,9 @@ def __init__(self, config: NeonEnvBuilder): if config.auth_enabled: sk_cfg["auth_enabled"] = True if self.safekeepers_remote_storage is not None: - sk_cfg["remote_storage"] = self.safekeepers_remote_storage.to_toml_inline_table() + sk_cfg[ + "remote_storage" + ] = self.safekeepers_remote_storage.to_toml_inline_table().strip() self.safekeepers.append(Safekeeper(env=self, id=id, port=port)) cfg["safekeepers"].append(sk_cfg) @@ -1212,11 +1216,11 @@ def start(self, timeout_in_seconds: Optional[int] = None): for f in futs: f.result() - def stop(self, immediate=False, ps_assert_metric_no_errors=False): + def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True): """ After this method returns, there should be no child processes running. """ - self.endpoints.stop_all() + self.endpoints.stop_all(fail_on_endpoint_errors) # Stop storage controller before pageservers: we don't want it to spuriously # detect a pageserver "failure" during test teardown @@ -2111,6 +2115,21 @@ def stop(self, immediate: bool = False) -> "NeonStorageController": self.running = False return self + @staticmethod + def retryable_node_operation(op, ps_id, max_attempts, backoff): + while max_attempts > 0: + try: + op(ps_id) + return + except StorageControllerApiException as e: + max_attempts -= 1 + log.info(f"Operation failed ({max_attempts} attempts left): {e}") + + if max_attempts == 0: + raise e + + time.sleep(backoff) + @staticmethod def raise_api_exception(res: requests.Response): try: @@ -2451,6 +2470,38 @@ def consistency_check(self): ) log.info("storage controller passed consistency check") + def poll_node_status( + self, node_id: int, desired_scheduling_policy: str, max_attempts: int, backoff: int + ): + """ + Poll the node status until it reaches 'desired_scheduling_policy' or 'max_attempts' have been exhausted + """ + log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy") + while max_attempts > 0: + try: + status = self.node_status(node_id) + policy = status["scheduling"] + if policy == desired_scheduling_policy: + return + else: + max_attempts -= 1 + log.info(f"Status call returned {policy=} ({max_attempts} attempts left)") + + if max_attempts == 0: + raise AssertionError( + f"Status for {node_id=} did not reach {desired_scheduling_policy=}" + ) + + time.sleep(backoff) + except StorageControllerApiException as e: + max_attempts -= 1 + log.info(f"Status call failed ({max_attempts} retries left): {e}") + + if max_attempts == 0: + raise e + + time.sleep(backoff) + def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): if isinstance(config_strings, tuple): pairs = [config_strings] @@ -3850,9 +3901,17 @@ def create( pageserver_id=pageserver_id, ) - def stop_all(self) -> "EndpointFactory": + def stop_all(self, fail_on_error=True) -> "EndpointFactory": + exception = None for ep in self.endpoints: - ep.stop() + try: + ep.stop() + except Exception as e: + log.error(f"Failed to stop endpoint {ep.endpoint_id}: {e}") + exception = e + + if fail_on_error and exception is not None: + raise exception return self diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 3da0be802116..03aee9e5c597 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -599,6 +599,22 @@ def timeline_get_lsn_by_timestamp( res_json = res.json() return res_json + def timeline_lsn_lease( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn + ): + data = { + "lsn": str(lsn), + } + + log.info(f"Requesting lsn lease for {lsn=}, {tenant_id=}, {timeline_id=}") + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/lsn_lease", + json=data, + ) + self.verbose_error(res) + res_json = res.json() + return res_json + def timeline_get_timestamp_of_lsn( self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn ): diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index 8730d8ef751d..c437258c6f87 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -42,10 +42,6 @@ def single_timeline( log.info("detach template tenant form pageserver") env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely - ".*Dropped remote consistent LSN updates.*", - ) log.info(f"duplicating template tenant {ncopies} times in S3") tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies) diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py index 0ff9c8fdaa98..33848b06d35c 100644 --- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py +++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py @@ -55,10 +55,6 @@ def setup_template(env: NeonEnv): } template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely - ".*Dropped remote consistent LSN updates.*", - ) env.pageserver.tenant_attach(template_tenant, config) ep = env.endpoints.create_start("main", tenant_id=template_tenant) ep.safe_psql("create table foo(b text)") diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index b66db4d0ab72..b41ae601975f 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -86,10 +86,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int): template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely - ".*Dropped remote consistent LSN updates.*", - ) env.pageserver.tenant_attach(template_tenant, config) ps_http = env.pageserver.http_client() diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 1d579214b0c5..60861cf939b8 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -1,4 +1,5 @@ import json +import os from pathlib import Path from typing import Any, Dict, Tuple @@ -17,30 +18,74 @@ setup_pageserver_with_tenants, ) +# The following tests use pagebench "getpage at latest LSN" to characterize the throughput of the pageserver. +# originally there was a single test named `test_pageserver_max_throughput_getpage_at_latest_lsn`` +# so you still see some references to this name in the code. +# To avoid recreating the snapshots for each test, we continue to use the name `max_throughput_latest_lsn` +# for some files and metrics. + + +# For reference, the space usage of the snapshots: +# sudo du -hs /instance_store/neon/test_output/shared-snapshots/* +# 416G /instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-500-13 +@pytest.mark.parametrize("duration", [60 * 60]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) +@pytest.mark.parametrize("n_tenants", [500]) +@pytest.mark.timeout(10000) +@pytest.mark.skipif( + os.getenv("CI", "false") == "true", + reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI", +) +def test_pageserver_characterize_throughput_with_n_tenants( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + n_tenants: int, + pgbench_scale: int, + duration: int, +): + setup_and_run_pagebench_benchmark( + neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, 1 + ) + # For reference, the space usage of the snapshots: -# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots -# 137G /instance_store/test_output/shared-snapshots -# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/* -# 1.8G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13 -# 1.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6 -# 8.5G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13 -# 5.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6 -# 76G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13 -# 46G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6 -@pytest.mark.parametrize("duration", [30]) -@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]]) -@pytest.mark.parametrize("n_tenants", [1, 10]) -@pytest.mark.timeout( - 10000 -) # TODO: this value is just "a really high number"; have this per instance type -def test_pageserver_max_throughput_getpage_at_latest_lsn( +# sudo du -hs /instance_store/neon/test_output/shared-snapshots/* +# 19G /instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-1-136 +@pytest.mark.parametrize("duration", [20 * 60]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)]) +# we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability +# we use 64 clients because typically for a high number of connections we recommend the connection pooler +# which by default uses 64 connections +@pytest.mark.parametrize("n_clients", [1, 64]) +@pytest.mark.parametrize("n_tenants", [1]) +@pytest.mark.timeout(2400) +@pytest.mark.skipif( + os.getenv("CI", "false") == "true", + reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI", +) +def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, n_tenants: int, pgbench_scale: int, duration: int, + n_clients: int, +): + setup_and_run_pagebench_benchmark( + neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients + ) + + +def setup_and_run_pagebench_benchmark( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + n_tenants: int, + pgbench_scale: int, + duration: int, + n_clients: int, ): def record(metric, **kwargs): zenbenchmark.record( @@ -55,6 +100,7 @@ def record(metric, **kwargs): "n_tenants": (n_tenants, {"unit": ""}), "pgbench_scale": (pgbench_scale, {"unit": ""}), "duration": (duration, {"unit": "s"}), + "n_clients": (n_clients, {"unit": ""}), } ) @@ -96,7 +142,7 @@ def setup_wrapper(env: NeonEnv): r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*" ) - run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration) + run_pagebench_benchmark(env, pg_bin, record, duration, n_clients) def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): @@ -118,10 +164,6 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): } template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely - ".*Dropped remote consistent LSN updates.*", - ) env.pageserver.tenant_attach(template_tenant, config) ps_http = env.pageserver.http_client() with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: @@ -157,8 +199,8 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): return (template_tenant, template_timeline, config) -def run_benchmark_max_throughput_latest_lsn( - env: NeonEnv, pg_bin: PgBin, record, duration_secs: int +def run_pagebench_benchmark( + env: NeonEnv, pg_bin: PgBin, record, duration_secs: int, n_clients: int ): """ Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`. @@ -172,6 +214,8 @@ def run_benchmark_max_throughput_latest_lsn( ps_http.base_url, "--page-service-connstring", env.pageserver.connstr(password=None), + "--num-clients", + str(n_clients), "--runtime", f"{duration_secs}s", # don't specify the targets explicitly, let pagebench auto-discover them diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py index 92e05663ce20..88296a7fbdec 100644 --- a/test_runner/performance/pageserver/util.py +++ b/test_runner/performance/pageserver/util.py @@ -22,7 +22,7 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): log.info("wait for all tenants to become active") wait_until_all_tenants_state( - ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False + ps_http, "Active", iterations=10 + n_tenants, period=1, http_error_ok=False ) # ensure all layers are resident for predictiable performance diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index a4c8c8ac421a..3a6113706fa9 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -1,18 +1,89 @@ import concurrent.futures import random import time +from collections import defaultdict +from typing import Any, Dict import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - NeonEnvBuilder, -) +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder from fixtures.pageserver.http import PageserverHttpClient from fixtures.pg_version import PgVersion +def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[str, int]: + """ + Get the number of shards attached to each node. + This function takes into account the intersection of the intent and the observed state. + If they do not match, it asserts out. + """ + tenants = env.storage_controller.tenant_list() + + intent = dict() + observed = dict() + + tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict( + lambda: { + "observed": {"attached": None, "secondary": []}, + "intent": {"attached": None, "secondary": []}, + } + ) + + for t in tenants: + for node_id, loc_state in t["observed"]["locations"].items(): + if ( + loc_state is not None + and "conf" in loc_state + and loc_state["conf"] is not None + and loc_state["conf"]["mode"] + in set(["AttachedSingle", "AttachedMulti", "AttachedStale"]) + ): + observed[t["tenant_shard_id"]] = int(node_id) + tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id) + + if ( + loc_state is not None + and "conf" in loc_state + and loc_state["conf"] is not None + and loc_state["conf"]["mode"] == "Secondary" + ): + tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(int(node_id)) + + if "attached" in t["intent"]: + intent[t["tenant_shard_id"]] = t["intent"]["attached"] + tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"]["attached"] + + if "secondary" in t["intent"]: + tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][ + "secondary" + ] + + log.info(f"{tenant_placement=}") + + matching = { + tid: intent[tid] for tid in observed if tid in intent and intent[tid] == observed[tid] + } + assert len(matching) == total_shards + + attached_per_node: defaultdict[str, int] = defaultdict(int) + for node_id in matching.values(): + attached_per_node[node_id] += 1 + + return attached_per_node + + +def assert_consistent_balanced_attachments(env: NeonEnv, total_shards): + attached_per_node = get_consistent_node_shard_counts(env, total_shards) + + min_shard_count = min(attached_per_node.values()) + max_shard_count = max(attached_per_node.values()) + + flake_factor = 5 / 100 + assert max_shard_count - min_shard_count <= int(total_shards * flake_factor) + + @pytest.mark.timeout(3600) # super long running test: should go down as we optimize def test_storage_controller_many_tenants( neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure @@ -44,7 +115,8 @@ def test_storage_controller_many_tenants( # A small sleep on each call into the notify hook, to simulate the latency of doing a database write compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01)) - env = neon_env_builder.init_start() + env = neon_env_builder.init_configs() + neon_env_builder.start() # We will intentionally stress reconciler concurrrency, which triggers a warning when lots # of shards are hitting the delayed path. @@ -60,14 +132,6 @@ def test_storage_controller_many_tenants( ) for ps in env.pageservers: - # This can happen because when we do a loop over all pageservers and mark them offline/active, - # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of - # bumping generation before other attachments are detached. - # - # We could clean this up by making reconcilers respect the .observed of their predecessor, if - # we spawn with a wait for the predecessor. - ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - # Storage controller is allowed to drop pageserver requests when the cancellation token # for a Reconciler fires. ps.allowed_errors.append(".*request was dropped before completing.*") @@ -79,6 +143,8 @@ def test_storage_controller_many_tenants( shard_count = 2 stripe_size = 1024 + total_shards = tenant_count * shard_count + tenants = set(TenantId.generate() for _i in range(0, tenant_count)) virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) @@ -195,10 +261,44 @@ def check_memory(): env.storage_controller.consistency_check() check_memory() - # Restart pageservers: this exercises the /re-attach API - for pageserver in env.pageservers: - pageserver.stop() - pageserver.start() + shard_counts = get_consistent_node_shard_counts(env, total_shards) + log.info(f"Shard counts before rolling restart: {shard_counts}") + + assert_consistent_balanced_attachments(env, total_shards) + + # Restart pageservers gracefully: this exercises the /re-attach pageserver API + # and the storage controller drain and fill API + for ps in env.pageservers: + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 + ) + + env.storage_controller.poll_node_status( + ps.id, "PauseForRestart", max_attempts=24, backoff=5 + ) + + shard_counts = get_consistent_node_shard_counts(env, total_shards) + log.info(f"Shard counts after draining node {ps.id}: {shard_counts}") + # Assert that we've drained the node + assert shard_counts[str(ps.id)] == 0 + # Assert that those shards actually went somewhere + assert sum(shard_counts.values()) == total_shards + + ps.restart() + env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=1) + + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 + ) + env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=5) + + shard_counts = get_consistent_node_shard_counts(env, total_shards) + log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") + + assert_consistent_balanced_attachments(env, total_shards) + + env.storage_controller.reconcile_until_idle() + env.storage_controller.consistency_check() # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn, # as they were not offline long enough to trigger any scheduling changes. diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile index 7e074e07b836..7c2b1b40e091 100644 --- a/test_runner/pg_clients/java/jdbc/Dockerfile +++ b/test_runner/pg_clients/java/jdbc/Dockerfile @@ -1,4 +1,4 @@ -FROM openjdk:21 +FROM openjdk:22 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt index e086a937e68b..099a4ade2c4d 100644 --- a/test_runner/pg_clients/python/pg8000/requirements.txt +++ b/test_runner/pg_clients/python/pg8000/requirements.txt @@ -1,2 +1,2 @@ -pg8000==1.30.5 +pg8000==1.31.2 scramp>=1.4.3 diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index a4a2426b97ec..32c1c52eea44 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "addr2line" -version = "0.21.0" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678" dependencies = [ "gimli", ] @@ -19,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "async-trait" -version = "0.1.77" +version = "0.1.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" +checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" dependencies = [ "proc-macro2", "quote", @@ -30,15 +30,15 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.1.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" [[package]] name = "backtrace" -version = "0.3.69" +version = "0.3.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a" dependencies = [ "addr2line", "cc", @@ -63,9 +63,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.2" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" [[package]] name = "block-buffer" @@ -78,9 +78,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.15.3" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "byteorder" @@ -90,15 +90,15 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.5.0" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" +checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" [[package]] name = "cc" -version = "1.0.89" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0ba8f7aaa012f30d5b2861462f6708eccd49c3c39863fe083a308035f63d723" +checksum = "ac367972e516d45567c7eafc73d24e1c193dcf200a8d94e9db7b3d38b349572d" [[package]] name = "cfg-if" @@ -154,9 +154,9 @@ dependencies = [ [[package]] name = "errno" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" dependencies = [ "libc", "windows-sys 0.52.0", @@ -170,15 +170,9 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" [[package]] name = "fastrand" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" - -[[package]] -name = "finl_unicode" -version = "1.2.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" [[package]] name = "foreign-types" @@ -296,9 +290,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.12" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "libc", @@ -307,9 +301,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.28.1" +version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" +checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" [[package]] name = "hmac" @@ -329,29 +323,23 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - [[package]] name = "libc" -version = "0.2.153" +version = "0.2.155" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" [[package]] name = "linux-raw-sys" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "lock_api" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" dependencies = [ "autocfg", "scopeguard", @@ -375,15 +363,15 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.1" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "miniz_oxide" -version = "0.7.2" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" dependencies = [ "adler", ] @@ -401,11 +389,10 @@ dependencies = [ [[package]] name = "native-tls" -version = "0.2.11" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" dependencies = [ - "lazy_static", "libc", "log", "openssl", @@ -419,9 +406,9 @@ dependencies = [ [[package]] name = "object" -version = "0.32.2" +version = "0.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +checksum = "576dfe1fc8f9df304abb159d767a29d0476f7750fbf8aa7ad07816004a207434" dependencies = [ "memchr", ] @@ -438,7 +425,7 @@ version = "0.10.64" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f" dependencies = [ - "bitflags 2.4.2", + "bitflags 2.6.0", "cfg-if", "foreign-types", "libc", @@ -466,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.101" +version = "0.9.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dda2b0f344e78efc2facf7d195d098df0dd72151b26ab98da807afc26c198dff" +checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2" dependencies = [ "cc", "libc", @@ -478,9 +465,9 @@ dependencies = [ [[package]] name = "parking_lot" -version = "0.12.1" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" dependencies = [ "lock_api", "parking_lot_core", @@ -488,15 +475,15 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.9" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.5.2", "smallvec", - "windows-targets 0.48.5", + "windows-targets 0.52.5", ] [[package]] @@ -525,9 +512,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.13" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" +checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" [[package]] name = "pin-utils" @@ -591,18 +578,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.78" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.35" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] @@ -646,6 +633,15 @@ dependencies = [ "bitflags 1.3.2", ] +[[package]] +name = "redox_syscall" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd" +dependencies = [ + "bitflags 2.6.0", +] + [[package]] name = "rust-neon-example" version = "0.1.0" @@ -658,17 +654,17 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustix" -version = "0.38.31" +version = "0.38.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" dependencies = [ - "bitflags 2.4.2", + "bitflags 2.6.0", "errno", "libc", "linux-raw-sys", @@ -692,11 +688,11 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "security-framework" -version = "2.9.2" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de" +checksum = "c627723fd09706bacdb5cf41499e95098555af3c3c29d014dc3c458ef6be11c0" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.6.0", "core-foundation", "core-foundation-sys", "libc", @@ -705,9 +701,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.9.1" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a" +checksum = "317936bbbd05227752583946b9e66d7ce3b489f84e11a94a510b4437fef407d7" dependencies = [ "core-foundation-sys", "libc", @@ -741,15 +737,15 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.13.1" +version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "socket2" -version = "0.5.6" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" dependencies = [ "libc", "windows-sys 0.52.0", @@ -757,26 +753,26 @@ dependencies = [ [[package]] name = "stringprep" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb41d74e231a107a1b4ee36bd1214b11285b77768d2e3824aedafa988fd36ee6" +checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1" dependencies = [ - "finl_unicode", "unicode-bidi", "unicode-normalization", + "unicode-properties", ] [[package]] name = "subtle" -version = "2.5.0" +version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.52" +version = "2.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" +checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9" dependencies = [ "proc-macro2", "quote", @@ -797,9 +793,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "c55115c6fbe2d2bef26eb09ad74bde02d8255476fc0c7b515ef09fbb35742d82" dependencies = [ "tinyvec_macros", ] @@ -812,9 +808,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.36.0" +version = "1.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" +checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" dependencies = [ "backtrace", "bytes", @@ -828,9 +824,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" +checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" dependencies = [ "proc-macro2", "quote", @@ -875,35 +871,15 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.10" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" +checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" dependencies = [ "bytes", "futures-core", "futures-sink", "pin-project-lite", "tokio", - "tracing", -] - -[[package]] -name = "tracing" -version = "0.1.40" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" -dependencies = [ - "pin-project-lite", - "tracing-core", -] - -[[package]] -name = "tracing-core" -version = "0.1.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" -dependencies = [ - "once_cell", ] [[package]] @@ -933,6 +909,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-properties" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4259d9d4425d9f0661581b804cb85fe66a4c631cadd8f490d1c13a35d5d9291" + [[package]] name = "vcpkg" version = "0.2.15" @@ -1023,11 +1005,11 @@ dependencies = [ [[package]] name = "whoami" -version = "1.5.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fec781d48b41f8163426ed18e8fc2864c12937df9ce54c88ede7bd47270893e" +checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9" dependencies = [ - "redox_syscall", + "redox_syscall 0.4.1", "wasite", "web-sys", ] @@ -1047,7 +1029,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.4", + "windows-targets 0.52.5", ] [[package]] @@ -1067,17 +1049,18 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" dependencies = [ - "windows_aarch64_gnullvm 0.52.4", - "windows_aarch64_msvc 0.52.4", - "windows_i686_gnu 0.52.4", - "windows_i686_msvc 0.52.4", - "windows_x86_64_gnu 0.52.4", - "windows_x86_64_gnullvm 0.52.4", - "windows_x86_64_msvc 0.52.4", + "windows_aarch64_gnullvm 0.52.5", + "windows_aarch64_msvc 0.52.5", + "windows_i686_gnu 0.52.5", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.5", + "windows_x86_64_gnu 0.52.5", + "windows_x86_64_gnullvm 0.52.5", + "windows_x86_64_msvc 0.52.5", ] [[package]] @@ -1088,9 +1071,9 @@ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" +checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" [[package]] name = "windows_aarch64_msvc" @@ -1100,9 +1083,9 @@ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" +checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" [[package]] name = "windows_i686_gnu" @@ -1112,9 +1095,15 @@ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" -version = "0.52.4" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" +checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" [[package]] name = "windows_i686_msvc" @@ -1124,9 +1113,9 @@ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" +checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" [[package]] name = "windows_x86_64_gnu" @@ -1136,9 +1125,9 @@ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" +checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" [[package]] name = "windows_x86_64_gnullvm" @@ -1148,9 +1137,9 @@ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" +checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" [[package]] name = "windows_x86_64_msvc" @@ -1160,6 +1149,6 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" +checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml index 0f420e5b0643..27d01810bd52 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml @@ -7,9 +7,9 @@ publish = false # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -native-tls = "0.2.11" +native-tls = "0.2.12" postgres-native-tls = "0.5.0" -tokio = { version = "1.36", features=["rt", "macros"] } +tokio = { version = "1.38", features=["rt", "macros"] } tokio-postgres = "0.7.10" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile index 8611e66cbb67..3e214de785b3 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile +++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:1.76 +FROM rust:1.79 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile index 04028388207c..6006e61ee22e 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile @@ -1,11 +1,11 @@ -FROM swift:5.9 AS build +FROM swift:5.10 AS build RUN apt-get -q update && apt-get -q install -y libssl-dev WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.9 +FROM swift:5.10 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresClientKitExample"] diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved index 767443a9ddcc..6e8613095f15 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved @@ -1,4 +1,5 @@ { + "originHash" : "8eff8c577ba246ce7824d3434839acefced2b1a1d2b1ad700554502538a50558", "pins" : [ { "identity" : "bluesocket", @@ -18,15 +19,6 @@ "version" : "2.0.2" } }, - { - "identity" : "openssl", - "kind" : "remoteSourceControl", - "location" : "https://github.com/Kitura/OpenSSL.git", - "state" : { - "revision" : "5dc8cb4f971135c17343e3c6df4f28904a0600e2", - "version" : "2.3.1" - } - }, { "identity" : "postgresclientkit", "kind" : "remoteSourceControl", @@ -37,5 +29,5 @@ } } ], - "version" : 2 + "version" : 3 } diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift index 48320dd02314..a66d09c542f9 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift @@ -1,4 +1,4 @@ -// swift-tools-version:5.8 +// swift-tools-version:5.10 import PackageDescription let package = Package( diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile index 9130e0973f8e..d6815fbb5fa2 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile @@ -1,10 +1,10 @@ -FROM swift:5.9 AS build +FROM swift:5.10 AS build WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.9 +FROM swift:5.10 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresNIOExample"] diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved index 023e03a7b1a0..0e5dfdafcb0a 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved @@ -1,12 +1,22 @@ { + "originHash" : "11b5dcece349a3e56a7a9a7d0af6d0f5b83dff321b43124a01b158ed7aac5302", "pins" : [ { "identity" : "postgres-nio", "kind" : "remoteSourceControl", "location" : "https://github.com/vapor/postgres-nio.git", "state" : { - "revision" : "69ccfdf4c80144d845e3b439961b7ec6cd7ae33f", - "version" : "1.20.2" + "revision" : "5c268768890b062803a49f1358becc478f954265", + "version" : "1.21.5" + } + }, + { + "identity" : "swift-async-algorithms", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-async-algorithms.git", + "state" : { + "revision" : "da4e36f86544cdf733a40d59b3a2267e3a7bbf36", + "version" : "1.0.0" } }, { @@ -81,6 +91,15 @@ "version" : "1.20.1" } }, + { + "identity" : "swift-service-lifecycle", + "kind" : "remoteSourceControl", + "location" : "https://github.com/swift-server/swift-service-lifecycle.git", + "state" : { + "revision" : "d58e6bf2b1ae2884cf204a8b5bcaaa7aae3c1ff0", + "version" : "2.6.0" + } + }, { "identity" : "swift-system", "kind" : "remoteSourceControl", @@ -91,5 +110,5 @@ } } ], - "version" : 2 + "version" : 3 } diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift index 637eb4bc9ddb..20bb10f76c37 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift @@ -1,10 +1,10 @@ -// swift-tools-version:5.9 +// swift-tools-version:5.10 import PackageDescription let package = Package( name: "PostgresNIOExample", dependencies: [ - .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.20.2") + .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.21.5") ], targets: [ .executableTarget( diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile index 004b383749f9..45e8753f7eec 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile +++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile @@ -1,4 +1,4 @@ -FROM node:21 +FROM node:22 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json index b4f8587eacef..19311808b6b8 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json @@ -5,7 +5,7 @@ "packages": { "": { "dependencies": { - "postgresql-client": "2.10.5" + "postgresql-client": "2.11.0" } }, "node_modules/doublylinked": { @@ -42,9 +42,10 @@ } }, "node_modules/postgresql-client": { - "version": "2.10.5", - "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.10.5.tgz", - "integrity": "sha512-R3EC16pUdbgrzk1J2MQLj7jY2TepWurJHoK90nOeLZj1XTpL/+wL1VCneTmclRVKDuKVjFHr+FASV47KrLpAbw==", + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.11.0.tgz", + "integrity": "sha512-QSPHcWVaiBG+JyASaDojOXvhRmsc2n8j2COdIjUDENFAtFls16Zy240asY2ENzZRQJUMAA8vpR8w4SAdI8jdbw==", + "license": "MIT", "dependencies": { "doublylinked": "^2.5.4", "lightning-pool": "^4.2.2", @@ -55,8 +56,7 @@ "putil-varhelpers": "^1.6.5" }, "engines": { - "node": ">=16.0", - "npm": ">=7.0.0" + "node": ">=16.0" } }, "node_modules/power-tasks": { diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json index 07ec100d0d22..d2bba23d2912 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package.json @@ -1,6 +1,6 @@ { "type": "module", "dependencies": { - "postgresql-client": "2.10.5" + "postgresql-client": "2.11.0" } } diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile index 004b383749f9..45e8753f7eec 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile +++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile @@ -1,4 +1,4 @@ -FROM node:21 +FROM node:22 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json index f3b456f1edc7..7f3f7f2e84e7 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json @@ -5,96 +5,138 @@ "packages": { "": { "dependencies": { - "@neondatabase/serverless": "0.9.0", + "@neondatabase/serverless": "0.9.4", "ws": "8.17.1" } }, "node_modules/@neondatabase/serverless": { - "version": "0.9.0", - "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.0.tgz", - "integrity": "sha512-mmJnUAzlzvxNSZuuhI6kgJjH+JgFdBMYUWxihtq/nj0Tjt+Y5UU3W+SvRFoucnd5NObYkuLYQzk+zV5DGFKGJg==", + "version": "0.9.4", + "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.4.tgz", + "integrity": "sha512-D0AXgJh6xkf+XTlsO7iwE2Q1w8981E1cLCPAALMU2YKtkF/1SF6BiAzYARZFYo175ON+b1RNIy9TdSFHm5nteg==", + "license": "MIT", "dependencies": { - "@types/pg": "8.6.6" + "@types/pg": "8.11.6" } }, "node_modules/@types/node": { - "version": "18.16.3", - "resolved": "https://registry.npmjs.org/@types/node/-/node-18.16.3.tgz", - "integrity": "sha512-OPs5WnnT1xkCBiuQrZA4+YAV4HEJejmHneyraIaxsbev5yCEr6KMwINNFP9wQeFIw8FWcoTqF3vQsa5CDaI+8Q==" + "version": "20.14.9", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.9.tgz", + "integrity": "sha512-06OCtnTXtWOZBJlRApleWndH4JsRVs1pDCc8dLSQp+7PpUpX3ePdHyeNSFTeSe7FtKyQkrlPvHwJOW3SLd8Oyg==", + "license": "MIT", + "dependencies": { + "undici-types": "~5.26.4" + } }, "node_modules/@types/pg": { - "version": "8.6.6", - "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.6.6.tgz", - "integrity": "sha512-O2xNmXebtwVekJDD+02udOncjVcMZQuTEQEMpKJ0ZRf5E7/9JJX3izhKUcUifBkyKpljyUM6BTgy2trmviKlpw==", + "version": "8.11.6", + "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.11.6.tgz", + "integrity": "sha512-/2WmmBXHLsfRqzfHW7BNZ8SbYzE8OSk7i3WjFYvfgRHj7S1xj+16Je5fUKv3lVdVzk/zn9TXOqf+avFCFIE0yQ==", + "license": "MIT", "dependencies": { "@types/node": "*", "pg-protocol": "*", - "pg-types": "^2.2.0" + "pg-types": "^4.0.1" } }, + "node_modules/obuf": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz", + "integrity": "sha512-PX1wu0AmAdPqOL1mWhqmlOd8kOIZQwGZw6rh7uby9fTc5lhaOWFLX3I6R1hrF9k3zUY40e6igsLGkDXK92LJNg==", + "license": "MIT" + }, "node_modules/pg-int8": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/pg-int8/-/pg-int8-1.0.1.tgz", "integrity": "sha512-WCtabS6t3c8SkpDBUlb1kjOs7l66xsGdKpIPZsg4wR+B3+u9UAum2odSsF9tnvxg80h4ZxLWMy4pRjOsFIqQpw==", + "license": "ISC", "engines": { "node": ">=4.0.0" } }, + "node_modules/pg-numeric": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/pg-numeric/-/pg-numeric-1.0.2.tgz", + "integrity": "sha512-BM/Thnrw5jm2kKLE5uJkXqqExRUY/toLHda65XgFTBTFYZyopbKjBe29Ii3RbkvlsMoFwD+tHeGaCjjv0gHlyw==", + "license": "ISC", + "engines": { + "node": ">=4" + } + }, "node_modules/pg-protocol": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.0.tgz", - "integrity": "sha512-M+PDm637OY5WM307051+bsDia5Xej6d9IR4GwJse1qA1DIhiKlksvrneZOYQq42OM+spubpcNYEo2FcKQrDk+Q==" + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.1.tgz", + "integrity": "sha512-jPIlvgoD63hrEuihvIg+tJhoGjUsLPn6poJY9N5CnlPd91c2T18T/9zBtLxZSb1EhYxBRoZJtzScCaWlYLtktg==", + "license": "MIT" }, "node_modules/pg-types": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-2.2.0.tgz", - "integrity": "sha512-qTAAlrEsl8s4OiEQY69wDvcMIdQN6wdz5ojQiOy6YRMuynxenON0O5oCpJI6lshc6scgAY8qvJ2On/p+CXY0GA==", + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-4.0.2.tgz", + "integrity": "sha512-cRL3JpS3lKMGsKaWndugWQoLOCoP+Cic8oseVcbr0qhPzYD5DWXK+RZ9LY9wxRf7RQia4SCwQlXk0q6FCPrVng==", + "license": "MIT", "dependencies": { "pg-int8": "1.0.1", - "postgres-array": "~2.0.0", - "postgres-bytea": "~1.0.0", - "postgres-date": "~1.0.4", - "postgres-interval": "^1.1.0" + "pg-numeric": "1.0.2", + "postgres-array": "~3.0.1", + "postgres-bytea": "~3.0.0", + "postgres-date": "~2.1.0", + "postgres-interval": "^3.0.0", + "postgres-range": "^1.1.1" }, "engines": { - "node": ">=4" + "node": ">=10" } }, "node_modules/postgres-array": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-2.0.0.tgz", - "integrity": "sha512-VpZrUqU5A69eQyW2c5CA1jtLecCsN2U/bD6VilrFDWq5+5UIEVO7nazS3TEcHf1zuPYO/sqGvUvW62g86RXZuA==", + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-3.0.2.tgz", + "integrity": "sha512-6faShkdFugNQCLwucjPcY5ARoW1SlbnrZjmGl0IrrqewpvxvhSLHimCVzqeuULCbG0fQv7Dtk1yDbG3xv7Veog==", + "license": "MIT", "engines": { - "node": ">=4" + "node": ">=12" } }, "node_modules/postgres-bytea": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-1.0.0.tgz", - "integrity": "sha512-xy3pmLuQqRBZBXDULy7KbaitYqLcmxigw14Q5sj8QBVLqEwXfeybIKVWiqAXTlcvdvb0+xkOtDbfQMOf4lST1w==", + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-3.0.0.tgz", + "integrity": "sha512-CNd4jim9RFPkObHSjVHlVrxoVQXz7quwNFpz7RY1okNNme49+sVyiTvTRobiLV548Hx/hb1BG+iE7h9493WzFw==", + "license": "MIT", + "dependencies": { + "obuf": "~1.1.2" + }, "engines": { - "node": ">=0.10.0" + "node": ">= 6" } }, "node_modules/postgres-date": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-1.0.7.tgz", - "integrity": "sha512-suDmjLVQg78nMK2UZ454hAG+OAW+HQPZ6n++TNDUX+L0+uUlLywnoxJKDou51Zm+zTCjrCl0Nq6J9C5hP9vK/Q==", + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-2.1.0.tgz", + "integrity": "sha512-K7Juri8gtgXVcDfZttFKVmhglp7epKb1K4pgrkLxehjqkrgPhfG6OO8LHLkfaqkbpjNRnra018XwAr1yQFWGcA==", + "license": "MIT", "engines": { - "node": ">=0.10.0" + "node": ">=12" } }, "node_modules/postgres-interval": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-1.2.0.tgz", - "integrity": "sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==", - "dependencies": { - "xtend": "^4.0.0" - }, + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-3.0.0.tgz", + "integrity": "sha512-BSNDnbyZCXSxgA+1f5UU2GmwhoI0aU5yMxRGO8CdFEcY2BQF9xm/7MqKnYoM1nJDk8nONNWDk9WeSmePFhQdlw==", + "license": "MIT", "engines": { - "node": ">=0.10.0" + "node": ">=12" } }, + "node_modules/postgres-range": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/postgres-range/-/postgres-range-1.1.4.tgz", + "integrity": "sha512-i/hbxIE9803Alj/6ytL7UHQxRvZkI9O4Sy+J3HGc4F4oo/2eQAjTSNJ0bfxyse3bH0nuVesCk+3IRLaMtG3H6w==", + "license": "MIT" + }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", + "license": "MIT" + }, "node_modules/ws": { "version": "8.17.1", "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz", @@ -114,14 +156,6 @@ "optional": true } } - }, - "node_modules/xtend": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", - "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==", - "engines": { - "node": ">=0.4" - } } } } diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json index 3ae7a8a6cfcd..f791d184c5f8 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package.json @@ -1,7 +1,7 @@ { "type": "module", "dependencies": { - "@neondatabase/serverless": "0.9.0", + "@neondatabase/serverless": "0.9.4", "ws": "8.17.1" } } diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index e117c2140f5e..f2ee2b70aac6 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -21,8 +21,6 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv: [ # eviction might be the first one after an attach to access the layers ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction", - # detach can happen before we get to validate the generation number - ".*deletion backend: Dropped remote consistent LSN updates for tenant.*", ] ) assert isinstance(env.pageserver_remote_storage, LocalFsStorage) @@ -58,10 +56,6 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N env.pageserver.allowed_errors.extend( [ - # This fixture detaches the tenant, and tests using it will tend to re-attach it - # shortly after. There may be un-processed deletion_queue validations from the - # initial attachment - ".*Dropped remote consistent LSN updates.*", # This fixture is for tests that will intentionally generate 400 responses ".*Error processing HTTP request: Bad request", ] diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 035ab2796f6a..922a21a99929 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -211,7 +211,7 @@ def op(): def check_pageserver(expect_success: bool, **conn_kwargs): check_connection( env.pageserver, - f"get_last_record_rlsn {env.initial_tenant} {timeline_id}", + f"show {env.initial_tenant}", expect_success, **conn_kwargs, ) diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index 97ab69049d00..4d2cdb8e320a 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -14,11 +14,6 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): ) env = neon_env_builder.init_start() - for pageserver in env.pageservers: - # This test dual-attaches a tenant, one of the pageservers will therefore - # be running with a stale generation. - pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - env.neon_cli.create_branch("test_change_pageserver") endpoint = env.endpoints.create_start("test_change_pageserver") diff --git a/test_runner/regress/test_endpoint_crash.py b/test_runner/regress/test_endpoint_crash.py new file mode 100644 index 000000000000..ae3dded437a0 --- /dev/null +++ b/test_runner/regress/test_endpoint_crash.py @@ -0,0 +1,23 @@ +import pytest +from fixtures.neon_fixtures import NeonEnvBuilder + + +@pytest.mark.parametrize( + "sql_func", + [ + "trigger_panic", + "trigger_segfault", + "💣", # calls `trigger_segfault` internally + ], +) +def test_endpoint_crash(neon_env_builder: NeonEnvBuilder, sql_func: str): + """ + Test that triggering crash from neon_test_utils crashes the endpoint + """ + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_endpoint_crash") + endpoint = env.endpoints.create_start("test_endpoint_crash") + + endpoint.safe_psql("CREATE EXTENSION neon_test_utils;") + with pytest.raises(Exception, match="This probably means the server terminated abnormally"): + endpoint.safe_psql(f"SELECT {sql_func}();") diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 54d3b2d515c5..3b2218dd9b09 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -39,9 +39,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_configs() env.start() - env.pageserver.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py index a6f05fe0f712..6465bdfd217d 100644 --- a/test_runner/regress/test_lfc_working_set_approximation.py +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -1,3 +1,4 @@ +import time from pathlib import Path from fixtures.log_helper import log @@ -72,3 +73,46 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv): blocks = query_scalar(cur, "select approximate_working_set_size(true)") log.info(f"working set size after some index access of a few select pages only {blocks}") assert blocks < 10 + + +def test_sliding_working_set_approximation(neon_simple_env: NeonEnv): + env = neon_simple_env + + endpoint = env.endpoints.create_start( + branch_name="main", + config_lines=[ + "autovacuum = off", + "shared_buffers=1MB", + "neon.max_file_cache_size=256MB", + "neon.file_cache_size_limit=245MB", + ], + ) + conn = endpoint.connect() + cur = conn.cursor() + cur.execute("create extension neon version '1.4'") + cur.execute( + "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))" + ) + cur.execute("insert into t (pk) values (generate_series(1,1000000))") + time.sleep(2) + before_10k = time.monotonic() + cur.execute("select sum(count) from t where pk between 10000 and 20000") + time.sleep(2) + before_1k = time.monotonic() + cur.execute("select sum(count) from t where pk between 1000 and 2000") + after = time.monotonic() + + cur.execute(f"select approximate_working_set_size_seconds({int(after - before_1k + 1)})") + estimation_1k = cur.fetchall()[0][0] + log.info(f"Working set size for selecting 1k records {estimation_1k}") + + cur.execute(f"select approximate_working_set_size_seconds({int(after - before_10k + 1)})") + estimation_10k = cur.fetchall()[0][0] + log.info(f"Working set size for selecting 10k records {estimation_10k}") + + cur.execute("select pg_table_size('t')") + size = cur.fetchall()[0][0] // 8192 + log.info(f"Table size {size} blocks") + + assert estimation_1k >= 20 and estimation_1k <= 40 + assert estimation_10k >= 200 and estimation_10k <= 400 diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index ca3c81d6e51d..41283e4d2ca0 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -4,7 +4,6 @@ from string import ascii_lowercase import pytest -from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( AuxFileStore, @@ -13,7 +12,7 @@ logical_replication_sync, wait_for_last_flush_lsn, ) -from fixtures.utils import query_scalar, wait_until +from fixtures.utils import wait_until def random_string(n: int): @@ -326,12 +325,17 @@ def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg): assert "could not receive data from WAL stream" not in logs -# Test compute start at LSN page of which starts with contrecord -# https://github.com/neondatabase/neon/issues/5749 +# Test replication of WAL record spanning page boundary (with contrecord) after +# compute restart and WAL write of the page. +# +# See https://github.com/neondatabase/neon/issues/5749 +# +# Most pages start with a contrecord, so we don't do anything special +# to ensure that. @pytest.mark.parametrize( "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] ) -def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): +def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env env.neon_cli.create_branch("init") @@ -356,52 +360,6 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): logical_replication_sync(vanilla_pg, endpoint) vanilla_pg.stop() - with endpoint.cursor() as cur: - # measure how much space logical message takes. Sometimes first attempt - # creates huge message and then it stabilizes, have no idea why. - for _ in range(3): - lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - log.info(f"current_lsn={lsn_before}") - # Non-transactional logical message doesn't write WAL, only XLogInsert's - # it, so use transactional. Which is a bit problematic as transactional - # necessitates commit record. Alternatively we can do smth like - # select neon_xlogflush(pg_current_wal_insert_lsn()); - # but isn't much better + that particular call complains on 'xlog flush - # request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips - # page headers. - payload = "blahblah" - cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')") - lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before - logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload) - log.info( - f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}" - ) - - # and write logical message spanning exactly as we want - lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - log.info(f"current_lsn={lsn_before}") - curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - offs = int(curr_lsn) % 8192 - till_page = 8192 - offs - payload_len = ( - till_page - logical_message_base - 8 - ) # not sure why 8 is here, it is deduced from experiments - log.info(f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}") - - # payload_len above would go exactly till the page boundary; but we want contrecord, so make it slightly longer - payload_len += 8 - - cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')") - supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - log.info(f"supposedly_page_boundary={supposedly_contrecord_end}") - # The calculations to hit the page boundary are very fuzzy, so just - # ignore test if we fail to reach it. - if not (int(supposedly_contrecord_end) % 8192 == 32): - pytest.skip("missed page boundary, bad luck") - - cur.execute("insert into replication_example values (2, 3)") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) endpoint.stop().start() diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index 39b486502672..e83aaf91c60f 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -50,7 +50,7 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): # Ensure that the default version is also updated in the neon.control file assert cur.fetchone() == ("1.3",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") - all_versions = ["1.3", "1.2", "1.1", "1.0"] + all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"] current_version = "1.3" for idx, begin_version in enumerate(all_versions): for target_version in all_versions[idx + 1 :]: diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 696af24e5c0a..7ce38c5c3c82 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -249,10 +249,6 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"] assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 - main_pageserver.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) - # Now advance the generation in the control plane: subsequent validations # from the running pageserver will fail. No more deletions should happen. env.storage_controller.attach_hook_issue(env.initial_tenant, other_pageserver.id) @@ -397,8 +393,6 @@ def assert_deletions_submitted(n: int) -> None: # validated before restart. assert get_deletion_queue_executed(ps_http) == before_restart_depth else: - main_pageserver.allowed_errors.extend([".*Dropping stale deletions.*"]) - # If we lost the attachment, we should have dropped our pre-restart deletions. assert get_deletion_queue_dropped(ps_http) == before_restart_depth @@ -553,13 +547,6 @@ def test_multi_attach( tenant_id = env.initial_tenant timeline_id = env.initial_timeline - # We will intentionally create situations where stale deletions happen from non-latest-generation - # nodes when the tenant is multiply-attached - for ps in env.pageservers: - ps.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) - # Initially, the tenant will be attached to the first pageserver (first is default in our test harness) wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active")) _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py index aecfcdd262e5..37ff923632d2 100644 --- a/test_runner/regress/test_pageserver_reconnect.py +++ b/test_runner/regress/test_pageserver_reconnect.py @@ -2,6 +2,7 @@ import time from contextlib import closing +import psycopg2.errors from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, PgBin @@ -40,3 +41,26 @@ def run_pgbench(connstr: str): c.execute("select pg_reload_conf()") thread.join() + + +# Test handling errors during page server reconnect +def test_pageserver_reconnect_failure(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_pageserver_reconnect") + endpoint = env.endpoints.create_start("test_pageserver_reconnect") + + con = endpoint.connect() + cur = con.cursor() + + cur.execute("set statement_timeout='2s'") + cur.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'") + connstring = cur.fetchall()[0][0] + cur.execute( + f"alter system set neon.pageserver_connstring='{connstring}?some_invalid_param=xyz'" + ) + cur.execute("select pg_reload_conf()") + try: + cur.execute("select count(*) from pg_class") + except psycopg2.errors.QueryCanceled: + log.info("Connection to PS failed") + assert not endpoint.log_contains("ERROR: cannot wait on socket event without a socket.*") diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 8431840dc069..0416078ebc67 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -16,6 +16,8 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage from fixtures.utils import wait_until from fixtures.workload import Workload +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response # A tenant configuration that is convenient for generating uploads and deletions # without a large amount of postgres traffic. @@ -59,7 +61,7 @@ def evict_random_layers( @pytest.mark.parametrize("seed", [1, 2, 3]) -def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): +def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, seed: int): """ Issue many location configuration changes, ensure that tenants remain readable & we don't get any unexpected errors. We should @@ -73,6 +75,20 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): neon_env_builder.enable_pageserver_remote_storage( remote_storage_kind=s3_storage(), ) + neon_env_builder.control_plane_compute_hook_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach" + ) + + def ignore_notify(request: Request): + # This test does all its own compute configuration (by passing explicit pageserver ID to Workload functions), + # so we send controller notifications to /dev/null to prevent it fighting the test for control of the compute. + log.info(f"Ignoring storage controller compute notification: {request.json}") + return Response(status=200) + + make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler( + ignore_notify + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) pageservers = env.pageservers @@ -83,9 +99,6 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): for ps in env.pageservers: ps.allowed_errors.extend( [ - # We will make no effort to avoid stale attachments - ".*Dropped remote consistent LSN updates.*", - ".*Dropping stale deletions.*", # page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found ".*query handler.*Tenant.*not found.*", # page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active @@ -102,6 +115,15 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): workload.init(env.pageservers[0].id) workload.write_rows(256, env.pageservers[0].id) + # Discourage the storage controller from interfering with the changes we will make directly on the pageserver + env.storage_controller.tenant_policy_update( + tenant_id, + { + "scheduling": "Stop", + }, + ) + env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*") + # We use a fixed seed to make the test reproducible: we want a randomly # chosen order, but not to change the order every time we run the test. rng = random.Random(seed) diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index b26bd3422f30..fac7fe9deef6 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -355,13 +355,6 @@ def churn_while_failpoints_active(result): env.pageserver.stop(immediate=True) env.endpoints.stop_all() - # We are about to forcibly drop local dirs. Storage controller will increment generation in re-attach before - # we later increment when actually attaching it again, leading to skipping a generation and potentially getting - # these warnings if there was a durable but un-executed deletion list at time of restart. - env.pageserver.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) - dir_to_clear = env.pageserver.tenant_dir() shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 8267d3f36c0b..d414f986e655 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -1144,10 +1144,6 @@ def test_sharding_split_failures( ) for ps in env.pageservers: - # When we do node failures and abandon a shard, it will de-facto have old generation and - # thereby be unable to publish remote consistent LSN updates - ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - # If we're using a failure that will panic the storage controller, all background # upcalls from the pageserver can fail ps.allowed_errors.append(".*calling control plane generation validation API failed.*") diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 1b294fb2d0aa..d37f7aae3dfd 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -60,11 +60,6 @@ def test_storage_controller_smoke( neon_env_builder.num_pageservers = 3 env = neon_env_builder.init_configs() - for pageserver in env.pageservers: - # This test detaches tenants during migration, which can race with deletion queue operations, - # during detach we only do an advisory flush, we don't wait for it. - pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"]) - # Start services by hand so that we can skip a pageserver (this will start + register later) env.broker.try_start() env.storage_controller.start() @@ -484,9 +479,6 @@ def handler(request: Request): # Start running env = neon_env_builder.init_start() - # We will to an unclean migration, which will result in deletion queue warnings - env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates for tenant.*") - # Initial notification from tenant creation assert len(notifications) == 1 expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = { @@ -1054,13 +1046,6 @@ def tenants_placed(): online_node_ids = set(range(1, len(env.pageservers) + 1)) - offline_node_ids for node_id in offline_node_ids: - env.get_pageserver(node_id).allowed_errors.append( - # In the case of the failpoint failure, the impacted pageserver - # still believes it has the tenant attached since location - # config calls into it will fail due to being marked offline. - ".*Dropped remote consistent LSN updates.*", - ) - if len(offline_node_ids) > 1: env.get_pageserver(node_id).allowed_errors.append( ".*Scheduling error when marking pageserver.*offline.*", @@ -1518,49 +1503,6 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto workload.validate() -def retryable_node_operation(op, ps_id, max_attempts, backoff): - while max_attempts > 0: - try: - op(ps_id) - return - except StorageControllerApiException as e: - max_attempts -= 1 - log.info(f"Operation failed ({max_attempts} attempts left): {e}") - - if max_attempts == 0: - raise e - - time.sleep(backoff) - - -def poll_node_status(env, node_id, desired_scheduling_policy, max_attempts, backoff): - log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy") - while max_attempts > 0: - try: - status = env.storage_controller.node_status(node_id) - policy = status["scheduling"] - if policy == desired_scheduling_policy: - return - else: - max_attempts -= 1 - log.info(f"Status call returned {policy=} ({max_attempts} attempts left)") - - if max_attempts == 0: - raise AssertionError( - f"Status for {node_id=} did not reach {desired_scheduling_policy=}" - ) - - time.sleep(backoff) - except StorageControllerApiException as e: - max_attempts -= 1 - log.info(f"Status call failed ({max_attempts} retries left): {e}") - - if max_attempts == 0: - raise e - - time.sleep(backoff) - - def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): """ Graceful reststart of storage controller clusters use the drain and @@ -1601,10 +1543,10 @@ def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards): # Perform a graceful rolling restart for ps in env.pageservers: - retryable_node_operation( + env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 ) - poll_node_status(env, ps.id, "PauseForRestart", max_attempts=6, backoff=5) + env.storage_controller.poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5) shard_counts = get_node_shard_counts(env, tenant_ids) log.info(f"Shard counts after draining node {ps.id}: {shard_counts}") @@ -1614,12 +1556,12 @@ def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards): assert sum(shard_counts.values()) == total_shards ps.restart() - poll_node_status(env, ps.id, "Active", max_attempts=10, backoff=1) + env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=10, backoff=1) - retryable_node_operation( + env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 ) - poll_node_status(env, ps.id, "Active", max_attempts=6, backoff=5) + env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=6, backoff=5) shard_counts = get_node_shard_counts(env, tenant_ids) log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") @@ -1657,15 +1599,15 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): ps_id_to_drain = env.pageservers[0].id - retryable_node_operation( + env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_drain(ps_id), ps_id_to_drain, max_attempts=3, backoff=2, ) - poll_node_status(env, ps_id_to_drain, "Draining", max_attempts=6, backoff=2) + env.storage_controller.poll_node_status(ps_id_to_drain, "Draining", max_attempts=6, backoff=2) env.storage_controller.cancel_node_drain(ps_id_to_drain) - poll_node_status(env, ps_id_to_drain, "Active", max_attempts=6, backoff=2) + env.storage_controller.poll_node_status(ps_id_to_drain, "Active", max_attempts=6, backoff=2) diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py index d7f396262059..91caad722051 100644 --- a/test_runner/regress/test_subscriber_restart.py +++ b/test_runner/regress/test_subscriber_restart.py @@ -54,4 +54,4 @@ def insert_data(pub): pcur.execute(f"INSERT into t values ({n_records}, 0)") n_records += 1 with sub.cursor() as scur: - wait_until(10, 0.5, check_that_changes_propagated) + wait_until(60, 0.5, check_that_changes_propagated) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 2cbb036c0d7c..80fb2b55b8b2 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -320,10 +320,6 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): assert not config_path.exists(), "detach did not remove config file" - # The re-attach's increment of the generation number may invalidate deletion queue - # updates in flight from the previous attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - env.pageserver.tenant_attach(tenant_id) wait_until( number_of_iterations=5, diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index d3fba32a19e0..1d7c8b8e31f0 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -67,8 +67,9 @@ def test_tenant_delete_smoke( # first try to delete non existing tenant tenant_id = TenantId.generate() - env.pageserver.allowed_errors.append(".*NotFound.*") - env.pageserver.allowed_errors.append(".*simulated failure.*") + env.pageserver.allowed_errors.extend( + [".*NotFound.*", ".*simulated failure.*", ".*failed to delete .+ objects.*"] + ) # Check that deleting a non-existent tenant gives the expected result: this is a loop because we # may need to retry on some remote storage errors injected by the test harness diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 2056840558e6..b165588636c7 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -76,10 +76,6 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str): env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # Our re-attach may race with the deletion queue processing LSN updates - # from the original attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: with endpoint.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") @@ -349,10 +345,6 @@ def test_detach_while_attaching( env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # Our re-attach may race with the deletion queue processing LSN updates - # from the original attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point @@ -422,10 +414,6 @@ def test_detach_while_activating( env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # Our re-attach may race with the deletion queue processing LSN updates - # from the original attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - data_id = 1 data_secret = "very secret secret" insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 9fe732e28806..43e9a0d36e80 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -203,8 +203,6 @@ def test_tenant_relocation( [ # Needed for detach polling on the original pageserver f".*NotFound: tenant {tenant_id}.*", - # We will dual-attach in this test, so stale generations are expected - ".*Dropped remote consistent LSN updates.*", ] ) diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 6c85ddebbcfb..70e8fe67d595 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -10,6 +10,7 @@ Endpoint, NeonEnv, NeonEnvBuilder, + flush_ep_to_pageserver, wait_for_last_flush_lsn, wait_for_wal_insert_lsn, ) @@ -710,3 +711,90 @@ def mask_model_inputs(x): return newlist else: return x + + +@pytest.mark.parametrize("zero_gc", [True, False]) +def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, zero_gc: bool): + """ + Compare a LSN lease to a read-only branch for synthetic size calculation. + They should have the same effect. + """ + + conf = { + "pitr_interval": "0s" if zero_gc else "3600s", + "gc_period": "0s", + } + + env = neon_env_builder.init_start(initial_tenant_conf=conf) + + ro_branch_res = insert_with_action( + env, env.initial_tenant, env.initial_timeline, test_output_dir, action="branch" + ) + + tenant, timeline = env.neon_cli.create_tenant(conf=conf) + lease_res = insert_with_action(env, tenant, timeline, test_output_dir, action="lease") + + assert_size_approx_equal(lease_res, ro_branch_res) + + +def insert_with_action( + env: NeonEnv, + tenant: TenantId, + timeline: TimelineId, + test_output_dir: Path, + action: str, +) -> int: + """ + Inserts some data on the timeline, perform an action, and insert more data on the same timeline. + Returns the size at the end of the insertion. + + Valid actions: + - "lease": Acquires a lease. + - "branch": Creates a child branch but never writes to it. + """ + + client = env.pageserver.http_client() + with env.endpoints.create_start("main", tenant_id=tenant) as ep: + initial_size = client.tenant_size(tenant) + log.info(f"initial size: {initial_size}") + + with ep.cursor() as cur: + cur.execute( + "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline) + + if action == "lease": + res = client.timeline_lsn_lease(tenant, timeline, last_flush_lsn) + log.info(f"result from lsn_lease api: {res}") + elif action == "branch": + ro_branch = env.neon_cli.create_branch( + "ro_branch", tenant_id=tenant, ancestor_start_lsn=last_flush_lsn + ) + log.info(f"{ro_branch=} created") + else: + raise AssertionError("Invalid action type, only `lease` and `branch`are accepted") + + with ep.cursor() as cur: + cur.execute( + "CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + cur.execute( + "CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + cur.execute( + "CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + + last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline) + + # Avoid flakiness when calculating logical size. + flush_ep_to_pageserver(env, ep, tenant, timeline) + + size_after_action_and_insert = client.tenant_size(tenant) + log.info(f"{size_after_action_and_insert=}") + + size_debug_file = open(test_output_dir / f"size_debug_{action}.html", "w") + size_debug = client.tenant_size_debug(tenant) + size_debug_file.write(size_debug) + return size_after_action_and_insert diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 93e9ad367367..04b3fdd80fa5 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -41,18 +41,35 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): neon_simple_env.storage_controller.allowed_errors.extend(error_regexes) pageserver_http = neon_simple_env.pageserver.http_client() + + # Failure to write a config to local disk makes the pageserver assume that local disk is bad and abort the process pageserver_http.configure_failpoints(("tenant-config-before-write", "return")) - with pytest.raises(Exception, match="tenant-config-before-write"): + + # Storage controller will see a torn TCP connection when the crash point is reached, and follow an unclean 500 error path + neon_simple_env.storage_controller.allowed_errors.extend( + [ + ".*Reconcile not done yet while creating tenant.*", + ".*Reconcile error: receive body: error sending request.*", + ".*Error processing HTTP request: InternalServerError.*", + ] + ) + + with pytest.raises(Exception, match="error sending request"): _ = neon_simple_env.neon_cli.create_tenant() + # Any files left behind on disk during failed creation do not prevent + # a retry from succeeding. Restart pageserver with no failpoints. + neon_simple_env.pageserver.running = False + neon_simple_env.pageserver.start() + + # The failed creation should not be present in list of tenants, as when we start up we'll see + # an empty tenant dir with no config in it. + neon_simple_env.pageserver.allowed_errors.append(".*Failed to load tenant config.*") new_tenants = sorted( map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) ) assert initial_tenants == new_tenants, "should not create new tenants" - # Any files left behind on disk during failed creation do not prevent - # a retry from succeeding. - pageserver_http.configure_failpoints(("tenant-config-before-write", "off")) neon_simple_env.neon_cli.create_tenant() @@ -369,10 +386,6 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): # generation nubmers out of order. env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+") - # Our multiple creation requests will advance generation quickly, and when we skip - # a generation number we can generate these warnings - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates for tenant .+") - # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of # an incomplete attach, or some other problem. In the field this should be rare, # so we allow it to log at WARN, even if it is occasionally a false positive.