Performance test for pgvector HNSW index build and queries (#7873)

## Problem We want to regularly verify the performance of pgvector HNSW parallel index builds and parallel similarity search using HNSW indexes. The first release that considerably improved the index-build parallelism was pgvector 0.7.0 and we want to make sure that we do not regress by our neon compute VM settings (swap, memory over commit, pg conf etc.) ## Summary of changes Prepare a Neon project with 1 million openAI vector embeddings (vector size 1536). Run HNSW indexing operations in the regression test for the various distance metrics. Run similarity queries using pgbench with 100 concurrent clients. I have also added the relevant metrics to the grafana dashboards pgbench and olape --------- Co-authored-by: Alexander Bayandin <alexander@neon.tech>
neondatabase · May 28, 2024 · fabeff8 · fabeff8 · github-actions · May 28, 2024
1 parent 4a0ce95
commit fabeff8
Show file tree

Hide file tree

Showing 10 changed files with 395 additions and 3 deletions.
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
@@ -38,6 +38,11 @@ on:
         description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch'
         required: false
         default: false
+      run_only_pgvector_tests:
+        type: boolean
+        description: 'Run pgvector tests but no other tests. If not set, all tests including pgvector tests will be run'
+        required: false
+        default: false
 
 defaults:
   run:
@@ -50,6 +55,7 @@ concurrency:
 
 jobs:
   bench:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     env:
       TEST_PG_BENCH_DURATIONS_MATRIX: "300"
       TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -120,6 +126,7 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   generate-matrices:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
     #
     # Available platforms:
@@ -197,6 +204,7 @@ jobs:
         echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
 
   pgbench-compare:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     needs: [ generate-matrices ]
 
     strategy:
@@ -343,6 +351,92 @@ jobs:
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
+  pgbench-pgvector:
+    env:
+      TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
+      TEST_PG_BENCH_SCALES_MATRIX: "1"
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 16
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: "neon-captest-pgvector"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      options: --init
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Add Postgres binaries to PATH
+      run: |
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+        CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
+        
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+
+        QUERIES=("SELECT version()")
+        QUERIES+=("SHOW neon.tenant_id")
+        QUERIES+=("SHOW neon.timeline_id")
+        
+        for q in "${QUERIES[@]}"; do
+          psql ${CONNSTR} -c "${q}"
+        done
+
+    - name: Benchmark pgvector hnsw indexing
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_perf_olap.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+
+    - name: Benchmark pgvector hnsw queries
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_pgvector
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+
   clickbench-compare:
     # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
     # we use for performance testing in pgbench-compare.
@@ -351,7 +445,7 @@ jobs:
     #
     # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
     # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
-    if: ${{ !cancelled() }}
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
     needs: [ generate-matrices, pgbench-compare ]
 
     strategy:
@@ -455,7 +549,7 @@ jobs:
     # We might change it after https://github.com/neondatabase/neon/issues/2900.
     #
     # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
-    if: ${{ !cancelled() }}
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
     needs: [ generate-matrices, clickbench-compare ]
 
     strategy:
@@ -557,7 +651,7 @@ jobs:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
   user-examples-compare:
-    if: ${{ !cancelled() }}
+    if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
     needs: [ generate-matrices, tpch-compare ]
 
     strategy:

diff --git a/pyproject.toml b/pyproject.toml
@@ -54,6 +54,7 @@ build-backend = "poetry.core.masonry.api"
 exclude = [
     "^vendor/",
     "^target/",
+    "test_runner/performance/pgvector/loaddata.py",
 ]
 check_untyped_defs = true
 # Help mypy find imports when running against list of individual files.

diff --git a/test_runner/performance/pgvector/HNSW_build.sql b/test_runner/performance/pgvector/HNSW_build.sql
@@ -0,0 +1,47 @@
+
+\set ECHO queries
+\timing
+
+-- prepare test table
+DROP TABLE IF EXISTS hnsw_test_table;
+CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;
+INSERT INTO hnsw_test_table SELECT * FROM documents;
+CREATE INDEX ON hnsw_test_table (_id); -- needed later for random tuple queries
+-- tune index build params
+SET max_parallel_maintenance_workers = 7; 
+SET maintenance_work_mem = '8GB';
+-- create HNSW index for the supported distance metrics
+CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops);
+CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops);
+CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);
+CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);
+CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);
+-- note: in a second psql session we can monitor the progress of the index build phases using
+-- the following query:
+-- SELECT phase, round(100.0 * blocks_done / nullif(blocks_total, 0), 1) AS "%" FROM pg_stat_progress_create_index;
+
+-- show all indexes built on the table
+SELECT 
+    idx.relname AS index_name,
+    tbl.relname AS table_name,
+    am.amname AS access_method,
+    a.attname AS column_name,
+    opc.opcname AS operator_class
+FROM 
+    pg_index i
+JOIN 
+    pg_class idx ON idx.oid = i.indexrelid
+JOIN 
+    pg_class tbl ON tbl.oid = i.indrelid
+JOIN 
+    pg_am am ON am.oid = idx.relam
+JOIN 
+    pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey)
+JOIN 
+    pg_opclass opc ON opc.oid = i.indclass[0]
+WHERE 
+    tbl.relname = 'hnsw_test_table' 
+    AND a.attname = 'embeddings';
+
+-- show table sizes
+\dt+
diff --git a/test_runner/performance/pgvector/IVFFLAT_build.sql b/test_runner/performance/pgvector/IVFFLAT_build.sql
@@ -0,0 +1,52 @@
+
+\set ECHO queries
+\timing
+
+-- prepare test table
+DROP TABLE IF EXISTS ivfflat_test_table;
+CREATE TABLE ivfflat_test_table AS TABLE documents WITH NO DATA;
+INSERT INTO ivfflat_test_table SELECT * FROM documents;
+CREATE INDEX ON ivfflat_test_table (_id); -- needed later for random tuple queries
+-- tune index build params
+SET max_parallel_maintenance_workers = 7; 
+SET maintenance_work_mem = '8GB';
+-- create ivfflat index for the supported distance metrics
+-- the formulat for lists is # rows / 1000 or sqrt(# rows) if # rows > 1 million
+-- we have 1 million embeddings of vector size 1536 in column embeddings of table documents
+-- so we use 1000 lists
+CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_l2_ops) WITH (lists = 1000);
+CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_ip_ops) WITH (lists = 1000);
+CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_cosine_ops) WITH (lists = 1000);
+CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings::halfvec(1536) halfvec_l2_ops) WITH (lists = 1000);
+CREATE INDEX ON ivfflat_test_table
+    USING ivfflat ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops) WITH (lists = 1000);
+
+\d ivfflat_test_table
+
+
+-- show all indexes built on the table
+SELECT 
+    idx.relname AS index_name,
+    tbl.relname AS table_name,
+    am.amname AS access_method,
+    a.attname AS column_name,
+    opc.opcname AS operator_class
+FROM 
+    pg_index i
+JOIN 
+    pg_class idx ON idx.oid = i.indexrelid
+JOIN 
+    pg_class tbl ON tbl.oid = i.indrelid
+JOIN 
+    pg_am am ON am.oid = idx.relam
+JOIN 
+    pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey)
+JOIN 
+    pg_opclass opc ON opc.oid = i.indclass[0]
+WHERE 
+    tbl.relname = 'ivfflat_test_table' 
+    AND a.attname = 'embeddings';
+-- show table sizes
+\dt+
+
+
diff --git a/test_runner/performance/pgvector/README.md b/test_runner/performance/pgvector/README.md
@@ -0,0 +1,38 @@
+---
+dataset_info:
+  features:
+  - name: _id
+    dtype: string
+  - name: title
+    dtype: string
+  - name: text
+    dtype: string
+  - name: text-embedding-3-large-1536-embedding
+    sequence: float64
+  splits:
+  - name: train
+    num_bytes: 12679725776
+    num_examples: 1000000
+  download_size: 9551862565
+  dataset_size: 12679725776
+configs:
+- config_name: default
+  data_files:
+  - split: train
+    path: data/train-*
+license: mit
+task_categories:
+- feature-extraction
+language:
+- en
+size_categories:
+- 1M<n<10M
+---
+
+
+1M OpenAI Embeddings: text-embedding-3-large 1536 dimensions
+
+- Created: February 2024. 
+- Text used for Embedding: title (string) + text (string)
+- Embedding Model: OpenAI text-embedding-3-large
+- This dataset was generated from the first 1M entries of https://huggingface.co/datasets/BeIR/dbpedia-entity, extracted by @KShivendu_ [here](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M)
diff --git a/test_runner/performance/pgvector/loaddata.py b/test_runner/performance/pgvector/loaddata.py
@@ -0,0 +1,72 @@
+import sys
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import psycopg2
+from pgvector.psycopg2 import register_vector
+from psycopg2.extras import execute_values
+
+
+def print_usage():
+    print("Usage: loaddata.py <CONNSTR> <DATADIR>")
+
+
+def main(conn_str, directory_path):
+    # Connection to PostgreSQL
+    with psycopg2.connect(conn_str) as conn:
+        with conn.cursor() as cursor:
+            # Run SQL statements
+            cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")
+            register_vector(conn)
+            cursor.execute("DROP TABLE IF EXISTS documents;")
+            cursor.execute(
+                """
+                CREATE TABLE documents (
+                    _id TEXT PRIMARY KEY,
+                    title TEXT,
+                    text TEXT,
+                    embeddings vector(1536) -- text-embedding-3-large-1536-embedding (OpenAI)
+                );
+            """
+            )
+            conn.commit()
+
+            # List and sort Parquet files
+            parquet_files = sorted(Path(directory_path).glob("*.parquet"))
+
+            for file in parquet_files:
+                print(f"Loading {file} into PostgreSQL")
+                df = pd.read_parquet(file)
+
+                print(df.head())
+
+                data_list = [
+                    (
+                        row["_id"],
+                        row["title"],
+                        row["text"],
+                        np.array(row["text-embedding-3-large-1536-embedding"]),
+                    )
+                    for index, row in df.iterrows()
+                ]
+                # Use execute_values to perform batch insertion
+                execute_values(
+                    cursor,
+                    "INSERT INTO documents (_id, title, text, embeddings) VALUES %s",
+                    data_list,
+                )
+                # Commit after we insert all embeddings
+                conn.commit()
+
+                print(f"Loaded {file} into PostgreSQL")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print_usage()
+        sys.exit(1)
+
+    conn_str = sys.argv[1]
+    directory_path = sys.argv[2]
+    main(conn_str, directory_path)
diff --git a/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql
@@ -0,0 +1,10 @@
+with x (x) as (
+  select "embeddings" as x
+  from hnsw_test_table 
+  TABLESAMPLE SYSTEM (1) 
+  LIMIT 1
+)
+SELECT title, "embeddings" <=> (select x from x) as distance
+FROM hnsw_test_table
+ORDER BY 2
+LIMIT 30;