Add new macros for diff calculation, and unit tests (#99)

* Add macro for new hash-based comparison strategy * split out SF-focused version of macro * Fix change to complex object * Fix overuse of star * switch from compare rels to compare queries * provide wrapping parens * switch to array of columns for PK * split unit tests into own files, change unit tests to array pk * tidy up get_comp_bounds * fix arg rename * add quick_are_queries_identical and unit tests * Move data tests into own directory * Add test for multiple PKs * fix incorrect unit test configs * make data types for id and id_2 big enough nums * Mock event_time response * fix hardcoded value in quick_are_qs_identical * Add unit tests for null handling (still broken) * Rename columsn to be more unique * Steal surrogate key macro from utils * Use generated surrogate key across the board in place of PK * rm my profile reference * Update quick_are_queries_identical.sql * Add diagram explaining comparison bounds * Add comments explaining warehouse-specific optimisations * cross-db support * subq * no postgres or redshift for a sec * add default var values for compare wrappers * avoid lateral alias reference for BQ * BQ doesn't support count(arg1, arg2) * re-enable redshift * Alias subq for redshift * remove extra comma * add row status of nonunique_pk * remove redundant test and wrapper model * Create json-y tests for snowflake * Add workaround for redshift to support count num rows in status * skip incompatible tests * Fix redshift lack of bool_or support in window funcs * add skip exclusions for everything else * fix incorrect skip tag application * Move user configs to project.yml from profiles * Temporarily disable unpassable redshift tests * add temp skip to circle's config.yml * forgot tag: method * Temporarily skip reworked_compare_all_statuses_different_column_set * Skip another test redshift * disable unsupported tests BQ * postgres too? * Fixes for postgres * namespace macros * It's a postgres problem, not a redshift problem * Handle postgres 63 char limit * Add databricks * Rename tests to data_tests * Found a better workaround for missing count distinct window * actually call the macro * disable syntax-failing tests on dbx
dbt-labs · May 27, 2024 · 9da3c51 · 9da3c51
1 parent 8473293
commit 9da3c51
Show file tree

Hide file tree

Showing 45 changed files with 997 additions and 36 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -33,7 +33,7 @@ jobs:
             . dbt_venv/bin/activate
 
             python -m pip install --upgrade pip setuptools
-            python -m pip install --pre dbt-core dbt-postgres dbt-redshift dbt-snowflake dbt-bigquery
+            python -m pip install --pre dbt-core dbt-postgres dbt-redshift dbt-snowflake dbt-bigquery dbt-databricks
 
             mkdir -p ~/.dbt
             cp integration_tests/ci/sample.profiles.yml ~/.dbt/profiles.yml
@@ -52,8 +52,8 @@ jobs:
             dbt deps --target postgres
             dbt seed --target postgres --full-refresh
             dbt compile --target postgres
-            dbt run --target postgres
-            dbt test --target postgres
+            dbt run --target postgres --exclude tag:skip+ tag:temporary_skip+
+            dbt test --target postgres --exclude tag:skip+ tag:temporary_skip+
 
       - run:
           name: "Run Tests - Redshift"
@@ -64,8 +64,8 @@ jobs:
             dbt deps --target redshift
             dbt seed --target redshift --full-refresh
             dbt compile --target redshift
-            dbt run --target redshift
-            dbt test --target redshift
+            dbt run --target redshift --exclude tag:skip+ tag:temporary_skip+
+            dbt test --target redshift --exclude tag:skip+ tag:temporary_skip+
 
       - run:
           name: "Run Tests - Snowflake"
@@ -76,8 +76,8 @@ jobs:
             dbt deps --target snowflake
             dbt seed --target snowflake --full-refresh
             dbt compile --target snowflake
-            dbt run --target snowflake
-            dbt test --target snowflake
+            dbt run --target snowflake --exclude tag:skip+ tag:temporary_skip+
+            dbt test --target snowflake --exclude tag:skip+ tag:temporary_skip+
 
       - run:
           name: "Run Tests - BigQuery"
@@ -91,9 +91,20 @@ jobs:
             dbt deps --target bigquery
             dbt seed --target bigquery --full-refresh
             dbt compile --target bigquery
-            dbt run --target bigquery --full-refresh
-            dbt test --target bigquery
+            dbt run --target bigquery --full-refresh --exclude tag:skip+ tag:temporary_skip+
+            dbt test --target bigquery --exclude tag:skip+ tag:temporary_skip+
 
+      - run:
+          name: "Run Tests - Databricks"
+          command: |
+            . dbt_venv/bin/activate
+            echo `pwd`
+            cd integration_tests
+            dbt deps --target databricks
+            dbt seed --target databricks --full-refresh
+            dbt compile --target databricks
+            dbt run --target databricks --exclude tag:skip+ tag:temporary_skip+
+            dbt test --target databricks --exclude tag:skip+ tag:temporary_skip+
 
       - save_cache:
           key: deps1-{{ .Branch }}
@@ -115,3 +126,4 @@ workflows:
             - profile-redshift
             - profile-snowflake
             - profile-bigquery
+            - profile-databricks
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 target/
 dbt_packages/
 logs/
-logfile
+logfile
+.DS_Store
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,21 @@
+{    
+    "yaml.schemas": {
+        "https://github.com/raw/dbt-labs/dbt-jsonschema/main/schemas/latest/dbt_yml_files-latest.json": [
+            "/**/*.yml",
+            "!profiles.yml",
+            "!dbt_project.yml",
+            "!packages.yml",
+            "!selectors.yml",
+            "!profile_template.yml"
+        ],
+        "https://github.com/raw/dbt-labs/dbt-jsonschema/main/schemas/latest/dbt_project-latest.json": [
+            "dbt_project.yml"
+        ],
+        "https://github.com/raw/dbt-labs/dbt-jsonschema/main/schemas/latest/selectors-latest.json": [
+            "selectors.yml"
+        ],
+        "https://github.com/raw/dbt-labs/dbt-jsonschema/main/schemas/latest/packages-latest.json": [
+            "packages.yml"
+        ]
+    },
+}
diff --git a/integration_tests/ci/sample.profiles.yml b/integration_tests/ci/sample.profiles.yml
@@ -2,10 +2,6 @@
 # HEY! This file is used in the dbt-audit-helper integrations tests with CircleCI.
 # You should __NEVER__ check credentials into version control. Thanks for reading :)
 
-config:
-    send_anonymous_usage_stats: False
-    use_colors: True
-
 integration_tests:
   target: postgres
   outputs:
@@ -27,15 +23,15 @@ integration_tests:
       dbname: "{{ env_var('REDSHIFT_TEST_DBNAME') }}"
       port: "{{ env_var('REDSHIFT_TEST_PORT') | as_number }}"
       schema: audit_helper_integration_tests_redshift
-      threads: 1
+      threads: 8
 
     bigquery:
       type: bigquery
       method: service-account
       keyfile: "{{ env_var('BIGQUERY_SERVICE_KEY_PATH') }}"
       project: "{{ env_var('BIGQUERY_TEST_DATABASE') }}"
       schema: audit_helper_integration_tests_bigquery
-      threads: 1
+      threads: 8
 
     snowflake:
       type: snowflake
@@ -46,4 +42,12 @@ integration_tests:
       database: "{{ env_var('SNOWFLAKE_TEST_DATABASE') }}"
       warehouse: "{{ env_var('SNOWFLAKE_TEST_WAREHOUSE') }}"
       schema: audit_helper_integration_tests_snowflake
-      threads: 1
+      threads: 8
+
+    databricks:
+      type: databricks
+      schema: dbt_project_evaluator_integration_tests_databricks
+      host: "{{ env_var('DATABRICKS_TEST_HOST') }}"
+      http_path: "{{ env_var('DATABRICKS_TEST_HTTP_PATH') }}"
+      token: "{{ env_var('DATABRICKS_TEST_ACCESS_TOKEN') }}"
+      threads: 10
diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml
@@ -17,3 +17,15 @@ clean-targets:         # directories to be removed by `dbt clean`
 
 seeds:
   +quote_columns: false
+
+vars:
+  compare_queries_summarize: true
+  reworked_compare__primary_key_columns: ['col1']
+  reworked_compare__columns: ['col1']
+  reworked_compare__event_time:
+  quick_are_queries_identical_cols: ['col1']
+  quick_are_queries_identical_event_time:
+
+flags:
+  send_anonymous_usage_stats: False
+  use_colors: True
diff --git a/...re_all_columns_concat_pk_with_summary.sql → ...re_all_columns_concat_pk_with_summary.sql b/...re_all_columns_concat_pk_with_summary.sql → ...re_all_columns_concat_pk_with_summary.sql
diff --git a/...all_columns_concat_pk_without_summary.sql → ...all_columns_concat_pk_without_summary.sql b/...all_columns_concat_pk_without_summary.sql → ...all_columns_concat_pk_without_summary.sql
diff --git a/...dels/compare_all_columns_where_clause.sql → ...ests/compare_all_columns_where_clause.sql b/...dels/compare_all_columns_where_clause.sql → ...ests/compare_all_columns_where_clause.sql
diff --git a/...dels/compare_all_columns_with_summary.sql → ...ests/compare_all_columns_with_summary.sql b/...dels/compare_all_columns_with_summary.sql → ...ests/compare_all_columns_with_summary.sql
diff --git a/..._all_columns_with_summary_and_exclude.sql → ..._all_columns_with_summary_and_exclude.sql b/..._all_columns_with_summary_and_exclude.sql → ..._all_columns_with_summary_and_exclude.sql
diff --git a/...s/compare_all_columns_without_summary.sql → ...s/compare_all_columns_without_summary.sql b/...s/compare_all_columns_without_summary.sql → ...s/compare_all_columns_without_summary.sql
diff --git a/integration_tests/models/compare_queries.sql → ...sts/models/data_tests/compare_queries.sql b/integration_tests/models/compare_queries.sql → ...sts/models/data_tests/compare_queries.sql
diff --git a/...are_queries_concat_pk_without_summary.sql → ...are_queries_concat_pk_without_summary.sql b/...are_queries_concat_pk_without_summary.sql → ...are_queries_concat_pk_without_summary.sql
diff --git a/...s/models/compare_queries_with_summary.sql → ...ta_tests/compare_queries_with_summary.sql b/...s/models/compare_queries_with_summary.sql → ...ta_tests/compare_queries_with_summary.sql
diff --git a/...odels/compare_queries_without_summary.sql → ...tests/compare_queries_without_summary.sql b/...odels/compare_queries_without_summary.sql → ...tests/compare_queries_without_summary.sql
diff --git a/...tests/models/compare_relation_columns.sql → ...s/data_tests/compare_relation_columns.sql b/...tests/models/compare_relation_columns.sql → ...s/data_tests/compare_relation_columns.sql
diff --git a/...e_relations_concat_pk_without_summary.sql → ...e_relations_concat_pk_without_summary.sql b/...e_relations_concat_pk_without_summary.sql → ...e_relations_concat_pk_without_summary.sql
diff --git a/...models/compare_relations_with_exclude.sql → ..._tests/compare_relations_with_exclude.sql b/...models/compare_relations_with_exclude.sql → ..._tests/compare_relations_with_exclude.sql
diff --git a/...models/compare_relations_with_summary.sql → ..._tests/compare_relations_with_summary.sql b/...models/compare_relations_with_summary.sql → ..._tests/compare_relations_with_summary.sql
diff --git a/...els/compare_relations_without_exclude.sql → ...sts/compare_relations_without_exclude.sql b/...els/compare_relations_without_exclude.sql → ...sts/compare_relations_without_exclude.sql
diff --git a/...els/compare_relations_without_summary.sql → ...sts/compare_relations_without_summary.sql b/...els/compare_relations_without_summary.sql → ...sts/compare_relations_without_summary.sql
diff --git a/...ation_tests/models/compare_row_counts.sql → .../models/data_tests/compare_row_counts.sql b/...ation_tests/models/compare_row_counts.sql → .../models/data_tests/compare_row_counts.sql
diff --git a/...s/models/compare_which_columns_differ.sql → ...ta_tests/compare_which_columns_differ.sql b/...s/models/compare_which_columns_differ.sql → ...ta_tests/compare_which_columns_differ.sql
diff --git a/...are_which_columns_differ_exclude_cols.sql → ...are_which_columns_differ_exclude_cols.sql b/...are_which_columns_differ_exclude_cols.sql → ...are_which_columns_differ_exclude_cols.sql
diff --git a/integration_tests/models/schema.yml → ...ration_tests/models/data_tests/schema.yml b/integration_tests/models/schema.yml → ...ration_tests/models/data_tests/schema.yml
@@ -2,96 +2,96 @@ version: 2
 
 models:
   - name: compare_queries
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_relations_without_exclude')
 
   - name: compare_queries_concat_pk_without_summary
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_without_summary')
 
   - name: compare_queries_with_summary
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_with_summary')
 
   - name: compare_queries_without_summary
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_without_summary')
 
   - name: compare_relations_with_summary
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_with_summary')
 
   - name: compare_relations_without_summary
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_without_summary')
 
   - name: compare_relations_with_exclude
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_relations_with_exclude')
 
   - name: compare_relations_without_exclude
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_relations_without_exclude')
 
   - name: compare_all_columns_with_summary
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_all_columns_with_summary')
 
   - name: compare_all_columns_without_summary
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_all_columns_without_summary')
 
   - name: compare_all_columns_concat_pk_with_summary
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_all_columns_concat_pk_with_summary')
 
   - name: compare_all_columns_concat_pk_without_summary
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_all_columns_concat_pk_without_summary')
 
   - name: compare_all_columns_with_summary_and_exclude
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_all_columns_with_summary_and_exclude')
 
   - name: compare_all_columns_where_clause
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_all_columns_where_clause')
 
   - name: compare_relation_columns
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_relation_columns')
 
   - name: compare_relations_concat_pk_without_summary
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_without_summary')
 
   - name: compare_which_columns_differ
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_which_columns_differ')
 
   - name: compare_which_columns_differ_exclude_cols
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_which_columns_differ_exclude_cols')
 
   - name: compare_row_counts
-    tests:
+    data_tests:
       - dbt_utils.equality:
           compare_model: ref('expected_results__compare_row_counts')
diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_a.sql
@@ -0,0 +1 @@
+select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, {{ dbt.current_timestamp() }} as created_at
diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_model_b.sql
@@ -0,0 +1 @@
+select 12 as id, 22 as id_2, 'xyz' as col1, 'tuv' as col2, 123 as col3, {{ dbt.current_timestamp() }} as created_at
diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_a.sql
@@ -0,0 +1,3 @@
+{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres', 'databricks']) else 'runnable']) }}
+
+select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2
diff --git a/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql b/integration_tests/models/unit_test_placeholder_models/unit_test_struct_model_b.sql
@@ -0,0 +1,3 @@
+{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres', 'databricks']) else 'runnable']) }}
+
+select 1 as id, 'John Doe' as col1, object_construct('street', '123 Main St', 'city', 'Anytown', 'state', 'CA') as col2
diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql b/integration_tests/models/unit_test_wrappers/unit_compare_queries.sql
@@ -0,0 +1,8 @@
+
+{{ 
+    audit_helper.compare_queries(
+        "select * from " ~ ref('unit_test_model_a'),
+        "select * from " ~ ref('unit_test_model_b'),
+        summarize = var('compare_queries_summarize')
+    ) 
+}}
diff --git a/integration_tests/models/unit_test_wrappers/unit_compare_queries.yml b/integration_tests/models/unit_test_wrappers/unit_compare_queries.yml
@@ -0,0 +1,47 @@
+unit_tests:
+  - name: identical_records_compare_queries
+    model: unit_compare_queries
+    description: The world's most basic unit test. 
+
+    given:
+      - input: ref('unit_test_model_a')
+        rows:
+          - { "id": 1, "col1": "abc", "col2": "def" }
+          - { "id": 2, "col1": "hij", "col2": "klm" }
+          - { "id": 3, "col1": "nop", "col2": "qrs" }
+      - input: ref('unit_test_model_b')
+        rows:
+          - { "id": 1, "col1": "abc", "col2": "def" }
+          - { "id": 2, "col1": "hij", "col2": "klm" }
+          - { "id": 3, "col1": "nop", "col2": "qrs" }
+
+    expect:
+      rows:
+        - {"in_a": true, "in_b": true}
+
+    overrides:
+      vars:
+        compare_queries_summarize: true
+
+  - name: identical_records_compare_queries_no_summarize
+    model: unit_compare_queries
+    description: The world's second most basic unit test.
+
+    given:
+      - input: ref('unit_test_model_a')
+        rows:
+          - { "id": 1, "col1": "abc", "col2": "def" }
+          - { "id": 2, "col1": "hij", "col2": "klm" }
+          - { "id": 3, "col1": "nop", "col2": "qrs" }
+      - input: ref('unit_test_model_b')
+        rows:
+          - { "id": 1, "col1": "abc", "col2": "def" }
+          - { "id": 2, "col1": "hij", "col2": "klm" }
+          - { "id": 3, "col1": "nop", "col2": "qrs" }
+
+    expect:
+      rows: []
+
+    overrides:
+      vars:
+        compare_queries_summarize: false
diff --git a/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql b/integration_tests/models/unit_test_wrappers/unit_quick_are_queries_identical.sql
@@ -0,0 +1,10 @@
+{{ config(tags=['skip' if (target.type in ['redshift', 'bigquery', 'postgres', 'databricks']) else 'runnable']) }}
+
+{{ 
+    audit_helper.quick_are_queries_identical(
+        "select * from " ~ ref('unit_test_model_a'),
+        "select * from " ~ ref('unit_test_model_b'),
+        columns=var('quick_are_queries_identical_cols'),
+        event_time=var('quick_are_queries_identical_event_time')
+    ) 
+}}