From 54b30646f3f61a437297f24ab410d460cbf6b7ba Mon Sep 17 00:00:00 2001 From: Gian Merlino Date: Tue, 6 Feb 2024 03:02:05 -0800 Subject: [PATCH] Add sqlReverseLookupThreshold for ReverseLookupRule. (#15832) If lots of keys map to the same value, reversing a LOOKUP call can slow things down unacceptably. To protect against this, this patch introduces a parameter sqlReverseLookupThreshold representing the maximum size of an IN filter that will be created as part of lookup reversal. If inSubQueryThreshold is set to a smaller value than sqlReverseLookupThreshold, then inSubQueryThreshold will be used instead. This allows users to use that single parameter to control IN sizes if they wish. --- .../benchmark/lookup/LookupBenchmarkUtil.java | 2 +- .../lookup/SqlReverseLookupBenchmark.java | 16 +++++---- docs/querying/lookups.md | 4 +-- docs/querying/sql-query-context.md | 3 +- .../sql/calcite/rule/ReverseLookupRule.java | 12 +++++-- .../CalciteLookupFunctionQueryTest.java | 33 +++++++++++++++++-- 6 files changed, 56 insertions(+), 14 deletions(-) diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/lookup/LookupBenchmarkUtil.java b/benchmarks/src/test/java/org/apache/druid/benchmark/lookup/LookupBenchmarkUtil.java index 95957fdd3bda..72a809e32fef 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/lookup/LookupBenchmarkUtil.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/lookup/LookupBenchmarkUtil.java @@ -77,7 +77,7 @@ public LookupExtractor build(Iterable> keyValuePairs) return new MapLookupExtractor(map, false); } }, - REVERSIBLE { + IMMUTABLE { @Override public LookupExtractor build(Iterable> keyValuePairs) { diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/lookup/SqlReverseLookupBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/lookup/SqlReverseLookupBenchmark.java index c2131c252c88..bd4dcd4d9ad8 100644 --- a/benchmarks/src/test/java/org/apache/druid/benchmark/lookup/SqlReverseLookupBenchmark.java +++ b/benchmarks/src/test/java/org/apache/druid/benchmark/lookup/SqlReverseLookupBenchmark.java @@ -72,7 +72,7 @@ public class SqlReverseLookupBenchmark /** * Type of lookup to benchmark. All are members of enum {@link LookupBenchmarkUtil.LookupType}. */ - @Param({"hashmap"}) + @Param({"hashmap", "immutable"}) private String lookupType; /** @@ -84,7 +84,7 @@ public class SqlReverseLookupBenchmark /** * Average number of keys that map to each value. */ - @Param({"100000", "200000", "400000", "800000", "1600000"}) + @Param({"1000", "5000", "10000", "100000"}) private int keysPerValue; private SqlEngine engine; @@ -133,8 +133,10 @@ public void tearDown() throws Exception @OutputTimeUnit(TimeUnit.MILLISECONDS) public void planEquals(Blackhole blackhole) { - final String sql = - "SELECT COUNT(*) FROM foo WHERE LOOKUP(dimZipf, 'benchmark-lookup', 'N/A') = '0'"; + final String sql = StringUtils.format( + "SELECT COUNT(*) FROM foo WHERE LOOKUP(dimZipf, 'benchmark-lookup', 'N/A') = '%s'", + LookupBenchmarkUtil.makeKeyOrValue(0) + ); try (final DruidPlanner planner = plannerFactory.createPlannerForTesting(engine, sql, ImmutableMap.of())) { final PlannerResult plannerResult = planner.plan(); blackhole.consume(plannerResult); @@ -146,8 +148,10 @@ public void planEquals(Blackhole blackhole) @OutputTimeUnit(TimeUnit.MILLISECONDS) public void planNotEquals(Blackhole blackhole) { - final String sql = - "SELECT COUNT(*) FROM foo WHERE LOOKUP(dimZipf, 'benchmark-lookup', 'N/A') <> '0'"; + final String sql = StringUtils.format( + "SELECT COUNT(*) FROM foo WHERE LOOKUP(dimZipf, 'benchmark-lookup', 'N/A') <> '%s'", + LookupBenchmarkUtil.makeKeyOrValue(0) + ); try (final DruidPlanner planner = plannerFactory.createPlannerForTesting(engine, sql, ImmutableMap.of())) { final PlannerResult plannerResult = planner.plan(); blackhole.consume(plannerResult); diff --git a/docs/querying/lookups.md b/docs/querying/lookups.md index aac7abd4cd63..bbc1b03faca8 100644 --- a/docs/querying/lookups.md +++ b/docs/querying/lookups.md @@ -155,8 +155,8 @@ You can see the difference in the native query that is generated during SQL plan can retrieve with [`EXPLAIN PLAN FOR`](sql.md#explain-plan). When a lookup is reversed in this way, the `lookup` function disappears and is replaced by a simpler filter, typically of type `equals` or `in`. -Lookups are not reversed if the number of matching keys exceeds the [`inSubQueryThreshold`](sql-query-context.md) for -the query. +Lookups are not reversed if the number of matching keys exceeds the [`sqlReverseLookupThreshold`](sql-query-context.md) +or [`inSubQueryThreshold`](sql-query-context.md) for the query. This rewrite adds some planning time that may become noticeable for larger lookups, especially if many keys map to the same value. You can see the impact on planning time in the `sqlQuery/planningTimeMs` metric. You can also measure the diff --git a/docs/querying/sql-query-context.md b/docs/querying/sql-query-context.md index f90f995e58f9..f8b1576a913b 100644 --- a/docs/querying/sql-query-context.md +++ b/docs/querying/sql-query-context.md @@ -47,7 +47,8 @@ Configure Druid SQL query planning using the parameters in the table below. |`useNativeQueryExplain`|If true, `EXPLAIN PLAN FOR` will return the explain plan as a JSON representation of equivalent native query(s), else it will return the original version of explain plan generated by Calcite.

This property is provided for backwards compatibility. It is not recommended to use this parameter unless you were depending on the older behavior.|`druid.sql.planner.useNativeQueryExplain` on the Broker (default: true)| |`sqlFinalizeOuterSketches`|If false (default behavior in Druid 25.0.0 and later), `DS_HLL`, `DS_THETA`, and `DS_QUANTILES_SKETCH` return sketches in query results, as documented. If true (default behavior in Druid 24.0.1 and earlier), sketches from these functions are finalized when they appear in query results.

This property is provided for backwards compatibility with behavior in Druid 24.0.1 and earlier. It is not recommended to use this parameter unless you were depending on the older behavior. Instead, use a function that does not return a sketch, such as `APPROX_COUNT_DISTINCT_DS_HLL`, `APPROX_COUNT_DISTINCT_DS_THETA`, `APPROX_QUANTILE_DS`, `DS_THETA_ESTIMATE`, or `DS_GET_QUANTILE`.|`druid.query.default.context.sqlFinalizeOuterSketches` on the Broker (default: false)| |`sqlUseBoundAndSelectors`|If false (default behavior if `druid.generic.useDefaultValueForNull=false` in Druid 27.0.0 and later), the SQL planner will use [equality](./filters.md#equality-filter), [null](./filters.md#null-filter), and [range](./filters.md#range-filter) filters instead of [selector](./filters.md#selector-filter) and [bounds](./filters.md#bound-filter). This value must be set to `false` for correct behavior for filtering `ARRAY` typed values. | Defaults to same value as `druid.generic.useDefaultValueForNull`, which is `false`| -|`sqlReverseLookup`|Whether to consider the [reverse-lookup rewrite](lookups.md#reverse-lookup) of the `LOOKUP` function during SQL planning.|true| +|`sqlReverseLookup`|Whether to consider the [reverse-lookup rewrite](lookups.md#reverse-lookup) of the `LOOKUP` function during SQL planning.

Calls to `LOOKUP` are only reversed when the number of matching keys is lower than both `inSubQueryThreshold` and `sqlReverseLookupThreshold`.|true| +|`sqlReverseLookupThreshold`|Maximum size of `IN` filter to create when applying a [reverse-lookup rewrite](lookups.md#reverse-lookup). If a `LOOKUP` call matches more keys than this threshold, it is left as-is.

If `inSubQueryThreshold` is lower than `sqlReverseLookupThreshold`, the `inSubQueryThreshold` is used as the threshold instead.|10000| |`sqlPullUpLookup`|Whether to consider the [pull-up rewrite](lookups.md#pull-up) of the `LOOKUP` function during SQL planning.|true| |`enableJoinLeftTableScanDirect`|`false`|This flag applies to queries which have joins. For joins, where left child is a simple scan with a filter, by default, druid will run the scan as a query and the join the results to the right child on broker. Setting this flag to true overrides that behavior and druid will attempt to push the join to data servers instead. Please note that the flag could be applicable to queries even if there is no explicit join. since queries can internally translated into a join by the SQL planner.| |`maxNumericInFilters`|`-1`|Max limit for the amount of numeric values that can be compared for a string type dimension when the entire SQL WHERE clause of a query translates only to an [OR](../querying/filters.md#or) of [Bound filter](../querying/filters.md#bound-filter). By default, Druid does not restrict the amount of of numeric Bound Filters on String columns, although this situation may block other queries from running. Set this parameter to a smaller value to prevent Druid from running queries that have prohibitively long segment processing times. The optimal limit requires some trial and error; we recommend starting with 100. Users who submit a query that exceeds the limit of `maxNumericInFilters` should instead rewrite their queries to use strings in the `WHERE` clause instead of numbers. For example, `WHERE someString IN (‘123’, ‘456’)`. This value cannot exceed the set system configuration `druid.sql.planner.maxNumericInFilters`. This value is ignored if `druid.sql.planner.maxNumericInFilters` is not set explicitly.| diff --git a/sql/src/main/java/org/apache/druid/sql/calcite/rule/ReverseLookupRule.java b/sql/src/main/java/org/apache/druid/sql/calcite/rule/ReverseLookupRule.java index 3deacaee355d..f53cc3e282a0 100644 --- a/sql/src/main/java/org/apache/druid/sql/calcite/rule/ReverseLookupRule.java +++ b/sql/src/main/java/org/apache/druid/sql/calcite/rule/ReverseLookupRule.java @@ -83,10 +83,15 @@ public class ReverseLookupRule extends RelOptRule implements SubstitutionRule */ public static final String CTX_MAX_OPTIMIZE_COUNT = "maxOptimizeCountForDruidReverseLookupRule"; + /** + * Context parameter to prevent creating too-large IN filters as a result of reverse lookups. + */ + public static final String CTX_THRESHOLD = "sqlReverseLookupThreshold"; + /** * Context parameter for tests, to allow us to force the case where we avoid creating a bunch of ORs. */ - public static final String CTX_MAX_IN_SIZE = "maxInSizeForDruidReverseLookupRule"; + public static final int DEFAULT_THRESHOLD = 10000; private final PlannerContext plannerContext; @@ -103,7 +108,10 @@ public void onMatch(RelOptRuleCall call) final int maxOptimizeCount = plannerContext.queryContext().getInt(CTX_MAX_OPTIMIZE_COUNT, Integer.MAX_VALUE); final int maxInSize = - plannerContext.queryContext().getInt(CTX_MAX_IN_SIZE, plannerContext.queryContext().getInSubQueryThreshold()); + Math.min( + plannerContext.queryContext().getInSubQueryThreshold(), + plannerContext.queryContext().getInt(CTX_THRESHOLD, DEFAULT_THRESHOLD) + ); final ReverseLookupShuttle reverseLookupShuttle = new ReverseLookupShuttle( plannerContext, filter.getCluster().getRexBuilder(), diff --git a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteLookupFunctionQueryTest.java b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteLookupFunctionQueryTest.java index fe8b454b4c0b..41885eabb499 100644 --- a/sql/src/test/java/org/apache/druid/sql/calcite/CalciteLookupFunctionQueryTest.java +++ b/sql/src/test/java/org/apache/druid/sql/calcite/CalciteLookupFunctionQueryTest.java @@ -29,6 +29,7 @@ import org.apache.druid.query.Druids; import org.apache.druid.query.InlineDataSource; import org.apache.druid.query.Query; +import org.apache.druid.query.QueryContexts; import org.apache.druid.query.QueryDataSource; import org.apache.druid.query.aggregation.CountAggregatorFactory; import org.apache.druid.query.aggregation.FilteredAggregatorFactory; @@ -381,12 +382,12 @@ public void testFilterInOverMaxSize() { cannotVectorize(); - // Set CTX_MAX_IN_SIZE = 1 to stop the LOOKUP call from being reversed. + // Set sqlReverseLookupThreshold = 1 to stop the LOOKUP call from being reversed. final ImmutableMap queryContext = ImmutableMap.builder() .putAll(QUERY_CONTEXT_DEFAULT) .put(PlannerContext.CTX_SQL_REVERSE_LOOKUP, true) - .put(ReverseLookupRule.CTX_MAX_IN_SIZE, 1) + .put(ReverseLookupRule.CTX_THRESHOLD, 1) .build(); testQuery( @@ -404,6 +405,34 @@ public void testFilterInOverMaxSize() ); } + @Test + public void testFilterInOverMaxSize2() + { + cannotVectorize(); + + // Set inSubQueryThreshold = 1 to stop the LOOKUP call from being reversed. + final ImmutableMap queryContext = + ImmutableMap.builder() + .putAll(QUERY_CONTEXT_DEFAULT) + .put(PlannerContext.CTX_SQL_REVERSE_LOOKUP, true) + .put(QueryContexts.IN_SUB_QUERY_THRESHOLD_KEY, 1) + .build(); + + testQuery( + buildFilterTestSql("LOOKUP(dim1, 'lookyloo') = 'xabc' OR LOOKUP(dim1, 'lookyloo') = 'x6'"), + queryContext, + NullHandling.sqlCompatible() + ? buildFilterTestExpectedQuery( + expressionVirtualColumn("v0", "lookup(\"dim1\",'lookyloo')", ColumnType.STRING), + in("v0", ImmutableList.of("x6", "xabc"), null) + ) + : buildFilterTestExpectedQuery( + in("dim1", ImmutableList.of("x6", "xabc"), EXTRACTION_FN) + ), + ImmutableList.of(new Object[]{"xabc", 1L}) + ); + } + @Test public void testFilterInOrIsNull() {