kuzudb · andyfengHKU · Mar 24, 2023 · Mar 23, 2023
diff --git a/src/binder/query/query_graph.cpp b/src/binder/query/query_graph.cpp
@@ -5,9 +5,9 @@ namespace binder {
 
 std::size_t SubqueryGraphHasher::operator()(const SubqueryGraph& key) const {
     if (0 == key.queryRelsSelector.count()) {
-        return std::hash<std::bitset<MAX_NUM_VARIABLES>>{}(key.queryNodesSelector);
+        return std::hash<std::bitset<MAX_NUM_QUERY_VARIABLES>>{}(key.queryNodesSelector);
     }
-    return std::hash<std::bitset<MAX_NUM_VARIABLES>>{}(key.queryRelsSelector);
+    return std::hash<std::bitset<MAX_NUM_QUERY_VARIABLES>>{}(key.queryRelsSelector);
 }
 
 bool SubqueryGraph::containAllVariables(std::unordered_set<std::string>& variables) const {

diff --git a/src/include/binder/query/reading_clause/query_graph.h b/src/include/binder/query/reading_clause/query_graph.h
@@ -8,7 +8,7 @@
 namespace kuzu {
 namespace binder {
 
-const uint8_t MAX_NUM_VARIABLES = 64;
+constexpr static uint8_t MAX_NUM_QUERY_VARIABLES = 64;
 
 class QueryGraph;
 struct SubqueryGraph;
@@ -25,8 +25,8 @@ struct SubqueryGraphHasher {
 struct SubqueryGraph {
 
     const QueryGraph& queryGraph;
-    std::bitset<MAX_NUM_VARIABLES> queryNodesSelector;
-    std::bitset<MAX_NUM_VARIABLES> queryRelsSelector;
+    std::bitset<MAX_NUM_QUERY_VARIABLES> queryNodesSelector;
+    std::bitset<MAX_NUM_QUERY_VARIABLES> queryRelsSelector;
 
     explicit SubqueryGraph(const QueryGraph& queryGraph) : queryGraph{queryGraph} {}
 

diff --git a/src/include/planner/logical_plan/logical_operator/schema.h b/src/include/planner/logical_plan/logical_operator/schema.h
@@ -11,12 +11,6 @@ using f_group_pos = uint32_t;
 using f_group_pos_set = std::unordered_set<f_group_pos>;
 constexpr f_group_pos INVALID_F_GROUP_POS = UINT32_MAX;
 
-class Schema;
-struct SchemaHasher;
-struct SchemaApproximateEquality;
-template<typename T>
-using schema_map_t = std::unordered_map<Schema*, T, SchemaHasher, SchemaApproximateEquality>;
-
 class FactorizationGroup {
     friend class Schema;
 
@@ -139,14 +133,6 @@ class Schema {
     binder::expression_vector expressionsInScope;
 };
 
-struct SchemaHasher {
-    std::size_t operator()(const Schema* const& schema) const;
-};
-
-struct SchemaApproximateEquality {
-    bool operator()(const Schema* const& left, const Schema* const& right) const;
-};
-
 class SchemaUtils {
 public:
     static std::vector<binder::expression_vector> getExpressionsPerGroup(

diff --git a/src/include/planner/subplans_table.h b/src/include/planner/subplans_table.h
@@ -13,14 +13,67 @@ namespace kuzu {
 namespace planner {
 
 const uint64_t MAX_LEVEL_TO_PLAN_EXACTLY = 7;
-const uint64_t MAX_NUM_SUBGRAPHS_PER_LEVEL = 50;
-const uint64_t MAX_NUM_PLANS_PER_SUBGRAPH = 50;
 
-class SubPlansTable {
-    struct PlanSet;
-    // Each dp level is a map from sub query graph to a set of plans
-    using dp_level_t = subquery_graph_V_map_t<std::unique_ptr<PlanSet>>;
+// Different from vanilla dp algorithm where one optimal plan is kept per subgraph, we keep multiple
+// plans each with a different factorization structure. The following example will explain our
+// rationale.
+// Given a triangle with an outgoing edge
+// MATCH (a)->(b)->(c), (a)->(c), (c)->(d)
+// At level 3 (assume level is based on num of nodes) for subgraph "abc", if we ignore factorization
+// structure, the 3 plans that intersects on "a", "b", or "c" are considered homogenous and one of
+// them will be picked.
+// Then at level 4 for subgraph "abcd", we know the plan that intersect on "c" will be worse because
+// we need to further flatten it and extend to "d".
+// Therefore, we try to be factorization aware when keeping optimal plans.
+class SubgraphPlans {
+public:
+    SubgraphPlans(const SubqueryGraph& subqueryGraph);
+
+    void addPlan(std::unique_ptr<LogicalPlan> plan);
+
+    std::vector<std::unique_ptr<LogicalPlan>>& getPlans() { return plans; }
+
+private:
+    // To balance computation time, we encode plan by only considering the flat information of the
+    // nodes that are involved in current subgraph.
+    std::bitset<MAX_NUM_QUERY_VARIABLES> encodePlan(const LogicalPlan& plan);
+
+private:
+    constexpr static uint32_t MAX_NUM_PLANS = 10;
 
+private:
+    binder::expression_vector nodeIDsToEncode;
+    std::vector<std::unique_ptr<LogicalPlan>> plans;
+    std::unordered_map<std::bitset<MAX_NUM_QUERY_VARIABLES>, common::vector_idx_t>
+        encodedPlan2PlanIdx;
+};
+
+// A DPLevel is a collection of plans per subgraph. All subgraph should have the same number of
+// variables.
+class DPLevel {
+public:
+    inline bool contains(const SubqueryGraph& subqueryGraph) {
+        return subgraph2Plans.contains(subqueryGraph);
+    }
+
+    inline SubgraphPlans* getSubgraphPlans(const SubqueryGraph& subqueryGraph) {
+        return subgraph2Plans.at(subqueryGraph).get();
+    }
+
+    std::vector<SubqueryGraph> getSubqueryGraphs();
+
+    void addPlan(const SubqueryGraph& subqueryGraph, std::unique_ptr<LogicalPlan> plan);
+
+    inline void clear() { subgraph2Plans.clear(); }
+
+private:
+    constexpr static uint32_t MAX_NUM_SUBGRAPH = 50;
+
+private:
+    subquery_graph_V_map_t<std::unique_ptr<SubgraphPlans>> subgraph2Plans;
+};
+
+class SubPlansTable {
 public:
     void resize(uint32_t newSize);
 
@@ -35,21 +88,12 @@ class SubPlansTable {
     void clear();
 
 private:
-    struct PlanSet {
-        std::vector<std::unique_ptr<LogicalPlan>> plans;
-        schema_map_t<common::vector_idx_t> schemaToPlanIdx;
-
-        inline std::vector<std::unique_ptr<LogicalPlan>>& getPlans() { return plans; }
-
-        void addPlan(std::unique_ptr<LogicalPlan> plan);
-    };
-
-    dp_level_t* getDPLevel(const SubqueryGraph& subqueryGraph) const {
+    DPLevel* getDPLevel(const SubqueryGraph& subqueryGraph) const {
         return dpLevels[subqueryGraph.getTotalNumVariables()].get();
     }
 
 private:
-    std::vector<std::unique_ptr<dp_level_t>> dpLevels;
+    std::vector<std::unique_ptr<DPLevel>> dpLevels;
 };
 
 } // namespace planner

diff --git a/src/planner/join_order_enumerator.cpp b/src/planner/join_order_enumerator.cpp
@@ -262,8 +262,11 @@ void JoinOrderEnumerator::planWCOJoin(const SubqueryGraph& subgraph,
         relSubgraph.addQueryRel(relPos);
         assert(context->subPlansTable->containSubgraphPlans(relSubgraph));
         auto& relPlanCandidates = context->subPlansTable->getSubgraphPlans(relSubgraph);
-        assert(relPlanCandidates.size() == 2); // 2 directions
-        relPlans.push_back(getWCOJBuildPlanForRel(relPlanCandidates, *boundNode));
+        auto relPlan = getWCOJBuildPlanForRel(relPlanCandidates, *boundNode);
+        if (relPlan == nullptr) { // Cannot find a suitable rel plan.
+            return;
+        }
+        relPlans.push_back(std::move(relPlan));
     }
     auto predicates =
         getNewlyMatchedExpressions(prevSubgraphs, newSubgraph, context->getWhereExpressions());

diff --git a/src/planner/operator/schema.cpp b/src/planner/operator/schema.cpp
@@ -107,43 +107,6 @@ size_t Schema::getNumGroups(bool isFlat) const {
     return result;
 }
 
-std::size_t SchemaHasher::operator()(const Schema* const& schema) const {
-    return std::hash<size_t>{}(schema->getNumFlatGroups()) ^
-           std::hash<size_t>{}(schema->getNumUnFlatGroups());
-}
-
-// We use this equality in join order enumeration to make sure at each DP level, we don't just keep
-// the best plan, but keep best plan for each unique factorization schema.
-// In order to balance enumeration time, we use an approximate equality check to reduce computation.
-// We check the following
-// - number of factorization groups
-// - number of unFlat factorization groups
-// - number of expressions
-// - if an expression has the same flat/unFlat flag in both schemas
-bool SchemaApproximateEquality::operator()(
-    const Schema* const& left, const Schema* const& right) const {
-    if (left->getNumGroups() != right->getNumGroups()) {
-        return false;
-    }
-    if (left->getNumUnFlatGroups() != right->getNumUnFlatGroups()) {
-        return false;
-    }
-    if (left->getExpressionsInScope().size() != right->getExpressionsInScope().size()) {
-        return false;
-    }
-    for (auto& expression : left->getExpressionsInScope()) {
-        if (!right->isExpressionInScope(*expression)) {
-            return false;
-        }
-        auto leftGroupPos = left->getGroupPos(*expression);
-        auto rightGroupPos = right->getGroupPos(*expression);
-        if (left->getGroup(leftGroupPos)->isFlat() != right->getGroup(rightGroupPos)->isFlat()) {
-            return false;
-        }
-    }
-    return true;
-}
-
 std::vector<binder::expression_vector> SchemaUtils::getExpressionsPerGroup(
     const binder::expression_vector& expressions, const Schema& schema) {
     std::vector<binder::expression_vector> result;

diff --git a/src/planner/subplans_table.cpp b/src/planner/subplans_table.cpp
@@ -5,11 +5,65 @@
 namespace kuzu {
 namespace planner {
 
+SubgraphPlans::SubgraphPlans(const kuzu::binder::SubqueryGraph& subqueryGraph) {
+    for (auto i = 0u; i < subqueryGraph.queryGraph.getNumQueryNodes(); ++i) {
+        if (subqueryGraph.queryNodesSelector[i]) {
+            nodeIDsToEncode.push_back(
+                subqueryGraph.queryGraph.getQueryNode(i)->getInternalIDProperty());
+        }
+    }
+}
+
+void SubgraphPlans::addPlan(std::unique_ptr<LogicalPlan> plan) {
+    if (plans.size() > MAX_NUM_PLANS) {
+        return;
+    }
+    auto planCode = encodePlan(*plan);
+    if (!encodedPlan2PlanIdx.contains(planCode)) {
+        encodedPlan2PlanIdx.insert({planCode, plans.size()});
+        plans.push_back(std::move(plan));
+    } else {
+        auto planIdx = encodedPlan2PlanIdx.at(planCode);
+        if (plan->getCost() < plans[planIdx]->getCost()) {
+            plans[planIdx] = std::move(plan);
+        }
+    }
+}
+
+std::bitset<MAX_NUM_QUERY_VARIABLES> SubgraphPlans::encodePlan(const LogicalPlan& plan) {
+    auto schema = plan.getSchema();
+    std::bitset<MAX_NUM_QUERY_VARIABLES> result;
+    result.reset();
+    for (auto i = 0; i < nodeIDsToEncode.size(); ++i) {
+        result[i] = schema->getGroup(schema->getGroupPos(*nodeIDsToEncode[i]))->isFlat();
+    }
+    return result;
+}
+
+std::vector<SubqueryGraph> DPLevel::getSubqueryGraphs() {
+    std::vector<SubqueryGraph> result;
+    for (auto& [subGraph, _] : subgraph2Plans) {
+        result.push_back(subGraph);
+    }
+    return result;
+}
+
+void DPLevel::addPlan(
+    const kuzu::binder::SubqueryGraph& subqueryGraph, std::unique_ptr<LogicalPlan> plan) {
+    if (subgraph2Plans.size() > MAX_NUM_SUBGRAPH) {
+        return;
+    }
+    if (!contains(subqueryGraph)) {
+        subgraph2Plans.insert({subqueryGraph, std::make_unique<SubgraphPlans>(subqueryGraph)});
+    }
+    subgraph2Plans.at(subqueryGraph)->addPlan(std::move(plan));
+}
+
 void SubPlansTable::resize(uint32_t newSize) {
     auto prevSize = dpLevels.size();
     dpLevels.resize(newSize);
     for (auto i = prevSize; i < newSize; ++i) {
-        dpLevels[i] = std::make_unique<dp_level_t>();
+        dpLevels[i] = std::make_unique<DPLevel>();
     }
 }
 
@@ -21,26 +75,16 @@ std::vector<std::unique_ptr<LogicalPlan>>& SubPlansTable::getSubgraphPlans(
     const SubqueryGraph& subqueryGraph) {
     auto dpLevel = getDPLevel(subqueryGraph);
     KU_ASSERT(dpLevel->contains(subqueryGraph));
-    return dpLevel->at(subqueryGraph)->getPlans();
+    return dpLevel->getSubgraphPlans(subqueryGraph)->getPlans();
 }
 
 std::vector<SubqueryGraph> SubPlansTable::getSubqueryGraphs(uint32_t level) {
-    std::vector<SubqueryGraph> result;
-    for (auto& [subGraph, _] : *dpLevels[level]) {
-        result.push_back(subGraph);
-    }
-    return result;
+    return dpLevels[level]->getSubqueryGraphs();
 }
 
 void SubPlansTable::addPlan(const SubqueryGraph& subqueryGraph, std::unique_ptr<LogicalPlan> plan) {
     auto dpLevel = getDPLevel(subqueryGraph);
-    if (dpLevel->size() > MAX_NUM_SUBGRAPHS_PER_LEVEL) {
-        return;
-    }
-    if (!dpLevel->contains(subqueryGraph)) {
-        dpLevel->emplace(subqueryGraph, std::make_unique<PlanSet>());
-    }
-    dpLevel->at(subqueryGraph)->addPlan(std::move(plan));
+    dpLevel->addPlan(subqueryGraph, std::move(plan));
 }
 
 void SubPlansTable::clear() {
@@ -49,25 +93,5 @@ void SubPlansTable::clear() {
     }
 }
 
-void SubPlansTable::PlanSet::addPlan(std::unique_ptr<LogicalPlan> plan) {
-    if (plans.size() >= MAX_NUM_PLANS_PER_SUBGRAPH) {
-        return;
-    }
-    auto schema = plan->getSchema();
-    if (!schemaToPlanIdx.contains(schema)) { // add plan if this is a new factorization schema
-        schemaToPlanIdx.insert({schema, plans.size()});
-        plans.push_back(std::move(plan));
-    } else { // swap plan for lower cost under the same factorization schema
-        auto idx = schemaToPlanIdx.at(schema);
-        assert(idx < MAX_NUM_PLANS_PER_SUBGRAPH);
-        auto currentPlan = plans[idx].get();
-        if (currentPlan->getCost() > plan->getCost()) {
-            plans[idx] = std::move(plan);
-            schemaToPlanIdx.erase(schema);
-            schemaToPlanIdx.insert({schema, idx});
-        }
-    }
-}
-
 } // namespace planner
 } // namespace kuzu