Skip to content

Commit

Permalink
Encode plan when inserting to dp table
Browse files Browse the repository at this point in the history
  • Loading branch information
andyfengHKU committed Mar 23, 2023
1 parent b999827 commit 11910ae
Show file tree
Hide file tree
Showing 7 changed files with 124 additions and 109 deletions.
4 changes: 2 additions & 2 deletions src/binder/query/query_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ namespace binder {

std::size_t SubqueryGraphHasher::operator()(const SubqueryGraph& key) const {
if (0 == key.queryRelsSelector.count()) {
return std::hash<std::bitset<MAX_NUM_VARIABLES>>{}(key.queryNodesSelector);
return std::hash<std::bitset<MAX_NUM_QUERY_VARIABLES>>{}(key.queryNodesSelector);
}
return std::hash<std::bitset<MAX_NUM_VARIABLES>>{}(key.queryRelsSelector);
return std::hash<std::bitset<MAX_NUM_QUERY_VARIABLES>>{}(key.queryRelsSelector);
}

bool SubqueryGraph::containAllVariables(std::unordered_set<std::string>& variables) const {
Expand Down
6 changes: 3 additions & 3 deletions src/include/binder/query/reading_clause/query_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
namespace kuzu {
namespace binder {

const uint8_t MAX_NUM_VARIABLES = 64;
constexpr static uint8_t MAX_NUM_QUERY_VARIABLES = 64;

class QueryGraph;
struct SubqueryGraph;
Expand All @@ -25,8 +25,8 @@ struct SubqueryGraphHasher {
struct SubqueryGraph {

const QueryGraph& queryGraph;
std::bitset<MAX_NUM_VARIABLES> queryNodesSelector;
std::bitset<MAX_NUM_VARIABLES> queryRelsSelector;
std::bitset<MAX_NUM_QUERY_VARIABLES> queryNodesSelector;
std::bitset<MAX_NUM_QUERY_VARIABLES> queryRelsSelector;

explicit SubqueryGraph(const QueryGraph& queryGraph) : queryGraph{queryGraph} {}

Expand Down
14 changes: 0 additions & 14 deletions src/include/planner/logical_plan/logical_operator/schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,6 @@ using f_group_pos = uint32_t;
using f_group_pos_set = std::unordered_set<f_group_pos>;
constexpr f_group_pos INVALID_F_GROUP_POS = UINT32_MAX;

class Schema;
struct SchemaHasher;
struct SchemaApproximateEquality;
template<typename T>
using schema_map_t = std::unordered_map<Schema*, T, SchemaHasher, SchemaApproximateEquality>;

class FactorizationGroup {
friend class Schema;

Expand Down Expand Up @@ -139,14 +133,6 @@ class Schema {
binder::expression_vector expressionsInScope;
};

struct SchemaHasher {
std::size_t operator()(const Schema* const& schema) const;
};

struct SchemaApproximateEquality {
bool operator()(const Schema* const& left, const Schema* const& right) const;
};

class SchemaUtils {
public:
static std::vector<binder::expression_vector> getExpressionsPerGroup(
Expand Down
73 changes: 56 additions & 17 deletions src/include/planner/subplans_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,62 @@ namespace kuzu {
namespace planner {

const uint64_t MAX_LEVEL_TO_PLAN_EXACTLY = 7;
const uint64_t MAX_NUM_SUBGRAPHS_PER_LEVEL = 50;
const uint64_t MAX_NUM_PLANS_PER_SUBGRAPH = 50;

class SubPlansTable {
struct PlanSet;
// Each dp level is a map from sub query graph to a set of plans
using dp_level_t = subquery_graph_V_map_t<std::unique_ptr<PlanSet>>;
// Different from vanilla dp algorithm where one optimal plan is kept per subgraph, we keep multiple
// plans each with a different factorization structure. The following example will explain our
// rationale.
// Given a triangle with an outgoing edge
// MATCH (a)->(b)->(c), (a)->(c), (c)->(d)
// At level 3 (assume level is based on num of nodes) for subgraph "abc", if we ignore factorization
// structure, the 3 plans that intersects on "a", "b", or "c" are considered homogenous and one of
// them will be picked.
// Then at level 4 for subgraph "abcd", we know the plan that intersect on "c" will be worse because
// we need to further flatten it and extend to "d".
// Therefore, we try to be factorization aware when keeping optimal plans.
class SubgraphPlans {
public:
SubgraphPlans(const SubqueryGraph& subqueryGraph);

void addPlan(std::unique_ptr<LogicalPlan> plan);

std::vector<std::unique_ptr<LogicalPlan>>& getPlans() { return plans; }

private:
// To balance computation time, we encode plan by only considering the flat information of the
// nodes that are involved in current subgraph.
std::bitset<MAX_NUM_QUERY_VARIABLES> encodePlan(const LogicalPlan& plan);

private:
binder::expression_vector nodeIDsToEncode;
constexpr static uint32_t MAX_NUM_PLANS = 10;
std::vector<std::unique_ptr<LogicalPlan>> plans;
std::unordered_map<std::bitset<MAX_NUM_QUERY_VARIABLES>, common::vector_idx_t> planCode2planIdx;
};

// A DPLevel is a collection of plans per subgraph. All subgraph should have the same number of
// variables.
class DPLevel {
public:
inline bool contains(const SubqueryGraph& subqueryGraph) {
return subgraph2Plans.contains(subqueryGraph);
}

inline SubgraphPlans* getSubgraphPlans(const SubqueryGraph& subqueryGraph) {
return subgraph2Plans.at(subqueryGraph).get();
}

std::vector<SubqueryGraph> getSubqueryGraphs();

void addPlan(const SubqueryGraph& subqueryGraph, std::unique_ptr<LogicalPlan> plan);

inline void clear() { subgraph2Plans.clear(); }

private:
constexpr static uint32_t MAX_NUM_SUBGRAPH = 50;
subquery_graph_V_map_t<std::unique_ptr<SubgraphPlans>> subgraph2Plans;
};

class SubPlansTable {
public:
void resize(uint32_t newSize);

Expand All @@ -35,21 +83,12 @@ class SubPlansTable {
void clear();

private:
struct PlanSet {
std::vector<std::unique_ptr<LogicalPlan>> plans;
schema_map_t<common::vector_idx_t> schemaToPlanIdx;

inline std::vector<std::unique_ptr<LogicalPlan>>& getPlans() { return plans; }

void addPlan(std::unique_ptr<LogicalPlan> plan);
};

dp_level_t* getDPLevel(const SubqueryGraph& subqueryGraph) const {
DPLevel* getDPLevel(const SubqueryGraph& subqueryGraph) const {
return dpLevels[subqueryGraph.getTotalNumVariables()].get();
}

private:
std::vector<std::unique_ptr<dp_level_t>> dpLevels;
std::vector<std::unique_ptr<DPLevel>> dpLevels;
};

} // namespace planner
Expand Down
7 changes: 5 additions & 2 deletions src/planner/join_order_enumerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,8 +262,11 @@ void JoinOrderEnumerator::planWCOJoin(const SubqueryGraph& subgraph,
relSubgraph.addQueryRel(relPos);
assert(context->subPlansTable->containSubgraphPlans(relSubgraph));
auto& relPlanCandidates = context->subPlansTable->getSubgraphPlans(relSubgraph);
assert(relPlanCandidates.size() == 2); // 2 directions
relPlans.push_back(getWCOJBuildPlanForRel(relPlanCandidates, *boundNode));
auto relPlan = getWCOJBuildPlanForRel(relPlanCandidates, *boundNode);
if (relPlan == nullptr) { // Cannot find a suitable rel plan.
return;
}
relPlans.push_back(std::move(relPlan));
}
auto predicates =
getNewlyMatchedExpressions(prevSubgraphs, newSubgraph, context->getWhereExpressions());
Expand Down
37 changes: 0 additions & 37 deletions src/planner/operator/schema.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,43 +107,6 @@ size_t Schema::getNumGroups(bool isFlat) const {
return result;
}

std::size_t SchemaHasher::operator()(const Schema* const& schema) const {
return std::hash<size_t>{}(schema->getNumFlatGroups()) ^
std::hash<size_t>{}(schema->getNumUnFlatGroups());
}

// We use this equality in join order enumeration to make sure at each DP level, we don't just keep
// the best plan, but keep best plan for each unique factorization schema.
// In order to balance enumeration time, we use an approximate equality check to reduce computation.
// We check the following
// - number of factorization groups
// - number of unFlat factorization groups
// - number of expressions
// - if an expression has the same flat/unFlat flag in both schemas
bool SchemaApproximateEquality::operator()(
const Schema* const& left, const Schema* const& right) const {
if (left->getNumGroups() != right->getNumGroups()) {
return false;
}
if (left->getNumUnFlatGroups() != right->getNumUnFlatGroups()) {
return false;
}
if (left->getExpressionsInScope().size() != right->getExpressionsInScope().size()) {
return false;
}
for (auto& expression : left->getExpressionsInScope()) {
if (!right->isExpressionInScope(*expression)) {
return false;
}
auto leftGroupPos = left->getGroupPos(*expression);
auto rightGroupPos = right->getGroupPos(*expression);
if (left->getGroup(leftGroupPos)->isFlat() != right->getGroup(rightGroupPos)->isFlat()) {
return false;
}
}
return true;
}

std::vector<binder::expression_vector> SchemaUtils::getExpressionsPerGroup(
const binder::expression_vector& expressions, const Schema& schema) {
std::vector<binder::expression_vector> result;
Expand Down
92 changes: 58 additions & 34 deletions src/planner/subplans_table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,65 @@
namespace kuzu {
namespace planner {

SubgraphPlans::SubgraphPlans(const kuzu::binder::SubqueryGraph& subqueryGraph) {
for (auto i = 0u; i < subqueryGraph.queryGraph.getNumQueryNodes(); ++i) {
if (subqueryGraph.queryNodesSelector[i]) {
nodeIDsToEncode.push_back(
subqueryGraph.queryGraph.getQueryNode(i)->getInternalIDProperty());
}
}
}

void SubgraphPlans::addPlan(std::unique_ptr<LogicalPlan> plan) {
if (plans.size() > MAX_NUM_PLANS) {
return;
}
auto planCode = encodePlan(*plan);
if (!planCode2planIdx.contains(planCode)) {
planCode2planIdx.insert({planCode, plans.size()});
plans.push_back(std::move(plan));
} else {
auto planIdx = planCode2planIdx.at(planCode);
if (plan->getCost() < plans[planIdx]->getCost()) {
plans[planIdx] = std::move(plan);
}
}
}

std::bitset<MAX_NUM_QUERY_VARIABLES> SubgraphPlans::encodePlan(const LogicalPlan& plan) {
auto schema = plan.getSchema();
std::bitset<MAX_NUM_QUERY_VARIABLES> result;
result.reset();
for (auto i = 0; i < nodeIDsToEncode.size(); ++i) {
result[i] = schema->getGroup(schema->getGroupPos(*nodeIDsToEncode[i]))->isFlat();
}
return result;
}

std::vector<SubqueryGraph> DPLevel::getSubqueryGraphs() {
std::vector<SubqueryGraph> result;
for (auto& [subGraph, _] : subgraph2Plans) {
result.push_back(subGraph);
}
return result;
}

void DPLevel::addPlan(
const kuzu::binder::SubqueryGraph& subqueryGraph, std::unique_ptr<LogicalPlan> plan) {
if (subgraph2Plans.size() > MAX_NUM_SUBGRAPH) {
return;
}
if (!contains(subqueryGraph)) {
subgraph2Plans.insert({subqueryGraph, std::make_unique<SubgraphPlans>(subqueryGraph)});
}
subgraph2Plans.at(subqueryGraph)->addPlan(std::move(plan));
}

void SubPlansTable::resize(uint32_t newSize) {
auto prevSize = dpLevels.size();
dpLevels.resize(newSize);
for (auto i = prevSize; i < newSize; ++i) {
dpLevels[i] = std::make_unique<dp_level_t>();
dpLevels[i] = std::make_unique<DPLevel>();
}
}

Expand All @@ -21,26 +75,16 @@ std::vector<std::unique_ptr<LogicalPlan>>& SubPlansTable::getSubgraphPlans(
const SubqueryGraph& subqueryGraph) {
auto dpLevel = getDPLevel(subqueryGraph);
KU_ASSERT(dpLevel->contains(subqueryGraph));
return dpLevel->at(subqueryGraph)->getPlans();
return dpLevel->getSubgraphPlans(subqueryGraph)->getPlans();
}

std::vector<SubqueryGraph> SubPlansTable::getSubqueryGraphs(uint32_t level) {
std::vector<SubqueryGraph> result;
for (auto& [subGraph, _] : *dpLevels[level]) {
result.push_back(subGraph);
}
return result;
return dpLevels[level]->getSubqueryGraphs();
}

void SubPlansTable::addPlan(const SubqueryGraph& subqueryGraph, std::unique_ptr<LogicalPlan> plan) {
auto dpLevel = getDPLevel(subqueryGraph);
if (dpLevel->size() > MAX_NUM_SUBGRAPHS_PER_LEVEL) {
return;
}
if (!dpLevel->contains(subqueryGraph)) {
dpLevel->emplace(subqueryGraph, std::make_unique<PlanSet>());
}
dpLevel->at(subqueryGraph)->addPlan(std::move(plan));
dpLevel->addPlan(subqueryGraph, std::move(plan));
}

void SubPlansTable::clear() {
Expand All @@ -49,25 +93,5 @@ void SubPlansTable::clear() {
}
}

void SubPlansTable::PlanSet::addPlan(std::unique_ptr<LogicalPlan> plan) {
if (plans.size() >= MAX_NUM_PLANS_PER_SUBGRAPH) {
return;
}
auto schema = plan->getSchema();
if (!schemaToPlanIdx.contains(schema)) { // add plan if this is a new factorization schema
schemaToPlanIdx.insert({schema, plans.size()});
plans.push_back(std::move(plan));
} else { // swap plan for lower cost under the same factorization schema
auto idx = schemaToPlanIdx.at(schema);
assert(idx < MAX_NUM_PLANS_PER_SUBGRAPH);
auto currentPlan = plans[idx].get();
if (currentPlan->getCost() > plan->getCost()) {
plans[idx] = std::move(plan);
schemaToPlanIdx.erase(schema);
schemaToPlanIdx.insert({schema, idx});
}
}
}

} // namespace planner
} // namespace kuzu

0 comments on commit 11910ae

Please sign in to comment.