Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Encode plan when inserting to dp table #1406

Merged
merged 1 commit into from
Mar 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/binder/query/query_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ namespace binder {

std::size_t SubqueryGraphHasher::operator()(const SubqueryGraph& key) const {
if (0 == key.queryRelsSelector.count()) {
return std::hash<std::bitset<MAX_NUM_VARIABLES>>{}(key.queryNodesSelector);
return std::hash<std::bitset<MAX_NUM_QUERY_VARIABLES>>{}(key.queryNodesSelector);
}
return std::hash<std::bitset<MAX_NUM_VARIABLES>>{}(key.queryRelsSelector);
return std::hash<std::bitset<MAX_NUM_QUERY_VARIABLES>>{}(key.queryRelsSelector);
}

bool SubqueryGraph::containAllVariables(std::unordered_set<std::string>& variables) const {
Expand Down
6 changes: 3 additions & 3 deletions src/include/binder/query/reading_clause/query_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
namespace kuzu {
namespace binder {

const uint8_t MAX_NUM_VARIABLES = 64;
constexpr static uint8_t MAX_NUM_QUERY_VARIABLES = 64;

class QueryGraph;
struct SubqueryGraph;
Expand All @@ -25,8 +25,8 @@ struct SubqueryGraphHasher {
struct SubqueryGraph {

const QueryGraph& queryGraph;
std::bitset<MAX_NUM_VARIABLES> queryNodesSelector;
std::bitset<MAX_NUM_VARIABLES> queryRelsSelector;
std::bitset<MAX_NUM_QUERY_VARIABLES> queryNodesSelector;
std::bitset<MAX_NUM_QUERY_VARIABLES> queryRelsSelector;

explicit SubqueryGraph(const QueryGraph& queryGraph) : queryGraph{queryGraph} {}

Expand Down
14 changes: 0 additions & 14 deletions src/include/planner/logical_plan/logical_operator/schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,6 @@ using f_group_pos = uint32_t;
using f_group_pos_set = std::unordered_set<f_group_pos>;
constexpr f_group_pos INVALID_F_GROUP_POS = UINT32_MAX;

class Schema;
struct SchemaHasher;
struct SchemaApproximateEquality;
template<typename T>
using schema_map_t = std::unordered_map<Schema*, T, SchemaHasher, SchemaApproximateEquality>;

class FactorizationGroup {
friend class Schema;

Expand Down Expand Up @@ -139,14 +133,6 @@ class Schema {
binder::expression_vector expressionsInScope;
};

struct SchemaHasher {
std::size_t operator()(const Schema* const& schema) const;
};

struct SchemaApproximateEquality {
bool operator()(const Schema* const& left, const Schema* const& right) const;
};

class SchemaUtils {
public:
static std::vector<binder::expression_vector> getExpressionsPerGroup(
Expand Down
78 changes: 61 additions & 17 deletions src/include/planner/subplans_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,67 @@ namespace kuzu {
namespace planner {

const uint64_t MAX_LEVEL_TO_PLAN_EXACTLY = 7;
const uint64_t MAX_NUM_SUBGRAPHS_PER_LEVEL = 50;
const uint64_t MAX_NUM_PLANS_PER_SUBGRAPH = 50;

class SubPlansTable {
struct PlanSet;
// Each dp level is a map from sub query graph to a set of plans
using dp_level_t = subquery_graph_V_map_t<std::unique_ptr<PlanSet>>;
// Different from vanilla dp algorithm where one optimal plan is kept per subgraph, we keep multiple
// plans each with a different factorization structure. The following example will explain our
// rationale.
// Given a triangle with an outgoing edge
// MATCH (a)->(b)->(c), (a)->(c), (c)->(d)
// At level 3 (assume level is based on num of nodes) for subgraph "abc", if we ignore factorization
// structure, the 3 plans that intersects on "a", "b", or "c" are considered homogenous and one of
// them will be picked.
// Then at level 4 for subgraph "abcd", we know the plan that intersect on "c" will be worse because
// we need to further flatten it and extend to "d".
// Therefore, we try to be factorization aware when keeping optimal plans.
class SubgraphPlans {
public:
SubgraphPlans(const SubqueryGraph& subqueryGraph);

void addPlan(std::unique_ptr<LogicalPlan> plan);

std::vector<std::unique_ptr<LogicalPlan>>& getPlans() { return plans; }

private:
// To balance computation time, we encode plan by only considering the flat information of the
// nodes that are involved in current subgraph.
std::bitset<MAX_NUM_QUERY_VARIABLES> encodePlan(const LogicalPlan& plan);

private:
constexpr static uint32_t MAX_NUM_PLANS = 10;
andyfengHKU marked this conversation as resolved.
Show resolved Hide resolved

private:
binder::expression_vector nodeIDsToEncode;
std::vector<std::unique_ptr<LogicalPlan>> plans;
std::unordered_map<std::bitset<MAX_NUM_QUERY_VARIABLES>, common::vector_idx_t>
encodedPlan2PlanIdx;
};

// A DPLevel is a collection of plans per subgraph. All subgraph should have the same number of
// variables.
class DPLevel {
public:
inline bool contains(const SubqueryGraph& subqueryGraph) {
return subgraph2Plans.contains(subqueryGraph);
}

inline SubgraphPlans* getSubgraphPlans(const SubqueryGraph& subqueryGraph) {
return subgraph2Plans.at(subqueryGraph).get();
}

std::vector<SubqueryGraph> getSubqueryGraphs();

void addPlan(const SubqueryGraph& subqueryGraph, std::unique_ptr<LogicalPlan> plan);

inline void clear() { subgraph2Plans.clear(); }

private:
constexpr static uint32_t MAX_NUM_SUBGRAPH = 50;
andyfengHKU marked this conversation as resolved.
Show resolved Hide resolved

private:
subquery_graph_V_map_t<std::unique_ptr<SubgraphPlans>> subgraph2Plans;
};

class SubPlansTable {
public:
void resize(uint32_t newSize);

Expand All @@ -35,21 +88,12 @@ class SubPlansTable {
void clear();

private:
struct PlanSet {
std::vector<std::unique_ptr<LogicalPlan>> plans;
schema_map_t<common::vector_idx_t> schemaToPlanIdx;

inline std::vector<std::unique_ptr<LogicalPlan>>& getPlans() { return plans; }

void addPlan(std::unique_ptr<LogicalPlan> plan);
};

dp_level_t* getDPLevel(const SubqueryGraph& subqueryGraph) const {
DPLevel* getDPLevel(const SubqueryGraph& subqueryGraph) const {
return dpLevels[subqueryGraph.getTotalNumVariables()].get();
}

private:
std::vector<std::unique_ptr<dp_level_t>> dpLevels;
std::vector<std::unique_ptr<DPLevel>> dpLevels;
};

} // namespace planner
Expand Down
7 changes: 5 additions & 2 deletions src/planner/join_order_enumerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,8 +262,11 @@ void JoinOrderEnumerator::planWCOJoin(const SubqueryGraph& subgraph,
relSubgraph.addQueryRel(relPos);
assert(context->subPlansTable->containSubgraphPlans(relSubgraph));
auto& relPlanCandidates = context->subPlansTable->getSubgraphPlans(relSubgraph);
assert(relPlanCandidates.size() == 2); // 2 directions
relPlans.push_back(getWCOJBuildPlanForRel(relPlanCandidates, *boundNode));
auto relPlan = getWCOJBuildPlanForRel(relPlanCandidates, *boundNode);
if (relPlan == nullptr) { // Cannot find a suitable rel plan.
return;
}
relPlans.push_back(std::move(relPlan));
}
auto predicates =
getNewlyMatchedExpressions(prevSubgraphs, newSubgraph, context->getWhereExpressions());
Expand Down
37 changes: 0 additions & 37 deletions src/planner/operator/schema.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,43 +107,6 @@ size_t Schema::getNumGroups(bool isFlat) const {
return result;
}

std::size_t SchemaHasher::operator()(const Schema* const& schema) const {
return std::hash<size_t>{}(schema->getNumFlatGroups()) ^
std::hash<size_t>{}(schema->getNumUnFlatGroups());
}

// We use this equality in join order enumeration to make sure at each DP level, we don't just keep
// the best plan, but keep best plan for each unique factorization schema.
// In order to balance enumeration time, we use an approximate equality check to reduce computation.
// We check the following
// - number of factorization groups
// - number of unFlat factorization groups
// - number of expressions
// - if an expression has the same flat/unFlat flag in both schemas
bool SchemaApproximateEquality::operator()(
const Schema* const& left, const Schema* const& right) const {
if (left->getNumGroups() != right->getNumGroups()) {
return false;
}
if (left->getNumUnFlatGroups() != right->getNumUnFlatGroups()) {
return false;
}
if (left->getExpressionsInScope().size() != right->getExpressionsInScope().size()) {
return false;
}
for (auto& expression : left->getExpressionsInScope()) {
if (!right->isExpressionInScope(*expression)) {
return false;
}
auto leftGroupPos = left->getGroupPos(*expression);
auto rightGroupPos = right->getGroupPos(*expression);
if (left->getGroup(leftGroupPos)->isFlat() != right->getGroup(rightGroupPos)->isFlat()) {
return false;
}
}
return true;
}

std::vector<binder::expression_vector> SchemaUtils::getExpressionsPerGroup(
const binder::expression_vector& expressions, const Schema& schema) {
std::vector<binder::expression_vector> result;
Expand Down
92 changes: 58 additions & 34 deletions src/planner/subplans_table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,65 @@
namespace kuzu {
namespace planner {

SubgraphPlans::SubgraphPlans(const kuzu::binder::SubqueryGraph& subqueryGraph) {
for (auto i = 0u; i < subqueryGraph.queryGraph.getNumQueryNodes(); ++i) {
if (subqueryGraph.queryNodesSelector[i]) {
nodeIDsToEncode.push_back(
subqueryGraph.queryGraph.getQueryNode(i)->getInternalIDProperty());
}
}
}

void SubgraphPlans::addPlan(std::unique_ptr<LogicalPlan> plan) {
if (plans.size() > MAX_NUM_PLANS) {
return;
}
auto planCode = encodePlan(*plan);
if (!encodedPlan2PlanIdx.contains(planCode)) {
encodedPlan2PlanIdx.insert({planCode, plans.size()});
plans.push_back(std::move(plan));
} else {
auto planIdx = encodedPlan2PlanIdx.at(planCode);
if (plan->getCost() < plans[planIdx]->getCost()) {
plans[planIdx] = std::move(plan);
}
}
}

std::bitset<MAX_NUM_QUERY_VARIABLES> SubgraphPlans::encodePlan(const LogicalPlan& plan) {
auto schema = plan.getSchema();
std::bitset<MAX_NUM_QUERY_VARIABLES> result;
result.reset();
for (auto i = 0; i < nodeIDsToEncode.size(); ++i) {
result[i] = schema->getGroup(schema->getGroupPos(*nodeIDsToEncode[i]))->isFlat();
}
return result;
}

std::vector<SubqueryGraph> DPLevel::getSubqueryGraphs() {
std::vector<SubqueryGraph> result;
for (auto& [subGraph, _] : subgraph2Plans) {
result.push_back(subGraph);
}
return result;
}

void DPLevel::addPlan(
const kuzu::binder::SubqueryGraph& subqueryGraph, std::unique_ptr<LogicalPlan> plan) {
if (subgraph2Plans.size() > MAX_NUM_SUBGRAPH) {
return;
}
if (!contains(subqueryGraph)) {
subgraph2Plans.insert({subqueryGraph, std::make_unique<SubgraphPlans>(subqueryGraph)});
}
subgraph2Plans.at(subqueryGraph)->addPlan(std::move(plan));
}

void SubPlansTable::resize(uint32_t newSize) {
auto prevSize = dpLevels.size();
dpLevels.resize(newSize);
for (auto i = prevSize; i < newSize; ++i) {
dpLevels[i] = std::make_unique<dp_level_t>();
dpLevels[i] = std::make_unique<DPLevel>();
}
}

Expand All @@ -21,26 +75,16 @@ std::vector<std::unique_ptr<LogicalPlan>>& SubPlansTable::getSubgraphPlans(
const SubqueryGraph& subqueryGraph) {
auto dpLevel = getDPLevel(subqueryGraph);
KU_ASSERT(dpLevel->contains(subqueryGraph));
return dpLevel->at(subqueryGraph)->getPlans();
return dpLevel->getSubgraphPlans(subqueryGraph)->getPlans();
}

std::vector<SubqueryGraph> SubPlansTable::getSubqueryGraphs(uint32_t level) {
std::vector<SubqueryGraph> result;
for (auto& [subGraph, _] : *dpLevels[level]) {
result.push_back(subGraph);
}
return result;
return dpLevels[level]->getSubqueryGraphs();
}

void SubPlansTable::addPlan(const SubqueryGraph& subqueryGraph, std::unique_ptr<LogicalPlan> plan) {
auto dpLevel = getDPLevel(subqueryGraph);
if (dpLevel->size() > MAX_NUM_SUBGRAPHS_PER_LEVEL) {
return;
}
if (!dpLevel->contains(subqueryGraph)) {
dpLevel->emplace(subqueryGraph, std::make_unique<PlanSet>());
}
dpLevel->at(subqueryGraph)->addPlan(std::move(plan));
dpLevel->addPlan(subqueryGraph, std::move(plan));
}

void SubPlansTable::clear() {
Expand All @@ -49,25 +93,5 @@ void SubPlansTable::clear() {
}
}

void SubPlansTable::PlanSet::addPlan(std::unique_ptr<LogicalPlan> plan) {
if (plans.size() >= MAX_NUM_PLANS_PER_SUBGRAPH) {
return;
}
auto schema = plan->getSchema();
if (!schemaToPlanIdx.contains(schema)) { // add plan if this is a new factorization schema
schemaToPlanIdx.insert({schema, plans.size()});
plans.push_back(std::move(plan));
} else { // swap plan for lower cost under the same factorization schema
auto idx = schemaToPlanIdx.at(schema);
assert(idx < MAX_NUM_PLANS_PER_SUBGRAPH);
auto currentPlan = plans[idx].get();
if (currentPlan->getCost() > plan->getCost()) {
plans[idx] = std::move(plan);
schemaToPlanIdx.erase(schema);
schemaToPlanIdx.insert({schema, idx});
}
}
}

} // namespace planner
} // namespace kuzu