greenplumn CJoinOrderDPv2 源码
greenplumn CJoinOrderDPv2 代码
文件路径:/src/backend/gporca/libgpopt/include/gpopt/xforms/CJoinOrderDPv2.h
//---------------------------------------------------------------------------
// Greenplum Database
// Copyright (C) 2019 VMware, Inc. or its affiliates.
//
// @filename:
// CJoinOrderDPv2.h
//
// @doc:
// Dynamic programming-based join order generation
//---------------------------------------------------------------------------
#ifndef GPOPT_CJoinOrderDPv2_H
#define GPOPT_CJoinOrderDPv2_H
#include "gpos/base.h"
#include "gpos/common/CBitSet.h"
#include "gpos/common/CHashMap.h"
#include "gpos/common/DbgPrintMixin.h"
#include "gpos/io/IOstream.h"
#include "gpopt/base/CKHeap.h"
#include "gpopt/base/CUtils.h"
#include "gpopt/operators/CExpression.h"
#include "gpopt/xforms/CJoinOrder.h"
namespace gpopt
{
using namespace gpos;
//---------------------------------------------------------------------------
// @class:
// CJoinOrderDPv2
//
// @doc:
// Helper class for creating join orders using dynamic programming
//
// Some terminology:
//
// NIJ: Non-inner join. This is sometimes used instead of left join,
// since we anticipate to extend this code to semi-joins and other
// types of joins.
// Atom: A child of the NAry join. This could be a table or some
// other operator like a groupby or a full outer join.
// Group: A set of atoms (called a "component" in DPv1). Once we generate
// a result expression, each of these sets will be associated
// with a CGroup in MEMO.
//---------------------------------------------------------------------------
class CJoinOrderDPv2 : public CJoinOrder,
public gpos::DbgPrintMixin<CJoinOrderDPv2>
{
private:
// Data structures for DPv2 join enumeration:
//
// Each level l is the set of l-way joins we are considering.
// Level 1 describes the "atoms" the leaves of the original NAry join we are transforming.
//
// Each level consists of a set (an array) of "groups". A group represents what may eventually
// become a group in the MEMO structure, if it is a part of one of the generated top k
// expressions at the top level. It is a set of atoms to be joined (in any order).
// The SGroupInfo struct contains the bitset representing the atoms, the cardinality from the
// derived statistics and an array of SExpressionInfo structs.
//
// Each SExpressionInfo struct describes an expression in a group. Besides a CExpression,
// it also has the SGroupInfo and SExpressionInfo of the children. Each SExpressionInfo
// also has a property. The SExpressionInfo entries in a group all have different properties.
// We only keep expressions that are not dominated by another expression, meaning that there
// is no other expression that can produce a superset of the properties for a less or equal
// cost.
//
//
// SLevelInfo
// +---------------------+
// Level n: | SGroupInfo array ---+----> SGroupInfo
// | optional top k | +------------------+
// +---------------------+ | Atoms (bitset) |
// | cardinality |
// | SExpressionInfo |
// | array |
// +--------+---------+
// v
// SExpressionInfo
// +------------------------+
// | CExpression |
// | child SExpressionInfos |
// | properties |
// +------------------------+
// +------------------------+
// | CExpression |
// | child SExpressionInfos |
// | properties |
// +------------------------+
// ...
// +------------------------+
// | CExpression |
// | child SExpressionInfos |
// | properties |
// +------------------------+
// ...
//
// SLevelInfo
// +---------------------+
// Level 1: | SGroupInfo array ---+----> SGroupInfo SGroupInfo
// | optional top k | +------------------+ +------------------+
// +---------------------+ | Atoms (bitset) | | Atoms (bitset) |
// | cardinality +--------------+ cardinality |
// | ExpressionInfo | | ExpressionInfo |
// | array | | array |
// +--------+---------+ +--------+---------+
// v v
// SExpressionInfo SExpressionInfo
// +------------------------+ +------------------------+
// | CExpression | | CExpression |
// | child SExpressionInfos | | child SExpressionInfos |
// | properties | | properties |
// +------------------------+ +------------------------+
//
// forward declarations, circular reference
struct SGroupInfo;
struct SExpressionInfo;
// Join enumeration algorithm properties, these can be added if an expression satisfies more than one
// consider these as constants, not as a true enum
// note that the numbers (other than the first) must be powers of 2,
// since we add them to make composite properties!!!
// Note also that query, mincard and GreedyAvoidXProd are all greedy algorithms.
// Sorry for the confusion with the term "greedy" used in the optimizer_join_order guc
// and the CXformExpandNAryJoinGreedy classes, where they refer to one type of greedy
// algorithm that avoids cross products.
enum JoinOrderPropType
{
EJoinOrderAny = 0, // the overall best solution (used for exhaustive2)
EJoinOrderQuery = 1, // this expression uses the "query" join order
EJoinOrderMincard = 2, // this expression has the "mincard" property
EJoinOrderGreedyAvoidXProd =
4, // best "greedy" expression with minimal cross products
EJoinOrderHasPS =
8, // best expression with special consideration for DPE
EJoinOrderDP = 16, // best solution using DP
EJoinOrderStats =
32 // this expression is used to calculate the statistics
// (row count) for the group
};
// properties of an expression in the DP structure (also used as required properties)
struct SExpressionProperties
{
// the join order enumeration algorithm for which this is a solution
// (exhaustive enumeration, can use any of these: EJoinOrderAny)
ULONG m_join_order;
SExpressionProperties(ULONG join_order_properties)
: m_join_order(join_order_properties)
{
}
BOOL
Satisfies(ULONG pt) const
{
return pt == (m_join_order & pt);
}
void
Add(const SExpressionProperties &p)
{
m_join_order |= p.m_join_order;
}
BOOL
IsGreedy() const
{
return 0 != (m_join_order & (EJoinOrderQuery + EJoinOrderMincard +
EJoinOrderGreedyAvoidXProd));
}
};
// a simple wrapper of an SGroupInfo * plus an index into its array of SExpressionInfos
// this identifies a group and one expression belonging to that group
struct SGroupAndExpression
{
SGroupInfo *m_group_info{nullptr};
ULONG m_expr_index{gpos::ulong_max};
SGroupAndExpression() = default;
SGroupAndExpression(SGroupInfo *g, ULONG ix)
: m_group_info(g), m_expr_index(ix)
{
}
SExpressionInfo *
GetExprInfo() const
{
return m_expr_index == gpos::ulong_max
? nullptr
: (*m_group_info->m_best_expr_info_array)[m_expr_index];
}
BOOL
IsValid() const
{
return nullptr != m_group_info && gpos::ulong_max != m_expr_index;
}
BOOL
operator==(const SGroupAndExpression &other) const
{
return m_group_info == other.m_group_info &&
m_expr_index == other.m_expr_index;
}
};
// description of an expression in the DP environment
// left and right child of join expressions point to
// child groups + expressions
struct SExpressionInfo : public CRefCount
{
// the expression
CExpression *m_expr;
// left/right child group/expr info (group for left/right child of m_expr),
// we do not keep a refcount for these
SGroupAndExpression m_left_child_expr;
SGroupAndExpression m_right_child_expr;
// derived properties of this expression
SExpressionProperties m_properties;
// in the future, we may add more properties relevant to the cost here,
// like distribution spec, partition selectors
// Stores part keys for atoms that are partitioned tables. NULL otherwise.
CPartKeysArray *m_atom_part_keys_array;
// cost of the expression
CDouble m_cost;
//cost adjustment for the effect of partition selectors, this is always <= 0.0
CDouble m_cost_adj_PS;
// base table rows, -1 if not atom or get/select
CDouble m_atom_base_table_rows;
// stores atom ids that are fufilled by a PS in this expression
CBitSet *m_contain_PS;
SExpressionInfo(CMemoryPool *mp, CExpression *expr,
const SGroupAndExpression &left_child_expr_info,
const SGroupAndExpression &right_child_expr_info,
SExpressionProperties &properties)
: m_expr(expr),
m_left_child_expr(left_child_expr_info),
m_right_child_expr(right_child_expr_info),
m_properties(properties),
m_atom_part_keys_array(nullptr),
m_cost(0.0),
m_cost_adj_PS(0.0),
m_atom_base_table_rows(-1.0),
m_contain_PS(nullptr)
{
m_contain_PS = GPOS_NEW(mp) CBitSet(mp);
this->UnionPSProperties(left_child_expr_info.GetExprInfo());
this->UnionPSProperties(right_child_expr_info.GetExprInfo());
}
SExpressionInfo(CMemoryPool *mp, CExpression *expr,
SExpressionProperties &properties)
: m_expr(expr),
m_properties(properties),
m_atom_part_keys_array(nullptr),
m_cost(0.0),
m_cost_adj_PS(0.0),
m_atom_base_table_rows(-1.0),
m_contain_PS(nullptr)
{
m_contain_PS = GPOS_NEW(mp) CBitSet(mp);
}
~SExpressionInfo() override
{
m_expr->Release();
CRefCount::SafeRelease(m_contain_PS);
}
// cost (use -1 for greedy solutions to ensure we keep all of them)
CDouble
GetCostForHeap() const
{
return m_properties.IsGreedy() ? -1.0 : GetCost();
}
CDouble
GetCost() const
{
return m_cost + m_cost_adj_PS;
}
void
UnionPSProperties(SExpressionInfo *other) const
{
m_contain_PS->Union(other->m_contain_PS);
}
BOOL
ChildrenAreEqual(const SExpressionInfo &other) const
{
return m_left_child_expr == other.m_left_child_expr &&
m_right_child_expr == other.m_right_child_expr;
}
};
using SExpressionInfoArray =
CDynamicPtrArray<SExpressionInfo, CleanupRelease<SExpressionInfo>>;
//---------------------------------------------------------------------------
// @struct:
// SGroupInfo
//
// @doc:
// Struct containing a bitset, representing a group, its best expression, and cost
//
//---------------------------------------------------------------------------
struct SGroupInfo : public CRefCount
{
// the set of atoms, this uniquely identifies the group
CBitSet *m_atoms;
// infos of the best (lowest cost) expressions (so far, if at the current level)
// for each interesting property
SExpressionInfoArray *m_best_expr_info_array;
CDouble m_cardinality;
CDouble m_lowest_expr_cost;
SGroupInfo(CMemoryPool *mp, CBitSet *atoms)
: m_atoms(atoms), m_cardinality(-1.0), m_lowest_expr_cost(-1.0)
{
m_best_expr_info_array = GPOS_NEW(mp) SExpressionInfoArray(mp);
}
~SGroupInfo() override
{
m_atoms->Release();
m_best_expr_info_array->Release();
}
BOOL
IsAnAtom() const
{
return 1 == m_atoms->Size();
}
CDouble
GetCostForHeap() const
{
return m_lowest_expr_cost;
}
};
// dynamic array of SGroupInfo, where each index represents an alternative group of a given level k
using SGroupInfoArray =
CDynamicPtrArray<SGroupInfo, CleanupRelease<SGroupInfo>>;
// info for a join level, the set of all groups representing <m_level>-way joins
struct SLevelInfo : public CRefCount
{
ULONG m_level;
SGroupInfoArray *m_groups;
CKHeap<SGroupInfoArray, SGroupInfo> *m_top_k_groups;
SLevelInfo(ULONG level, SGroupInfoArray *groups)
: m_level(level), m_groups(groups), m_top_k_groups(nullptr)
{
}
~SLevelInfo() override
{
m_groups->Release();
CRefCount::SafeRelease(m_top_k_groups);
}
};
// hashing function
static ULONG
UlHashBitSet(const CBitSet *pbs)
{
GPOS_ASSERT(nullptr != pbs);
return pbs->HashValue();
}
// equality function
static BOOL
FEqualBitSet(const CBitSet *pbsFst, const CBitSet *pbsSnd)
{
GPOS_ASSERT(nullptr != pbsFst);
GPOS_ASSERT(nullptr != pbsSnd);
return pbsFst->Equals(pbsSnd);
}
using ExpressionToEdgeMap =
CHashMap<CExpression, SEdge, CExpression::HashValue, CUtils::Equals,
CleanupRelease<CExpression>, CleanupRelease<SEdge>>;
// dynamic array of SGroupInfos
using BitSetToGroupInfoMap =
CHashMap<CBitSet, SGroupInfo, UlHashBitSet, FEqualBitSet,
CleanupRelease<CBitSet>, CleanupRelease<SGroupInfo>>;
// iterator over group infos in a level
using BitSetToGroupInfoMapIter =
CHashMapIter<CBitSet, SGroupInfo, UlHashBitSet, FEqualBitSet,
CleanupRelease<CBitSet>, CleanupRelease<SGroupInfo>>;
// dynamic array of SLevelInfos, where each index represents the level
using DPv2Levels = CDynamicPtrArray<SLevelInfo, CleanupRelease<SLevelInfo>>;
// an array of an array of groups, organized by level at the first array dimension,
// main data structure for dynamic programming
DPv2Levels *m_join_levels;
// map to find the associated edge in the join graph from a join predicate
ExpressionToEdgeMap *m_expression_to_edge_map;
// map to check whether a DPv2 group already exists
BitSetToGroupInfoMap *m_bitset_to_group_info_map;
// ON predicates for NIJs (non-inner joins, e.g. LOJs)
// currently NIJs are LOJs only, this may change in the future
// if/when we add semijoins, anti-semijoins and relatives
CExpressionArray *m_on_pred_conjuncts;
// association between logical children and inner join/ON preds
// (which of the logical children are right children of NIJs and what ON predicates are they using)
ULongPtrArray *m_child_pred_indexes;
// for each non-inner join (entry in m_on_pred_conjuncts), the required atoms on the left
CBitSetArray *m_non_inner_join_dependencies;
// top K expressions at the top level
CKHeap<SExpressionInfoArray, SExpressionInfo> *m_top_k_expressions;
// top K expressions at top level that contain promising dynamic partiion selectors
// if there are no promising dynamic partition selectors, this will be empty
CKHeap<SExpressionInfoArray, SExpressionInfo> *m_top_k_part_expressions;
// current penalty for cross products (depends on enumeration algorithm)
CDouble m_cross_prod_penalty;
// outer references, if any
CColRefSet *m_outer_refs;
CMemoryPool *m_mp;
SLevelInfo *
Level(ULONG l)
{
return (*m_join_levels)[l];
}
// build expression linking given groups
CExpression *PexprBuildInnerJoinPred(CBitSet *pbsFst, CBitSet *pbsSnd);
// compute cost of a join expression in a group
void ComputeCost(SExpressionInfo *expr_info, CDouble join_cardinality);
// if we need to keep track of used edges, make a map that
// speeds up this usage check
void PopulateExpressionToEdgeMapIfNeeded();
// add a select node with any remaining edges (predicates) that have
// not been incorporated in the join tree
CExpression *AddSelectNodeForRemainingEdges(CExpression *join_expr);
// mark all the edges used in a join tree
void RecursivelyMarkEdgesAsUsed(CExpression *expr);
// enumerate all possible joins between left_level-way joins on the left side
// and right_level-way joins on the right side, resulting in left_level + right_level-way joins
void SearchJoinOrders(ULONG left_level, ULONG right_level);
void GreedySearchJoinOrders(ULONG left_level, JoinOrderPropType algo);
void DeriveStats(CExpression *pexpr) override;
// create a CLogicalJoin and a CExpression to join two groups, for a required property
SExpressionInfo *GetJoinExprForProperties(
SGroupInfo *left_child, SGroupInfo *right_child,
SExpressionProperties &required_properties);
// get a join expression from two child groups with specified child expressions
SExpressionInfo *GetJoinExpr(const SGroupAndExpression &left_child_expr,
const SGroupAndExpression &right_child_expr,
SExpressionProperties &result_properties);
// does "prop" provide all the properties of "other_prop" plus maybe more?
static BOOL IsASupersetOfProperties(SExpressionProperties &prop,
SExpressionProperties &other_prop);
// is one of the properties a subset of the other or are they disjoint?
static BOOL ArePropertiesDisjoint(SExpressionProperties &prop,
SExpressionProperties &other_prop);
// get best expression in a group for a given set of properties
static SGroupAndExpression GetBestExprForProperties(
SGroupInfo *group_info, SExpressionProperties &props);
// add a new property to an existing predicate
static void AddNewPropertyToExpr(SExpressionInfo *expr_info,
SExpressionProperties props);
// enumerate bushy joins (joins where both children are also joins) of level "current_level"
void SearchBushyJoinOrders(ULONG current_level);
// look up an existing group or create a new one, with an expression to be used for stats
SGroupInfo *LookupOrCreateGroupInfo(SLevelInfo *levelInfo, CBitSet *atoms,
SExpressionInfo *stats_expr_info);
// add a new expression to a group, unless there already is an existing expression that dominates it
void AddExprToGroupIfNecessary(SGroupInfo *group_info,
SExpressionInfo *new_expr_info);
void PopulateDPEInfo(SExpressionInfo *join_expr_info,
SGroupInfo *left_group_info,
SGroupInfo *right_group_info);
void FinalizeDPLevel(ULONG level);
SGroupInfoArray *
GetGroupsForLevel(ULONG level) const
{
return (*m_join_levels)[level]->m_groups;
}
ULONG FindLogicalChildByNijId(ULONG nij_num);
static ULONG NChooseK(ULONG n, ULONG k);
BOOL LevelIsFull(ULONG level);
void EnumerateDP();
void EnumerateQuery();
void FindLowestCardTwoWayJoin(JoinOrderPropType prop_type);
void EnumerateMinCard();
void EnumerateGreedyAvoidXProd();
public:
// ctor
CJoinOrderDPv2(CMemoryPool *mp, CExpressionArray *pdrgpexprAtoms,
CExpressionArray *innerJoinConjuncts,
CExpressionArray *onPredConjuncts,
ULongPtrArray *childPredIndexes, CColRefSet *outerRefs);
// dtor
~CJoinOrderDPv2() override;
// main handler
virtual void PexprExpand();
CExpression *GetNextOfTopK();
// check for NIJs
BOOL IsRightChildOfNIJ(SGroupInfo *groupInfo,
CExpression **onPredToUse = nullptr,
CBitSet **requiredBitsOnLeft = nullptr);
// print function
IOstream &OsPrint(IOstream &) const;
static IOstream &OsPrintProperty(IOstream &, SExpressionProperties &);
CXform::EXformId
EOriginXForm() const override
{
return CXform::ExfExpandNAryJoinDPv2;
}
}; // class CJoinOrderDPv2
} // namespace gpopt
#endif // !GPOPT_CJoinOrderDPv2_H
// EOF
相关信息
相关文章
greenplumn CJoinOrderGreedy 源码
greenplumn CJoinOrderMinCard 源码
greenplumn CSubqueryHandler 源码
greenplumn CXformAntiSemiJoinAntiSemiJoinNotInSwap 源码
0
赞
热门推荐
-
2、 - 优质文章
-
3、 gate.io
-
8、 golang
-
9、 openharmony
-
10、 Vue中input框自动聚焦