PostgreSQL 源码解读（91）

编程入门行业动态更新时间:2024-10-24 00:21:41

PostgreSQL <a href=https://www.elefans.com/category/jswz/34/1770099.html style= 源码解读（91）"/>

PostgreSQL 源码解读（91）

本节是ExecHashJoin函数介绍的第二部分，主要介绍了ExecHashJoin中依赖的其他函数的实现逻辑，包括ExecHashTableCreate、ExecChooseHashTableSize等。

一、数据结构

Plan
所有计划节点通过将Plan结构作为第一个字段从Plan结构“派生”。这确保了在将节点转换为计划节点时，一切都能正常工作。(在执行器中以通用方式传递时，节点指针经常被转换为Plan *)

/* ----------------*      Plan node** All plan nodes "derive" from the Plan structure by having the* Plan structure as the first field.  This ensures that everything works* when nodes are cast to Plan's.  (node pointers are frequently cast to Plan** when passed around generically in the executor)* 所有计划节点通过将Plan结构作为第一个字段从Plan结构“派生”。* 这确保了在将节点转换为计划节点时，一切都能正常工作。* (在执行器中以通用方式传递时，节点指针经常被转换为Plan *)** We never actually instantiate any Plan nodes; this is just the common* abstract superclass for all Plan-type nodes.* 从未实例化任何Plan节点;这只是所有Plan-type节点的通用抽象超类。* ----------------*/
typedef struct Plan
{NodeTag     type;//节点类型/** 成本估算信息;estimated execution costs for plan (see costsize.c for more info)*/Cost        startup_cost;   /* 启动成本;cost expended before fetching any tuples */Cost        total_cost;     /* 总成本;total cost (assuming all tuples fetched) *//** 优化器估算信息;planner's estimate of result size of this plan step*/double      plan_rows;      /* 行数;number of rows plan is expected to emit */int         plan_width;     /* 平均行大小(Byte为单位);average row width in bytes *//** 并行执行相关的信息;information needed for parallel query*/bool        parallel_aware; /* 是否参与并行执行逻辑?engage parallel-aware logic? */bool        parallel_safe;  /* 是否并行安全;OK to use as part of parallel plan? *//** Plan类型节点通用的信息.Common structural data for all Plan types.*/int         plan_node_id;   /* unique across entire final plan tree */List       *targetlist;     /* target list to be computed at this node */List       *qual;           /* implicitly-ANDed qual conditions */struct Plan *lefttree;      /* input plan tree(s) */struct Plan *righttree;List       *initPlan;       /* Init Plan nodes (un-correlated expr* subselects) *//** Information for management of parameter-change-driven rescanning* parameter-change-driven重扫描的管理信息.* * extParam includes the paramIDs of all external PARAM_EXEC params* affecting this plan node or its children.  setParam params from the* node's initPlans are not included, but their extParams are.** allParam includes all the extParam paramIDs, plus the IDs of local* params that affect the node (i.e., the setParams of its initplans).* These are _all_ the PARAM_EXEC params that affect this node.*/Bitmapset  *extParam;Bitmapset  *allParam;
} Plan;

JoinState
Hash/NestLoop/Merge Join的基类

/* ----------------*   JoinState information**      Superclass for state nodes of join plans.*      Hash/NestLoop/Merge Join的基类* ----------------*/
typedef struct JoinState
{PlanState   ps;//基类PlanStateJoinType    jointype;//连接类型//在找到一个匹配inner tuple的时候,如需要跳转到下一个outer tuple,则该值为Tbool        single_match;   /* True if we should skip to next outer tuple* after finding one inner match *///连接条件表达式(除了ps.qual)ExprState  *joinqual;       /* JOIN quals (in addition to ps.qual) */
} JoinState;

HashJoinState
Hash Join运行期状态结构体

/* these structs are defined in executor/hashjoin.h: */
typedef struct HashJoinTupleData *HashJoinTuple;
typedef struct HashJoinTableData *HashJoinTable;typedef struct HashJoinState
{JoinState   js;             /* 基类;its first field is NodeTag */ExprState  *hashclauses;//hash连接条件List       *hj_OuterHashKeys;   /* 外表条件链表;list of ExprState nodes */List       *hj_InnerHashKeys;   /* 内表连接条件;list of ExprState nodes */List       *hj_HashOperators;   /* 操作符OIDs链表;list of operator OIDs */HashJoinTable hj_HashTable;//Hash表uint32      hj_CurHashValue;//当前的Hash值int         hj_CurBucketNo;//当前的bucket编号int         hj_CurSkewBucketNo;//行倾斜bucket编号HashJoinTuple hj_CurTuple;//当前元组TupleTableSlot *hj_OuterTupleSlot;//outer relation slotTupleTableSlot *hj_HashTupleSlot;//Hash tuple slotTupleTableSlot *hj_NullOuterTupleSlot;//用于外连接的outer虚拟slotTupleTableSlot *hj_NullInnerTupleSlot;//用于外连接的inner虚拟slotTupleTableSlot *hj_FirstOuterTupleSlot;//int         hj_JoinState;//JoinState状态bool        hj_MatchedOuter;//是否匹配bool        hj_OuterNotEmpty;//outer relation是否为空
} HashJoinState;

HashJoinTable
Hash表数据结构

typedef struct HashJoinTableData
{int         nbuckets;       /* 内存中的hash桶数;# buckets in the in-memory hash table */int         log2_nbuckets;  /* 2的对数(nbuckets必须是2的幂);its log2 (nbuckets must be a power of 2) */int         nbuckets_original;  /* 首次hash时的桶数;# buckets when starting the first hash */int         nbuckets_optimal;   /* 优化后的桶数(每个批次);optimal # buckets (per batch) */int         log2_nbuckets_optimal;  /* 2的对数;log2(nbuckets_optimal) *//* buckets[i] is head of list of tuples in i'th in-memory bucket *///bucket [i]是内存中第i个桶中的元组链表的head itemunion{/* unshared array is per-batch storage, as are all the tuples *///未共享数组是按批处理存储的，所有元组均如此struct HashJoinTupleData **unshared;/* shared array is per-query DSA area, as are all the tuples *///共享数组是每个查询的DSA区域，所有元组均如此dsa_pointer_atomic *shared;}           buckets;bool        keepNulls;      /*如不匹配则存储NULL元组,该值为T;true to store unmatchable NULL tuples */bool        skewEnabled;    /*是否使用倾斜优化?;are we using skew optimization? */HashSkewBucket **skewBucket;    /* 倾斜的hash表桶数;hashtable of skew buckets */int         skewBucketLen;  /* skewBucket数组大小;size of skewBucket array (a power of 2!) */int         nSkewBuckets;   /* 活动的倾斜桶数;number of active skew buckets */int        *skewBucketNums; /* 活动倾斜桶数组索引;array indexes of active skew buckets */int         nbatch;         /* 批次数;number of batches */int         curbatch;       /* 当前批次,第一轮为0;current batch #; 0 during 1st pass */int         nbatch_original;    /* 在开始inner扫描时的批次;nbatch when we started inner scan */int         nbatch_outstart;    /* 在开始outer扫描时的批次;nbatch when we started outer scan */bool        growEnabled;    /* 关闭nbatch增加的标记;flag to shut off nbatch increases */double      totalTuples;    /* 从inner plan获得的元组数;# tuples obtained from inner plan */double      partialTuples;  /* 通过hashjoin获得的inner元组数;# tuples obtained from inner plan by me */double      skewTuples;     /* 倾斜元组数;# tuples inserted into skew tuples *//** These arrays are allocated for the life of the hash join, but only if* nbatch > 1.  A file is opened only when we first write a tuple into it* (otherwise its pointer remains NULL).  Note that the zero'th array* elements never get used, since we will process rather than dump out any* tuples of batch zero.* 这些数组在散列连接的生命周期内分配，但仅当nbatch > 1时分配。* 只有当第一次将元组写入文件时，文件才会打开(否则它的指针将保持NULL)。* 注意，第0个数组元素永远不会被使用，因为批次0的元组永远不会转储.*/BufFile   **innerBatchFile; /* 每个批次的inner虚拟临时文件缓存;buffered virtual temp file per batch */BufFile   **outerBatchFile; /* 每个批次的outer虚拟临时文件缓存;buffered virtual temp file per batch *//** Info about the datatype-specific hash functions for the datatypes being* hashed. These are arrays of the same length as the number of hash join* clauses (hash keys).* 有关正在散列的数据类型的特定于数据类型的散列函数的信息。* 这些数组的长度与散列连接子句(散列键)的数量相同。*/FmgrInfo   *outer_hashfunctions;    /* outer hash函数FmgrInfo结构体;lookup data for hash functions */FmgrInfo   *inner_hashfunctions;    /* inner hash函数FmgrInfo结构体;lookup data for hash functions */bool       *hashStrict;     /* 每个hash操作符是严格?is each hash join operator strict? */Size        spaceUsed;      /* 元组使用的当前内存空间大小;memory space currently used by tuples */Size        spaceAllowed;   /* 空间使用上限;upper limit for space used */Size        spacePeak;      /* 峰值的空间使用;peak space used */Size        spaceUsedSkew;  /* 倾斜哈希表的当前空间使用情况;skew hash table's current space usage */Size        spaceAllowedSkew;   /* 倾斜哈希表的使用上限;upper limit for skew hashtable */MemoryContext hashCxt;      /* 整个散列连接存储的上下文;context for whole-hash-join storage */MemoryContext batchCxt;     /* 该批次存储的上下文;context for this-batch-only storage *//* used for dense allocation of tuples (into linked chunks) *///用于密集分配元组(到链接块中)HashMemoryChunk chunks;     /* 整个批次使用一个链表;one list for the whole batch *//* Shared and private state for Parallel Hash. *///并行hash使用的共享和私有状态HashMemoryChunk current_chunk;  /* 后台进程的当前chunk;this backend's current chunk */dsa_area   *area;           /* 用于分配内存的DSA区域;DSA area to allocate memory from */ParallelHashJoinState *parallel_state;//并行执行状态ParallelHashJoinBatchAccessor *batches;//并行访问器dsa_pointer current_chunk_shared;//当前chunk的开始指针
} HashJoinTableData;typedef struct HashJoinTableData *HashJoinTable;

HashJoinTupleData
Hash连接元组数据

/* ----------------------------------------------------------------*              hash-join hash table structures** Each active hashjoin has a HashJoinTable control block, which is* palloc'd in the executor's per-query context.  All other storage needed* for the hashjoin is kept in private memory contexts, two for each hashjoin.* This makes it easy and fast to release the storage when we don't need it* anymore.  (Exception: data associated with the temp files lives in the* per-query context too, since we always call buffile.c in that context.)* 每个活动的hashjoin都有一个可散列的控制块，它在执行程序的每个查询上下文中都是通过palloc分配的。* hashjoin所需的所有其他存储都保存在私有内存上下文中，每个hashjoin有两个。* 当不再需要它的时候，这使得释放它变得简单和快速。* (例外:与临时文件相关的数据也存在于每个查询上下文中，因为在这种情况下总是调用buffile.c。)** The hashtable contexts are made children of the per-query context, ensuring* that they will be discarded at end of statement even if the join is* aborted early by an error.  (Likewise, any temporary files we make will* be cleaned up by the virtual file manager in event of an error.)* hashtable上下文是每个查询上下文的子上下文，确保在语句结束时丢弃它们，即使连接因错误而提前中止。*   (同样，如果出现错误，虚拟文件管理器将清理创建的任何临时文件。)** Storage that should live through the entire join is allocated from the* "hashCxt", while storage that is only wanted for the current batch is* allocated in the "batchCxt".  By resetting the batchCxt at the end of* each batch, we free all the per-batch storage reliably and without tedium.* 通过整个连接的存储空间应从“hashCxt”分配，而只需要当前批处理的存储空间在“batchCxt”中分配。* 通过在每个批处理结束时重置batchCxt，可以可靠地释放每个批处理的所有存储，而不会感到单调乏味。* * During first scan of inner relation, we get its tuples from executor.* If nbatch > 1 then tuples that don't belong in first batch get saved* into inner-batch temp files. The same statements apply for the* first scan of the outer relation, except we write tuples to outer-batch* temp files.  After finishing the first scan, we do the following for* each remaining batch:*  1. Read tuples from inner batch file, load into hash buckets.*  2. Read tuples from outer batch file, match to hash buckets and output.* 在内部关系的第一次扫描中，从执行者那里得到了它的元组。* 如果nbatch > 1，那么不属于第一批的元组将保存到批内临时文件中。* 相同的语句适用于外关系的第一次扫描，但是我们将元组写入外部批处理临时文件。* 完成第一次扫描后，我们对每批剩余的元组做如下处理: * 1.从内部批处理文件读取元组，加载到散列桶中。* 2.从外部批处理文件读取元组，匹配哈希桶和输出。 ** It is possible to increase nbatch on the fly if the in-memory hash table* gets too big.  The hash-value-to-batch computation is arranged so that this* can only cause a tuple to go into a later batch than previously thought,* never into an earlier batch.  When we increase nbatch, we rescan the hash* table and dump out any tuples that are now of a later batch to the correct* inner batch file.  Subsequently, while reading either inner or outer batch* files, we might find tuples that no longer belong to the current batch;* if so, we just dump them out to the correct batch file.* 如果内存中的哈希表太大，可以动态增加nbatch。* 散列值到批处理的计算是这样安排的:*   这只会导致元组进入比以前认为的更晚的批处理，而不会进入更早的批处理。* 当增加nbatch时，重新扫描哈希表，并将现在属于后面批处理的任何元组转储到正确的内部批处理文件。* 随后，在读取内部或外部批处理文件时，可能会发现不再属于当前批处理的元组;*   如果是这样，只需将它们转储到正确的批处理文件即可。* ----------------------------------------------------------------*//* these are in nodes/execnodes.h: */
/* typedef struct HashJoinTupleData *HashJoinTuple; */
/* typedef struct HashJoinTableData *HashJoinTable; */typedef struct HashJoinTupleData
{/* link to next tuple in same bucket *///link同一个桶中的下一个元组union{struct HashJoinTupleData *unshared;dsa_pointer shared;}           next;uint32      hashvalue;      /* 元组的hash值;tuple's hash code *//* Tuple data, in MinimalTuple format, follows on a MAXALIGN boundary */
}           HashJoinTupleData;#define HJTUPLE_OVERHEAD  MAXALIGN(sizeof(HashJoinTupleData))
#define HJTUPLE_MINTUPLE(hjtup)  \((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))

二、源码解读

ExecHashTableCreate
ExecHashTableCreate函数初始化hashjoin需要使用的hashtable.

/*----------------------------------------------------------------------------------------------------HJ_BUILD_HASHTABLE 阶段
-----------------------------------------------------------------------------------------------------*//* ----------------*  these are defined to avoid confusion problems with "left"*  and "right" and "inner" and "outer".  The convention is that*  the "left" plan is the "outer" plan and the "right" plan is*  the inner plan, but these make the code more readable.*  这些定义是为了避免“左”和“右”以及“内”和“外”的混淆问题。*  约定是，“左”计划是“外部”计划，“右”计划是内部计划，但是这些计划使代码更具可读性。* ----------------*/
#define innerPlan(node)         (((Plan *)(node))->righttree)
#define outerPlan(node)         (((Plan *)(node))->lefttree)/* ----------------------------------------------------------------*      ExecHashTableCreate**      create an empty hashtable data structure for hashjoin.*      初始化hashjoin需要使用的hashtable.* ----------------------------------------------------------------*/
HashJoinTable
ExecHashTableCreate(HashState *state, List *hashOperators, bool keepNulls)
{Hash       *node;HashJoinTable hashtable;Plan       *outerNode;size_t      space_allowed;int         nbuckets;int         nbatch;double      rows;int         num_skew_mcvs;int         log2_nbuckets;int         nkeys;int         i;ListCell   *ho;MemoryContext oldcxt;/** Get information about the size of the relation to be hashed (it's the* "outer" subtree of this node, but the inner relation of the hashjoin).* Compute the appropriate size of the hash table.* 获取有关要散列的关系大小的信息(它是该节点的“outer”子树，hashjoin的inner relation)。* 计算哈希表的适当大小。*/node = (Hash *) state->ps.plan;//获取Hash节点outerNode = outerPlan(node);//获取outer relation Plan节点/** If this is shared hash table with a partial plan, then we can't use* outerNode->plan_rows to estimate its size.  We need an estimate of the* total number of rows across all copies of the partial plan.* 如果这是带有部分计划(并行处理)的共享哈希表，那么不能使用outerNode->plan_rows来估计它的大小。* 需要估算跨部分计划的所有副本的行总数。*/rows = node->plan.parallel_aware ? node->rows_total : outerNode->plan_rows;//获取总行数ExecChooseHashTableSize(rows, outerNode->plan_width,OidIsValid(node->skewTable),state->parallel_state != NULL,state->parallel_state != NULL ?state->parallel_state->nparticipants - 1 : 0,&space_allowed,&nbuckets, &nbatch, &num_skew_mcvs);//计算Hash Table的大小尺寸/* nbuckets must be a power of 2 *///nbuckets(hash桶数)必须是2的n次方log2_nbuckets = my_log2(nbuckets);Assert(nbuckets == (1 << log2_nbuckets));/** Initialize the hash table control block.* 初始化hash表的控制块** The hashtable control block is just palloc'd from the executor's* per-query memory context.  Everything else should be kept inside the* subsidiary hashCxt or batchCxt.* hashtable控件块是从执行程序的每个查询内存上下文中调取的。* 其他内容都应该保存在附属hashCxt或batchCxt中。*/hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));//分配内存hashtable->nbuckets = nbuckets;//桶数hashtable->nbuckets_original = nbuckets;hashtable->nbuckets_optimal = nbuckets;hashtable->log2_nbuckets = log2_nbuckets;hashtable->log2_nbuckets_optimal = log2_nbuckets;hashtable->buckets.unshared = NULL;hashtable->keepNulls = keepNulls;hashtable->skewEnabled = false;hashtable->skewBucket = NULL;hashtable->skewBucketLen = 0;hashtable->nSkewBuckets = 0;hashtable->skewBucketNums = NULL;hashtable->nbatch = nbatch;hashtable->curbatch = 0;hashtable->nbatch_original = nbatch;hashtable->nbatch_outstart = nbatch;hashtable->growEnabled = true;hashtable->totalTuples = 0;hashtable->partialTuples = 0;hashtable->skewTuples = 0;hashtable->innerBatchFile = NULL;hashtable->outerBatchFile = NULL;hashtable->spaceUsed = 0;hashtable->spacePeak = 0;hashtable->spaceAllowed = space_allowed;hashtable->spaceUsedSkew = 0;hashtable->spaceAllowedSkew =hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100;hashtable->chunks = NULL;hashtable->current_chunk = NULL;hashtable->parallel_state = state->parallel_state;hashtable->area = state->ps.state->es_query_dsa;hashtable->batches = NULL;#ifdef HJDEBUGprintf("Hashjoin %p: initial nbatch = %d, nbuckets = %d\n",hashtable, nbatch, nbuckets);
#endif/** Create temporary memory contexts in which to keep the hashtable working* storage.  See notes in executor/hashjoin.h.* 创建临时内存上下文，以便在其中保持散列表的相关信息。* 参见executor/hashjoin.h中的注释。*/hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext,"HashTableContext",ALLOCSET_DEFAULT_SIZES);hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt,"HashBatchContext",ALLOCSET_DEFAULT_SIZES);/* Allocate data that will live for the life of the hashjoin *///分配内存,切换至hashCxtoldcxt = MemoryContextSwitchTo(hashtable->hashCxt);/** Get info about the hash functions to be used for each hash key. Also* remember whether the join operators are strict.* 获取关于每个散列键要使用的散列函数的信息。* 还要记住连接操作符是否严格。*/nkeys = list_length(hashOperators);//键值数hashtable->outer_hashfunctions =(FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));//outer relation所使用的hash函数hashtable->inner_hashfunctions =(FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));//inner relation所使用的hash函数hashtable->hashStrict = (bool *) palloc(nkeys * sizeof(bool));//是否严格的操作符i = 0;foreach(ho, hashOperators)//遍历hash操作符{Oid         hashop = lfirst_oid(ho);//hash操作符Oid         left_hashfn;//左函数Oid         right_hashfn;//右函数//获取与给定操作符兼容的标准哈希函数的OID，并根据需要对其LHS和/或RHS数据类型进行操作。if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn))//获取hash函数elog(ERROR, "could not find hash function for hash operator %u",hashop);fmgr_info(left_hashfn, &hashtable->outer_hashfunctions[i]);fmgr_info(right_hashfn, &hashtable->inner_hashfunctions[i]);hashtable->hashStrict[i] = op_strict(hashop);i++;}if (nbatch > 1 && hashtable->parallel_state == NULL)//批次>1而且并行状态为NULL{/** allocate and initialize the file arrays in hashCxt (not needed for* parallel case which uses shared tuplestores instead of raw files)* 在hashCxt中分配和初始化文件数组(对于使用共享tuplestore而不是原始文件的并行情况不需要)*/hashtable->innerBatchFile = (BufFile **)palloc0(nbatch * sizeof(BufFile *));//用于缓存该批次的inner relation的tuplehashtable->outerBatchFile = (BufFile **)palloc0(nbatch * sizeof(BufFile *));//用于缓存该批次的outerr relation的tuple/* The files will not be opened until needed... *//* ... but make sure we have temp tablespaces established for them *///这些文件需要时才会打开……//…但是要确保为它们建立了临时表空间PrepareTempTablespaces();}MemoryContextSwitchTo(oldcxt);//切换回原内存上下文if (hashtable->parallel_state)//并行处理{ParallelHashJoinState *pstate = hashtable->parallel_state;Barrier    *build_barrier;/** Attach to the build barrier.  The corresponding detach operation is* in ExecHashTableDetach.  Note that we won't attach to the* batch_barrier for batch 0 yet.  We'll attach later and start it out* in PHJ_BATCH_PROBING phase, because batch 0 is allocated up front* and then loaded while hashing (the standard hybrid hash join* algorithm), and we'll coordinate that using build_barrier.*/build_barrier = &pstate->build_barrier;BarrierAttach(build_barrier);/** So far we have no idea whether there are any other participants,* and if so, what phase they are working on.  The only thing we care* about at this point is whether someone has already created the* SharedHashJoinBatch objects and the hash table for batch 0.  One* backend will be elected to do that now if necessary.*/if (BarrierPhase(build_barrier) == PHJ_BUILD_ELECTING &&BarrierArriveAndWait(build_barrier, WAIT_EVENT_HASH_BUILD_ELECTING)){pstate->nbatch = nbatch;pstate->space_allowed = space_allowed;pstate->growth = PHJ_GROWTH_OK;/* Set up the shared state for coordinating batches. */ExecParallelHashJoinSetUpBatches(hashtable, nbatch);/** Allocate batch 0's hash table up front so we can load it* directly while hashing.*/pstate->nbuckets = nbuckets;ExecParallelHashTableAlloc(hashtable, 0);}/** The next Parallel Hash synchronization point is in* MultiExecParallelHash(), which will progress it all the way to* PHJ_BUILD_DONE.  The caller must not return control from this* executor node between now and then.*/}else//非并行处理{/** Prepare context for the first-scan space allocations; allocate the* hashbucket array therein, and set each bucket "empty".* 为第一次扫描空间分配准备上下文;在其中分配hashbucket数组，并将每个bucket设置为“空”。*/MemoryContextSwitchTo(hashtable->batchCxt);//切换上下文hashtable->buckets.unshared = (HashJoinTuple *)palloc0(nbuckets * sizeof(HashJoinTuple));//分配内存空间/** Set up for skew optimization, if possible and there's a need for* more than one batch.  (In a one-batch join, there's no point in* it.)* 如需要多个批处理,设置倾斜优化。(在单批处理连接中，这是没有意义的。)*/if (nbatch > 1)ExecHashBuildSkewHash(hashtable, node, num_skew_mcvs);MemoryContextSwitchTo(oldcxt);//切换上下文}return hashtable;//返回Hash表
}/** This routine fills a FmgrInfo struct, given the OID* of the function to be called.* 给定要调用的函数的OID，这个例程填充一个FmgrInfo结构体。** The caller's CurrentMemoryContext is used as the fn_mcxt of the info* struct; this means that any subsidiary data attached to the info struct* (either by fmgr_info itself, or later on by a function call handler)* will be allocated in that context.  The caller must ensure that this* context is at least as long-lived as the info struct itself.  This is* not a problem in typical cases where the info struct is on the stack or* in freshly-palloc'd space.  However, if one intends to store an info* struct in a long-lived table, it's better to use fmgr_info_cxt.* 调用方的CurrentMemoryContext用作info结构体的fn_mcxt;* 这意味着附加到info结构体的任何附属数据(通过fmgr_info本身，或者稍后通过函数调用处理程序)将在该上下文中分配。* 调用者必须确保这个上下文的生命周期至少与info结构本身一样。* 在信息结构位于堆栈上或在新palloc空间中的典型情况下，这不是一个问题。* 但是，如果希望在long-lived表中存储信息结构，最好使用fmgr_info_cxt。*/voidfmgr_info(Oid functionId, FmgrInfo *finfo){fmgr_info_cxt_security(functionId, finfo, CurrentMemoryContext, false);}

ExecChooseHashTableSize
ExecChooseHashTableSize函数根据给定要散列的关系的估计大小(行数和平均行宽)，计算适当的散列表大小。


/** Compute appropriate size for hashtable given the estimated size of the* relation to be hashed (number of rows and average row width).* 给定要散列的关系的估计大小(行数和平均行宽)，计算适当的散列表大小。** This is exported so that the planner's costsize.c can use it.* 这些信息已导出以便计划器costsize.c可以使用*//* Target bucket loading (tuples per bucket) */
#define NTUP_PER_BUCKET         1void
ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,bool try_combined_work_mem,int parallel_workers,size_t *space_allowed,int *numbuckets,int *numbatches,int *num_skew_mcvs)
{int         tupsize;//元组大小double      inner_rel_bytes;//inner relation大小long        bucket_bytes;//桶大小long        hash_table_bytes;//hash table大小long        skew_table_bytes;//倾斜表大小long        max_pointers;//最大的指针数long        mppow2;//int         nbatch = 1;//批次int         nbuckets;//桶数double      dbuckets;///* Force a plausible relation size if no info *///如relation大小没有信息,则设定为默认值1000.0if (ntuples <= 0.0)ntuples = 1000.0;/** Estimate tupsize based on footprint of tuple in hashtable... note this* does not allow for any palloc overhead.  The manipulations of spaceUsed* don't count palloc overhead either.* 根据哈希表中tuple的占用空间估计tupsize…* 注意，这不允许任何palloc开销。使用的空间操作也不包括palloc开销。*/tupsize = HJTUPLE_OVERHEAD +MAXALIGN(SizeofMinimalTupleHeader) +MAXALIGN(tupwidth);//估算元组大小inner_rel_bytes = ntuples * tupsize;//inner relation大小/** Target in-memory hashtable size is work_mem kilobytes.* 目标内存中的散列表大小为work_mem KB。*/hash_table_bytes = work_mem * 1024L;/** Parallel Hash tries to use the combined work_mem of all workers to* avoid the need to batch.  If that won't work, it falls back to work_mem* per worker and tries to process batches in parallel.* 并行散列试图使用所有worker的所有work_mem来避免分批处理。* 如果这不起作用，它将返回到每个worker的work_mem，并尝试并行处理批处理。*/if (try_combined_work_mem)//尝试融合work_memhash_table_bytes += hash_table_bytes * parallel_workers;*space_allowed = hash_table_bytes;/** If skew optimization is possible, estimate the number of skew buckets* that will fit in the memory allowed, and decrement the assumed space* available for the main hash table accordingly.* 如果可以进行倾斜优化，估算允许内存中容纳的倾斜桶的数量，并相应地减少主哈希表的假定可用空间。** We make the optimistic assumption that each skew bucket will contain* one inner-relation tuple.  If that turns out to be low, we will recover* at runtime by reducing the number of skew buckets.* 我们乐观地假设，每个倾斜桶将包含一个内部关系元组。* 如果结果很低，将通过减少倾斜桶的数量在运行时进行恢复。** hashtable->skewBucket will have up to 8 times as many HashSkewBucket* pointers as the number of MCVs we allow, since ExecHashBuildSkewHash* will round up to the next power of 2 and then multiply by 4 to reduce* collisions.* hashtable->skewBucket的指针数量将是允许的mcv数量的8倍，*   因为ExecHashBuildSkewHash将四舍五入到下一个2次方，然后乘以4以减少冲突。*/if (useskew){//倾斜优化skew_table_bytes = hash_table_bytes * SKEW_WORK_MEM_PERCENT / 100;/*----------* Divisor is:* size of a hash tuple +* worst-case size of skewBucket[] per MCV +* size of skewBucketNums[] entry +* size of skew bucket struct itself*----------*/*num_skew_mcvs = skew_table_bytes / (tupsize +(8 * sizeof(HashSkewBucket *)) +sizeof(int) +SKEW_BUCKET_OVERHEAD);if (*num_skew_mcvs > 0)hash_table_bytes -= skew_table_bytes;}else*num_skew_mcvs = 0;//不使用倾斜优化,默认为0/** Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when* memory is filled, assuming a single batch; but limit the value so that* the pointer arrays we'll try to allocate do not exceed work_mem nor* MaxAllocSize.* 设置nbuckets，假设为单批处理，当内存被填满时，实现NTUP_PER_BUCKET的平均桶负载;*   但是要限制这个值，以便试图分配的指针数组不会超过work_mem或MaxAllocSize。** Note that both nbuckets and nbatch must be powers of 2 to make* ExecHashGetBucketAndBatch fast.* 注意，nbucket和nbatch都必须是2的幂，才能使ExecHashGetBucketAndBatch更快。*/max_pointers = *space_allowed / sizeof(HashJoinTuple);//最大指针数max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple));//控制上限/* If max_pointers isn't a power of 2, must round it down to one *///如果max_pointer不是2的幂，则必须四舍五入到符合规则的某个值(如110.1 --> 128)mppow2 = 1L << my_log2(max_pointers);if (max_pointers != mppow2)max_pointers = mppow2 / 2;/* Also ensure we avoid integer overflow in nbatch and nbuckets *//* (this step is redundant given the current value of MaxAllocSize) *///还要确保在nbatch和nbucket中避免整数溢出//(鉴于MaxAllocSize的当前值，此步骤是多余的)max_pointers = Min(max_pointers, INT_MAX / 2);//设定上限dbuckets = ceil(ntuples / NTUP_PER_BUCKET);//取整dbuckets = Min(dbuckets, max_pointers);//设定上限nbuckets = (int) dbuckets;//桶数/* don't let nbuckets be really small, though ... *///但是，不要让nbucket非常小……nbuckets = Max(nbuckets, 1024);//设定下限(1024)/* ... and force it to be a power of 2. *///2的幂nbuckets = 1 << my_log2(nbuckets);/** If there's not enough space to store the projected number of tuples and* the required bucket headers, we will need multiple batches.* 如果没有足够的空间来存储预计的元组数量和所需的bucket headers，将需要多个批处理。*/bucket_bytes = sizeof(HashJoinTuple) * nbuckets;if (inner_rel_bytes + bucket_bytes > hash_table_bytes)//inner relation大小 + 桶数大于可用空间{/* We'll need multiple batches *///需要多批次long        lbuckets;double      dbatch;int         minbatch;long        bucket_size;/** If Parallel Hash with combined work_mem would still need multiple* batches, we'll have to fall back to regular work_mem budget.* 如果合并了work_mem的并行散列仍然需要多个批处理，将不得不回到常规的work_mem预算。*/if (try_combined_work_mem){ExecChooseHashTableSize(ntuples, tupwidth, useskew,false, parallel_workers,space_allowed,numbuckets,numbatches,num_skew_mcvs);return;}/** Estimate the number of buckets we'll want to have when work_mem is* entirely full.  Each bucket will contain a bucket pointer plus* NTUP_PER_BUCKET tuples, whose projected size already includes* overhead for the hash code, pointer to the next tuple, etc.* 估计work_mem完全用完时需要的桶数。* 每个桶将包含一个桶指针和NTUP_PER_BUCKET元组，*   其投影大小已经包括哈希码的开销、指向下一个元组的指针等等。*/bucket_size = (tupsize * NTUP_PER_BUCKET + sizeof(HashJoinTuple));//桶大小lbuckets = 1L << my_log2(hash_table_bytes / bucket_size);lbuckets = Min(lbuckets, max_pointers);nbuckets = (int) lbuckets;nbuckets = 1 << my_log2(nbuckets);bucket_bytes = nbuckets * sizeof(HashJoinTuple);/** Buckets are simple pointers to hashjoin tuples, while tupsize* includes the pointer, hash code, and MinimalTupleData.  So buckets* should never really exceed 25% of work_mem (even for* NTUP_PER_BUCKET=1); except maybe for work_mem values that are not* 2^N bytes, where we might get more because of doubling. So let's* look for 50% here.* Buckets是指向hashjoin元组的简单指针，而tupsize包含指针、散列代码和MinimalTupleData。* 所以Buckets的实际大小不应该超过work_mem的25%(即使对于NTUP_PER_BUCKET=1);*   除了work_mem值不是2 ^ N个字节这个原因外,翻倍可能会得到更多的,这里试着使用50%*/Assert(bucket_bytes <= hash_table_bytes / 2);/* Calculate required number of batches. *///计算批次数dbatch = ceil(inner_rel_bytes / (hash_table_bytes - bucket_bytes));dbatch = Min(dbatch, max_pointers);minbatch = (int) dbatch;nbatch = 2;while (nbatch < minbatch)nbatch <<= 1;}Assert(nbuckets > 0);Assert(nbatch > 0);*numbuckets = nbuckets;*numbatches = nbatch;
}

三、跟踪分析

测试脚本如下

testdb=# set enable_nestloop=false;
SET
testdb=# set enable_mergejoin=false;
SET
testdb=# explain verbose select dw.*,grjf.grbh,grjf.xm,grjf.ny,grjf.je 
testdb-# from t_dwxx dw,lateral (select gr.grbh,gr.xm,jf.ny,jf.je 
testdb(#                         from t_grxx gr inner join t_jfxx jf 
testdb(#                                        on gr.dwbh = dw.dwbh 
testdb(#                                           and gr.grbh = jf.grbh) grjf
testdb-# order by dw.dwbh;QUERY PLAN                                           
-----------------------------------------------------------------------------------------------Sort  (cost=14828.83..15078.46 rows=99850 width=47)Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm, jf.ny, jf.jeSort Key: dw.dwbh->  Hash Join  (cost=3176.00..6537.55 rows=99850 width=47)Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm, jf.ny, jf.jeHash Cond: ((gr.grbh)::text = (jf.grbh)::text)->  Hash Join  (cost=289.00..2277.61 rows=99850 width=32)Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xmInner Unique: trueHash Cond: ((gr.dwbh)::text = (dw.dwbh)::text)->  Seq Scan on public.t_grxx gr  (cost=0.00..1726.00 rows=100000 width=16)Output: gr.dwbh, gr.grbh, gr.xm, gr.xb, gr.nl->  Hash  (cost=164.00..164.00 rows=10000 width=20)Output: dw.dwmc, dw.dwbh, dw.dwdz->  Seq Scan on public.t_dwxx dw  (cost=0.00..164.00 rows=10000 width=20)Output: dw.dwmc, dw.dwbh, dw.dwdz->  Hash  (cost=1637.00..1637.00 rows=100000 width=20)Output: jf.ny, jf.je, jf.grbh->  Seq Scan on public.t_jfxx jf  (cost=0.00..1637.00 rows=100000 width=20)Output: jf.ny, jf.je, jf.grbh
(20 rows)

启动gdb,设置断点,进入ExecHashTableCreate

(gdb) b ExecHashTableCreate
Breakpoint 1 at 0x6fc75d: file nodeHash.c, line 449.
(gdb) c
Continuing.Breakpoint 1, ExecHashTableCreate (state=0x1e3cbc8, hashOperators=0x1e59890, keepNulls=false) at nodeHash.c:449
449     node = (Hash *) state->ps.plan;

获取相关信息

449     node = (Hash *) state->ps.plan;
(gdb) n
450     outerNode = outerPlan(node);
(gdb) 
457     rows = node->plan.parallel_aware ? node->rows_total : outerNode->plan_rows;
(gdb) 
462                             state->parallel_state != NULL ?
(gdb) 
459     ExecChooseHashTableSize(rows, outerNode->plan_width,
(gdb)

获取Hash节点;
outer节点为顺序扫描SeqScan节点
inner(构造hash表的relation)行数为10000

(gdb) p *node
$1 = {plan = {type = T_Hash, startup_cost = 164, total_cost = 164, plan_rows = 10000, plan_width = 20, parallel_aware = false, parallel_safe = true, plan_node_id = 4, targetlist = 0x1e4bf90, qual = 0x0, lefttree = 0x1e493e8, righttree = 0x0, initPlan = 0x0, extParam = 0x0, allParam = 0x0}, skewTable = 16977, skewColumn = 1, skewInherit = false, rows_total = 0}
(gdb) p *outerNode
$2 = {type = T_SeqScan, startup_cost = 0, total_cost = 164, plan_rows = 10000, plan_width = 20, parallel_aware = false, parallel_safe = true, plan_node_id = 5, targetlist = 0x1e492b0, qual = 0x0, lefttree = 0x0, righttree = 0x0, initPlan = 0x0, extParam = 0x0, allParam = 0x0}
(gdb) p rows
$3 = 10000
(gdb)

进入ExecChooseHashTableSize函数

(gdb) step
ExecChooseHashTableSize (ntuples=10000, tupwidth=20, useskew=true, try_combined_work_mem=false, parallel_workers=0, space_allowed=0x7ffdcf148540, numbuckets=0x7ffdcf14853c, numbatches=0x7ffdcf148538, num_skew_mcvs=0x7ffdcf148534)at nodeHash.c:677
677     int         nbatch = 1;

ExecChooseHashTableSize->计算元组大小(56B)/inner relation大小(约560K)/hash表空间(16M)

(gdb) n
682     if (ntuples <= 0.0)
(gdb) 
690     tupsize = HJTUPLE_OVERHEAD +
(gdb) 
693     inner_rel_bytes = ntuples * tupsize;
(gdb) 
698     hash_table_bytes = work_mem * 1024L;
(gdb) 
705     if (try_combined_work_mem)
(gdb) p tupsize
$4 = 56
(gdb) p inner_rel_bytes
$5 = 560000
(gdb) p hash_table_bytes
$6 = 16777216

ExecChooseHashTableSize->使用数据倾斜优化(所需空间从Hash Table中获取)

(gdb) n
708     *space_allowed = hash_table_bytes;
(gdb) 
724     if (useskew)
(gdb) 
726         skew_table_bytes = hash_table_bytes * SKEW_WORK_MEM_PERCENT / 100;
(gdb) p useskew
$8 = true
(gdb) p hash_table_bytes
$9 = 16441672
(gdb) p skew_table_bytes
$10 = 335544
(gdb) p num_skew_mcvs
$11 = (int *) 0x7ffdcf148534
(gdb) p *num_skew_mcvs
$12 = 2396
(gdb)

ExecChooseHashTableSize->获取最大指针数目(2097152)

(gdb) n
756     max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple));
(gdb) 
758     mppow2 = 1L << my_log2(max_pointers);
(gdb) n
759     if (max_pointers != mppow2)
(gdb) p max_pointers
$13 = 2097152
(gdb) p mppow2
$15 = 2097152

ExecChooseHashTableSize->计算Hash桶数

(gdb) n
764     max_pointers = Min(max_pointers, INT_MAX / 2);
(gdb) 
766     dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
(gdb) 
767     dbuckets = Min(dbuckets, max_pointers);
(gdb) 
768     nbuckets = (int) dbuckets;
(gdb) 
770     nbuckets = Max(nbuckets, 1024);
(gdb) 
772     nbuckets = 1 << my_log2(nbuckets);
(gdb) 
778     bucket_bytes = sizeof(HashJoinTuple) * nbuckets;
(gdb) n
779     if (inner_rel_bytes + bucket_bytes > hash_table_bytes)
(gdb) 
834     Assert(nbuckets > 0);
(gdb) p dbuckets
$16 = 10000
(gdb) p nbuckets
$17 = 16384
(gdb) p bucket_bytes
$18 = 131072

ExecChooseHashTableSize->只需要一个批次,赋值,返回

835     Assert(nbatch > 0);
(gdb) 
837     *numbuckets = nbuckets;
(gdb) 
838     *numbatches = nbatch;
(gdb) 
839 }
(gdb) 
(gdb) 
ExecHashTableCreate (state=0x1e3cbc8, hashOperators=0x1e59890, keepNulls=false) at nodeHash.c:468
468     log2_nbuckets = my_log2(nbuckets);

初始化Hash表

468     log2_nbuckets = my_log2(nbuckets);
(gdb) p nbuckets
$19 = 16384
(gdb) n
469     Assert(nbuckets == (1 << log2_nbuckets));
(gdb) 
478     hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
(gdb) 
479     hashtable->nbuckets = nbuckets;
...

分配内存上下文

...
(gdb) 
522     hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext,
(gdb) 
526     hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt,
(gdb) 
532     oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
(gdb)

切换上下文,并初始化hash函数

(gdb) 
532     oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
(gdb) n
538     nkeys = list_length(hashOperators);
(gdb) 
540         (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
(gdb) p nkeys
$20 = 1
(gdb) n
539     hashtable->outer_hashfunctions =
(gdb) 
542         (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
(gdb) 
541     hashtable->inner_hashfunctions =
(gdb) 
543     hashtable->hashStrict = (bool *) palloc(nkeys * sizeof(bool));
(gdb) 
544     i = 0;

初始化Hash操作符

(gdb) n
545     foreach(ho, hashOperators)
(gdb) 
547         Oid         hashop = lfirst_oid(ho);
(gdb) 
551         if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn))
(gdb) 
554         fmgr_info(left_hashfn, &hashtable->outer_hashfunctions[i]);
(gdb) 
555         fmgr_info(right_hashfn, &hashtable->inner_hashfunctions[i]);
(gdb) 
556         hashtable->hashStrict[i] = op_strict(hashop);
(gdb) 
557         i++;
(gdb) 
545     foreach(ho, hashOperators)
(gdb) p *hashtable->hashStrict
$21 = true
(gdb) n
560     if (nbatch > 1 && hashtable->parallel_state == NULL)

分配hash桶内存空间

gdb) n
575     MemoryContextSwitchTo(oldcxt);
(gdb) 
577     if (hashtable->parallel_state)
(gdb) 
631         MemoryContextSwitchTo(hashtable->batchCxt);
(gdb) 
634             palloc0(nbuckets * sizeof(HashJoinTuple));
(gdb) 
633         hashtable->buckets.unshared = (HashJoinTuple *)
(gdb) p nbuckets
$23 = 16384

构造完成,返回hash表

(gdb) n
641         if (nbatch > 1)
(gdb) 
644         MemoryContextSwitchTo(oldcxt);
(gdb) 
647     return hashtable;
(gdb) 
648 }
(gdb) 
ExecHashJoinImpl (pstate=0x1e3c048, parallel=false) at nodeHashjoin.c:282
282                 node->hj_HashTable = hashtable;
(gdb)

DONE!

四、参考资料

Hash Joins: Past, Present and Future/PGCon 2017
A Look at How Postgres Executes a Tiny Join - Part 1
A Look at How Postgres Executes a Tiny Join - Part 2
Assignment 2 Symmetric Hash Join

来自 “ ITPUB博客 ” ，链接：/，如需转载，请注明出处，否则将追究法律责任。

转载于:/

更多推荐

PostgreSQL 源码解读（91）

本文发布于:2024-03-04 10:43:05，感谢您对本站的认可！

本文链接:https://www.elefans.com/category/jswz/34/1709046.html