greenplumn bitmapsearch 源码

  • 2022-08-18
  • 浏览 (398)

greenplumn bitmapsearch 代码

文件路径:/src/backend/access/bitmap/bitmapsearch.c

/*-------------------------------------------------------------------------
 *
 * bitmapsearch.c
 *	  Search routines for on-disk bitmap index access method.
 *
 * Portions Copyright (c) 2007-2010 Greenplum Inc
 * Portions Copyright (c) 2010-2012 EMC Corporation
 * Portions Copyright (c) 2012-Present VMware, Inc. or its affiliates.
 * Porions Copyright (c) 2006-2008, PostgreSQL Global Development Group
 *
 *
 * IDENTIFICATION
 *	  src/backend/access/bitmap/bitmapsearch.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "access/genam.h"
#include "access/tupdesc.h"
#include "access/bitmap.h"
#include "access/bitmap_private.h"
#include "access/relscan.h"
#include "storage/lmgr.h"
#include "parser/parse_oper.h"
#include "utils/lsyscache.h"
#include "utils/snapmgr.h"
#include "utils/faultinjector.h"


typedef struct ItemPos
{
	BlockNumber		blockNo;
	OffsetNumber	offset;
} ItemPos;

static void next_batch_words(IndexScanDesc scan);
static void read_words(Relation rel, Buffer lovBuffer,
					   OffsetNumber lovOffset,
					   bool lockLovBuffer,
					   BMBatchWords *bachWords /* out */,
					   BlockNumber *nextBlockNoP /* out */,
					   bool *readLastWords /* out */);
static void init_scanpos(IndexScanDesc scan, BMVector bmScanPos,
					 BlockNumber lovBlock, OffsetNumber lovOffset);

/*
 * _bitmap_first() -- find the first tuple that satisfies a given scan.
 */
bool
_bitmap_first(IndexScanDesc scan, ScanDirection dir)
{
	BMScanOpaque so;
	BMScanPosition scanpos;

	_bitmap_findbitmaps(scan, dir);
	so = (BMScanOpaque) scan->opaque;
	scanpos = (BMScanPosition) so->bm_currPos;
	if (scanpos->done)
		return false;

	/*
	 * Bitmap indexes don't currently support Index Only Scans.
	 * It would be pretty straightforward to return the index tuples from the
	 * LOV index, but we haven't implemented it.
	 *
	 * However, even though the 'amcanreturn' function is not implemented,
	 * the planner still chooses an Index Only Scan for some queries where
	 * no attribute from the index are needed. Be prepared for that, by
	 * filling xs_itup with a dummy IndexTuple with all NULL values.
	 */
	if (scan->xs_want_itup && !scan->xs_itup)
	{
		TupleDesc idesc = RelationGetDescr(scan->indexRelation);
		Datum	   *nulldatums;
		bool	   *isnulls;

		nulldatums = palloc(idesc->natts * sizeof(Datum));
		isnulls = palloc(idesc->natts * sizeof(bool));

		for (int i = 0; i < idesc->natts; i++)
		{
			nulldatums[i] = (Datum) 0;
			isnulls[i] = true;
		}
		scan->xs_itup = index_form_tuple(idesc, nulldatums, isnulls);
		pfree(nulldatums);
		pfree(isnulls);
	}

	return _bitmap_next(scan, dir);
}

/*
 * _bitmap_next() -- return the next tuple that satisfies a given scan.
 */
bool
_bitmap_next(IndexScanDesc scan, ScanDirection dir  pg_attribute_unused())
{
	BMScanOpaque	so;
	BMScanPosition	scanPos;
	uint64			nextTid;

	so = (BMScanOpaque) scan->opaque;
	scanPos = so->bm_currPos;

	if (scanPos->done)
		return false;

	for (;;)
	{
		/*
		 * If there are no more words left from the previous scan, we
		 * try to compute the next batch of words.
		 */
		if (scanPos->bm_batchWords->nwords == 0 &&
			scanPos->bm_result.nextTidLoc >= scanPos->bm_result.numOfTids)
		{
			_bitmap_reset_batchwords(scanPos->bm_batchWords);
			scanPos->bm_batchWords->firstTid = scanPos->bm_result.nextTid;

			elog(DEBUG2, "IndexScan next batch words start Tid: " INT64_FORMAT,
				 scanPos->bm_batchWords->firstTid);

			next_batch_words(scan);

			_bitmap_begin_iterate(scanPos->bm_batchWords, &(scanPos->bm_result));
		}

		/* If we can not find more words, then this scan is over. */
		if (scanPos->done)
			return false;

		nextTid = _bitmap_findnexttid(scanPos->bm_batchWords,
									  &(scanPos->bm_result));
		if (nextTid == 0)
			continue;
		else
			break;
	}

	Assert((nextTid % BM_MAX_TUPLES_PER_PAGE) + 1 > 0);

	ItemPointerSet(&(scan->xs_heaptid), BM_INT_GET_BLOCKNO(nextTid),
				   BM_INT_GET_OFFSET(nextTid));
	so->cur_pos_valid = true;

	return true;
}

/*
 * _bitmap_firstbatchwords() -- find the first batch of bitmap words
 *  in a bitmap vector for a given scan.
 */
bool
_bitmap_firstbatchwords(IndexScanDesc scan,
						ScanDirection dir)
{
	_bitmap_findbitmaps(scan, dir);

	return _bitmap_nextbatchwords(scan, dir);
}

/*
 * _bitmap_nextbatchwords() -- find the next batch of bitmap words
 *  in a bitmap vector for a given scan.
 */
bool
_bitmap_nextbatchwords(IndexScanDesc scan,
					   ScanDirection dir  pg_attribute_unused())
{
	BMScanOpaque	so;

	so = (BMScanOpaque) scan->opaque;

	/* check if this scan if over */
	if (so->bm_currPos->done)
		return false;

	/*
	 * Set firstTid to retrun for the remain batch words. tid < nextTid should
	 * already scanned. So move firstTid to nextTid.
	 * The firstTid may get updated when read new batch words if there only one
	 * bitmap vector matched, see read_words.
	 */
	so->bm_currPos->bm_batchWords->firstTid = so->bm_currPos->bm_result.nextTid;
	elog(DEBUG2, "BitmapIndexScan next batch words start Tid: " INT64_FORMAT,
		 so->bm_currPos->bm_batchWords->firstTid);

	/*
	 * If there are some leftover words from the previous scan, simply
	 * return them.
	 */
	if (so->bm_currPos->bm_batchWords->nwords > 0)
		return true;

	/*
	 * Compute the next list of batch words. Before that,
	 * reset the previous list of batch words, especially the
	 * content and header bitmap words.
	 */
	_bitmap_reset_batchwords(so->bm_currPos->bm_batchWords);
	next_batch_words(scan);

	return true;
}

/*
 * next_batch_words() -- compute the next batch of bitmap words
 * 	from a given scan position.
 */
static void
next_batch_words(IndexScanDesc scan)
{
	BMScanPosition			scanPos;
	BMVector	bmScanPos;
	int						i;
	BMBatchWords		  **batches;
	int						numBatches;

	scanPos = ((BMScanOpaque) scan->opaque)->bm_currPos;
	bmScanPos = scanPos->posvecs;

	/*
	 * Since we read a new batch of words, we need to reset 
	 * lastScanWordNo in result words. Otherwise, we may miss
	 * some words in this new batch.
	 */
	scanPos->bm_result.lastScanWordNo = 0;

	batches = (BMBatchWords **)
		palloc0(scanPos->nvec * sizeof(BMBatchWords *));

	numBatches = 0;
	/*
	 * Obtains the next batch of words for each bitmap vector.
	 * Ignores those bitmap vectors that contain no new words.
	 */
	for (i = 0; i < scanPos->nvec; i++)
	{
		BMBatchWords	*batchWords;
		batchWords = bmScanPos[i].bm_batchWords;

		/*
		 * If there are no words left from previous scan, read the next
		 * batch of words.
		 */
		if (bmScanPos[i].bm_batchWords->nwords == 0 &&
			!(bmScanPos[i].bm_readLastWords))
		{

			_bitmap_reset_batchwords(batchWords);
			read_words(scan->indexRelation,
					   bmScanPos[i].bm_lovBuffer,
					   bmScanPos[i].bm_lovOffset,
					   true /* lockLocBuffer */,
					   batchWords,
					   &(bmScanPos[i].bm_nextBlockNo),
					   &(bmScanPos[i].bm_readLastWords));
		}

		if (bmScanPos[i].bm_batchWords->nwords > 0)
		{
			batches[numBatches] = batchWords;
			numBatches++;
		}
	}

	/*
	 * We handle the case where only one bitmap vector contributes to
	 * the scan separately with other cases. This is because 
	 * bmScanPos->bm_batchWords and scanPos->bm_batchWords
	 * are the same.
	 */
	if (scanPos->nvec == 1)
	{
		if (bmScanPos->bm_batchWords->nwords == 0)
			scanPos->done = true;
		pfree(batches);
		scanPos->bm_batchWords = scanPos->posvecs->bm_batchWords;

		return;
	}

	/*
	 * At least two bitmap vectors contribute to this scan, we
	 * ORed these bitmap vectors.
	 */
	if (numBatches == 0)
	{
		scanPos->done = true;
		pfree(batches);
		return;
	}

	_bitmap_union(batches, numBatches, scanPos->bm_batchWords);
	pfree(batches);
}

/*
 * read_words() -- read one-block of bitmap words from
 *	the bitmap page.
 *
 * If nextBlockNo is an invalid block number, then the two last words
 * are stored in lovItem. Otherwise, read words from nextBlockNo.
 */
static void
read_words(Relation rel, Buffer lovBuffer, OffsetNumber lovOffset,
		   bool lockLovBuffer, BMBatchWords *batchWords /* out */,
		   BlockNumber *nextBlockNoP /* out */, bool *readLastWords /* out */)
{
	if (BlockNumberIsValid(*nextBlockNoP))
	{
		Buffer			bitmapBuffer;
		Page			bitmapPage;
		BMBitmap		bitmap;
		BMBitmapOpaque	bo;
		uint64			totalTidsInPage;
		bool			readLOV = false;

		if (lockLovBuffer)
			LockBuffer(lovBuffer, BM_READ);

		bitmapBuffer = _bitmap_getbuf(rel, *nextBlockNoP, BM_READ);
		bitmapPage = BufferGetPage(bitmapBuffer);

		bitmap = (BMBitmap) PageGetContentsMaxAligned(bitmapPage);
		bo = (BMBitmapOpaque)PageGetSpecialPointer(bitmapPage);

		*nextBlockNoP = bo->bm_bitmap_next;
		batchWords->nwords = bo->bm_hrl_words_used;

		/*
		 * If this is the last bitmap page and the total number of words
		 * in this page is less than or equal to
		 * BM_NUM_OF_HRL_WORDS_PER_PAGE - 2, we read the last two words from LOV
		 * and append them into 'batchWords->hwords' and 'batchWords->cwords'.
		 * This requires hold lock on the lovBuffer to avoid concurrent changes
		 * on it. Otherwise, release the lock ASAP.
		 */
		if ((!BlockNumberIsValid(*nextBlockNoP)) &&
			(batchWords->nwords <= BM_NUM_OF_HRL_WORDS_PER_PAGE - 2))
			readLOV = true;
		else
		{
			if (lockLovBuffer)
				LockBuffer(lovBuffer, BUFFER_LOCK_UNLOCK);
		}

		/*
		 * Get real next tid and nwordsread in uncompressed order for a
		 * bitmap index scan on a bitmap page.
		 * If current bitmap page get rearranged words from previous page
		 * after release the previous bitmap page and before acquire lock
		 * on it for read. The expected next tid for current bitmap scan
		 * will not equal to the current page's start tid. So set to correct
		 * value.
		 * The rearrange happens when doing insert on the table and it will
		 * update a full bitmap pages(except the last page) and generate
		 * new words.
		 * Since the page is full, so it'll rearrange the words and move
		 * the unfit words to next bitmap page.
		 * This related to issue: https://github.com/greenplum-db/gpdb/issues/11308.
		 */
		totalTidsInPage = GET_NUM_BITS(bitmap->cwords, bitmap->hwords,
									   bo->bm_hrl_words_used);
		batchWords->firstTid = bo->bm_last_tid_location - totalTidsInPage + 1;
		batchWords->nwordsread = batchWords->firstTid / BM_HRL_WORD_SIZE;

		memcpy(batchWords->hwords, bitmap->hwords,
			   BM_NUM_OF_HEADER_WORDS * sizeof(BM_HRL_WORD));
		memcpy(batchWords->cwords, bitmap->cwords,
			   sizeof(BM_HRL_WORD) * batchWords->nwords);

		_bitmap_relbuf(bitmapBuffer);
		SIMPLE_FAULT_INJECTOR("after_read_one_bitmap_idx_page");

		*readLastWords = false;

		if (readLOV)
		{
			BMBatchWords	tempBWord;
			BM_HRL_WORD		cwords[2];
			BM_HRL_WORD		hword;
			BM_HRL_WORD		tmp;
			int				offs;

			MemSet(&tempBWord, 0, sizeof(BMBatchWords));
			tempBWord.cwords = cwords;
			tempBWord.hwords = &hword;

			read_words(rel, lovBuffer, lovOffset, false /* lockLovBuffer */,
					   &tempBWord, nextBlockNoP, readLastWords);
			Assert(tempBWord.nwords > 0 && tempBWord.nwords <= 2);

			// release lock on lovBuffer once we read words from it.
			if (lockLovBuffer)
				LockBuffer(lovBuffer, BUFFER_LOCK_UNLOCK);

			memcpy(batchWords->cwords + batchWords->nwords, cwords,
				   tempBWord.nwords * sizeof(BM_HRL_WORD));

			offs = batchWords->nwords / BM_HRL_WORD_SIZE;
			tmp = hword >> batchWords->nwords % BM_HRL_WORD_SIZE;
			batchWords->hwords[offs] |= tmp;

			if (batchWords->nwords % BM_HRL_WORD_SIZE == BM_HRL_WORD_SIZE - 1)
			{
				offs = (batchWords->nwords + 1)/BM_HRL_WORD_SIZE;
				batchWords->hwords[offs] |= hword << 1;
			}
			batchWords->nwords += tempBWord.nwords;
		}
	}
	else
	{
		BMLOVItem	lovItem;
		Page		lovPage;

		if (lockLovBuffer)
			LockBuffer(lovBuffer, BM_READ);

		lovPage = BufferGetPage(lovBuffer);
		lovItem = (BMLOVItem) PageGetItem(lovPage, 
										  PageGetItemId(lovPage, lovOffset));

		if (lovItem->bm_last_compword != LITERAL_ALL_ONE)
		{
			batchWords->nwords = 2;
			batchWords->hwords[0] = (((BM_HRL_WORD)lovItem->lov_words_header) <<
							  (BM_HRL_WORD_SIZE-2));
			batchWords->cwords[0] = lovItem->bm_last_compword;
			batchWords->cwords[1] = lovItem->bm_last_word;
		}
		else
		{
			batchWords->nwords = 1;
			batchWords->hwords[0] = (((BM_HRL_WORD)lovItem->lov_words_header) <<
									(BM_HRL_WORD_SIZE-1));
			batchWords->cwords[0] = lovItem->bm_last_word;
		}

		if (lockLovBuffer)
			LockBuffer(lovBuffer, BUFFER_LOCK_UNLOCK);

		*readLastWords = true;
	}
}

/*
 * _bitmap_findbitmaps() -- find the bitmap vectors that satisfy the
 * index predicate.
 */
void
_bitmap_findbitmaps(IndexScanDesc scan, ScanDirection dir  pg_attribute_unused())
{
	BMScanOpaque			so;
	BMScanPosition			scanPos;
	Buffer					metabuf;
	BMMetaPage				metapage;
	BlockNumber				lovBlock;
	OffsetNumber			lovOffset;
	bool					blockNull, offsetNull;
	int						vectorNo, keyNo;

	so = (BMScanOpaque) scan->opaque;

	/* allocate space and initialize values for so->bm_currPos */
	if (so->bm_currPos == NULL)
		so->bm_currPos = (BMScanPosition) palloc0(sizeof(BMScanPositionData));

	scanPos = so->bm_currPos;
	scanPos->nvec = 0;
	scanPos->done = false;
	MemSet(&scanPos->bm_result, 0, sizeof(BMIterateResult));

	/*
	 * The tid to return always start from 1 which is the first tid of
	 * first uncompressed word.
	 */
	scanPos->bm_result.nextTid = 1;

	for (keyNo = 0; keyNo < scan->numberOfKeys; keyNo++)
	{
		if (scan->keyData[keyNo].sk_flags & SK_ISNULL)
		{
			/* Null key passed to bitmap index scan. Return empty result */
			Assert(scanPos->nvec == 0);
			scanPos->done = true;
			return;
		}
	}

	metabuf = _bitmap_getbuf(scan->indexRelation, BM_METAPAGE, BM_READ);
	metapage = _bitmap_get_metapage_data(scan->indexRelation, metabuf);

	{
		Relation		lovHeap, lovIndex;
		ScanKey			scanKeys;
		IndexScanDesc	scanDesc;
		List*			lovItemPoss = NIL;
		ListCell		*cell;

		/*
		 * We haven't locked the metapage but that's okay... if these
		 * values change underneath us there's something much more
		 * fundamentally wrong. This could change when we have VACUUM
		 * support, of course.
		 */
		_bitmap_open_lov_heapandindex(scan->indexRelation, metapage, 
				 &lovHeap, &lovIndex, AccessShareLock);

		scanKeys = palloc0(scan->numberOfKeys * sizeof(ScanKeyData));
		for (keyNo = 0; keyNo < scan->numberOfKeys; keyNo++)
		{
			ScanKey	scanKey = (ScanKey)(((char *)scanKeys) + 
										 keyNo * sizeof(ScanKeyData));

			ScanKeyEntryInitialize(scanKey,
								   scan->keyData[keyNo].sk_flags,
								   scan->keyData[keyNo].sk_attno,
								   scan->keyData[keyNo].sk_strategy,
								   scan->keyData[keyNo].sk_subtype,
								   scan->keyData[keyNo].sk_collation,
								   scan->keyData[keyNo].sk_func.fn_oid,
								   scan->keyData[keyNo].sk_argument);
		}

		/* When there are no scan keys, all bitmap vectors are included,
		 * including the one for the NULL value.
		 * This can happen when the index is created with predicates.
		 */
		if (scan->numberOfKeys == 0)
		{
			ItemPos *itemPos;
			itemPos = (ItemPos*)palloc0(sizeof(ItemPos));
			itemPos->blockNo = BM_LOV_STARTPAGE;;
			itemPos->offset = 1;
			lovItemPoss = lappend(lovItemPoss, itemPos);

			scanPos->nvec++;
		}

		scanDesc = index_beginscan(lovHeap, lovIndex, GetActiveSnapshot(),
								   scan->numberOfKeys, 0);
		index_rescan(scanDesc, scanKeys, scan->numberOfKeys, NULL, 0);

		/*
		 * finds all lov items for this scan through lovHeap and lovIndex.
		 */
		while (true)
		{
			ItemPos			*itemPos;

			bool res = _bitmap_findvalue(lovHeap, lovIndex, scanKeys, scanDesc,
							  	         &lovBlock, &blockNull, &lovOffset, 
										 &offsetNull);

			if(!res)
				break;

			/*
			 * We find the position for one LOV item. Append it into
			 * the list.
			 */
			itemPos = (ItemPos*)palloc0(sizeof(ItemPos));
			itemPos->blockNo = lovBlock;
			itemPos->offset = lovOffset;
			lovItemPoss = lappend(lovItemPoss, itemPos);

			scanPos->nvec++;
		}

		if (scanPos->nvec)
		{
			scanPos->posvecs =
				(BMVector)palloc0(sizeof(BMVectorData) * scanPos->nvec);
		}

		vectorNo = 0;
		foreach(cell, lovItemPoss)
		{
			ItemPos		*itemPos = (ItemPos*)lfirst(cell);

			BMVector bmScanPos = &(scanPos->posvecs[vectorNo]);
			init_scanpos(scan, bmScanPos, itemPos->blockNo, itemPos->offset);
			vectorNo++;
		}

		list_free_deep(lovItemPoss);

		index_endscan(scanDesc);
		_bitmap_close_lov_heapandindex(lovHeap, lovIndex, AccessShareLock);
		pfree(scanKeys);
	}

	_bitmap_relbuf(metabuf);

	if (scanPos->nvec == 0)
	{
		scanPos->done = true;
		return;
	}

	/*
	 * Since there is only one related bitmap vector, we have
	 * the scan position's batch words structure point directly to
	 * the vector's batch words.
	 */
	if (scanPos->nvec == 1)
		scanPos->bm_batchWords = scanPos->posvecs->bm_batchWords;
	else
	{
		scanPos->bm_batchWords = (BMBatchWords *) palloc0(sizeof(BMBatchWords));
		_bitmap_init_batchwords(scanPos->bm_batchWords,
								BM_NUM_OF_HRL_WORDS_PER_PAGE,
								CurrentMemoryContext);	
	}
}

/*
 * init_scanpos() -- initialize a BMScanPosition for a given
 *	bitmap vector.
 */
static void
init_scanpos(IndexScanDesc scan, BMVector bmScanPos, BlockNumber lovBlock,
			 OffsetNumber lovOffset)
{
	Page 					lovPage;
	BMLOVItem				lovItem;

	bmScanPos->bm_lovOffset = lovOffset;
	
	bmScanPos->bm_lovBuffer = _bitmap_getbuf(scan->indexRelation, lovBlock, 
										     BM_READ);

	lovPage	= BufferGetPage(bmScanPos->bm_lovBuffer);
	lovItem = (BMLOVItem) PageGetItem(lovPage, 
					PageGetItemId(lovPage, bmScanPos->bm_lovOffset));
			
	bmScanPos->bm_nextBlockNo = lovItem->bm_lov_head;
	bmScanPos->bm_readLastWords = false;
	bmScanPos->bm_batchWords = (BMBatchWords *) palloc0(sizeof(BMBatchWords));
	_bitmap_init_batchwords(bmScanPos->bm_batchWords,
							BM_NUM_OF_HRL_WORDS_PER_PAGE,
							CurrentMemoryContext);	

	LockBuffer(bmScanPos->bm_lovBuffer, BUFFER_LOCK_UNLOCK);
}

相关信息

greenplumn 源码目录

相关文章

greenplumn bitmap 源码

greenplumn bitmapattutil 源码

greenplumn bitmapinsert 源码

greenplumn bitmappages 源码

greenplumn bitmaputil 源码

greenplumn bitmapxlog 源码

0  赞