/*------------------------------------------------------------------------- * * gist_private.h * private declarations for GiST -- declarations related to the * internal implementation of GiST, not the public API * * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * src/include/access/gist_private.h * *------------------------------------------------------------------------- */ #ifndef GIST_PRIVATE_H #define GIST_PRIVATE_H #include "access/amapi.h" #include "access/gist.h" #include "access/itup.h" #include "fmgr.h" #include "lib/pairingheap.h" #include "storage/bufmgr.h" #include "storage/buffile.h" #include "utils/hsearch.h" #include "access/genam.h" /* * Maximum number of "halves" a page can be split into in one operation. * Typically a split produces 2 halves, but can be more if keys have very * different lengths, or when inserting multiple keys in one operation (as * when inserting downlinks to an internal node). There is no theoretical * limit on this, but in practice if you get more than a handful page halves * in one split, there's something wrong with the opclass implementation. * GIST_MAX_SPLIT_PAGES is an arbitrary limit on that, used to size some * local arrays used during split. Note that there is also a limit on the * number of buffers that can be held locked at a time, MAX_SIMUL_LWLOCKS, * so if you raise this higher than that limit, you'll just get a different * error. */ #define GIST_MAX_SPLIT_PAGES 75 /* Buffer lock modes */ #define GIST_SHARE BUFFER_LOCK_SHARE #define GIST_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE #define GIST_UNLOCK BUFFER_LOCK_UNLOCK typedef struct { BlockNumber prev; uint32 freespace; char tupledata[FLEXIBLE_ARRAY_MEMBER]; } GISTNodeBufferPage; #define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata)) /* Returns free space in node buffer page */ #define PAGE_FREE_SPACE(nbp) (nbp->freespace) /* Checks if node buffer page is empty */ #define PAGE_IS_EMPTY(nbp) (nbp->freespace == BLCKSZ - BUFFER_PAGE_DATA_OFFSET) /* Checks if node buffers page don't contain sufficient space for index tuple */ #define PAGE_NO_SPACE(nbp, itup) (PAGE_FREE_SPACE(nbp) < \ MAXALIGN(IndexTupleSize(itup))) /* * GISTSTATE: information needed for any GiST index operation * * This struct retains call info for the index's opclass-specific support * functions (per index column), plus the index's tuple descriptor. * * scanCxt holds the GISTSTATE itself as well as any data that lives for the * lifetime of the index operation. We pass this to the support functions * via fn_mcxt, so that they can store scan-lifespan data in it. The * functions are invoked in tempCxt, which is typically short-lifespan * (that is, it's reset after each tuple). However, tempCxt can be the same * as scanCxt if we're not bothering with per-tuple context resets. */ typedef struct GISTSTATE { MemoryContext scanCxt; /* context for scan-lifespan data */ MemoryContext tempCxt; /* short-term context for calling functions */ TupleDesc leafTupdesc; /* index's tuple descriptor */ TupleDesc nonLeafTupdesc; /* truncated tuple descriptor for non-leaf * pages */ TupleDesc fetchTupdesc; /* tuple descriptor for tuples returned in an * index-only scan */ FmgrInfo consistentFn[INDEX_MAX_KEYS]; FmgrInfo unionFn[INDEX_MAX_KEYS]; FmgrInfo compressFn[INDEX_MAX_KEYS]; FmgrInfo decompressFn[INDEX_MAX_KEYS]; FmgrInfo penaltyFn[INDEX_MAX_KEYS]; FmgrInfo picksplitFn[INDEX_MAX_KEYS]; FmgrInfo equalFn[INDEX_MAX_KEYS]; FmgrInfo distanceFn[INDEX_MAX_KEYS]; FmgrInfo fetchFn[INDEX_MAX_KEYS]; /* Collations to pass to the support functions */ Oid supportCollation[INDEX_MAX_KEYS]; } GISTSTATE; /* * During a GiST index search, we must maintain a queue of unvisited items, * which can be either individual heap tuples or whole index pages. If it * is an ordered search, the unvisited items should be visited in distance * order. Unvisited items at the same distance should be visited in * depth-first order, that is heap items first, then lower index pages, then * upper index pages; this rule avoids doing extra work during a search that * ends early due to LIMIT. * * To perform an ordered search, we use a pairing heap to manage the * distance-order queue. In a non-ordered search (no order-by operators), * we use it to return heap tuples before unvisited index pages, to * ensure depth-first order, but all entries are otherwise considered * equal. */ /* Individual heap tuple to be visited */ typedef struct GISTSearchHeapItem { ItemPointerData heapPtr; bool recheck; /* T if quals must be rechecked */ bool recheckDistances; /* T if distances must be rechecked */ HeapTuple recontup; /* data reconstructed from the index, used in * index-only scans */ OffsetNumber offnum; /* track offset in page to mark tuple as * LP_DEAD */ } GISTSearchHeapItem; /* Unvisited item, either index page or heap tuple */ typedef struct GISTSearchItem { pairingheap_node phNode; BlockNumber blkno; /* index page number, or InvalidBlockNumber */ union { GistNSN parentlsn; /* parent page's LSN, if index page */ /* we must store parentlsn to detect whether a split occurred */ GISTSearchHeapItem heap; /* heap info, if heap tuple */ } data; /* numberOfOrderBys entries */ IndexOrderByDistance distances[FLEXIBLE_ARRAY_MEMBER]; } GISTSearchItem; #define GISTSearchItemIsHeap(item) ((item).blkno == InvalidBlockNumber) #define SizeOfGISTSearchItem(n_distances) \ (offsetof(GISTSearchItem, distances) + \ sizeof(IndexOrderByDistance) * (n_distances)) /* * GISTScanOpaqueData: private state for a scan of a GiST index */ typedef struct GISTScanOpaqueData { GISTSTATE *giststate; /* index information, see above */ Oid *orderByTypes; /* datatypes of ORDER BY expressions */ pairingheap *queue; /* queue of unvisited items */ MemoryContext queueCxt; /* context holding the queue */ bool qual_ok; /* false if qual can never be satisfied */ bool firstCall; /* true until first gistgettuple call */ /* pre-allocated workspace arrays */ IndexOrderByDistance *distances; /* output area for gistindex_keytest */ /* info about killed items if any (killedItems is NULL if never used) */ OffsetNumber *killedItems; /* offset numbers of killed items */ int numKilled; /* number of currently stored items */ BlockNumber curBlkno; /* current number of block */ GistNSN curPageLSN; /* pos in the WAL stream when page was read */ /* In a non-ordered search, returnable heap items are stored here: */ GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)]; OffsetNumber nPageData; /* number of valid items in array */ OffsetNumber curPageData; /* next item to return */ MemoryContext pageDataCxt; /* context holding the fetched tuples, for * index-only scans */ } GISTScanOpaqueData; typedef GISTScanOpaqueData *GISTScanOpaque; /* despite the name, gistxlogPage is not part of any xlog record */ typedef struct gistxlogPage { BlockNumber blkno; int num; /* number of index tuples following */ } gistxlogPage; /* SplitedPageLayout - gistSplit function result */ typedef struct SplitedPageLayout { gistxlogPage block; IndexTupleData *list; int lenlist; IndexTuple itup; /* union key for page */ Page page; /* to operate */ Buffer buffer; /* to write after all proceed */ struct SplitedPageLayout *next; } SplitedPageLayout; /* * GISTInsertStack used for locking buffers and transfer arguments during * insertion */ typedef struct GISTInsertStack { /* current page */ BlockNumber blkno; Buffer buffer; Page page; /* * log sequence number from page->lsn to recognize page update and compare * it with page's nsn to recognize page split */ GistNSN lsn; /* * If set, we split the page while descending the tree to find an * insertion target. It means that we need to retry from the parent, * because the downlink of this page might no longer cover the new key. */ bool retry_from_parent; /* offset of the downlink in the parent page, that points to this page */ OffsetNumber downlinkoffnum; /* pointer to parent */ struct GISTInsertStack *parent; } GISTInsertStack; /* Working state and results for multi-column split logic in gistsplit.c */ typedef struct GistSplitVector { GIST_SPLITVEC splitVector; /* passed to/from user PickSplit method */ Datum spl_lattr[INDEX_MAX_KEYS]; /* Union of subkeys in * splitVector.spl_left */ bool spl_lisnull[INDEX_MAX_KEYS]; Datum spl_rattr[INDEX_MAX_KEYS]; /* Union of subkeys in * splitVector.spl_right */ bool spl_risnull[INDEX_MAX_KEYS]; bool *spl_dontcare; /* flags tuples which could go to either side * of the split for zero penalty */ } GistSplitVector; typedef struct { Relation r; Relation heapRel; Size freespace; /* free space to be left */ bool is_build; GISTInsertStack *stack; } GISTInsertState; /* root page of a gist index */ #define GIST_ROOT_BLKNO 0 /* * Before PostgreSQL 9.1, we used to rely on so-called "invalid tuples" on * inner pages to finish crash recovery of incomplete page splits. If a crash * happened in the middle of a page split, so that the downlink pointers were * not yet inserted, crash recovery inserted a special downlink pointer. The * semantics of an invalid tuple was that it if you encounter one in a scan, * it must always be followed, because we don't know if the tuples on the * child page match or not. * * We no longer create such invalid tuples, we now mark the left-half of such * an incomplete split with the F_FOLLOW_RIGHT flag instead, and finish the * split properly the next time we need to insert on that page. To retain * on-disk compatibility for the sake of pg_upgrade, we still store 0xffff as * the offset number of all inner tuples. If we encounter any invalid tuples * with 0xfffe during insertion, we throw an error, though scans still handle * them. You should only encounter invalid tuples if you pg_upgrade a pre-9.1 * gist index which already has invalid tuples in it because of a crash. That * should be rare, and you are recommended to REINDEX anyway if you have any * invalid tuples in an index, so throwing an error is as far as we go with * supporting that. */ #define TUPLE_IS_VALID 0xffff #define TUPLE_IS_INVALID 0xfffe #define GistTupleIsInvalid(itup) ( ItemPointerGetOffsetNumber( &((itup)->t_tid) ) == TUPLE_IS_INVALID ) #define GistTupleSetValid(itup) ItemPointerSetOffsetNumber( &((itup)->t_tid), TUPLE_IS_VALID ) /* * A buffer attached to an internal node, used when building an index in * buffering mode. */ typedef struct { BlockNumber nodeBlocknum; /* index block # this buffer is for */ int32 blocksCount; /* current # of blocks occupied by buffer */ BlockNumber pageBlocknum; /* temporary file block # */ GISTNodeBufferPage *pageBuffer; /* in-memory buffer page */ /* is this buffer queued for emptying? */ bool queuedForEmptying; /* is this a temporary copy, not in the hash table? */ bool isTemp; int level; /* 0 == leaf */ } GISTNodeBuffer; /* * Does specified level have buffers? (Beware of multiple evaluation of * arguments.) */ #define LEVEL_HAS_BUFFERS(nlevel, gfbb) \ ((nlevel) != 0 && (nlevel) % (gfbb)->levelStep == 0 && \ (nlevel) != (gfbb)->rootlevel) /* Is specified buffer at least half-filled (should be queued for emptying)? */ #define BUFFER_HALF_FILLED(nodeBuffer, gfbb) \ ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer / 2) /* * Is specified buffer full? Our buffers can actually grow indefinitely, * beyond the "maximum" size, so this just means whether the buffer has grown * beyond the nominal maximum size. */ #define BUFFER_OVERFLOWED(nodeBuffer, gfbb) \ ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer) /* * Data structure with general information about build buffers. */ typedef struct GISTBuildBuffers { /* Persistent memory context for the buffers and metadata. */ MemoryContext context; BufFile *pfile; /* Temporary file to store buffers in */ long nFileBlocks; /* Current size of the temporary file */ /* * resizable array of free blocks. */ long *freeBlocks; int nFreeBlocks; /* # of currently free blocks in the array */ int freeBlocksLen; /* current allocated length of the array */ /* Hash for buffers by block number */ HTAB *nodeBuffersTab; /* List of buffers scheduled for emptying */ List *bufferEmptyingQueue; /* * Parameters to the buffering build algorithm. levelStep determines which * levels in the tree have buffers, and pagesPerBuffer determines how * large each buffer is. */ int levelStep; int pagesPerBuffer; /* Array of lists of buffers on each level, for final emptying */ List **buffersOnLevels; int buffersOnLevelsLen; /* * Dynamically-sized array of buffers that currently have their last page * loaded in main memory. */ GISTNodeBuffer **loadedBuffers; int loadedBuffersCount; /* # of entries in loadedBuffers */ int loadedBuffersLen; /* allocated size of loadedBuffers */ /* Level of the current root node (= height of the index tree - 1) */ int rootlevel; } GISTBuildBuffers; /* * Storage type for GiST's reloptions */ typedef struct GiSTOptions { int32 vl_len_; /* varlena header (do not touch directly!) */ int fillfactor; /* page fill factor in percent (0..100) */ int bufferingModeOffset; /* use buffering build? */ } GiSTOptions; /* gist.c */ extern void gistbuildempty(Relation index); extern bool gistinsert(Relation r, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, struct IndexInfo *indexInfo); extern MemoryContext createTempGistContext(void); extern GISTSTATE *initGISTstate(Relation index); extern void freeGISTstate(GISTSTATE *giststate); extern void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *GISTstate, Relation heapRel, bool is_build); /* A List of these is returned from gistplacetopage() in *splitinfo */ typedef struct { Buffer buf; /* the split page "half" */ IndexTuple downlink; /* downlink for this half. */ } GISTPageSplitInfo; extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, Buffer buffer, IndexTuple *itup, int ntup, OffsetNumber oldoffnum, BlockNumber *newblkno, Buffer leftchildbuf, List **splitinfo, bool markleftchild, Relation heapRel, bool is_build); extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate); /* gistxlog.c */ extern XLogRecPtr gistXLogPageDelete(Buffer buffer, FullTransactionId xid, Buffer parentBuffer, OffsetNumber downlinkOffset); extern void gistXLogPageReuse(Relation rel, BlockNumber blkno, FullTransactionId latestRemovedXid); extern XLogRecPtr gistXLogUpdate(Buffer buffer, OffsetNumber *todelete, int ntodelete, IndexTuple *itup, int ntup, Buffer leftchild); extern XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete, int ntodelete, TransactionId latestRemovedXid); extern XLogRecPtr gistXLogSplit(bool page_is_leaf, SplitedPageLayout *dist, BlockNumber origrlink, GistNSN oldnsn, Buffer leftchild, bool markfollowright); /* gistget.c */ extern bool gistgettuple(IndexScanDesc scan, ScanDirection dir); extern int64 gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); extern bool gistcanreturn(Relation index, int attno); /* gistvalidate.c */ extern bool gistvalidate(Oid opclassoid); /* gistutil.c */ #define GiSTPageSize \ ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GISTPageOpaqueData)) ) #define GIST_MIN_FILLFACTOR 10 #define GIST_DEFAULT_FILLFACTOR 90 extern bytea *gistoptions(Datum reloptions, bool validate); extern bool gistproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull); extern bool gistfitpage(IndexTuple *itvec, int len); extern bool gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace); extern void gistcheckpage(Relation rel, Buffer buf); extern Buffer gistNewBuffer(Relation r); extern bool gistPageRecyclable(Page page); extern void gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off); extern IndexTuple *gistextractpage(Page page, int *len /* out */ ); extern IndexTuple *gistjoinvector(IndexTuple *itvec, int *len, IndexTuple *additvec, int addlen); extern IndexTupleData *gistfillitupvec(IndexTuple *vec, int veclen, int *memlen); extern IndexTuple gistunion(Relation r, IndexTuple *itvec, int len, GISTSTATE *giststate); extern IndexTuple gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *giststate); extern IndexTuple gistFormTuple(GISTSTATE *giststate, Relation r, Datum *attdata, bool *isnull, bool isleaf); extern OffsetNumber gistchoose(Relation r, Page p, IndexTuple it, GISTSTATE *giststate); extern void GISTInitBuffer(Buffer b, uint32 f); extern void gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, Datum k, Relation r, Page pg, OffsetNumber o, bool l, bool isNull); extern float gistpenalty(GISTSTATE *giststate, int attno, GISTENTRY *key1, bool isNull1, GISTENTRY *key2, bool isNull2); extern void gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, Datum *attr, bool *isnull); extern bool gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b); extern void gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p, OffsetNumber o, GISTENTRY *attdata, bool *isnull); extern HeapTuple gistFetchTuple(GISTSTATE *giststate, Relation r, IndexTuple tuple); extern void gistMakeUnionKey(GISTSTATE *giststate, int attno, GISTENTRY *entry1, bool isnull1, GISTENTRY *entry2, bool isnull2, Datum *dst, bool *dstisnull); extern XLogRecPtr gistGetFakeLSN(Relation rel); /* gistvacuum.c */ extern IndexBulkDeleteResult *gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state); extern IndexBulkDeleteResult *gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats); /* gistsplit.c */ extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate, GistSplitVector *v, int attno); /* gistbuild.c */ extern IndexBuildResult *gistbuild(Relation heap, Relation index, struct IndexInfo *indexInfo); extern void gistValidateBufferingOption(const char *value); /* gistbuildbuffers.c */ extern GISTBuildBuffers *gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel); extern GISTNodeBuffer *gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate, BlockNumber blkno, int level); extern void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, IndexTuple item); extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, IndexTuple *item); extern void gistFreeBuildBuffers(GISTBuildBuffers *gfbb); extern void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate, Relation r, int level, Buffer buffer, List *splitinfo); extern void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb); #endif /* GIST_PRIVATE_H */