greenplumn cluster 源码
greenplumn cluster 代码
文件路径:/src/backend/commands/cluster.c
/*-------------------------------------------------------------------------
*
* cluster.c
* CLUSTER a table on an index. This is now also used for VACUUM FULL.
*
* There is hardly anything left of Paul Brown's original implementation...
*
*
* Portions Copyright (c) 2006-2008, Greenplum inc
* Portions Copyright (c) 2012-Present VMware, Inc. or its affiliates.
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994-5, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/commands/cluster.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/amapi.h"
#include "access/heapam.h"
#include "access/multixact.h"
#include "access/reloptions.h"
#include "access/relscan.h"
#include "access/tableam.h"
#include "access/transam.h"
#include "access/tuptoaster.h"
#include "access/xact.h"
#include "access/xlog.h"
#include "catalog/pg_am.h"
#include "catalog/catalog.h"
#include "catalog/dependency.h"
#include "catalog/heap.h"
#include "catalog/index.h"
#include "catalog/namespace.h"
#include "catalog/objectaccess.h"
#include "catalog/pg_appendonly.h"
#include "catalog/pg_attribute_encoding.h"
#include "catalog/pg_type.h"
#include "catalog/pg_namespace.h"
#include "catalog/pg_tablespace.h"
#include "catalog/toasting.h"
#include "commands/cluster.h"
#include "commands/progress.h"
#include "commands/tablecmds.h"
#include "commands/vacuum.h"
#include "miscadmin.h"
#include "optimizer/optimizer.h"
#include "pgstat.h"
#include "storage/bufmgr.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "utils/acl.h"
#include "utils/faultinjector.h"
#include "utils/fmgroids.h"
#include "utils/inval.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/pg_rusage.h"
#include "utils/relmapper.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
#include "utils/tuplesort.h"
#include "catalog/aocatalog.h"
#include "catalog/oid_dispatch.h"
#include "cdb/cdbvars.h"
#include "cdb/cdbdisp_query.h"
#include "cdb/cdboidsync.h"
#include "libpq/pqformat.h"
/*
* This struct is used to pass around the information on tables to be
* clustered. We need this so we can make a list of them when invoked without
* a specific table/index pair.
*/
typedef struct
{
Oid tableOid;
Oid indexOid;
} RelToCluster;
static void rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose);
static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
bool verbose, bool *pSwapToastByContent,
TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
static List *get_tables_to_cluster(MemoryContext cluster_context);
/*---------------------------------------------------------------------------
* This cluster code allows for clustering multiple tables at once. Because
* of this, we cannot just run everything on a single transaction, or we
* would be forced to acquire exclusive locks on all the tables being
* clustered, simultaneously --- very likely leading to deadlock.
*
* To solve this we follow a similar strategy to VACUUM code,
* clustering each relation in a separate transaction. For this to work,
* we need to:
* - provide a separate memory context so that we can pass information in
* a way that survives across transactions
* - start a new transaction every time a new relation is clustered
* - check for validity of the information on to-be-clustered relations,
* as someone might have deleted a relation behind our back, or
* clustered one on a different index
* - end the transaction
*
* The single-relation case does not have any such overhead.
*
* We also allow a relation to be specified without index. In that case,
* the indisclustered bit will be looked up, and an ERROR will be thrown
* if there is no index with the bit set.
*---------------------------------------------------------------------------
*/
void
cluster(ClusterStmt *stmt, bool isTopLevel)
{
if (stmt->relation != NULL)
{
/* This is the single-relation case. */
Oid tableOid,
indexOid = InvalidOid;
Relation rel;
/* Find, lock, and check permissions on the table */
tableOid = RangeVarGetRelidExtended(stmt->relation,
AccessExclusiveLock,
0,
RangeVarCallbackOwnsTable, NULL);
rel = table_open(tableOid, NoLock);
/*
* Reject clustering a remote temp table ... their local buffer
* manager is not going to cope.
*/
if (RELATION_IS_OTHER_TEMP(rel))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot cluster temporary tables of other sessions")));
/*
* Reject clustering a partitioned table.
*/
if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot cluster a partitioned table")));
if (stmt->indexname == NULL)
{
ListCell *index;
/* We need to find the index that has indisclustered set. */
foreach(index, RelationGetIndexList(rel))
{
HeapTuple idxtuple;
Form_pg_index indexForm;
indexOid = lfirst_oid(index);
idxtuple = SearchSysCache1(INDEXRELID,
ObjectIdGetDatum(indexOid));
if (!HeapTupleIsValid(idxtuple))
elog(ERROR, "cache lookup failed for index %u", indexOid);
indexForm = (Form_pg_index) GETSTRUCT(idxtuple);
if (indexForm->indisclustered)
{
ReleaseSysCache(idxtuple);
break;
}
ReleaseSysCache(idxtuple);
indexOid = InvalidOid;
}
if (!OidIsValid(indexOid))
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_OBJECT),
errmsg("there is no previously clustered index for table \"%s\"",
stmt->relation->relname)));
}
else
{
/*
* The index is expected to be in the same namespace as the
* relation.
*/
indexOid = get_relname_relid(stmt->indexname,
rel->rd_rel->relnamespace);
if (!OidIsValid(indexOid))
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_OBJECT),
errmsg("index \"%s\" for table \"%s\" does not exist",
stmt->indexname, stmt->relation->relname)));
}
/* close relation, keep lock till commit */
table_close(rel, NoLock);
/* Do the job. */
cluster_rel(tableOid, indexOid, stmt->options, true /* printError */);
if (Gp_role == GP_ROLE_DISPATCH)
{
CdbDispatchUtilityStatement((Node *) stmt,
DF_CANCEL_ON_ERROR|
DF_WITH_SNAPSHOT|
DF_NEED_TWO_PHASE,
GetAssignedOidsForDispatch(),
NULL);
}
}
else
{
/*
* This is the "multi relation" case. We need to cluster all tables
* that have some index with indisclustered set.
*/
MemoryContext cluster_context;
List *rvs;
ListCell *rv;
/*
* We cannot run this form of CLUSTER inside a user transaction block;
* we'd be holding locks way too long.
*/
PreventInTransactionBlock(isTopLevel, "CLUSTER");
/*
* Create special memory context for cross-transaction storage.
*
* Since it is a child of PortalContext, it will go away even in case
* of error.
*/
cluster_context = AllocSetContextCreate(PortalContext,
"Cluster",
ALLOCSET_DEFAULT_SIZES);
/*
* Build the list of relations to cluster. Note that this lives in
* cluster_context.
*/
rvs = get_tables_to_cluster(cluster_context);
/* Commit to get out of starting transaction */
PopActiveSnapshot();
CommitTransactionCommand();
/* Ok, now that we've got them all, cluster them one by one */
foreach(rv, rvs)
{
RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
bool dispatch;
/* Start a new transaction for each relation. */
StartTransactionCommand();
/* functions in indexes may want a snapshot set */
PushActiveSnapshot(GetTransactionSnapshot());
/* Do the job. */
dispatch = cluster_rel(rvtc->tableOid, rvtc->indexOid,
stmt->options | CLUOPT_RECHECK,
false /* printError */);
if (Gp_role == GP_ROLE_DISPATCH && dispatch)
{
stmt->relation = makeNode(RangeVar);
stmt->relation->schemaname = get_namespace_name(get_rel_namespace(rvtc->tableOid));
stmt->relation->relname = get_rel_name(rvtc->tableOid);
CdbDispatchUtilityStatement((Node *) stmt,
DF_CANCEL_ON_ERROR|
DF_WITH_SNAPSHOT,
GetAssignedOidsForDispatch(),
NULL);
}
PopActiveSnapshot();
CommitTransactionCommand();
}
/* Start a new transaction for the cleanup work. */
StartTransactionCommand();
/* Clean up working storage */
MemoryContextDelete(cluster_context);
}
}
/*
* cluster_rel
*
* This clusters the table by creating a new, clustered table and
* swapping the relfilenodes of the new table and the old table, so
* the OID of the original table is preserved. Thus we do not lose
* GRANT, inheritance nor references to this table (this was a bug
* in releases through 7.3).
*
* Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
* the new table, it's better to create the indexes afterwards than to fill
* them incrementally while we load the table.
*
* If indexOid is InvalidOid, the table will be rewritten in physical order
* instead of index order. This is the new implementation of VACUUM FULL,
* and error messages should refer to the operation as VACUUM not CLUSTER.
*
* Note that we don't support clustering on an AO table. If printError is true,
* this function errors out when the relation is an AO table. Otherwise, this
* functions prints out a warning message when the relation is an AO table.
*/
bool
cluster_rel(Oid tableOid, Oid indexOid, int options, bool printError)
{
Relation OldHeap;
bool verbose = ((options & CLUOPT_VERBOSE) != 0);
bool recheck = ((options & CLUOPT_RECHECK) != 0);
/* Check for user-requested abort. */
CHECK_FOR_INTERRUPTS();
pgstat_progress_start_command(PROGRESS_COMMAND_CLUSTER, tableOid);
if (OidIsValid(indexOid))
pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
PROGRESS_CLUSTER_COMMAND_CLUSTER);
else
pgstat_progress_update_param(PROGRESS_CLUSTER_COMMAND,
PROGRESS_CLUSTER_COMMAND_VACUUM_FULL);
/*
* We grab exclusive access to the target rel and index for the duration
* of the transaction. (This is redundant for the single-transaction
* case, since cluster() already did it.) The index lock is taken inside
* check_index_is_clusterable.
*/
OldHeap = try_relation_open(tableOid, AccessExclusiveLock, false);
/* If the table has gone away, we can skip processing it */
if (!OldHeap)
{
pgstat_progress_end_command();
return false;
}
/*
* Since we may open a new transaction for each relation, we have to check
* that the relation still is what we think it is.
*
* If this is a single-transaction CLUSTER, we can skip these tests. We
* *must* skip the one on indisclustered since it would reject an attempt
* to cluster a not-previously-clustered index.
*/
if (recheck)
{
HeapTuple tuple;
Form_pg_index indexForm;
/* Check that the user still owns the relation */
if (!pg_class_ownercheck(tableOid, GetUserId()))
{
relation_close(OldHeap, AccessExclusiveLock);
pgstat_progress_end_command();
return false;
}
/*
* Silently skip a temp table for a remote session. Only doing this
* check in the "recheck" case is appropriate (which currently means
* somebody is executing a database-wide CLUSTER), because there is
* another check in cluster() which will stop any attempt to cluster
* remote temp tables by name. There is another check in cluster_rel
* which is redundant, but we leave it for extra safety.
*/
if (RELATION_IS_OTHER_TEMP(OldHeap))
{
relation_close(OldHeap, AccessExclusiveLock);
pgstat_progress_end_command();
return false;
}
if (OidIsValid(indexOid))
{
/*
* Check that the index still exists
*/
if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
{
relation_close(OldHeap, AccessExclusiveLock);
pgstat_progress_end_command();
return false;
}
/*
* Check that the index is still the one with indisclustered set.
*/
tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
if (!HeapTupleIsValid(tuple)) /* probably can't happen */
{
relation_close(OldHeap, AccessExclusiveLock);
pgstat_progress_end_command();
return false;
}
indexForm = (Form_pg_index) GETSTRUCT(tuple);
if (!indexForm->indisclustered)
{
ReleaseSysCache(tuple);
relation_close(OldHeap, AccessExclusiveLock);
pgstat_progress_end_command();
return false;
}
ReleaseSysCache(tuple);
}
}
/*
* We allow VACUUM FULL, but not CLUSTER, on shared catalogs. CLUSTER
* would work in most respects, but the index would only get marked as
* indisclustered in the current database, leading to unexpected behavior
* if CLUSTER were later invoked in another database.
*/
if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot cluster a shared catalog")));
/*
* Don't process temp tables of other backends ... their local buffer
* manager is not going to cope.
*/
if (RELATION_IS_OTHER_TEMP(OldHeap))
{
if (OidIsValid(indexOid))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot cluster temporary tables of other sessions")));
else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot vacuum temporary tables of other sessions")));
}
/*
* Also check for active uses of the relation in the current transaction,
* including open scans and pending AFTER trigger events.
*/
CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");
/* Check heap and index are valid to cluster on */
if (OidIsValid(indexOid))
check_index_is_clusterable(OldHeap, indexOid, recheck, AccessExclusiveLock);
/*
* Quietly ignore the request if this is a materialized view which has not
* been populated from its query. No harm is done because there is no data
* to deal with, and we don't want to throw an error if this is part of a
* multi-relation request -- for example, CLUSTER was run on the entire
* database.
*/
if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
!RelationIsPopulated(OldHeap))
{
relation_close(OldHeap, AccessExclusiveLock);
pgstat_progress_end_command();
return false;
}
/*
* All predicate locks on the tuples or pages are about to be made
* invalid, because we move tuples around. Promote them to relation
* locks. Predicate locks on indexes will be promoted when they are
* reindexed.
*/
TransferPredicateLocksToHeapRelation(OldHeap);
/* rebuild_relation does all the dirty work */
rebuild_relation(OldHeap, indexOid, verbose);
/* NB: rebuild_relation does table_close() on OldHeap */
pgstat_progress_end_command();
return true;
}
/*
* Verify that the specified heap and index are valid to cluster on
*
* Side effect: obtains lock on the index. The caller may
* in some cases already have AccessExclusiveLock on the table, but
* not in all cases so we can't rely on the table-level lock for
* protection here.
*/
void
check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMODE lockmode)
{
Relation OldIndex;
OldIndex = index_open(indexOid, lockmode);
/*
* Check that index is in fact an index on the given relation
*/
if (OldIndex->rd_index == NULL ||
OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("\"%s\" is not an index for table \"%s\"",
RelationGetRelationName(OldIndex),
RelationGetRelationName(OldHeap))));
/* Index AM must allow clustering */
if (!OldIndex->rd_indam->amclusterable)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
RelationGetRelationName(OldIndex))));
/*
* Disallow clustering on incomplete indexes (those that might not index
* every row of the relation). We could relax this by making a separate
* seqscan pass over the table to copy the missing rows, but that seems
* expensive and tedious.
*/
if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred, NULL))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot cluster on partial index \"%s\"",
RelationGetRelationName(OldIndex))));
/*
* Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
* it might well not contain entries for every heap row, or might not even
* be internally consistent. (But note that we don't check indcheckxmin;
* the worst consequence of following broken HOT chains would be that we
* might put recently-dead tuples out-of-order in the new table, and there
* is little harm in that.)
*/
if (!OldIndex->rd_index->indisvalid)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot cluster on invalid index \"%s\"",
RelationGetRelationName(OldIndex))));
/* Drop relcache refcnt on OldIndex, but keep lock */
index_close(OldIndex, NoLock);
}
/*
* mark_index_clustered: mark the specified index as the one clustered on
*
* With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
*/
void
mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
{
HeapTuple indexTuple;
Form_pg_index indexForm;
Relation pg_index;
ListCell *index;
/* Disallow applying to a partitioned table */
if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot mark index clustered in partitioned table")));
/*
* If the index is already marked clustered, no need to do anything.
*/
if (OidIsValid(indexOid))
{
indexTuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
if (!HeapTupleIsValid(indexTuple))
elog(ERROR, "cache lookup failed for index %u", indexOid);
indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
if (indexForm->indisclustered)
{
ReleaseSysCache(indexTuple);
return;
}
ReleaseSysCache(indexTuple);
}
/*
* Check each index of the relation and set/clear the bit as needed.
*/
pg_index = table_open(IndexRelationId, RowExclusiveLock);
foreach(index, RelationGetIndexList(rel))
{
Oid thisIndexOid = lfirst_oid(index);
indexTuple = SearchSysCacheCopy1(INDEXRELID,
ObjectIdGetDatum(thisIndexOid));
if (!HeapTupleIsValid(indexTuple))
elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
/*
* Unset the bit if set. We know it's wrong because we checked this
* earlier.
*/
if (indexForm->indisclustered)
{
indexForm->indisclustered = false;
CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
}
else if (thisIndexOid == indexOid)
{
/* this was checked earlier, but let's be real sure */
if (!indexForm->indisvalid)
elog(ERROR, "cannot cluster on invalid index %u", indexOid);
indexForm->indisclustered = true;
CatalogTupleUpdate(pg_index, &indexTuple->t_self, indexTuple);
}
InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0,
InvalidOid, is_internal);
heap_freetuple(indexTuple);
}
table_close(pg_index, RowExclusiveLock);
}
/*
* rebuild_relation: rebuild an existing relation in index or physical order
*
* OldHeap: table to rebuild --- must be opened and exclusive-locked!
* indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
*
* NB: this routine closes OldHeap at the right time; caller should not.
*/
static void
rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
{
Oid tableOid = RelationGetRelid(OldHeap);
Oid accessMethod = OldHeap->rd_rel->relam;
Oid tableSpace = OldHeap->rd_rel->reltablespace;
Oid OIDNewHeap;
char relpersistence;
bool is_system_catalog;
bool swap_toast_by_content;
TransactionId frozenXid;
MultiXactId cutoffMulti;
/*
* GPDB_12_MERGE_FIXME: We use specific bool in abstract code. This should
* be somehow hidden by table am api or necessity of this switch should be
* revisited.
*/
bool is_ao = RelationIsAppendOptimized(OldHeap);
/* Mark the correct index as clustered */
if (OidIsValid(indexOid))
mark_index_clustered(OldHeap, indexOid, true);
/* Remember info about rel before closing OldHeap */
relpersistence = OldHeap->rd_rel->relpersistence;
is_system_catalog = IsSystemRelation(OldHeap);
/* Close relcache entry, but keep lock until transaction commit */
table_close(OldHeap, NoLock);
/* Create the transient table that will receive the re-ordered data */
OIDNewHeap = make_new_heap(tableOid, tableSpace,
accessMethod,
relpersistence,
AccessExclusiveLock,
true /* createAoBlockDirectory */,
false);
/* Copy the heap data into the new table in the desired order */
copy_table_data(OIDNewHeap, tableOid, indexOid, verbose,
&swap_toast_by_content, &frozenXid, &cutoffMulti);
/*
* Swap the physical files of the target and transient tables, then
* rebuild the target's indexes and throw away the transient table.
*/
finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
swap_toast_by_content,
!is_ao /* swap_stats */,
false, true,
frozenXid, cutoffMulti,
relpersistence);
}
/*
* Create the transient table that will be filled with new data during
* CLUSTER, ALTER TABLE, and similar operations. The transient table
* duplicates the logical structure of the OldHeap; but will have the
* specified physical storage properties NewTableSpace, NewAccessMethod, and
* relpersistence.
*
* After this, the caller should load the new heap with transferred/modified
* data, then call finish_heap_swap to complete the operation.
*/
Oid
make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, Oid NewAccessMethod,
char relpersistence,
LOCKMODE lockmode,
bool createAoBlockDirectory,
bool makeCdbPolicy)
{
TupleDesc OldHeapDesc;
char NewHeapName[NAMEDATALEN];
Oid OIDNewHeap;
Oid toastid;
Relation OldHeap;
HeapTuple tuple;
Datum reloptions;
bool isNull;
Oid namespaceid;
OldHeap = table_open(OIDOldHeap, lockmode);
OldHeapDesc = RelationGetDescr(OldHeap);
/*
* Note that the NewHeap will not receive any of the defaults or
* constraints associated with the OldHeap; we don't need 'em, and there's
* no reason to spend cycles inserting them into the catalogs only to
* delete them.
*/
/*
* But we do want to use reloptions of the old heap for new heap.
*/
tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
if (!HeapTupleIsValid(tuple))
elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
&isNull);
if (isNull)
reloptions = (Datum) 0;
/*
* Unless we are changing access method between heap and AO/CO, look further.
*/
/*
* GPDB: some considerations when AM is going to change between heap and AO/CO:
*
* If user has also requested setting new reloptions, the new reloptions should have
* replaced the old ones at this point. We just need to reuse those on the new table.
*
* If user does NOT request new reloptions, we should discard the existing reloptions.
* And one more consideration if we are changing the table from heap to AO: we should
* also pick up options from gp_default_storage_options, just like CREATE TABLE does.
*/
if (RelationIsHeap(OldHeap) && IsAccessMethodAO(NewAccessMethod))
{
/*
* Heap to AO/CO: filter out any reloptions that belong to heap,
* and pick up from gp_default_storage_options.
*/
int numoptions;
relopt_value *options;
/*
* Process the reloptions as for AO tables. And validate=false will silently
* filter out any reloptions that belong to heap.
*/
StdRdOptions *stdRdOptions = (StdRdOptions *)default_reloptions(reloptions,
false, /* validate */
RELOPT_KIND_APPENDOPTIMIZED);
/* Pick up from gp_default_storage_options. */
options = parseRelOptions(reloptions, false, RELOPT_KIND_APPENDOPTIMIZED, &numoptions);
validate_and_refill_options(stdRdOptions, options, numoptions, RELOPT_KIND_APPENDOPTIMIZED, true);
/* Update the reloptions string. */
reloptions = transformAOStdRdOptions(stdRdOptions, reloptions);
}
else if (RelationIsAppendOptimized(OldHeap) && NewAccessMethod == HEAP_TABLE_AM_OID)
{
/*
* AO/CO to Heap: unfortunately we don't have a convenient routine to transform
* heap StdRdOptions back to reloption string. So we take a slightly different
* approach than the case of heap to AO/CO: we check if there is any AO reloptions:
*
* (1) If there is, just discard them (AO options do not apply to heap).
* (2) If there is none, that means we either have replaced it with heap reloptions
* or the reloptions field is just empty, and either way we will pass the existing
* reloptions on to the new table.
*
* This is possible because at this point we only have either AO/AOCO reloptions or
* heap reloptions, but we cannot have both (see ATExecSetRelOptions).
*/
Datum aoreloptions = (Datum) 0;
StdRdOptions *stdRdOptions = (StdRdOptions *)default_reloptions(reloptions,
false, /* validate */
RELOPT_KIND_APPENDOPTIMIZED);
/*
* Transform the stdRdOptions to get a reloptions string, from which we will
* know if there is any AO reloptions.
*/
aoreloptions = transformAOStdRdOptions(stdRdOptions, aoreloptions);
if (aoreloptions != (Datum) 0)
reloptions = (Datum) 0;
}
if (relpersistence == RELPERSISTENCE_TEMP)
namespaceid = LookupCreationNamespace("pg_temp");
else
namespaceid = RelationGetNamespace(OldHeap);
/*
* Create the new heap, using a temporary name in the same namespace as
* the existing table. NOTE: there is some risk of collision with user
* relnames. Working around this seems more trouble than it's worth; in
* particular, we can't create the new heap in a different namespace from
* the old, or we will have problems with the TEMP status of temp tables.
*
* Note: the new heap is not a shared relation, even if we are rebuilding
* a shared rel. However, we do make the new heap mapped if the source is
* mapped. This simplifies swap_relation_files, and is absolutely
* necessary for rebuilding pg_class, for reasons explained there.
*/
snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);
OIDNewHeap = heap_create_with_catalog(NewHeapName,
namespaceid,
NewTableSpace,
InvalidOid,
InvalidOid,
InvalidOid,
OldHeap->rd_rel->relowner,
NewAccessMethod,
OldHeapDesc,
NIL,
RELKIND_RELATION,
relpersistence,
false,
RelationIsMapped(OldHeap),
ONCOMMIT_NOOP,
makeCdbPolicy? OldHeap->rd_cdbpolicy: NULL,/*CDB*/
reloptions,
false,
true,
true,
OIDOldHeap,
NULL,
true);
Assert(OIDNewHeap != InvalidOid);
ReleaseSysCache(tuple);
/*
* Advance command counter so that the newly-created relation's catalog
* tuples will be visible to table_open.
*/
CommandCounterIncrement();
/*
* If necessary, create a TOAST table for the new relation, or an Append
* Only segment table.
*
* If the relation doesn't have a TOAST table already, we can't need one
* for the new relation. The other way around is possible though: if some
* wide columns have been dropped, NewHeapCreateToastTable can decide that
* no TOAST table is needed for the new table.
*
* Note that NewHeapCreateToastTable ends with CommandCounterIncrement, so
* that the TOAST table will be visible for insertion.
*/
toastid = OldHeap->rd_rel->reltoastrelid;
if (OidIsValid(toastid))
{
/* keep the existing toast table's reloptions, if any */
tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
if (!HeapTupleIsValid(tuple))
elog(ERROR, "cache lookup failed for relation %u", toastid);
reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
&isNull);
if (isNull)
reloptions = (Datum) 0;
NewHeapCreateToastTable(OIDNewHeap, reloptions, lockmode);
ReleaseSysCache(tuple);
}
if (RelationIsAppendOptimized(OldHeap) || NewAccessMethod == AO_ROW_TABLE_AM_OID)
NewRelationCreateAOAuxTables(OIDNewHeap, createAoBlockDirectory);
CacheInvalidateRelcacheByRelid(OIDNewHeap);
/*
* Copy the pg_attribute_encoding entries over if new table needs them.
* Note that in the case of AM change from heap/ao to aoco, we still need
* to do this since we created those entries for the heap/ao table at the
* phase 2 of ATSETAM (see ATExecCmd).
*/
if (NewAccessMethod == AO_COLUMN_TABLE_AM_OID)
cloneAttributeEncoding(OIDOldHeap,
OIDNewHeap,
RelationGetNumberOfAttributes(OldHeap));
table_close(OldHeap, NoLock);
return OIDNewHeap;
}
/*
* Do the physical copying of table data.
*
* There are three output parameters:
* *pSwapToastByContent is set true if toast tables must be swapped by content.
* *pFreezeXid receives the TransactionId used as freeze cutoff point.
* *pCutoffMulti receives the MultiXactId used as a cutoff point.
*/
static void
copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
bool *pSwapToastByContent, TransactionId *pFreezeXid,
MultiXactId *pCutoffMulti)
{
Relation NewHeap,
OldHeap,
OldIndex;
Relation relRelation;
HeapTuple reltup;
Form_pg_class relform;
TupleDesc oldTupDesc PG_USED_FOR_ASSERTS_ONLY;
TupleDesc newTupDesc PG_USED_FOR_ASSERTS_ONLY;
TransactionId OldestXmin;
TransactionId FreezeXid;
MultiXactId MultiXactCutoff;
bool use_sort;
double num_tuples = 0,
tups_vacuumed = 0,
tups_recently_dead = 0;
BlockNumber num_pages;
int elevel = verbose ? INFO : DEBUG2;
PGRUsage ru0;
pg_rusage_init(&ru0);
/*
* Open the relations we need.
*/
NewHeap = table_open(OIDNewHeap, AccessExclusiveLock);
OldHeap = table_open(OIDOldHeap, AccessExclusiveLock);
if (OidIsValid(OIDOldIndex))
OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
else
OldIndex = NULL;
/*
* Their tuple descriptors should be exactly alike, but here we only need
* assume that they have the same number of columns.
*/
oldTupDesc = RelationGetDescr(OldHeap);
newTupDesc = RelationGetDescr(NewHeap);
Assert(newTupDesc->natts == oldTupDesc->natts);
/*
* If the OldHeap has a toast table, get lock on the toast table to keep
* it from being vacuumed. This is needed because autovacuum processes
* toast tables independently of their main tables, with no lock on the
* latter. If an autovacuum were to start on the toast table after we
* compute our OldestXmin below, it would use a later OldestXmin, and then
* possibly remove as DEAD toast tuples belonging to main tuples we think
* are only RECENTLY_DEAD. Then we'd fail while trying to copy those
* tuples.
*
* We don't need to open the toast relation here, just lock it. The lock
* will be held till end of transaction.
*/
if (OldHeap->rd_rel->reltoastrelid)
LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
/*
* If both tables have TOAST tables, perform toast swap by content. It is
* possible that the old table has a toast table but the new one doesn't,
* if toastable columns have been dropped. In that case we have to do
* swap by links. This is okay because swap by content is only essential
* for system catalogs, and we don't support schema changes for them.
*/
if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
{
*pSwapToastByContent = true;
/*
* When doing swap by content, any toast pointers written into NewHeap
* must use the old toast table's OID, because that's where the toast
* data will eventually be found. Set this up by setting rd_toastoid.
* This also tells toast_save_datum() to preserve the toast value
* OIDs, which we want so as not to invalidate toast pointers in
* system catalog caches, and to avoid making multiple copies of a
* single toast value.
*
* Note that we must hold NewHeap open until we are done writing data,
* since the relcache will not guarantee to remember this setting once
* the relation is closed. Also, this technique depends on the fact
* that no one will try to read from the NewHeap until after we've
* finished writing it and swapping the rels --- otherwise they could
* follow the toast pointers to the wrong place. (It would actually
* work for values copied over from the old toast table, but not for
* any values that we toast which were previously not toasted.)
*/
NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
}
else
*pSwapToastByContent = false;
/*
* Compute xids used to freeze and weed out dead tuples and multixacts.
* Since we're going to rewrite the whole table anyway, there's no reason
* not to be aggressive about this.
*/
vacuum_set_xid_limits(OldHeap, 0, 0, 0, 0,
&OldestXmin, &FreezeXid, NULL, &MultiXactCutoff,
NULL);
/*
* FreezeXid will become the table's new relfrozenxid, and that mustn't go
* backwards, so take the max.
*/
if (TransactionIdIsValid(OldHeap->rd_rel->relfrozenxid) &&
TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
FreezeXid = OldHeap->rd_rel->relfrozenxid;
/*
* MultiXactCutoff, similarly, shouldn't go backwards either.
*/
if (MultiXactIdIsValid(OldHeap->rd_rel->relminmxid) &&
MultiXactIdPrecedes(MultiXactCutoff, OldHeap->rd_rel->relminmxid))
MultiXactCutoff = OldHeap->rd_rel->relminmxid;
/*
* Decide whether to use an indexscan or seqscan-and-optional-sort to scan
* the OldHeap. We know how to use a sort to duplicate the ordering of a
* btree index, and will use seqscan-and-sort for that case if the planner
* tells us it's cheaper. Otherwise, always indexscan if an index is
* provided, else plain seqscan.
*/
if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
else
use_sort = false;
/* Log what we're doing */
if (OldIndex != NULL && !use_sort)
ereport(elevel,
(errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
get_namespace_name(RelationGetNamespace(OldHeap)),
RelationGetRelationName(OldHeap),
RelationGetRelationName(OldIndex))));
else if (use_sort)
ereport(elevel,
(errmsg("clustering \"%s.%s\" using sequential scan and sort",
get_namespace_name(RelationGetNamespace(OldHeap)),
RelationGetRelationName(OldHeap))));
else
ereport(elevel,
(errmsg("vacuuming \"%s.%s\"",
get_namespace_name(RelationGetNamespace(OldHeap)),
RelationGetRelationName(OldHeap))));
/*
* Hand of the actual copying to AM specific function, the generic code
* cannot know how to deal with visibility across AMs. Note that this
* routine is allowed to set FreezeXid / MultiXactCutoff to different
* values (e.g. because the AM doesn't use freezing).
*/
table_relation_copy_for_cluster(OldHeap, NewHeap, OldIndex, use_sort,
OldestXmin, &FreezeXid, &MultiXactCutoff,
&num_tuples, &tups_vacuumed,
&tups_recently_dead);
/* return selected values to caller, get set as relfrozenxid/minmxid */
*pFreezeXid = FreezeXid;
*pCutoffMulti = MultiXactCutoff;
/* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
NewHeap->rd_toastoid = InvalidOid;
num_pages = RelationGetNumberOfBlocks(NewHeap);
/* Log what we did */
ereport(elevel,
(errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
RelationGetRelationName(OldHeap),
tups_vacuumed, num_tuples,
RelationGetNumberOfBlocks(OldHeap)),
errdetail("%.0f dead row versions cannot be removed yet.\n"
"%s.",
tups_recently_dead,
pg_rusage_show(&ru0))));
if (OldIndex != NULL)
index_close(OldIndex, NoLock);
table_close(OldHeap, NoLock);
table_close(NewHeap, NoLock);
/* Update pg_class to reflect the correct values of pages and tuples. */
relRelation = table_open(RelationRelationId, RowExclusiveLock);
reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDNewHeap));
if (!HeapTupleIsValid(reltup))
elog(ERROR, "cache lookup failed for relation %u", OIDNewHeap);
relform = (Form_pg_class) GETSTRUCT(reltup);
relform->relpages = num_pages;
relform->reltuples = num_tuples;
/* Don't update the stats for pg_class. See swap_relation_files. */
if (OIDOldHeap != RelationRelationId)
CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
else
CacheInvalidateRelcacheByTuple(reltup);
/* Clean up. */
heap_freetuple(reltup);
table_close(relRelation, RowExclusiveLock);
/* Make the update visible */
CommandCounterIncrement();
}
/*
* Change dependency links for objects that are being swapped.
*
* 'tabletype' can be "TOAST table", "aoseg", "aoblkdir".
* It is used for printing error messages.
*/
static void
changeDependencyLinks(Oid baseOid1, Oid baseOid2, Oid oid1, Oid oid2,
const char *tabletype)
{
ObjectAddress baseobject, newobject;
long count;
/* Delete old dependencies */
if (oid1)
{
count = deleteDependencyRecordsFor(RelationRelationId, oid1, false);
if (count != 1)
elog(ERROR, "expected one dependency record for %s table, found %ld",
tabletype, count);
}
if (oid2)
{
count = deleteDependencyRecordsFor(RelationRelationId, oid2, false);
if (count != 1)
elog(ERROR, "expected one dependency record for %s table, found %ld",
tabletype, count);
}
/* Register new dependencies */
baseobject.classId = RelationRelationId;
baseobject.objectSubId = 0;
newobject.classId = RelationRelationId;
newobject.objectSubId = 0;
if (oid1)
{
baseobject.objectId = baseOid1;
newobject.objectId = oid1;
recordDependencyOn(&newobject, &baseobject, DEPENDENCY_INTERNAL);
}
if (oid2)
{
baseobject.objectId = baseOid2;
newobject.objectId = oid2;
recordDependencyOn(&newobject, &baseobject, DEPENDENCY_INTERNAL);
}
}
/*
* Swap the physical files of two given relations.
*
* We swap the physical identity (reltablespace, relfilenode) while keeping the
* same logical identities of the two relations. relpersistence is also
* swapped, which is critical since it determines where buffers live for each
* relation.
*
* We can swap associated TOAST data in either of two ways: recursively swap
* the physical content of the toast tables (and their indexes), or swap the
* TOAST links in the given relations' pg_class entries. The former is needed
* to manage rewrites of shared catalogs (where we cannot change the pg_class
* links) while the latter is the only way to handle cases in which a toast
* table is added or removed altogether.
*
* Additionally, the first relation is marked with relfrozenxid set to
* frozenXid. It seems a bit ugly to have this here, but the caller would
* have to do it anyway, so having it here saves a heap_update. Note: in
* the swap-toast-links case, we assume we don't need to change the toast
* table's relfrozenxid: the new version of the toast table should already
* have relfrozenxid set to RecentXmin, which is good enough.
*
* Lastly, if r2 and its toast table and toast index (if any) are mapped,
* their OIDs are emitted into mapped_tables[]. This is hacky but beats
* having to look the information up again later in finish_heap_swap.
*
* GPDB: also swap aoseg, aoblkdir links.
*/
void
swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
bool swap_toast_by_content,
bool swap_stats,
bool is_internal,
TransactionId frozenXid,
MultiXactId cutoffMulti,
Oid *mapped_tables)
{
Relation relRelation,
rel;
HeapTuple reltup1,
reltup2;
Form_pg_class relform1,
relform2;
Oid relfilenode1,
relfilenode2;
Oid swaptemp;
char swptmpchr;
/* We need writable copies of both pg_class tuples. */
relRelation = table_open(RelationRelationId, RowExclusiveLock);
reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
if (!HeapTupleIsValid(reltup1))
elog(ERROR, "cache lookup failed for relation %u", r1);
relform1 = (Form_pg_class) GETSTRUCT(reltup1);
reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
if (!HeapTupleIsValid(reltup2))
elog(ERROR, "cache lookup failed for relation %u", r2);
relform2 = (Form_pg_class) GETSTRUCT(reltup2);
if (relform1->relam == AO_ROW_TABLE_AM_OID || relform1->relam == AO_COLUMN_TABLE_AM_OID ||
relform2->relam == AO_ROW_TABLE_AM_OID || relform2->relam == AO_COLUMN_TABLE_AM_OID)
ATAOEntries(relform1, relform2);
/* Also swap reloptions if we are swaping between heap and AO/AOCO tables. */
if ((relform1->relam == HEAP_TABLE_AM_OID && IsAccessMethodAO(relform2->relam)) ||
(relform2->relam == HEAP_TABLE_AM_OID && IsAccessMethodAO(relform1->relam)))
{
Datum val[Natts_pg_class] = {0};
bool null[Natts_pg_class] = {0};
bool repl[Natts_pg_class] = {0};
bool isNull;
val[Anum_pg_class_reloptions - 1] = SysCacheGetAttr(RELOID, reltup2, Anum_pg_class_reloptions, &isNull);
null[Anum_pg_class_reloptions - 1] = isNull;
repl[Anum_pg_class_reloptions - 1] = true;
reltup1 = heap_modify_tuple(reltup1, RelationGetDescr(relRelation),
val, null, repl);
relform1 = (Form_pg_class) GETSTRUCT(reltup1);
}
relfilenode1 = relform1->relfilenode;
relfilenode2 = relform2->relfilenode;
if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2))
{
/*
* Normal non-mapped relations: swap relfilenodes, reltablespaces,
* relpersistence
*/
Assert(!target_is_pg_class);
swaptemp = relform1->relfilenode;
relform1->relfilenode = relform2->relfilenode;
relform2->relfilenode = swaptemp;
swaptemp = relform1->reltablespace;
relform1->reltablespace = relform2->reltablespace;
relform2->reltablespace = swaptemp;
swaptemp = relform1->relam;
relform1->relam = relform2->relam;
relform2->relam = swaptemp;
swptmpchr = relform1->relpersistence;
relform1->relpersistence = relform2->relpersistence;
relform2->relpersistence = swptmpchr;
/* Also swap toast links, if we're swapping by links */
if (!swap_toast_by_content)
{
swaptemp = relform1->reltoastrelid;
relform1->reltoastrelid = relform2->reltoastrelid;
relform2->reltoastrelid = swaptemp;
}
}
else
{
/*
* Mapped-relation case. Here we have to swap the relation mappings
* instead of modifying the pg_class columns. Both must be mapped.
*/
if (OidIsValid(relfilenode1) || OidIsValid(relfilenode2))
elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
NameStr(relform1->relname));
/*
* We can't change the tablespace nor persistence of a mapped rel, and
* we can't handle toast link swapping for one either, because we must
* not apply any critical changes to its pg_class row. These cases
* should be prevented by upstream permissions tests, so these checks
* are non-user-facing emergency backstop.
*/
if (relform1->reltablespace != relform2->reltablespace)
elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
NameStr(relform1->relname));
if (relform1->relpersistence != relform2->relpersistence)
elog(ERROR, "cannot change persistence of mapped relation \"%s\"",
NameStr(relform1->relname));
if (relform1->relam != relform2->relam)
elog(ERROR, "cannot change access method of mapped relation \"%s\"",
NameStr(relform1->relname));
if (!swap_toast_by_content &&
(relform1->reltoastrelid || relform2->reltoastrelid))
elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
NameStr(relform1->relname));
/*
* Fetch the mappings --- shouldn't fail, but be paranoid
*/
relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared);
if (!OidIsValid(relfilenode1))
elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
NameStr(relform1->relname), r1);
relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared);
if (!OidIsValid(relfilenode2))
elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
NameStr(relform2->relname), r2);
/*
* Send replacement mappings to relmapper. Note these won't actually
* take effect until CommandCounterIncrement.
*/
RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false);
RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false);
/* Pass OIDs of mapped r2 tables back to caller */
*mapped_tables++ = r2;
}
/*
* In the case of a shared catalog, these next few steps will only affect
* our own database's pg_class row; but that's okay, because they are all
* noncritical updates. That's also an important fact for the case of a
* mapped catalog, because it's possible that we'll commit the map change
* and then fail to commit the pg_class update.
*/
/* set rel1's frozen Xid and minimum MultiXid */
if (relform1->relkind != RELKIND_INDEX)
{
Assert(!TransactionIdIsValid(frozenXid) ||
TransactionIdIsNormal(frozenXid));
relform1->relfrozenxid = frozenXid;
Assert(MultiXactIdIsValid(cutoffMulti));
relform1->relminmxid = cutoffMulti;
}
/*
* Greenplum: append-optimized tables do not have a valid relfrozenxid.
* Overwrite the entry for both relations.
*/
if (relform1->relkind != RELKIND_INDEX && IsAccessMethodAO(relform1->relam))
relform1->relfrozenxid = InvalidTransactionId;
if (relform2->relkind != RELKIND_INDEX && IsAccessMethodAO(relform2->relam))
relform2->relfrozenxid = InvalidTransactionId;
/* swap size statistics too, since new rel has freshly-updated stats */
if (swap_stats)
{
int32 swap_pages;
float4 swap_tuples;
int32 swap_allvisible;
swap_pages = relform1->relpages;
relform1->relpages = relform2->relpages;
relform2->relpages = swap_pages;
swap_tuples = relform1->reltuples;
relform1->reltuples = relform2->reltuples;
relform2->reltuples = swap_tuples;
swap_allvisible = relform1->relallvisible;
relform1->relallvisible = relform2->relallvisible;
relform2->relallvisible = swap_allvisible;
}
/*
* Update the tuples in pg_class --- unless the target relation of the
* swap is pg_class itself. In that case, there is zero point in making
* changes because we'd be updating the old data that we're about to throw
* away. Because the real work being done here for a mapped relation is
* just to change the relation map settings, it's all right to not update
* the pg_class rows in this case. The most important changes will instead
* performed later, in finish_heap_swap() itself.
*/
if (!target_is_pg_class)
{
CatalogIndexState indstate;
indstate = CatalogOpenIndexes(relRelation);
CatalogTupleUpdateWithInfo(relRelation, &reltup1->t_self, reltup1,
indstate);
CatalogTupleUpdateWithInfo(relRelation, &reltup2->t_self, reltup2,
indstate);
CatalogCloseIndexes(indstate);
}
else
{
/* no update ... but we do still need relcache inval */
CacheInvalidateRelcacheByTuple(reltup1);
CacheInvalidateRelcacheByTuple(reltup2);
}
/*
* Post alter hook for modified relations. The change to r2 is always
* internal, but r1 depends on the invocation context.
*/
InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0,
InvalidOid, is_internal);
InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0,
InvalidOid, true);
/*
* If we have toast tables associated with the relations being swapped,
* deal with them too.
*/
if (relform1->reltoastrelid || relform2->reltoastrelid)
{
if (swap_toast_by_content)
{
if (relform1->reltoastrelid && relform2->reltoastrelid)
{
/* Recursively swap the contents of the toast tables */
/*
* CLUSTER operation on append-optimized tables does not
* compute freeze limit (frozenXid) because AO tables do not
* have relfrozenxid. The toast tables need to keep existing
* relfrozenxid value unchanged in this case.
*
* GPDB_12_MERGE_FIXME: If the above argument is correct,
* figure out a way check it with an assert.
*/
swap_relation_files(relform1->reltoastrelid,
relform2->reltoastrelid,
target_is_pg_class,
swap_toast_by_content,
swap_stats,
is_internal,
frozenXid,
cutoffMulti,
mapped_tables);
}
else
{
/* caller messed up */
elog(ERROR, "cannot swap toast files by content when there's only one");
}
}
else
{
/*
* We swapped the ownership links, so we need to change dependency
* data to match.
*
* NOTE: it is possible that only one table has a toast table.
*
* NOTE: at present, a TOAST table's only dependency is the one on
* its owning table. If more are ever created, we'd need to use
* something more selective than deleteDependencyRecordsFor() to
* get rid of just the link we want.
*/
/*
* We disallow this case for system catalogs, to avoid the
* possibility that the catalog we're rebuilding is one of the
* ones the dependency changes would change. It's too late to be
* making any data changes to the target catalog.
*/
if (IsSystemClass(r1, relform1))
elog(ERROR, "cannot swap toast files by links for system catalogs");
/* Delete old dependencies */
changeDependencyLinks(r1, r2,
relform1->reltoastrelid, relform2->reltoastrelid,
"TOAST");
}
}
/*
* If we're swapping two toast tables by content, do the same for their
* valid index. The swap can actually be safely done only if the relations
* have indexes.
*/
if (swap_toast_by_content &&
relform1->relkind == RELKIND_TOASTVALUE &&
relform2->relkind == RELKIND_TOASTVALUE)
{
Oid toastIndex1,
toastIndex2;
/* Get valid index for each relation */
toastIndex1 = toast_get_valid_index(r1,
AccessExclusiveLock);
toastIndex2 = toast_get_valid_index(r2,
AccessExclusiveLock);
swap_relation_files(toastIndex1,
toastIndex2,
target_is_pg_class,
swap_toast_by_content,
swap_stats,
is_internal,
InvalidTransactionId,
InvalidMultiXactId,
mapped_tables);
}
/* Send statistics from QE to QD */
if (Gp_role == GP_ROLE_EXECUTE && swap_stats && !IsSystemClass(r1, relform1))
{
rel = relation_open(r1, AccessShareLock);
vac_send_relstats_to_qd(rel,
relform1->relpages,
relform1->reltuples,
relform1->relallvisible);
relation_close(rel, AccessShareLock);
}
/* Clean up. */
heap_freetuple(reltup1);
heap_freetuple(reltup2);
table_close(relRelation, RowExclusiveLock);
/*
* Close both relcache entries' smgr links. We need this kluge because
* both links will be invalidated during upcoming CommandCounterIncrement.
* Whichever of the rels is the second to be cleared will have a dangling
* reference to the other's smgr entry. Rather than trying to avoid this
* by ordering operations just so, it's easiest to close the links first.
* (Fortunately, since one of the entries is local in our transaction,
* it's sufficient to clear out our own relcache this way; the problem
* cannot arise for other backends when they see our update on the
* non-transient relation.)
*
* Caution: the placement of this step interacts with the decision to
* handle toast rels by recursion. When we are trying to rebuild pg_class
* itself, the smgr close on pg_class must happen after all accesses in
* this function.
*/
RelationCloseSmgrByOid(r1);
RelationCloseSmgrByOid(r2);
}
/*
* Remove the transient table that was built by make_new_heap, and finish
* cleaning up (including rebuilding all indexes on the old heap).
*/
void
finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
bool is_system_catalog,
bool swap_toast_by_content,
bool swap_stats,
bool check_constraints,
bool is_internal,
TransactionId frozenXid,
MultiXactId cutoffMulti,
char newrelpersistence)
{
ObjectAddress object;
Oid mapped_tables[4];
int reindex_flags;
int i;
/* Report that we are now swapping relation files */
pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
PROGRESS_CLUSTER_PHASE_SWAP_REL_FILES);
/* Zero out possible results from swapped_relation_files */
memset(mapped_tables, 0, sizeof(mapped_tables));
/*
* Swap the contents of the heap relations (including any toast tables).
* Also set old heap's relfrozenxid to frozenXid.
*/
swap_relation_files(OIDOldHeap, OIDNewHeap,
(OIDOldHeap == RelationRelationId),
swap_toast_by_content,
swap_stats,
is_internal,
frozenXid, cutoffMulti, mapped_tables);
SIMPLE_FAULT_INJECTOR("after_swap_relation_files");
/*
* If it's a system catalog, queue a sinval message to flush all catcaches
* on the catalog when we reach CommandCounterIncrement.
*/
if (is_system_catalog)
CacheInvalidateCatalog(OIDOldHeap);
/*
* Rebuild each index on the relation (but not the toast table, which is
* all-new at this point). It is important to do this before the DROP
* step because if we are processing a system catalog that will be used
* during DROP, we want to have its indexes available. There is no
* advantage to the other order anyway because this is all transactional,
* so no chance to reclaim disk space before commit. We do not need a
* final CommandCounterIncrement() because reindex_relation does it.
*
* Note: because index_build is called via reindex_relation, it will never
* set indcheckxmin true for the indexes. This is OK even though in some
* sense we are building new indexes rather than rebuilding existing ones,
* because the new heap won't contain any HOT chains at all, let alone
* broken ones, so it can't be necessary to set indcheckxmin.
*/
reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
if (check_constraints)
reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
/*
* Ensure that the indexes have the same persistence as the parent
* relation.
*/
if (newrelpersistence == RELPERSISTENCE_UNLOGGED)
reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED;
else if (newrelpersistence == RELPERSISTENCE_PERMANENT)
reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT;
/* Report that we are now reindexing relations */
pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
PROGRESS_CLUSTER_PHASE_REBUILD_INDEX);
reindex_relation(OIDOldHeap, reindex_flags, 0);
/* Report that we are now doing clean up */
pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
PROGRESS_CLUSTER_PHASE_FINAL_CLEANUP);
/*
* If the relation being rebuild is pg_class, swap_relation_files()
* couldn't update pg_class's own pg_class entry (check comments in
* swap_relation_files()), thus relfrozenxid was not updated. That's
* annoying because a potential reason for doing a VACUUM FULL is a
* imminent or actual anti-wraparound shutdown. So, now that we can
* access the new relation using its indices, update relfrozenxid.
* pg_class doesn't have a toast relation, so we don't need to update the
* corresponding toast relation. Not that there's little point moving all
* relfrozenxid updates here since swap_relation_files() needs to write to
* pg_class for non-mapped relations anyway.
*/
if (OIDOldHeap == RelationRelationId)
{
Relation relRelation;
HeapTuple reltup;
Form_pg_class relform;
relRelation = table_open(RelationRelationId, RowExclusiveLock);
reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(OIDOldHeap));
if (!HeapTupleIsValid(reltup))
elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
relform = (Form_pg_class) GETSTRUCT(reltup);
relform->relfrozenxid = frozenXid;
relform->relminmxid = cutoffMulti;
CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
table_close(relRelation, RowExclusiveLock);
}
/* Destroy new heap with old filenode */
object.classId = RelationRelationId;
object.objectId = OIDNewHeap;
object.objectSubId = 0;
/*
* The new relation is local to our transaction and we know nothing
* depends on it, so DROP_RESTRICT should be OK.
*/
performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
/* performDeletion does CommandCounterIncrement at end */
/*
* Now we must remove any relation mapping entries that we set up for the
* transient table, as well as its toast table and toast index if any. If
* we fail to do this before commit, the relmapper will complain about new
* permanent map entries being added post-bootstrap.
*/
for (i = 0; OidIsValid(mapped_tables[i]); i++)
RelationMapRemoveMapping(mapped_tables[i]);
/*
* At this point, everything is kosher except that, if we did toast swap
* by links, the toast table's name corresponds to the transient table.
* The name is irrelevant to the backend because it's referenced by OID,
* but users looking at the catalogs could be confused. Rename it to
* prevent this problem.
*
* Note no lock required on the relation, because we already hold an
* exclusive lock on it.
*/
if (!swap_toast_by_content)
{
Relation newrel;
newrel = table_open(OIDOldHeap, NoLock);
if (OidIsValid(newrel->rd_rel->reltoastrelid))
{
Oid toastidx;
char NewToastName[NAMEDATALEN];
/* Get the associated valid index to be renamed */
toastidx = toast_get_valid_index(newrel->rd_rel->reltoastrelid,
AccessShareLock);
/* rename the toast table ... */
snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
OIDOldHeap);
RenameRelationInternal(newrel->rd_rel->reltoastrelid,
NewToastName, true, false);
/* ... and its valid index too. */
snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
OIDOldHeap);
RenameRelationInternal(toastidx,
NewToastName, true, true);
}
relation_close(newrel, NoLock);
}
/* if it's not a catalog table, clear any missing attribute settings */
if (!is_system_catalog)
{
Relation newrel;
newrel = table_open(OIDOldHeap, NoLock);
RelationClearMissing(newrel);
relation_close(newrel, NoLock);
}
}
/*
* Get a list of tables that the current user owns and
* have indisclustered set. Return the list in a List * of rvsToCluster
* with the tableOid and the indexOid on which the table is already
* clustered.
*/
static List *
get_tables_to_cluster(MemoryContext cluster_context)
{
Relation indRelation;
TableScanDesc scan;
ScanKeyData entry;
HeapTuple indexTuple;
Form_pg_index index;
MemoryContext old_context;
RelToCluster *rvtc;
List *rvs = NIL;
/*
* Get all indexes that have indisclustered set and are owned by
* appropriate user. System relations or nailed-in relations cannot ever
* have indisclustered set, because CLUSTER will refuse to set it when
* called with one of them as argument.
*/
indRelation = table_open(IndexRelationId, AccessShareLock);
ScanKeyInit(&entry,
Anum_pg_index_indisclustered,
BTEqualStrategyNumber, F_BOOLEQ,
BoolGetDatum(true));
scan = table_beginscan_catalog(indRelation, 1, &entry);
while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
{
index = (Form_pg_index) GETSTRUCT(indexTuple);
if (!pg_class_ownercheck(index->indrelid, GetUserId()))
continue;
/*
* We have to build the list in a different memory context so it will
* survive the cross-transaction processing
*/
old_context = MemoryContextSwitchTo(cluster_context);
rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
rvtc->tableOid = index->indrelid;
rvtc->indexOid = index->indexrelid;
rvs = lcons(rvtc, rvs);
MemoryContextSwitchTo(old_context);
}
table_endscan(scan);
relation_close(indRelation, AccessShareLock);
return rvs;
}
相关信息
相关文章
0
赞
热门推荐
-
2、 - 优质文章
-
3、 gate.io
-
8、 golang
-
9、 openharmony
-
10、 Vue中input框自动聚焦