 * nodeModifyTable.c
 *	  routines to handle ModifyTable nodes.
 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *	  src/backend/executor/nodeModifyTable.c
 *		ExecInitModifyTable - initialize the ModifyTable node
 *		ExecModifyTable		- retrieve the next tuple from the node
 *		ExecEndModifyTable	- shut down the ModifyTable node
 *		ExecReScanModifyTable - rescan the ModifyTable node
 *		Each ModifyTable node contains a list of one or more subplans,
 *		much like an Append node.  There is one subplan per result relation.
 *		The key reason for this is that in an inherited UPDATE command, each
 *		result relation could have a different schema (more or different
 *		columns) requiring a different plan tree to produce it.  In an
 *		inherited DELETE, all the subplans should produce the same output
 *		rowtype, but we might still find that different plans are appropriate
 *		for different child relations.
 *		If the query specifies RETURNING, then the ModifyTable returns a
 *		RETURNING tuple after completing each row insert, update, or delete.
 *		It must be called again to continue the operation.  Without RETURNING,
 *		we just loop within the node until all the work is done, then
 *		return NULL.  This avoids useless call/return overhead.

#include "postgres.h"

#include "access/heapam.h"
#include "access/htup_details.h"
#include "access/tableam.h"
#include "access/tupconvert.h"
#include "access/xact.h"
#include "catalog/catalog.h"
#include "commands/trigger.h"
#include "executor/execPartition.h"
#include "executor/executor.h"
#include "executor/nodeModifyTable.h"
#include "foreign/fdwapi.h"
#include "miscadmin.h"
#include "nodes/nodeFuncs.h"
#include "rewrite/rewriteHandler.h"
#include "storage/bufmgr.h"
#include "storage/lmgr.h"
#include "utils/builtins.h"
#include "utils/datum.h"
#include "utils/memutils.h"
#include "utils/rel.h"

#include "access/transam.h"
#include "cdb/cdbaocsam.h"
#include "cdb/cdbappendonlyam.h"
#include "cdb/cdbvars.h"
#include "parser/parsetree.h"
#include "utils/lsyscache.h"
#include "utils/snapmgr.h"

static bool ExecOnConflictUpdate(ModifyTableState *mtstate,
								 ResultRelInfo *resultRelInfo,
								 ItemPointer conflictTid,
								 TupleTableSlot *planSlot,
								 TupleTableSlot *excludedSlot,
								 EState *estate,
								 bool canSetTag,
								 TupleTableSlot **returning);
static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate,
											   EState *estate,
											   PartitionTupleRouting *proute,
											   ResultRelInfo *targetRelInfo,
											   TupleTableSlot *slot);
static ResultRelInfo *getTargetResultRelInfo(ModifyTableState *node);
static void ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate);
static TupleConversionMap *tupconv_map_for_subplan(ModifyTableState *node,
												   int whichplan);

 * Verify that the tuples to be produced by INSERT or UPDATE match the
 * target relation's rowtype
 * We do this to guard against stale plans.  If plan invalidation is
 * functioning properly then we should never get a failure here, but better
 * safe than sorry.  Note that this is called after we have obtained lock
 * on the target rel, so the rowtype can't change underneath us.
 * The plan output is represented by its targetlist, because that makes
 * handling the dropped-column case easier.
static void
ExecCheckPlanOutput(Relation resultRel, List *targetList)
	TupleDesc	resultDesc = RelationGetDescr(resultRel);
	int			attno = 0;
	ListCell   *lc;

	foreach(lc, targetList)
		TargetEntry *tle = (TargetEntry *) lfirst(lc);
		Form_pg_attribute attr;

		if (tle->resjunk)
			continue;			/* ignore junk tlist items */

		if (attno >= resultDesc->natts)
					 errmsg("table row type and query-specified row type do not match"),
					 errdetail("Query has too many columns.")));
		attr = TupleDescAttr(resultDesc, attno);

		if (!attr->attisdropped)
			/* Normal case: demand type match */
			if (exprType((Node *) tle->expr) != attr->atttypid)
						 errmsg("table row type and query-specified row type do not match"),
						 errdetail("Table has type %s at ordinal position %d, but query expects %s.",
								   format_type_be(exprType((Node *) tle->expr)))));
			 * For a dropped column, we can't check atttypid (it's likely 0).
			 * In any case the planner has most likely inserted an INT4 null.
			 * What we insist on is just *some* NULL constant.
			/* GPDB_96_MERGE_FIXME: the subplan can be a Motion, so that the NULLs
			 * are transferred through the Motion node.
#if 0
			if (!IsA(tle->expr, Const) ||
				!((Const *) tle->expr)->constisnull)
						 errmsg("table row type and query-specified row type do not match"),
						 errdetail("Query provides a value for a dropped column at ordinal position %d.",
	if (attno != resultDesc->natts)
				 errmsg("table row type and query-specified row type do not match"),
				 errdetail("Query has too few columns.")));

 * ExecProcessReturning --- evaluate a RETURNING list
 * resultRelInfo: current result rel
 * tupleSlot: slot holding tuple actually inserted/updated/deleted
 * planSlot: slot holding tuple returned by top subplan node
 * Note: If tupleSlot is NULL, the FDW should have already provided econtext's
 * scan tuple.
 * Returns a slot holding the result tuple
static TupleTableSlot *
ExecProcessReturning(ResultRelInfo *resultRelInfo,
					 TupleTableSlot *tupleSlot,
					 TupleTableSlot *planSlot)
	ProjectionInfo *projectReturning = resultRelInfo->ri_projectReturning;
	ExprContext *econtext = projectReturning->pi_exprContext;

	/* Make tuple and any needed join variables available to ExecProject */
	if (tupleSlot)
		econtext->ecxt_scantuple = tupleSlot;
	econtext->ecxt_outertuple = planSlot;

	 * RETURNING expressions might reference the tableoid column, so
	 * reinitialize tts_tableOid before evaluating them.
	econtext->ecxt_scantuple->tts_tableOid =

	/* Compute the RETURNING expressions */
	return ExecProject(projectReturning);

 * ExecCheckTupleVisible -- verify tuple is visible
 * It would not be consistent with guarantees of the higher isolation levels to
 * proceed with avoiding insertion (taking speculative insertion's alternative
 * path) on the basis of another tuple that is not visible to MVCC snapshot.
 * Check for the need to raise a serialization failure, and do so as necessary.
static void
ExecCheckTupleVisible(EState *estate,
					  Relation rel,
					  TupleTableSlot *slot)
	if (!IsolationUsesXactSnapshot())

	if (!table_tuple_satisfies_snapshot(rel, slot, estate->es_snapshot))
		Datum		xminDatum;
		TransactionId xmin;
		bool		isnull;

		xminDatum = slot_getsysattr(slot, MinTransactionIdAttributeNumber, &isnull);
		xmin = DatumGetTransactionId(xminDatum);

		 * We should not raise a serialization failure if the conflict is
		 * against a tuple inserted by our own transaction, even if it's not
		 * visible to our snapshot.  (This would happen, for example, if
		 * conflicting keys are proposed for insertion in a single command.)
		if (!TransactionIdIsCurrentTransactionId(xmin))
					 errmsg("could not serialize access due to concurrent update")));

 * ExecCheckTIDVisible -- convenience variant of ExecCheckTupleVisible()
static void
ExecCheckTIDVisible(EState *estate,
					ResultRelInfo *relinfo,
					ItemPointer tid,
					TupleTableSlot *tempSlot)
	Relation	rel = relinfo->ri_RelationDesc;

	/* Redundantly check isolation level */
	if (!IsolationUsesXactSnapshot())

	if (!table_tuple_fetch_row_version(rel, tid, SnapshotAny, tempSlot))
		elog(ERROR, "failed to fetch conflicting tuple for ON CONFLICT");
	ExecCheckTupleVisible(estate, rel, tempSlot);

 * Compute stored generated columns for a tuple
ExecComputeStoredGenerated(EState *estate, TupleTableSlot *slot)
	ResultRelInfo *resultRelInfo = estate->es_result_relation_info;
	Relation	rel = resultRelInfo->ri_RelationDesc;
	TupleDesc	tupdesc = RelationGetDescr(rel);
	int			natts = tupdesc->natts;
	MemoryContext oldContext;
	Datum	   *values;
	bool	   *nulls;

	Assert(tupdesc->constr && tupdesc->constr->has_generated_stored);

	 * If first time through for this result relation, build expression
	 * nodetrees for rel's stored generation expressions.  Keep them in the
	 * per-query memory context so they'll survive throughout the query.
	if (resultRelInfo->ri_GeneratedExprs == NULL)
		oldContext = MemoryContextSwitchTo(estate->es_query_cxt);

		resultRelInfo->ri_GeneratedExprs =
			(ExprState **) palloc(natts * sizeof(ExprState *));

		for (int i = 0; i < natts; i++)
			if (TupleDescAttr(tupdesc, i)->attgenerated == ATTRIBUTE_GENERATED_STORED)
				Expr	   *expr;

				expr = (Expr *) build_column_default(rel, i + 1);
				if (expr == NULL)
					elog(ERROR, "no generation expression found for column number %d of table \"%s\"",
						 i + 1, RelationGetRelationName(rel));

				resultRelInfo->ri_GeneratedExprs[i] = ExecPrepareExpr(expr, estate);


	oldContext = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));

	values = palloc(sizeof(*values) * natts);
	nulls = palloc(sizeof(*nulls) * natts);

	memcpy(nulls, slot->tts_isnull, sizeof(*nulls) * natts);

	for (int i = 0; i < natts; i++)
		Form_pg_attribute attr = TupleDescAttr(tupdesc, i);

		if (attr->attgenerated == ATTRIBUTE_GENERATED_STORED)
			ExprContext *econtext;
			Datum		val;
			bool		isnull;

			econtext = GetPerTupleExprContext(estate);
			econtext->ecxt_scantuple = slot;

			val = ExecEvalExpr(resultRelInfo->ri_GeneratedExprs[i], econtext, &isnull);

			values[i] = val;
			nulls[i] = isnull;
			if (!nulls[i])
				values[i] = datumCopy(slot->tts_values[i], attr->attbyval, attr->attlen);

	memcpy(slot->tts_values, values, sizeof(*values) * natts);
	memcpy(slot->tts_isnull, nulls, sizeof(*nulls) * natts);


/* ----------------------------------------------------------------
 *		ExecInsert
 *		For INSERT, we have to insert the tuple into the target relation
 *		and insert appropriate tuples into the index relations.
 *		Returns RETURNING result if any, otherwise NULL.
 * If the target table is partitioned, the input tuple in 'parentslot'
 * is in the shape required for the parent table. This function will
 * look up the ResultRelInfo of the target partition, and form a
 * tuple suitable for the target partition. (It can be different, if
 * there are dropped columns in the parent, but not the partition,
 * for example.)
 * In GPDB, the INSERT can be part of an update operation when
 * there is a preceding SplitUpdate node. 'splitUpdate' is true in
 * that case.
 * ----------------------------------------------------------------
static TupleTableSlot *
ExecInsert(ModifyTableState *mtstate,
		   TupleTableSlot *slot,
		   TupleTableSlot *planSlot,
		   EState *estate,
		   bool canSetTag,
		   bool splitUpdate)
	ResultRelInfo *resultRelInfo;
	Relation	resultRelationDesc;
	List	   *recheckIndexes = NIL;
	TupleTableSlot *result = NULL;
	TransitionCaptureState *ar_insert_trig_tcs;
	ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
	OnConflictAction onconflict = node->onConflictAction;


	 * get information on the (current) result relation
	resultRelInfo = estate->es_result_relation_info;
	resultRelationDesc = resultRelInfo->ri_RelationDesc;

	 * Note: We fire BEFORE ROW TRIGGERS for every attempted insertion in an
	 * INSERT ... ON CONFLICT statement.  We cannot check for constraint
	 * violations before firing these triggers, because they can change the
	 * values to insert.  Also, they can run arbitrary user-defined code with
	 * side-effects that we can't cancel by just not inserting the tuple.
	 * GPDB_12_MERGE_FIXME: PostgreSQL *does* fire INSERT and DELETE
	 * triggers on an UPDATE that moves tuples from one partition to another.
	 * Should we follow that example with cross-segment UPDATEs too?
	if (resultRelInfo->ri_TrigDesc &&
		resultRelInfo->ri_TrigDesc->trig_insert_before_row &&
		if (!ExecBRInsertTriggers(estate, resultRelInfo, slot))
			return NULL;		/* "do nothing" */

	if (resultRelInfo->ri_TrigDesc &&
		if (!ExecIRInsertTriggers(estate, resultRelInfo, slot))
			return NULL;		/* "do nothing" */
	else if (resultRelInfo->ri_FdwRoutine)
		 * Compute stored generated columns
		if (resultRelationDesc->rd_att->constr &&
			ExecComputeStoredGenerated(estate, slot);

		 * insert into foreign table: let the FDW do it
		slot = resultRelInfo->ri_FdwRoutine->ExecForeignInsert(estate,

		if (slot == NULL)		/* "do nothing" */
			return NULL;

		 * AFTER ROW Triggers or RETURNING expressions might reference the
		 * tableoid column, so (re-)initialize tts_tableOid before evaluating
		 * them.
		slot->tts_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc);

		WCOKind		wco_kind;

		 * Constraints might reference the tableoid column, so (re-)initialize
		 * tts_tableOid before evaluating them.
		slot->tts_tableOid = RelationGetRelid(resultRelationDesc);

		 * Compute stored generated columns
		if (resultRelationDesc->rd_att->constr &&
			ExecComputeStoredGenerated(estate, slot);

		 * Check any RLS WITH CHECK policies.
		 * Normally we should check INSERT policies. But if the insert is the
		 * result of a partition key update that moved the tuple to a new
		 * partition, we should instead check UPDATE policies, because we are
		 * executing policies defined on the target table, and not those
		 * defined on the child partitions.
		wco_kind = (mtstate->operation == CMD_UPDATE) ?

		 * ExecWithCheckOptions() will skip any WCOs which are not of the kind
		 * we are looking for at this point.
		if (resultRelInfo->ri_WithCheckOptions != NIL)
			ExecWithCheckOptions(wco_kind, resultRelInfo, slot, estate);

		 * Check the constraints of the tuple.
		if (resultRelationDesc->rd_att->constr)
			ExecConstraints(resultRelInfo, slot, estate);

		 * Also check the tuple against the partition constraint, if there is
		 * one; except that if we got here via tuple-routing, we don't need to
		 * if there's no BR trigger defined on the partition.
		if (resultRelInfo->ri_PartitionCheck &&
			(resultRelInfo->ri_PartitionRoot == NULL ||
			 (resultRelInfo->ri_TrigDesc &&
			ExecPartitionCheck(resultRelInfo, slot, estate, true);

		if (onconflict != ONCONFLICT_NONE && resultRelInfo->ri_NumIndices > 0)
			/* Perform a speculative insertion. */
			uint32		specToken;
			ItemPointerData conflictTid;
			bool		specConflict;
			List	   *arbiterIndexes;

			arbiterIndexes = resultRelInfo->ri_onConflictArbiterIndexes;

			 * Do a non-conclusive check for conflicts first.
			 * We're not holding any locks yet, so this doesn't guarantee that
			 * the later insert won't conflict.  But it avoids leaving behind
			 * a lot of canceled speculative insertions, if you run a lot of
			 * INSERT ON CONFLICT statements that do conflict.
			 * We loop back here if we find a conflict below, either during
			 * the pre-check, or when we re-check after inserting the tuple
			 * speculatively.
			specConflict = false;
			if (!ExecCheckIndexConstraints(slot, estate, &conflictTid,
				/* committed conflict tuple found */
				if (onconflict == ONCONFLICT_UPDATE)
					 * In case of ON CONFLICT DO UPDATE, execute the UPDATE
					 * part.  Be prepared to retry if the UPDATE fails because
					 * of another concurrent UPDATE/DELETE to the conflict
					 * tuple.
					TupleTableSlot *returning = NULL;

					if (ExecOnConflictUpdate(mtstate, resultRelInfo,
											 &conflictTid, planSlot, slot,
											 estate, canSetTag, &returning))
						InstrCountTuples2(&mtstate->ps, 1);
						return returning;
						goto vlock;
					 * In case of ON CONFLICT DO NOTHING, do nothing. However,
					 * verify that the tuple is visible to the executor's MVCC
					 * snapshot at higher isolation levels.
					 * Using ExecGetReturningSlot() to store the tuple for the
					 * recheck isn't that pretty, but we can't trivially use
					 * the input slot, because it might not be of a compatible
					 * type. As there's no conflicting usage of
					 * ExecGetReturningSlot() in the DO NOTHING case...
					Assert(onconflict == ONCONFLICT_NOTHING);
					ExecCheckTIDVisible(estate, resultRelInfo, &conflictTid,
										ExecGetReturningSlot(estate, resultRelInfo));
					InstrCountTuples2(&mtstate->ps, 1);
					return NULL;

			 * Before we start insertion proper, acquire our "speculative
			 * insertion lock".  Others can use that to wait for us to decide
			 * if we're going to go ahead with the insertion, instead of
			 * waiting for the whole transaction to complete.
			specToken = SpeculativeInsertionLockAcquire(GetCurrentTransactionId());

			/* insert the tuple, with the speculative token */
			table_tuple_insert_speculative(resultRelationDesc, slot,

			/* insert index entries for tuple */
			recheckIndexes = ExecInsertIndexTuples(slot, estate, true,

			/* adjust the tuple's state accordingly */
			table_tuple_complete_speculative(resultRelationDesc, slot,
											 specToken, !specConflict);

			 * Wake up anyone waiting for our decision.  They will re-check
			 * the tuple, see that it's no longer speculative, and wait on our
			 * XID as if this was a regularly inserted tuple all along.  Or if
			 * we killed the tuple, they will see it's dead, and proceed as if
			 * the tuple never existed.

			 * If there was a conflict, start from the beginning.  We'll do
			 * the pre-check again, which will now find the conflicting tuple
			 * (unless it aborts before we get there).
			if (specConflict)
				goto vlock;

			/* Since there was no insertion conflict, we're done */
			/* insert the tuple normally */
			table_tuple_insert(resultRelationDesc, slot,
							   0, NULL);

			/* insert index entries for tuple */
			if (resultRelInfo->ri_NumIndices > 0)
				recheckIndexes = ExecInsertIndexTuples(slot, estate, false, NULL,
	if (canSetTag)

	 * If this insert is the result of a partition key update that moved the
	 * tuple to a new partition, put this row into the transition NEW TABLE,
	 * if there is one. We need to do this separately for DELETE and INSERT
	 * because they happen on different tables.
	ar_insert_trig_tcs = mtstate->mt_transition_capture;
	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
		&& mtstate->mt_transition_capture->tcs_update_new_table)
		ExecARUpdateTriggers(estate, resultRelInfo, NULL,

		 * We've already captured the NEW TABLE row, so make sure any AR
		 * INSERT trigger fired below doesn't capture it again.
		ar_insert_trig_tcs = NULL;

	slot->tts_tableOid = RelationGetRelid(resultRelationDesc);

	/* AFTER ROW INSERT Triggers */
	 * GPDB: Don't fire DELETE triggers on a split UPDATE.
	 * GPDB_12_MERGE_FIXME: PostgreSQL *does* fire INSERT and DELETE
	 * triggers on an UPDATE that moves tuples from one partition to another.
	 * Should we follow that example with cross-segment UPDATEs too?
	if (!splitUpdate)
		ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,


	 * Check any WITH CHECK OPTION constraints from parent views.  We are
	 * required to do this after testing all constraints and uniqueness
	 * violations per the SQL spec, so we do it after actually inserting the
	 * record into the heap and all indexes.
	 * ExecWithCheckOptions will elog(ERROR) if a violation is found, so the
	 * tuple will never be seen, if it violates the WITH CHECK OPTION.
	 * ExecWithCheckOptions() will skip any WCOs which are not of the kind we
	 * are looking for at this point.
	if (resultRelInfo->ri_WithCheckOptions != NIL)
		ExecWithCheckOptions(WCO_VIEW_CHECK, resultRelInfo, slot, estate);

	/* Process RETURNING if present */
	if (resultRelInfo->ri_projectReturning)
		result = ExecProcessReturning(resultRelInfo, slot, planSlot);

	return result;

/* ----------------------------------------------------------------
 *		ExecDelete
 *		DELETE is like UPDATE, except that we delete the tuple and no
 *		index modifications are needed.
 *		When deleting from a table, tupleid identifies the tuple to
 *		delete and oldtuple is NULL.  When deleting from a view,
 *		oldtuple is passed to the INSTEAD OF triggers and identifies
 *		what to delete, and tupleid is invalid.  When deleting from a
 *		foreign table, tupleid is invalid; the FDW has to figure out
 *		which row to delete using data from the planSlot.  oldtuple is
 *		passed to foreign table triggers; it is NULL when the foreign
 *		table has no relevant triggers.  We use tupleDeleted to indicate
 *		whether the tuple is actually deleted, callers can use it to
 *		decide whether to continue the operation.  When this DELETE is a
 *		part of an UPDATE of partition-key, then the slot returned by
 *		EvalPlanQual() is passed back using output parameter epqslot.
 *		In GPDB, DELETE can be part of an update operation when
 *		there is a preceding SplitUpdate node.
 *		Returns RETURNING result if any, otherwise NULL.
 * ----------------------------------------------------------------
static TupleTableSlot *
ExecDelete(ModifyTableState *mtstate,
		   ItemPointer tupleid,
		   int32 segid,
		   HeapTuple oldtuple,
		   TupleTableSlot *planSlot,
		   EPQState *epqstate,
		   EState *estate,
		   bool processReturning,
		   bool canSetTag,
		   bool changingPart,
		   bool splitUpdate,
		   bool *tupleDeleted,
		   TupleTableSlot **epqreturnslot)
	/* PlanGenerator planGen = estate->es_plannedstmt->planGen; */
	ResultRelInfo *resultRelInfo;
	Relation	resultRelationDesc;
	TM_Result	result;
	TM_FailureData tmfd;
	TupleTableSlot *slot = NULL;
	TransitionCaptureState *ar_delete_trig_tcs;

	if (tupleDeleted)
		*tupleDeleted = false;

	 * Sanity check the distribution of the tuple to prevent
	 * potential data corruption in case users manipulate data
	 * incorrectly (e.g. insert data on incorrect segment through
	 * utility mode) or there is bug in code, etc.
	if (segid != GpIdentity.segindex)
			 "distribution key of the tuple (%u, %u) doesn't belong to "
			 "current segment (actually from seg%d)",

	 * get information on the (current) result relation
	// GPDB_12_MERGE_FIXME: How to do this in new partitioning implementation?
	// Can we change the way ORCA deletion plans work?
#if 0
	if (estate->es_result_partitions && planGen == PLANGEN_OPTIMIZER)

		Oid parent = estate->es_result_partitions->part->parrelid;

		/* Obtain part for current tuple. */
		resultRelInfo = slot_get_partition(planSlot, estate, true);
		estate->es_result_relation_info = resultRelInfo;

		Oid part = RelationGetRelid(resultRelInfo->ri_RelationDesc);

		Assert(parent != part);
		resultRelInfo = estate->es_result_relation_info;
	resultRelInfo = estate->es_result_relation_info;
	resultRelationDesc = resultRelInfo->ri_RelationDesc;

	/* BEFORE ROW DELETE Triggers */
	 * GPDB_12_MERGE_FIXME: PostgreSQL *does* fire INSERT and DELETE
	 * triggers on an UPDATE that moves tuples from one partition to another.
	 * Should we follow that example with cross-segment UPDATEs too?
	if (resultRelInfo->ri_TrigDesc &&
		resultRelInfo->ri_TrigDesc->trig_delete_before_row &&
		bool		dodelete;

		dodelete = ExecBRDeleteTriggers(estate, epqstate, resultRelInfo,
										tupleid, oldtuple, epqreturnslot);

		if (!dodelete)			/* "do nothing" */
			return NULL;

	if (resultRelInfo->ri_TrigDesc &&
		bool		dodelete;

		Assert(oldtuple != NULL);

		dodelete = ExecIRDeleteTriggers(estate, resultRelInfo, oldtuple);

		if (!dodelete)			/* "do nothing" */
			return NULL;
	else if (resultRelInfo->ri_FdwRoutine)
		 * delete from foreign table: let the FDW do it
		 * We offer the returning slot as a place to store RETURNING data,
		 * although the FDW can return some other slot if it wants.
		slot = ExecGetReturningSlot(estate, resultRelInfo);
		slot = resultRelInfo->ri_FdwRoutine->ExecForeignDelete(estate,

		if (slot == NULL)		/* "do nothing" */
			return NULL;

		 * RETURNING expressions might reference the tableoid column, so
		 * (re)initialize tts_tableOid before evaluating them.
		if (TTS_EMPTY(slot))

		slot->tts_tableOid = RelationGetRelid(resultRelationDesc);
		 * delete the tuple
		 * Note: if es_crosscheck_snapshot isn't InvalidSnapshot, we check
		 * that the row to be deleted is visible to that snapshot, and throw a
		 * can't-serialize error if not. This is a special-case behavior
		 * needed for referential integrity updates in transaction-snapshot
		 * mode transactions.
		result = table_tuple_delete(resultRelationDesc, tupleid,
									true /* wait for commit */ ,
									changingPart || splitUpdate);

		switch (result)
			case TM_SelfModified:

				 * The target tuple was already updated or deleted by the
				 * current command, or by a later command in the current
				 * transaction.  The former case is possible in a join DELETE
				 * where multiple tuples join to the same target tuple. This
				 * is somewhat questionable, but Postgres has always allowed
				 * it: we just ignore additional deletion attempts.
				 * The latter case arises if the tuple is modified by a
				 * command in a BEFORE trigger, or perhaps by a command in a
				 * volatile function used in the query.  In such situations we
				 * should not ignore the deletion, but it is equally unsafe to
				 * proceed.  We don't want to discard the original DELETE
				 * while keeping the triggered actions based on its deletion;
				 * and it would be no better to allow the original DELETE
				 * while discarding updates that it triggered.  The row update
				 * carries some information that might be important according
				 * to business rules; so throwing an error is the only safe
				 * course.
				 * If a trigger actually intends this type of interaction, it
				 * can re-execute the DELETE and then return NULL to cancel
				 * the outer delete.
				if (tmfd.cmax != estate->es_output_cid)
							 errmsg("tuple to be deleted was already modified by an operation triggered by the current command"),
							 errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows.")));

				/* Else, already deleted by self; nothing to do */

				 * In an scenario in which R(a,b) and S(a,b) have
				 *        R               S
				 *    ________         ________
				 *     (1, 1)           (1, 2)
				 *                      (1, 7)
				 *  An update query such as:
				 *   UPDATE R SET a = S.b  FROM S WHERE R.b = S.a;
				 *  will have an non-deterministic output. The tuple in R
				 * can be updated to (2,1) or (7,1).
				 * Since the introduction of SplitUpdate, these queries will
				 * send multiple requests to delete the same tuple. One of them
				 * will pass, but others will not. But there will also be
				 * multiple requests to insert a new version of the tuple, and
				 * we cannot cancel out those if the Delete cannot be
				 * performed. An error is reported in such scenario; otherwise
				 * you end up with multiple copies of the same row.
				if (splitUpdate)
							 errmsg("multiple updates to a row by the same query is not allowed")));
				return NULL;

			case TM_Ok:

			case TM_Updated:
					TupleTableSlot *inputslot;
					TupleTableSlot *epqslot;

					if (IsolationUsesXactSnapshot())
								 errmsg("could not serialize access due to concurrent update")));

					 * Already know that we're going to need to do EPQ, so
					 * fetch tuple directly into the right slot.
					EvalPlanQualBegin(epqstate, estate);
					inputslot = EvalPlanQualSlot(epqstate, resultRelationDesc,

					result = table_tuple_lock(resultRelationDesc, tupleid,
											  inputslot, estate->es_output_cid,
											  LockTupleExclusive, LockWaitBlock,

					switch (result)
						case TM_Ok:
							epqslot = EvalPlanQual(estate,
							if (TupIsNull(epqslot))
								/* Tuple not passing quals anymore, exiting... */
								return NULL;

							 * If requested, skip delete and pass back the
							 * updated row.
							if (epqreturnslot)
								*epqreturnslot = epqslot;
								return NULL;
								goto ldelete;

						case TM_SelfModified:

							 * This can be reached when following an update
							 * chain from a tuple updated by another session,
							 * reaching a tuple that was already updated in
							 * this transaction. If previously updated by this
							 * command, ignore the delete, otherwise error
							 * out.
							 * See also TM_SelfModified response to
							 * table_tuple_delete() above.
							if (tmfd.cmax != estate->es_output_cid)
										 errmsg("tuple to be deleted was already modified by an operation triggered by the current command"),
										 errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows.")));
							return NULL;

						case TM_Deleted:
							/* tuple already deleted; nothing to do */
							return NULL;


							 * TM_Invisible should be impossible because we're
							 * waiting for updated row versions, and would
							 * already have errored out if the first version
							 * is invisible.
							 * TM_Updated should be impossible, because we're
							 * locking the latest version via
							elog(ERROR, "unexpected table_tuple_lock status: %u",
							return NULL;


			case TM_Deleted:
				if (IsolationUsesXactSnapshot())
							 errmsg("could not serialize access due to concurrent delete")));
				/* tuple already deleted; nothing to do */
				return NULL;

				elog(ERROR, "unrecognized table_tuple_delete status: %u",
				return NULL;

		 * Note: Normally one would think that we have to delete index tuples
		 * associated with the heap tuple now...
		 * ... but in POSTGRES, we have no need to do this because VACUUM will
		 * take care of it later.  We can't delete index tuples immediately
		 * anyway, since the tuple is still visible to other transactions.

	if (canSetTag)

	/* Tell caller that the delete actually happened. */
	if (tupleDeleted)
		*tupleDeleted = true;

	 * If this delete is the result of a partition key update that moved the
	 * tuple to a new partition, put this row into the transition OLD TABLE,
	 * if there is one. We need to do this separately for DELETE and INSERT
	 * because they happen on different tables.
	ar_delete_trig_tcs = mtstate->mt_transition_capture;
	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
		&& mtstate->mt_transition_capture->tcs_update_old_table)
		ExecARUpdateTriggers(estate, resultRelInfo,

		 * We've already captured the NEW TABLE row, so make sure any AR
		 * DELETE trigger fired below doesn't capture it again.
		ar_delete_trig_tcs = NULL;

	/* AFTER ROW DELETE Triggers */
	 * GPDB_12_MERGE_FIXME: PostgreSQL *does* fire INSERT and DELETE
	 * triggers on an UPDATE that moves tuples from one partition to another.
	 * Should we follow that example with cross-segment UPDATEs too?
	if (!RelationIsAppendOptimized(resultRelationDesc) && !splitUpdate)
		ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple,

	/* Process RETURNING if present and if requested */
	 * In a split update, the processed rows are returned by the INSERT
	 * of the new row, not the DELETE of the old one.
	if (processReturning && resultRelInfo->ri_projectReturning)
		 * We have to put the target tuple into a slot, which means first we
		 * gotta fetch it.  We can use the trigger tuple slot.
		TupleTableSlot *rslot;

		if (resultRelInfo->ri_FdwRoutine)
			/* FDW must have provided a slot containing the deleted row */
			slot = ExecGetReturningSlot(estate, resultRelInfo);
			if (oldtuple != NULL)
				ExecForceStoreHeapTuple(oldtuple, slot, false);
				if (!table_tuple_fetch_row_version(resultRelationDesc, tupleid,
												   SnapshotAny, slot))
					elog(ERROR, "failed to fetch deleted tuple for DELETE RETURNING");

		rslot = ExecProcessReturning(resultRelInfo, slot, planSlot);

		 * Before releasing the target tuple again, make sure rslot has a
		 * local copy of any pass-by-reference values.


		return rslot;

	return NULL;

/* ----------------------------------------------------------------
 *		ExecUpdate
 *		note: we can't run UPDATE queries with transactions
 *		off because UPDATEs are actually INSERTs and our
 *		scan will mistakenly loop forever, updating the tuple
 *		it just inserted..  This should be fixed but until it
 *		is, we don't want to get stuck in an infinite loop
 *		which corrupts your database..
 *		When updating a table, tupleid identifies the tuple to
 *		update and oldtuple is NULL.  When updating a view, oldtuple
 *		is passed to the INSTEAD OF triggers and identifies what to
 *		update, and tupleid is invalid.  When updating a foreign table,
 *		tupleid is invalid; the FDW has to figure out which row to
 *		update using data from the planSlot.  oldtuple is passed to
 *		foreign table triggers; it is NULL when the foreign table has
 *		no relevant triggers.
 *		Returns RETURNING result if any, otherwise NULL.
 * ----------------------------------------------------------------
static TupleTableSlot *
ExecUpdate(ModifyTableState *mtstate,
		   ItemPointer tupleid,
		   HeapTuple oldtuple,
		   TupleTableSlot *slot,
		   TupleTableSlot *planSlot,
		   int32 segid, /* gpdb specific parameter, check if tuple to update is from local */
		   EPQState *epqstate,
		   EState *estate,
		   bool canSetTag)
	ResultRelInfo *resultRelInfo;
	Relation	resultRelationDesc;
	TM_Result	result;
	TM_FailureData tmfd;
	List	   *recheckIndexes = NIL;
	TupleConversionMap *saved_tcs_map = NULL;

	 * abort the operation if not running transactions
	if (IsBootstrapProcessingMode())
		elog(ERROR, "cannot UPDATE during bootstrap");

	 * Sanity check the distribution of the tuple to prevent
	 * potential data corruption in case users manipulate data
	 * incorrectly (e.g. insert data on incorrect segment through
	 * utility mode) or there is bug in code, etc.
	if (segid != GpIdentity.segindex)
			 "distribution key of the tuple (%u, %u) doesn't belong to "
			 "current segment (actually from seg%d)",


	 * get information on the (current) result relation
	resultRelInfo = estate->es_result_relation_info;
	resultRelationDesc = resultRelInfo->ri_RelationDesc;

	/* BEFORE ROW UPDATE Triggers */
	if (resultRelInfo->ri_TrigDesc &&
		if (!ExecBRUpdateTriggers(estate, epqstate, resultRelInfo,
								  tupleid, oldtuple, slot))
			return NULL;		/* "do nothing" */

	if (resultRelInfo->ri_TrigDesc &&
		if (!ExecIRUpdateTriggers(estate, resultRelInfo,
								  oldtuple, slot))
			return NULL;		/* "do nothing" */
	else if (resultRelInfo->ri_FdwRoutine)
		 * Compute stored generated columns
		if (resultRelationDesc->rd_att->constr &&
			ExecComputeStoredGenerated(estate, slot);

		 * update in foreign table: let the FDW do it
		slot = resultRelInfo->ri_FdwRoutine->ExecForeignUpdate(estate,

		if (slot == NULL)		/* "do nothing" */
			return NULL;

		 * AFTER ROW Triggers or RETURNING expressions might reference the
		 * tableoid column, so (re-)initialize tts_tableOid before evaluating
		 * them.
		slot->tts_tableOid = RelationGetRelid(resultRelationDesc);
		LockTupleMode lockmode;
		bool		partition_constraint_failed;
		bool		update_indexes;

		 * Constraints might reference the tableoid column, so (re-)initialize
		 * tts_tableOid before evaluating them.
		slot->tts_tableOid = RelationGetRelid(resultRelationDesc);

		 * Compute stored generated columns
		if (resultRelationDesc->rd_att->constr &&
			ExecComputeStoredGenerated(estate, slot);

		 * Check any RLS UPDATE WITH CHECK policies
		 * If we generate a new candidate tuple after EvalPlanQual testing, we
		 * must loop back here and recheck any RLS policies and constraints.
		 * (We don't need to redo triggers, however.  If there are any BEFORE
		 * triggers then trigger.c will have done table_tuple_lock to lock the
		 * correct tuple, so there's no need to do them again.)

		/* ensure slot is independent, consider e.g. EPQ */

		 * If partition constraint fails, this row might get moved to another
		 * partition, in which case we should check the RLS CHECK policy just
		 * before inserting into the new partition, rather than doing it here.
		 * This is because a trigger on that partition might again change the
		 * row.  So skip the WCO checks if the partition constraint fails.
		partition_constraint_failed =
			resultRelInfo->ri_PartitionCheck &&
			!ExecPartitionCheck(resultRelInfo, slot, estate, false);

		if (!partition_constraint_failed &&
			resultRelInfo->ri_WithCheckOptions != NIL)
			 * ExecWithCheckOptions() will skip any WCOs which are not of the
			 * kind we are looking for at this point.
								 resultRelInfo, slot, estate);

		 * If a partition check failed, try to move the row into the right
		 * partition.
		if (partition_constraint_failed)
			bool		tuple_deleted;
			TupleTableSlot *ret_slot;
			TupleTableSlot *epqslot = NULL;
			PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
			int			map_index;
			TupleConversionMap *tupconv_map;

			 * Disallow an INSERT ON CONFLICT DO UPDATE that causes the
			 * original row to migrate to a different partition.  Maybe this
			 * can be implemented some day, but it seems a fringe feature with
			 * little redeeming value.
			if (((ModifyTable *) mtstate->ps.plan)->onConflictAction == ONCONFLICT_UPDATE)
						 errmsg("invalid ON UPDATE specification"),
						 errdetail("The result tuple would appear in a different partition than the original tuple.")));

			 * When an UPDATE is run on a leaf partition, we will not have
			 * partition tuple routing set up. In that case, fail with
			 * partition constraint violation error.
			if (proute == NULL)
				ExecPartitionCheckEmitError(resultRelInfo, slot, estate);

			 * Row movement, part 1.  Delete the tuple, but skip RETURNING
			 * processing. We want to return rows from INSERT.
			ExecDelete(mtstate, tupleid, segid, oldtuple, planSlot, epqstate,
					   estate, false, false /* canSetTag */ ,
					   true /* changingPart */ ,
					   false /* splitUpdate */ ,
					   &tuple_deleted, &epqslot);

			 * For some reason if DELETE didn't happen (e.g. trigger prevented
			 * it, or it was already deleted by self, or it was concurrently
			 * deleted by another transaction), then we should skip the insert
			 * as well; otherwise, an UPDATE could cause an increase in the
			 * total number of rows across all partitions, which is clearly
			 * wrong.
			 * For a normal UPDATE, the case where the tuple has been the
			 * subject of a concurrent UPDATE or DELETE would be handled by
			 * the EvalPlanQual machinery, but for an UPDATE that we've
			 * translated into a DELETE from this partition and an INSERT into
			 * some other partition, that's not available, because CTID chains
			 * can't span relation boundaries.  We mimic the semantics to a
			 * limited extent by skipping the INSERT if the DELETE fails to
			 * find a tuple. This ensures that two concurrent attempts to
			 * UPDATE the same tuple at the same time can't turn one tuple
			 * into two, and that an UPDATE of a just-deleted tuple can't
			 * resurrect it.
			if (!tuple_deleted)
				 * epqslot will be typically NULL.  But when ExecDelete()
				 * finds that another transaction has concurrently updated the
				 * same row, it re-fetches the row, skips the delete, and
				 * epqslot is set to the re-fetched tuple slot. In that case,
				 * we need to do all the checks again.
				if (TupIsNull(epqslot))
					return NULL;
					slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, epqslot);
					goto lreplace;

			 * Updates set the transition capture map only when a new subplan
			 * is chosen.  But for inserts, it is set for each row. So after
			 * INSERT, we need to revert back to the map created for UPDATE;
			 * otherwise the next UPDATE will incorrectly use the one created
			 * for INSERT.  So first save the one created for UPDATE.
			if (mtstate->mt_transition_capture)
				saved_tcs_map = mtstate->mt_transition_capture->tcs_map;

			 * resultRelInfo is one of the per-subplan resultRelInfos.  So we
			 * should convert the tuple into root's tuple descriptor, since
			 * ExecInsert() starts the search from root.  The tuple conversion
			 * map list is in the order of mtstate->resultRelInfo[], so to
			 * retrieve the one for this resultRel, we need to know the
			 * position of the resultRel in mtstate->resultRelInfo[].
			map_index = resultRelInfo - mtstate->resultRelInfo;
			Assert(map_index >= 0 && map_index < mtstate->mt_nplans);
			tupconv_map = tupconv_map_for_subplan(mtstate, map_index);
			if (tupconv_map != NULL)
				slot = execute_attr_map_slot(tupconv_map->attrMap,

			 * Prepare for tuple routing, making it look like we're inserting
			 * into the root.
			Assert(mtstate->rootResultRelInfo != NULL);
			slot = ExecPrepareTupleRouting(mtstate, estate, proute,
										   mtstate->rootResultRelInfo, slot);

			ret_slot = ExecInsert(mtstate, slot, planSlot,
								  estate, canSetTag, false /* splitUpdate */);

			/* Revert ExecPrepareTupleRouting's node change. */
			estate->es_result_relation_info = resultRelInfo;
			if (mtstate->mt_transition_capture)
				mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
				mtstate->mt_transition_capture->tcs_map = saved_tcs_map;

			return ret_slot;

		 * Check the constraints of the tuple.  We've already checked the
		 * partition constraint above; however, we must still ensure the tuple
		 * passes all other constraints, so we will call ExecConstraints() and
		 * have it validate all remaining checks.
		if (resultRelationDesc->rd_att->constr)
			ExecConstraints(resultRelInfo, slot, estate);

		 * replace the heap tuple
		 * Note: if es_crosscheck_snapshot isn't InvalidSnapshot, we check
		 * that the row to be updated is visible to that snapshot, and throw a
		 * can't-serialize error if not. This is a special-case behavior
		 * needed for referential integrity updates in transaction-snapshot
		 * mode transactions.
		result = table_tuple_update(resultRelationDesc, tupleid, slot,
									true /* wait for commit */ ,
									&tmfd, &lockmode, &update_indexes);

		switch (result)
			case TM_SelfModified:

				 * The target tuple was already updated or deleted by the
				 * current command, or by a later command in the current
				 * transaction.  The former case is possible in a join UPDATE
				 * where multiple tuples join to the same target tuple. This
				 * is pretty questionable, but Postgres has always allowed it:
				 * we just execute the first update action and ignore
				 * additional update attempts.
				 * The latter case arises if the tuple is modified by a
				 * command in a BEFORE trigger, or perhaps by a command in a
				 * volatile function used in the query.  In such situations we
				 * should not ignore the update, but it is equally unsafe to
				 * proceed.  We don't want to discard the original UPDATE
				 * while keeping the triggered actions based on it; and we
				 * have no principled way to merge this update with the
				 * previous ones.  So throwing an error is the only safe
				 * course.
				 * If a trigger actually intends this type of interaction, it
				 * can re-execute the UPDATE (assuming it can figure out how)
				 * and then return NULL to cancel the outer update.
				 * In GPDB, for AO tables TM_SelfModified is returned only
				 * in case of same command tuple update based on visimap dirty
				 * list checking. Also, tmfd is not initialized and can't be for
				 * AO case, as visimap update within same command happens at end
				 * of command.
				if (!RelationIsAppendOptimized(resultRelationDesc) &&
					tmfd.cmax != estate->es_output_cid)
									errmsg("tuple to be updated was already modified by an operation triggered by the current command"),
									errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows.")));

				/* Else, already updated by self; nothing to do */
				return NULL;

			case TM_Ok:

			case TM_Updated:
					TupleTableSlot *inputslot;
					TupleTableSlot *epqslot;

					if (IsolationUsesXactSnapshot())
								 errmsg("could not serialize access due to concurrent update")));

					 * Already know that we're going to need to do EPQ, so
					 * fetch tuple directly into the right slot.
					EvalPlanQualBegin(epqstate, estate);
					inputslot = EvalPlanQualSlot(epqstate, resultRelationDesc,

					result = table_tuple_lock(resultRelationDesc, tupleid,
											  inputslot, estate->es_output_cid,
											  lockmode, LockWaitBlock,

					switch (result)
						case TM_Ok:

							epqslot = EvalPlanQual(estate,
							if (TupIsNull(epqslot))
								/* Tuple not passing quals anymore, exiting... */
								return NULL;

							slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, epqslot);
							goto lreplace;

						case TM_Deleted:
							/* tuple already deleted; nothing to do */
							return NULL;

						case TM_SelfModified:

							 * This can be reached when following an update
							 * chain from a tuple updated by another session,
							 * reaching a tuple that was already updated in
							 * this transaction. If previously modified by
							 * this command, ignore the redundant update,
							 * otherwise error out.
							 * See also TM_SelfModified response to
							 * table_tuple_update() above.
							if (tmfd.cmax != estate->es_output_cid)
										 errmsg("tuple to be updated was already modified by an operation triggered by the current command"),
										 errhint("Consider using an AFTER trigger instead of a BEFORE trigger to propagate changes to other rows.")));
							return NULL;

							/* see table_tuple_lock call in ExecDelete() */
							elog(ERROR, "unexpected table_tuple_lock status: %u",
							return NULL;


			case TM_Deleted:
				if (IsolationUsesXactSnapshot())
							 errmsg("could not serialize access due to concurrent delete")));
				/* tuple already deleted; nothing to do */
				return NULL;

				elog(ERROR, "unrecognized table_tuple_update status: %u",
				return NULL;

		/* insert index entries for tuple if necessary */
		if (resultRelInfo->ri_NumIndices > 0 && update_indexes)
			recheckIndexes = ExecInsertIndexTuples(slot, estate, false, NULL, NIL);
	if (canSetTag)

	/* AFTER ROW UPDATE Triggers */
	/* GPDB: AO and AOCO tables don't support triggers */
	if (!RelationIsAppendOptimized(resultRelationDesc))
		ExecARUpdateTriggers(estate, resultRelInfo, tupleid, oldtuple, slot,
						 mtstate->operation == CMD_INSERT ?
						 mtstate->mt_oc_transition_capture :


	 * Check any WITH CHECK OPTION constraints from parent views.  We are
	 * required to do this after testing all constraints and uniqueness
	 * violations per the SQL spec, so we do it after actually updating the
	 * record in the heap and all indexes.
	 * ExecWithCheckOptions() will skip any WCOs which are not of the kind we
	 * are looking for at this point.
	if (resultRelInfo->ri_WithCheckOptions != NIL)
		ExecWithCheckOptions(WCO_VIEW_CHECK, resultRelInfo, slot, estate);

	/* Process RETURNING if present */
	if (resultRelInfo->ri_projectReturning)
		return ExecProcessReturning(resultRelInfo, slot, planSlot);

	return NULL;

 * Insert the new tuple version of a Split Update
 * We have to check if this UPDATE also moves the row to
 * a different partition, much like ExecUpdate() does.
static TupleTableSlot *
ExecSplitUpdate_Insert(ModifyTableState *mtstate,
					   TupleTableSlot *slot,
					   TupleTableSlot *planSlot,
					   EState *estate,
					   bool canSetTag)
	ResultRelInfo *resultRelInfo;
	Relation	resultRelationDesc;
	bool		partition_constraint_failed;
	TupleConversionMap *saved_tcs_map = NULL;
	PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
	int			map_index;
	TupleConversionMap *tupconv_map;

	 * get information on the (current) result relation
	resultRelInfo = estate->es_result_relation_info;
	resultRelationDesc = resultRelInfo->ri_RelationDesc;

	/* ensure slot is independent, consider e.g. EPQ */

	 * If partition constraint fails, this row might get moved to another
	 * partition, in which case we should check the RLS CHECK policy just
	 * before inserting into the new partition, rather than doing it here.
	 * This is because a trigger on that partition might again change the
	 * row.  So skip the WCO checks if the partition constraint fails.
	partition_constraint_failed =
		resultRelInfo->ri_PartitionCheck &&
		!ExecPartitionCheck(resultRelInfo, slot, estate, false);

	if (!partition_constraint_failed &&
		resultRelInfo->ri_WithCheckOptions != NIL)
		 * ExecWithCheckOptions() will skip any WCOs which are not of the
		 * kind we are looking for at this point.
							 resultRelInfo, slot, estate);

	 * Updates set the transition capture map only when a new subplan
	 * is chosen.  But for inserts, it is set for each row. So after
	 * INSERT, we need to revert back to the map created for UPDATE;
	 * otherwise the next UPDATE will incorrectly use the one created
	 * for INSERT.  So first save the one created for UPDATE.
	if (mtstate->mt_transition_capture)
		saved_tcs_map = mtstate->mt_transition_capture->tcs_map;

	if (partition_constraint_failed)
		 * When an UPDATE is run on a leaf partition, we will not have
		 * partition tuple routing set up. In that case, fail with
		 * partition constraint violation error.
		if (proute == NULL)
			ExecPartitionCheckEmitError(resultRelInfo, slot, estate);

		 * resultRelInfo is one of the per-subplan resultRelInfos.  So we
		 * should convert the tuple into root's tuple descriptor, since
		 * ExecInsert() starts the search from root.  The tuple conversion
		 * map list is in the order of mtstate->resultRelInfo[], so to
		 * retrieve the one for this resultRel, we need to know the
		 * position of the resultRel in mtstate->resultRelInfo[].
		map_index = resultRelInfo - mtstate->resultRelInfo;
		Assert(map_index >= 0 && map_index < mtstate->mt_nplans);
		tupconv_map = tupconv_map_for_subplan(mtstate, map_index);
		if (tupconv_map != NULL)
			slot = execute_attr_map_slot(tupconv_map->attrMap,

		 * Prepare for tuple routing, making it look like we're inserting
		 * into the root.
		Assert(mtstate->rootResultRelInfo != NULL);
		slot = ExecPrepareTupleRouting(mtstate, estate, proute,
									   mtstate->rootResultRelInfo, slot);

		slot = ExecInsert(mtstate, slot, planSlot,
						  estate, mtstate->canSetTag,
						  true /* splitUpdate */);

		/* Revert ExecPrepareTupleRouting's node change. */
		estate->es_result_relation_info = resultRelInfo;
		if (mtstate->mt_transition_capture)
			mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
			mtstate->mt_transition_capture->tcs_map = saved_tcs_map;
		slot = ExecInsert(mtstate, slot, planSlot,
						  estate, mtstate->canSetTag,
						  true /* splitUpdate */);

	return slot;

 * ExecOnConflictUpdate --- execute UPDATE of INSERT ON CONFLICT DO UPDATE
 * Try to lock tuple for update as part of speculative insertion.  If
 * a qual originating from ON CONFLICT DO UPDATE is satisfied, update
 * (but still lock row, even though it may not satisfy estate's
 * snapshot).
 * Returns true if we're done (with or without an update), or false if
 * the caller must retry the INSERT from scratch.
static bool
ExecOnConflictUpdate(ModifyTableState *mtstate,
					 ResultRelInfo *resultRelInfo,
					 ItemPointer conflictTid,
					 TupleTableSlot *planSlot,
					 TupleTableSlot *excludedSlot,
					 EState *estate,
					 bool canSetTag,
					 TupleTableSlot **returning)
	ExprContext *econtext = mtstate->ps.ps_ExprContext;
	Relation	relation = resultRelInfo->ri_RelationDesc;
	ExprState  *onConflictSetWhere = resultRelInfo->ri_onConflict->oc_WhereClause;
	TupleTableSlot *existing = resultRelInfo->ri_onConflict->oc_Existing;
	TM_FailureData tmfd;
	LockTupleMode lockmode;
	TM_Result	test;
	Datum		xminDatum;
	TransactionId xmin;
	bool		isnull;

	/* Determine lock mode to use */
	lockmode = ExecUpdateLockMode(estate, resultRelInfo);

	 * Lock tuple for update.  Don't follow updates when tuple cannot be
	 * locked without doing so.  A row locking conflict here means our
	 * previous conclusion that the tuple is conclusively committed is not
	 * true anymore.
	test = table_tuple_lock(relation, conflictTid,
							existing, estate->es_output_cid,
							lockmode, LockWaitBlock, 0,
	switch (test)
		case TM_Ok:
			/* success! */

		case TM_Invisible:

			 * This can occur when a just inserted tuple is updated again in
			 * the same command. E.g. because multiple rows with the same
			 * conflicting key values are inserted.
			 * This is somewhat similar to the ExecUpdate() TM_SelfModified
			 * case.  We do not want to proceed because it would lead to the
			 * same row being updated a second time in some unspecified order,
			 * and in contrast to plain UPDATEs there's no historical behavior
			 * to break.
			 * It is the user's responsibility to prevent this situation from
			 * occurring.  These problems are why SQL-2003 similarly specifies
			 * that for SQL MERGE, an exception must be raised in the event of
			 * an attempt to update the same row twice.
			xminDatum = slot_getsysattr(existing,
			xmin = DatumGetTransactionId(xminDatum);

			if (TransactionIdIsCurrentTransactionId(xmin))
						 errmsg("ON CONFLICT DO UPDATE command cannot affect row a second time"),
						 errhint("Ensure that no rows proposed for insertion within the same command have duplicate constrained values.")));

			/* This shouldn't happen */
			elog(ERROR, "attempted to lock invisible tuple");

		case TM_SelfModified:

			 * This state should never be reached. As a dirty snapshot is used
			 * to find conflicting tuples, speculative insertion wouldn't have
			 * seen this row to conflict with.
			elog(ERROR, "unexpected self-updated tuple");

		case TM_Updated:
			if (IsolationUsesXactSnapshot())
						 errmsg("could not serialize access due to concurrent update")));

			 * As long as we don't support an UPDATE of INSERT ON CONFLICT for
			 * a partitioned table we shouldn't reach to a case where tuple to
			 * be lock is moved to another partition due to concurrent update
			 * of the partition key.

			 * Tell caller to try again from the very start.
			 * It does not make sense to use the usual EvalPlanQual() style
			 * loop here, as the new version of the row might not conflict
			 * anymore, or the conflicting tuple has actually been deleted.
			return false;

		case TM_Deleted:
			if (IsolationUsesXactSnapshot())
						 errmsg("could not serialize access due to concurrent delete")));

			/* see TM_Updated case */
			return false;

			elog(ERROR, "unrecognized table_tuple_lock status: %u", test);

	/* Success, the tuple is locked. */

	 * Verify that the tuple is visible to our MVCC snapshot if the current
	 * isolation level mandates that.
	 * It's not sufficient to rely on the check within ExecUpdate() as e.g.
	 * CONFLICT ... WHERE clause may prevent us from reaching that.
	 * This means we only ever continue when a new command in the current
	 * transaction could see the row, even though in READ COMMITTED mode the
	 * tuple will not be visible according to the current statement's
	 * snapshot.  This is in line with the way UPDATE deals with newer tuple
	 * versions.
	ExecCheckTupleVisible(estate, relation, existing);

	 * Make tuple and any needed join variables available to ExecQual and
	 * ExecProject.  The EXCLUDED tuple is installed in ecxt_innertuple, while
	 * the target's existing tuple is installed in the scantuple.  EXCLUDED
	 * has been made to reference INNER_VAR in setrefs.c, but there is no
	 * other redirection.
	econtext->ecxt_scantuple = existing;
	econtext->ecxt_innertuple = excludedSlot;
	econtext->ecxt_outertuple = NULL;

	if (!ExecQual(onConflictSetWhere, econtext))
		ExecClearTuple(existing);	/* see return below */
		InstrCountFiltered1(&mtstate->ps, 1);
		return true;			/* done with the tuple */

	if (resultRelInfo->ri_WithCheckOptions != NIL)
		 * Check target's existing tuple against UPDATE-applicable USING
		 * security barrier quals (if any), enforced here as RLS checks/WCOs.
		 * The rewriter creates UPDATE RLS checks/WCOs for UPDATE security
		 * quals, and stores them as WCOs of "kind" WCO_RLS_CONFLICT_CHECK,
		 * but that's almost the extent of its special handling for ON
		 * The rewriter will also have associated UPDATE applicable straight
		 * RLS checks/WCOs for the benefit of the ExecUpdate() call that
		 * follows.  INSERTs and UPDATEs naturally have mutually exclusive WCO
		 * kinds, so there is no danger of spurious over-enforcement in the
		 * INSERT or UPDATE path.
		ExecWithCheckOptions(WCO_RLS_CONFLICT_CHECK, resultRelInfo,

	/* Project the new tuple version */

	 * Note that it is possible that the target tuple has been modified in
	 * this session, after the above table_tuple_lock. We choose to not error
	 * out in that case, in line with ExecUpdate's treatment of similar cases.
	 * This can happen if an UPDATE is triggered from within ExecQual(),
	 * ExecWithCheckOptions() or ExecProject() above, e.g. by selecting from a
	 * wCTE in the ON CONFLICT's SET.

	/* Execute UPDATE with projection */
	*returning = ExecUpdate(mtstate, conflictTid, NULL,
							&mtstate->mt_epqstate, mtstate->ps.state,

	 * Clear out existing tuple, as there might not be another conflict among
	 * the next input rows. Don't want to hold resources till the end of the
	 * query.
	return true;

 * Process BEFORE EACH STATEMENT triggers
static void
fireBSTriggers(ModifyTableState *node)
	ModifyTable *plan = (ModifyTable *) node->ps.plan;
	ResultRelInfo *resultRelInfo = node->resultRelInfo;

	 * If the node modifies a partitioned table, we must fire its triggers.
	 * Note that in that case, node->resultRelInfo points to the first leaf
	 * partition, not the root table.
	if (node->rootResultRelInfo != NULL)
		resultRelInfo = node->rootResultRelInfo;

	switch (node->operation)
		case CMD_INSERT:
			ExecBSInsertTriggers(node->ps.state, resultRelInfo);
			if (plan->onConflictAction == ONCONFLICT_UPDATE)
		case CMD_UPDATE:
			ExecBSUpdateTriggers(node->ps.state, resultRelInfo);
		case CMD_DELETE:
			ExecBSDeleteTriggers(node->ps.state, resultRelInfo);
			elog(ERROR, "unknown operation");

 * Return the target rel ResultRelInfo.
 * This relation is the same as :
 * - the relation for which we will fire AFTER STATEMENT triggers.
 * - the relation into whose tuple format all captured transition tuples must
 *   be converted.
 * - the root partitioned table.
static ResultRelInfo *
getTargetResultRelInfo(ModifyTableState *node)
	 * Note that if the node modifies a partitioned table, node->resultRelInfo
	 * points to the first leaf partition, not the root table.
	if (node->rootResultRelInfo != NULL)
		return node->rootResultRelInfo;
		return node->resultRelInfo;

 * Process AFTER EACH STATEMENT triggers
static void
fireASTriggers(ModifyTableState *node)
	ModifyTable *plan = (ModifyTable *) node->ps.plan;
	ResultRelInfo *resultRelInfo = getTargetResultRelInfo(node);

	switch (node->operation)
		case CMD_INSERT:
			if (plan->onConflictAction == ONCONFLICT_UPDATE)
			ExecASInsertTriggers(node->ps.state, resultRelInfo,
		case CMD_UPDATE:
			ExecASUpdateTriggers(node->ps.state, resultRelInfo,
		case CMD_DELETE:
			ExecASDeleteTriggers(node->ps.state, resultRelInfo,
			elog(ERROR, "unknown operation");

 * Set up the state needed for collecting transition tuples for AFTER
 * triggers.
static void
ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
	ModifyTable *plan = (ModifyTable *) mtstate->ps.plan;
	ResultRelInfo *targetRelInfo = getTargetResultRelInfo(mtstate);

	/* Check for transition tables on the directly targeted relation. */
	mtstate->mt_transition_capture =
	if (plan->operation == CMD_INSERT &&
		plan->onConflictAction == ONCONFLICT_UPDATE)
		mtstate->mt_oc_transition_capture =

	 * If we found that we need to collect transition tuples then we may also
	 * need tuple conversion maps for any children that have TupleDescs that
	 * aren't compatible with the tuplestores.  (We can share these maps
	 * between the regular and ON CONFLICT cases.)
	if (mtstate->mt_transition_capture != NULL ||
		mtstate->mt_oc_transition_capture != NULL)

		 * Install the conversion map for the first plan for UPDATE and DELETE
		 * operations.  It will be advanced each time we switch to the next
		 * plan.  (INSERT operations set it every time, so we need not update
		 * mtstate->mt_oc_transition_capture here.)
		if (mtstate->mt_transition_capture && mtstate->operation != CMD_INSERT)
			mtstate->mt_transition_capture->tcs_map =
				tupconv_map_for_subplan(mtstate, 0);

 * ExecPrepareTupleRouting --- prepare for routing one tuple
 * Determine the partition in which the tuple in slot is to be inserted,
 * and modify mtstate and estate to prepare for it.
 * Caller must revert the estate changes after executing the insertion!
 * In mtstate, transition capture changes may also need to be reverted.
 * Returns a slot holding the tuple of the partition rowtype.
static TupleTableSlot *
ExecPrepareTupleRouting(ModifyTableState *mtstate,
						EState *estate,
						PartitionTupleRouting *proute,
						ResultRelInfo *targetRelInfo,
						TupleTableSlot *slot)
	ResultRelInfo *partrel;
	PartitionRoutingInfo *partrouteinfo;
	TupleConversionMap *map;

	 * Lookup the target partition's ResultRelInfo.  If ExecFindPartition does
	 * not find a valid partition for the tuple in 'slot' then an error is
	 * raised.  An error may also be raised if the found partition is not a
	 * valid target for INSERTs.  This is required since a partitioned table
	 * UPDATE to another partition becomes a DELETE+INSERT.
	partrel = ExecFindPartition(mtstate, targetRelInfo, proute, slot, estate);
	partrouteinfo = partrel->ri_PartitionInfo;
	Assert(partrouteinfo != NULL);

	 * Make it look like we are inserting into the partition.
	estate->es_result_relation_info = partrel;

	 * If we're capturing transition tuples, we might need to convert from the
	 * partition rowtype to root partitioned table's rowtype.
	if (mtstate->mt_transition_capture != NULL)
		if (partrel->ri_TrigDesc &&
			 * If there are any BEFORE triggers on the partition, we'll have
			 * to be ready to convert their result back to tuplestore format.
			mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
			mtstate->mt_transition_capture->tcs_map =
			 * Otherwise, just remember the original unconverted tuple, to
			 * avoid a needless round trip conversion.
			mtstate->mt_transition_capture->tcs_original_insert_tuple = slot;
			mtstate->mt_transition_capture->tcs_map = NULL;
	if (mtstate->mt_oc_transition_capture != NULL)
		mtstate->mt_oc_transition_capture->tcs_map =

	 * Convert the tuple, if necessary.
	map = partrouteinfo->pi_RootToPartitionMap;
	if (map != NULL)
		TupleTableSlot *new_slot = partrouteinfo->pi_PartitionTupleSlot;

		slot = execute_attr_map_slot(map->attrMap, slot, new_slot);

	return slot;

 * Initialize the child-to-root tuple conversion map array for UPDATE subplans.
 * This map array is required to convert the tuple from the subplan result rel
 * to the target table descriptor. This requirement arises for two independent
 * scenarios:
 * 1. For update-tuple-routing.
 * 2. For capturing tuples in transition tables.
static void
ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate)
	ResultRelInfo *targetRelInfo = getTargetResultRelInfo(mtstate);
	ResultRelInfo *resultRelInfos = mtstate->resultRelInfo;
	TupleDesc	outdesc;
	int			numResultRelInfos = mtstate->mt_nplans;
	int			i;

	 * Build array of conversion maps from each child's TupleDesc to the one
	 * used in the target relation.  The map pointers may be NULL when no
	 * conversion is necessary, which is hopefully a common case.

	/* Get tuple descriptor of the target rel. */
	outdesc = RelationGetDescr(targetRelInfo->ri_RelationDesc);

	mtstate->mt_per_subplan_tupconv_maps = (TupleConversionMap **)
		palloc(sizeof(TupleConversionMap *) * numResultRelInfos);

	for (i = 0; i < numResultRelInfos; ++i)
		mtstate->mt_per_subplan_tupconv_maps[i] =
								   gettext_noop("could not convert row type"));

 * For a given subplan index, get the tuple conversion map.
static TupleConversionMap *
tupconv_map_for_subplan(ModifyTableState *mtstate, int whichplan)
	/* If nobody else set the per-subplan array of maps, do so ourselves. */
	if (mtstate->mt_per_subplan_tupconv_maps == NULL)

	Assert(whichplan >= 0 && whichplan < mtstate->mt_nplans);
	return mtstate->mt_per_subplan_tupconv_maps[whichplan];

/* ----------------------------------------------------------------
 *	   ExecModifyTable
 *		Perform table modifications as required, and return RETURNING results
 *		if needed.
 * ----------------------------------------------------------------
static TupleTableSlot *
ExecModifyTable(PlanState *pstate)
	ModifyTableState *node = castNode(ModifyTableState, pstate);
	PartitionTupleRouting *proute = node->mt_partition_tuple_routing;
	EState	   *estate = node->ps.state;
	CmdType		operation = node->operation;
	ResultRelInfo *saved_resultRelInfo;
	ResultRelInfo *resultRelInfo;
	PlanState  *subplanstate;
	JunkFilter *junkfilter;
	AttrNumber  action_attno;
	AttrNumber  segid_attno;
	TupleTableSlot *slot;
	TupleTableSlot *planSlot;
	ItemPointer tupleid;
	ItemPointerData tuple_ctid;
	HeapTupleData oldtupdata;
	HeapTuple	oldtuple;


	 * This should NOT get called during EvalPlanQual; we should have passed a
	 * subplan tree to EvalPlanQual, instead.  Use a runtime test not just
	 * Assert because this condition is easy to miss in testing.  (Note:
	 * although ModifyTable should not get executed within an EvalPlanQual
	 * operation, we do have to allow it to be initialized and shut down in
	 * case it is within a CTE subplan.  Hence this test must be here, not in
	 * ExecInitModifyTable.)
	if (estate->es_epqTupleSlot != NULL)
		elog(ERROR, "ModifyTable should not be called during EvalPlanQual");

	 * If we've already completed processing, don't try to do more.  We need
	 * this test because ExecPostprocessPlan might call us an extra time, and
	 * our subplan's nodes aren't necessarily robust against being called
	 * extra times.
	if (node->mt_done)
		return NULL;

	if (Gp_role == GP_ROLE_EXECUTE && !Gp_is_writer)
		 * Current Greenplum MPP architecture only support one writer gang, and
		 * only writer gang can execute DML nodes. There is no code path to reach
		 * here. For writable CTE case as below:
		 *   create table t(a int);
		 *   with wcte as (delete from t returning a)
		 *     insert into t select * from wcte;
		 * The above query will error out during parse-analyze so not reach here.
		 * If reaching here, some bugs must happen.
		elog(ERROR, "Reader Gang execute ModifyTable node, some bugs must happen");

	 * On first call, fire BEFORE STATEMENT triggers before proceeding.
	if (node->fireBSTriggers)
		node->fireBSTriggers = false;

	/* Preload local variables */
	resultRelInfo = node->resultRelInfo + node->mt_whichplan;
	subplanstate = node->mt_plans[node->mt_whichplan];
	junkfilter = resultRelInfo->ri_junkFilter;
	action_attno = resultRelInfo->ri_action_attno;
	segid_attno = resultRelInfo->ri_segid_attno;

	 * es_result_relation_info must point to the currently active result
	 * relation while we are within this ModifyTable node.  Even though
	 * ModifyTable nodes can't be nested statically, they can be nested
	 * dynamically (since our subplan could include a reference to a modifying
	 * CTE).  So we have to save and restore the caller's value.
	saved_resultRelInfo = estate->es_result_relation_info;

	estate->es_result_relation_info = resultRelInfo;

	 * Fetch rows from subplan(s), and execute the required table modification
	 * for each row.
	for (;;)
		 * Reset the per-output-tuple exprcontext.  This is needed because
		 * triggers expect to use that context as workspace.  It's a bit ugly
		 * to do this below the top level of the plan, however.  We might need
		 * to rethink this later.

		 * Reset per-tuple memory context used for processing on conflict and
		 * returning clauses, to free any expression evaluation storage
		 * allocated in the previous cycle.
		if (pstate->ps_ExprContext)

		planSlot = ExecProcNode(subplanstate);

		if (TupIsNull(planSlot))
			/* advance to next subplan if any */
			if (node->mt_whichplan < node->mt_nplans)
				estate->es_result_relation_info = estate->es_result_relations + node->mt_whichplan;
				resultRelInfo = estate->es_result_relation_info;
				subplanstate = node->mt_plans[node->mt_whichplan];
				junkfilter = estate->es_result_relation_info->ri_junkFilter;
				action_attno = estate->es_result_relation_info->ri_action_attno;
				segid_attno = estate->es_result_relation_info->ri_segid_attno;
				EvalPlanQualSetPlan(&node->mt_epqstate, subplanstate->plan,
				/* Prepare to convert transition tuples from this child. */
				if (node->mt_transition_capture != NULL)
					node->mt_transition_capture->tcs_map =
						tupconv_map_for_subplan(node, node->mt_whichplan);
				if (node->mt_oc_transition_capture != NULL)
					node->mt_oc_transition_capture->tcs_map =
						tupconv_map_for_subplan(node, node->mt_whichplan);

		 * Ensure input tuple is the right format for the target relation.
		if (node->mt_scans[node->mt_whichplan]->tts_ops != planSlot->tts_ops)
			ExecCopySlot(node->mt_scans[node->mt_whichplan], planSlot);
			planSlot = node->mt_scans[node->mt_whichplan];

		 * If resultRelInfo->ri_usesFdwDirectModify is true, all we need to do
		 * here is compute the RETURNING expressions.
		if (resultRelInfo->ri_usesFdwDirectModify)

			 * A scan slot containing the data that was actually inserted,
			 * updated or deleted has already been made available to
			 * ExecProcessReturning by IterateDirectModify, so no need to
			 * provide it here.
			slot = ExecProcessReturning(resultRelInfo, NULL, planSlot);

			estate->es_result_relation_info = saved_resultRelInfo;
			return slot;

		EvalPlanQualSetSlot(&node->mt_epqstate, planSlot);
		slot = planSlot;

		int32 segid = GpIdentity.segindex;
		int action = -1;

		tupleid = NULL;
		oldtuple = NULL;
		if (junkfilter != NULL)
			 * extract the 'ctid' or 'wholerow' junk attribute.
			if (operation == CMD_UPDATE || operation == CMD_DELETE)
				char		relkind;
				Datum		datum;
				bool		isNull;

				relkind = resultRelInfo->ri_RelationDesc->rd_rel->relkind;
				if (relkind == RELKIND_RELATION || relkind == RELKIND_MATVIEW)
					datum = ExecGetJunkAttribute(slot,
					/* shouldn't ever get a null result... */
					if (isNull)
						elog(ERROR, "ctid is NULL");

					tupleid = (ItemPointer) DatumGetPointer(datum);
					tuple_ctid = *tupleid;	/* be sure we don't free ctid!! */
					tupleid = &tuple_ctid;

				 * Use the wholerow attribute, when available, to reconstruct
				 * the old relation tuple.
				 * Foreign table updates have a wholerow attribute when the
				 * relation has a row-level trigger.  Note that the wholerow
				 * attribute does not carry system columns.  Foreign table
				 * triggers miss seeing those, except that we know enough here
				 * to set t_tableOid.  Quite separately from this, the FDW may
				 * fetch its own junk attrs to identify the row.
				 * Other relevant relkinds, currently limited to views, always
				 * have a wholerow attribute.
				else if (AttributeNumberIsValid(junkfilter->jf_junkAttNo))
					datum = ExecGetJunkAttribute(slot,
					/* shouldn't ever get a null result... */
					if (isNull)
						elog(ERROR, "wholerow is NULL");

					oldtupdata.t_data = DatumGetHeapTupleHeader(datum);
					oldtupdata.t_len =
					/* Historically, view triggers see invalid t_tableOid. */
					oldtupdata.t_tableOid =
						(relkind == RELKIND_VIEW) ? InvalidOid :
					oldtuple = &oldtupdata;
					Assert(relkind == RELKIND_FOREIGN_TABLE);

				 * Extract GPDB-specific junk attributes.
				if (AttributeNumberIsValid(segid_attno))
					datum = ExecGetJunkAttribute(slot,
					/* shouldn't ever get a null result... */
					if (isNull)
						elog(ERROR, "gp_segment_id is NULL");

					segid = DatumGetInt32(datum);
				if (AttributeNumberIsValid(action_attno))
					datum = ExecGetJunkAttribute(slot,
					/* shouldn't ever get a null result... */
					if (isNull)
						elog(ERROR, "action is NULL");

					action = DatumGetInt32(datum);

			 * apply the junkfilter if needed.
			if (operation != CMD_DELETE)
				slot = ExecFilterJunk(junkfilter, slot);

		switch (operation)
			case CMD_INSERT:
				/* Prepare for tuple routing if needed. */
				if (proute)
					slot = ExecPrepareTupleRouting(node, estate, proute,
												   resultRelInfo, slot);
				slot = ExecInsert(node, slot, planSlot,
								  estate, node->canSetTag, false /* splitUpdate */);
				/* Revert ExecPrepareTupleRouting's state change. */
				if (proute)
					estate->es_result_relation_info = resultRelInfo;
			case CMD_UPDATE:
				if (!AttributeNumberIsValid(action_attno))
					/* normal non-split UPDATE */
					slot = ExecUpdate(node, tupleid, oldtuple, slot, planSlot,
									  &node->mt_epqstate, estate, node->canSetTag);
				else if (DML_INSERT == action)
					slot = ExecSplitUpdate_Insert(node, slot, planSlot,
												  estate, node->canSetTag);
				else /* DML_DELETE */
					slot = ExecDelete(node, tupleid, segid, oldtuple, planSlot,
									  &node->mt_epqstate, estate,
									  false /* canSetTag */,
									  true /* changingPart */ ,
									  true /* splitUpdate */ ,
									  NULL, NULL);
			case CMD_DELETE:
				slot = ExecDelete(node, tupleid, segid, oldtuple, planSlot,
								  &node->mt_epqstate, estate,
								  true, node->canSetTag,
								  false /* changingPart */ ,
								  false /* splitUpdate */ ,
								  NULL, NULL);
				elog(ERROR, "unknown operation");

		 * If the target is a partitioned table, ExecInsert / ExecUpdate /
		 * ExecDelete might have changed es_result_relation_info to point to
		 * a partition, instead of the top-level table. Reset it. (It would
		 * be more tidy if those functions cleaned up after themselves, but
		 * it's more robust to do it here just once.)
		estate->es_result_relation_info = resultRelInfo;

		 * If we got a RETURNING result, return it to caller.  We'll continue
		 * the work on next call.
		if (slot)
			estate->es_result_relation_info = saved_resultRelInfo;
			return slot;

	/* Restore es_result_relation_info before exiting */
	estate->es_result_relation_info = saved_resultRelInfo;

	 * We're done, but fire AFTER STATEMENT triggers before exiting.
	/* In GPDB, don't fire statement triggers in reader processes */
	if (Gp_role != GP_ROLE_EXECUTE || Gp_is_writer)

	node->mt_done = true;

	return NULL;

/* ----------------------------------------------------------------
 *		ExecInitModifyTable
 * ----------------------------------------------------------------
ModifyTableState *
ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
	ModifyTableState *mtstate;
	CmdType		operation = node->operation;
	int			nplans = list_length(node->plans);
	ResultRelInfo *saved_resultRelInfo;
	ResultRelInfo *resultRelInfo;
	Plan	   *subplan;
	ListCell   *l;
	int			i;
	Relation	rel;
	bool		update_tuple_routing_needed = node->partColsUpdated;

	/* check for unsupported flags */
	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));

	 * create state structure
	mtstate = makeNode(ModifyTableState);
	mtstate->ps.plan = (Plan *) node;
	mtstate->ps.state = estate;
	mtstate->ps.ExecProcNode = ExecModifyTable;

	mtstate->operation = operation;
	mtstate->canSetTag = node->canSetTag;
	mtstate->mt_done = false;

	mtstate->mt_plans = (PlanState **) palloc0(sizeof(PlanState *) * nplans);
	mtstate->resultRelInfo = estate->es_result_relations + node->resultRelIndex;
	mtstate->mt_scans = (TupleTableSlot **) palloc0(sizeof(TupleTableSlot *) * nplans);

	/* If modifying a partitioned table, initialize the root table info */
	if (node->rootResultRelIndex >= 0)
		mtstate->rootResultRelInfo = estate->es_root_result_relations +

	mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
	mtstate->mt_nplans = nplans;

	/* set up epqstate with dummy subplan data for the moment */
	EvalPlanQualInit(&mtstate->mt_epqstate, estate, NULL, NIL, node->epqParam);

	if (CMD_UPDATE == operation)
		mtstate->mt_isSplitUpdates = (bool *) palloc0(nplans * sizeof(bool));
		if (node->isSplitUpdates)
			if (list_length(node->isSplitUpdates) != nplans)
				elog(ERROR, "ModifyTable node is missing is-split-update information");

			i = 0;
			foreach(l, node->isSplitUpdates)
				mtstate->mt_isSplitUpdates[i++] = (bool) lfirst_int(l);

	/* GPDB: Don't fire statement-triggers in QE reader processes */
	if (Gp_role != GP_ROLE_EXECUTE || Gp_is_writer)
		mtstate->fireBSTriggers = true;

	 * call ExecInitNode on each of the plans to be executed and save the
	 * results into the array "mt_plans".  This is also a convenient place to
	 * verify that the proposed target relations are valid and open their
	 * indexes for insertion of new index entries.  Note we *must* set
	 * estate->es_result_relation_info correctly while we initialize each
	 * sub-plan; external modules such as FDWs may depend on that (see
	 * contrib/postgres_fdw/postgres_fdw.c: postgresBeginDirectModify() as one
	 * example).
	saved_resultRelInfo = estate->es_result_relation_info;

	resultRelInfo = mtstate->resultRelInfo;
	i = 0;
	foreach(l, node->plans)
		subplan = (Plan *) lfirst(l);

		/* Initialize the usesFdwDirectModify flag */
		resultRelInfo->ri_usesFdwDirectModify = bms_is_member(i,

		 * Verify result relation is a valid target for the current operation
		CheckValidResultRel(resultRelInfo, operation);

		 * If there are indices on the result relation, open them and save
		 * descriptors in the result relation info, so that we can add new
		 * index entries for the tuples we add/update.  We need not do this
		 * for a DELETE, however, since deletion doesn't affect indexes. Also,
		 * inside an EvalPlanQual operation, the indexes might be open
		 * already, since we share the resultrel state with the original
		 * query.
		if (resultRelInfo->ri_RelationDesc->rd_rel->relhasindex &&
			operation != CMD_DELETE &&
			resultRelInfo->ri_IndexRelationDescs == NULL)
							node->onConflictAction != ONCONFLICT_NONE);

		 * If this is an UPDATE and a BEFORE UPDATE trigger is present, the
		 * trigger itself might modify the partition-key values. So arrange
		 * for tuple routing.
		if (resultRelInfo->ri_TrigDesc &&
			resultRelInfo->ri_TrigDesc->trig_update_before_row &&
			operation == CMD_UPDATE)
			update_tuple_routing_needed = true;

		/* Now init the plan for this result rel */
		estate->es_result_relation_info = resultRelInfo;
		mtstate->mt_plans[i] = ExecInitNode(subplan, estate, eflags);
		mtstate->mt_scans[i] =
			ExecInitExtraTupleSlot(mtstate->ps.state, ExecGetResultType(mtstate->mt_plans[i]),

		if (RelationIsAoRows(resultRelInfo->ri_RelationDesc))
			appendonly_dml_init(resultRelInfo->ri_RelationDesc, operation);
		else if (RelationIsAoCols(resultRelInfo->ri_RelationDesc))
			aoco_dml_init(resultRelInfo->ri_RelationDesc, operation);

		/* Also let FDWs init themselves for foreign-table result rels */
		if (!resultRelInfo->ri_usesFdwDirectModify &&
			resultRelInfo->ri_FdwRoutine != NULL &&
			resultRelInfo->ri_FdwRoutine->BeginForeignModify != NULL)
			List	   *fdw_private = (List *) list_nth(node->fdwPrivLists, i);



	estate->es_result_relation_info = saved_resultRelInfo;

	/* Get the target relation */
	rel = (getTargetResultRelInfo(mtstate))->ri_RelationDesc;

	 * If it's not a partitioned table after all, UPDATE tuple routing should
	 * not be attempted.
	if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
		update_tuple_routing_needed = false;

	 * Build state for tuple routing if it's an INSERT or if it's an UPDATE of
	 * partition key.
	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE &&
		(operation == CMD_INSERT || update_tuple_routing_needed))
		mtstate->mt_partition_tuple_routing =
			ExecSetupPartitionTupleRouting(estate, mtstate, rel);

	 * Build state for collecting transition tuples.  This requires having a
	 * valid trigger query context, so skip it in explain-only mode.
	if (!(eflags & EXEC_FLAG_EXPLAIN_ONLY))
		ExecSetupTransitionCaptureState(mtstate, estate);

	 * Construct mapping from each of the per-subplan partition attnos to the
	 * root attno.  This is required when during update row movement the tuple
	 * descriptor of a source partition does not match the root partitioned
	 * table descriptor.  In such a case we need to convert tuples to the root
	 * tuple descriptor, because the search for destination partition starts
	 * from the root.  We'll also need a slot to store these converted tuples.
	 * We can skip this setup if it's not a partition key update.
	if (update_tuple_routing_needed)
		mtstate->mt_root_tuple_slot = table_slot_create(rel, NULL);

	 * Initialize any WITH CHECK OPTION constraints if needed.
	resultRelInfo = mtstate->resultRelInfo;
	i = 0;
	foreach(l, node->withCheckOptionLists)
		List	   *wcoList = (List *) lfirst(l);
		List	   *wcoExprs = NIL;
		ListCell   *ll;

		foreach(ll, wcoList)
			WithCheckOption *wco = (WithCheckOption *) lfirst(ll);
			ExprState  *wcoExpr = ExecInitQual((List *) wco->qual,

			wcoExprs = lappend(wcoExprs, wcoExpr);

		resultRelInfo->ri_WithCheckOptions = wcoList;
		resultRelInfo->ri_WithCheckOptionExprs = wcoExprs;

	 * Initialize RETURNING projections if needed.
	if (node->returningLists)
		TupleTableSlot *slot;
		ExprContext *econtext;

		 * Initialize result tuple slot and assign its rowtype using the first
		 * RETURNING list.  We assume the rest will look the same.
		mtstate->ps.plan->targetlist = (List *) linitial(node->returningLists);

		/* Set up a slot for the output of the RETURNING projection(s) */
		ExecInitResultTupleSlotTL(&mtstate->ps, &TTSOpsVirtual);
		slot = mtstate->ps.ps_ResultTupleSlot;

		/* Need an econtext too */
		if (mtstate->ps.ps_ExprContext == NULL)
			ExecAssignExprContext(estate, &mtstate->ps);
		econtext = mtstate->ps.ps_ExprContext;

		 * Build a projection for each result rel.
		resultRelInfo = mtstate->resultRelInfo;
		foreach(l, node->returningLists)
			List	   *rlist = (List *) lfirst(l);

			resultRelInfo->ri_returningList = rlist;
			resultRelInfo->ri_projectReturning =
				ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps,
		 * We still must construct a dummy result tuple type, because InitPlan
		 * expects one (maybe should change that?).
		mtstate->ps.plan->targetlist = NIL;

		mtstate->ps.ps_ExprContext = NULL;

	/* Set the list of arbiter indexes if needed for ON CONFLICT */
	resultRelInfo = mtstate->resultRelInfo;
	if (node->onConflictAction != ONCONFLICT_NONE)
		resultRelInfo->ri_onConflictArbiterIndexes = node->arbiterIndexes;

	 * If needed, Initialize target list, projection and qual for ON CONFLICT
	if (node->onConflictAction == ONCONFLICT_UPDATE)
		ExprContext *econtext;
		TupleDesc	relationDesc;
		TupleDesc	tupDesc;

		/* insert may only have one plan, inheritance is not expanded */
		Assert(nplans == 1);

		/* already exists if created by RETURNING processing above */
		if (mtstate->ps.ps_ExprContext == NULL)
			ExecAssignExprContext(estate, &mtstate->ps);

		econtext = mtstate->ps.ps_ExprContext;
		relationDesc = resultRelInfo->ri_RelationDesc->rd_att;

		/* carried forward solely for the benefit of explain */
		mtstate->mt_excludedtlist = node->exclRelTlist;

		/* create state for DO UPDATE SET operation */
		resultRelInfo->ri_onConflict = makeNode(OnConflictSetState);

		/* initialize slot for the existing tuple */
		resultRelInfo->ri_onConflict->oc_Existing =

		/* create the tuple slot for the UPDATE SET projection */
		tupDesc = ExecTypeFromTL((List *) node->onConflictSet);
		resultRelInfo->ri_onConflict->oc_ProjSlot =
			ExecInitExtraTupleSlot(mtstate->ps.state, tupDesc,

		/* build UPDATE SET projection state */
		resultRelInfo->ri_onConflict->oc_ProjInfo =
			ExecBuildProjectionInfo(node->onConflictSet, econtext,

		/* initialize state to evaluate the WHERE clause, if any */
		if (node->onConflictWhere)
			ExprState  *qualexpr;

			qualexpr = ExecInitQual((List *) node->onConflictWhere,
			resultRelInfo->ri_onConflict->oc_WhereClause = qualexpr;

	 * If we have any secondary relations in an UPDATE or DELETE, they need to
	 * be treated like non-locked relations in SELECT FOR UPDATE, ie, the
	 * EvalPlanQual mechanism needs to be told about them.  Locate the
	 * relevant ExecRowMarks.
	foreach(l, node->rowMarks)
		PlanRowMark *rc = lfirst_node(PlanRowMark, l);
		ExecRowMark *erm;
		bool		isdistributed = false;

		/* ignore "parent" rowmarks; they are irrelevant at runtime */
		if (rc->isParent)

		 * Like in preprocess_targetlist, ignore distributed tables.
		 * GPDB_91_MERGE_FIXME: we are largely just ignoring locking altogether.
		 * Perhaps that's OK as long as we take a full table lock on any UPDATEs
		 * or DELETEs. But sure doesn't seem right. Can we do better?
			RangeTblEntry *rte = rt_fetch(rc->rti, estate->es_plannedstmt->rtable);

			if (rte->rtekind == RTE_RELATION)
				Relation relation = heap_open(rte->relid, NoLock);
				if (GpPolicyIsPartitioned(relation->rd_cdbpolicy))
					isdistributed = true;
				heap_close(relation, NoLock);
				if (isdistributed)
		if (Gp_role == GP_ROLE_EXECUTE)
			 * In the executor, we don't have information on which tables are
			 * distributed. Assume that everything is; we wouldn't be running this
			 * slice on an entry table otherwise.

		/* find ExecRowMark (same for all subplans) */
		erm = ExecFindRowMark(estate, rc->rti, false);

		/* build ExecAuxRowMark for each subplan */
		for (i = 0; i < nplans; i++)
			ExecAuxRowMark *aerm;

			subplan = mtstate->mt_plans[i]->plan;
			aerm = ExecBuildAuxRowMark(erm, subplan->targetlist);
			mtstate->mt_arowmarks[i] = lappend(mtstate->mt_arowmarks[i], aerm);

	/* select first subplan */
	mtstate->mt_whichplan = 0;
	subplan = (Plan *) linitial(node->plans);
	EvalPlanQualSetPlan(&mtstate->mt_epqstate, subplan,

	 * Initialize the junk filter(s) if needed.  INSERT queries need a filter
	 * if there are any junk attrs in the tlist.  UPDATE and DELETE always
	 * need a filter, since there's always at least one junk attribute present
	 * --- no need to look first.  Typically, this will be a 'ctid' or
	 * 'wholerow' attribute, but in the case of a foreign data wrapper it
	 * might be a set of junk attributes sufficient to identify the remote
	 * row.
	 * If there are multiple result relations, each one needs its own junk
	 * filter.  Note multiple rels are only possible for UPDATE/DELETE, so we
	 * can't be fooled by some needing a filter and some not.
	 * This section of code is also a convenient place to verify that the
	 * output of an INSERT or UPDATE matches the target table(s).
		bool		junk_filter_needed = false;

		switch (operation)
			case CMD_INSERT:
				foreach(l, subplan->targetlist)
					TargetEntry *tle = (TargetEntry *) lfirst(l);

					if (tle->resjunk)
						junk_filter_needed = true;
			case CMD_UPDATE:
			case CMD_DELETE:
				junk_filter_needed = true;
				elog(ERROR, "unknown operation");

		if (junk_filter_needed)
			resultRelInfo = mtstate->resultRelInfo;
			for (i = 0; i < nplans; i++)
				JunkFilter *j;
				TupleTableSlot *junkresslot;

				subplan = mtstate->mt_plans[i]->plan;
				if (operation == CMD_INSERT || operation == CMD_UPDATE)

				junkresslot =
					ExecInitExtraTupleSlot(estate, NULL,
				j = ExecInitJunkFilter(subplan->targetlist,

				if (operation == CMD_UPDATE || operation == CMD_DELETE)
					/* For UPDATE/DELETE, find the appropriate junk attr now */
					char		relkind;

					relkind = resultRelInfo->ri_RelationDesc->rd_rel->relkind;
					if (relkind == RELKIND_RELATION ||
						relkind == RELKIND_MATVIEW ||
						j->jf_junkAttNo = ExecFindJunkAttribute(j, "ctid");
						if (!AttributeNumberIsValid(j->jf_junkAttNo))
							elog(ERROR, "could not find junk ctid column");

						/* Extra GPDB junk columns */
						resultRelInfo->ri_segid_attno = ExecFindJunkAttribute(j, "gp_segment_id");
						if (!AttributeNumberIsValid(resultRelInfo->ri_segid_attno))
							elog(ERROR, "could not find junk gp_segment_id column");

						if (operation == CMD_UPDATE && mtstate->mt_isSplitUpdates[i])
							resultRelInfo->ri_action_attno = ExecFindJunkAttribute(j, "DMLAction");
							if (!AttributeNumberIsValid(resultRelInfo->ri_action_attno))
								elog(ERROR, "could not find junk action column");
					else if (relkind == RELKIND_FOREIGN_TABLE)
						 * When there is a row-level trigger, there should be
						 * a wholerow attribute.
						j->jf_junkAttNo = ExecFindJunkAttribute(j, "wholerow");
						j->jf_junkAttNo = ExecFindJunkAttribute(j, "wholerow");
						if (!AttributeNumberIsValid(j->jf_junkAttNo))
							elog(ERROR, "could not find junk wholerow column");

				resultRelInfo->ri_junkFilter = j;
			if (operation == CMD_INSERT)

	 * Lastly, if this is not the primary (canSetTag) ModifyTable node, add it
	 * to estate->es_auxmodifytables so that it will be run to completion by
	 * ExecPostprocessPlan.  (It'd actually work fine to add the primary
	 * ModifyTable node too, but there's no need.)  Note the use of lcons not
	 * lappend: we need later-initialized ModifyTable nodes to be shut down
	 * before earlier ones.  This ensures that we don't throw away RETURNING
	 * rows that need to be seen by a later CTE subplan.
	if (Gp_role == GP_ROLE_EXECUTE || Gp_role == GP_ROLE_UTILITY)
		 * We do not need this unless in executor or with utility role. Note
		 * This was added for the data modifying CTE feature but there are other
		 * cases could run into this also.
		if (!mtstate->canSetTag)
			estate->es_auxmodifytables = lcons(mtstate,

	return mtstate;

/* ----------------------------------------------------------------
 *		ExecEndModifyTable
 *		Shuts down the plan.
 *		Returns nothing of interest.
 * ----------------------------------------------------------------
ExecEndModifyTable(ModifyTableState *node)
	int			i;

	 * Allow any FDWs to shut down
	for (i = 0; i < node->mt_nplans; i++)
		ResultRelInfo *resultRelInfo = node->resultRelInfo + i;

		if (!resultRelInfo->ri_usesFdwDirectModify &&
			resultRelInfo->ri_FdwRoutine != NULL &&
			resultRelInfo->ri_FdwRoutine->EndForeignModify != NULL)
		if (RelationIsAoRows(resultRelInfo->ri_RelationDesc))
		else if(RelationIsAoCols(resultRelInfo->ri_RelationDesc))
			aoco_dml_finish(resultRelInfo->ri_RelationDesc, node->operation);

	 * Close all the partitioned tables, leaf partitions, and their indices
	 * and release the slot used for tuple routing, if set.
	if (node->mt_partition_tuple_routing)
		ExecCleanupTupleRouting(node, node->mt_partition_tuple_routing);

		if (node->mt_root_tuple_slot)

	 * Free the exprcontext

	 * clean out the tuple table
	if (node->ps.ps_ResultTupleSlot)

	 * Terminate EPQ execution if active

	 * shut down subplans
	for (i = 0; i < node->mt_nplans; i++)

ExecReScanModifyTable(ModifyTableState *node)
	 * Currently, we don't need to support rescan on ModifyTable nodes. The
	 * semantics of that would be a bit debatable anyway.
	elog(ERROR, "ExecReScanModifyTable is not implemented");

ExecSquelchModifyTable(ModifyTableState *node)
	 * ModifyTable nodes must run to completion when asked to Squelch so
	 * that we don't risk losing modifications which should be performed
	 * regardless of any LIMIT's or other forms for projections which could
	 * end up causing a squelch to happen.
	for (;;)
		TupleTableSlot *result;

		result = ExecModifyTable(&node->ps);
		if (!result)


