@@ -6450,13 +6450,18 @@ heap_inplace_update_and_unlock(Relation relation,
64506450 HeapTupleHeader htup = oldtup -> t_data ;
64516451 uint32 oldlen ;
64526452 uint32 newlen ;
6453+ char * dst ;
6454+ char * src ;
64536455
64546456 Assert (ItemPointerEquals (& oldtup -> t_self , & tuple -> t_self ));
64556457 oldlen = oldtup -> t_len - htup -> t_hoff ;
64566458 newlen = tuple -> t_len - tuple -> t_data -> t_hoff ;
64576459 if (oldlen != newlen || htup -> t_hoff != tuple -> t_data -> t_hoff )
64586460 elog (ERROR , "wrong tuple length" );
64596461
6462+ dst = (char * ) htup + htup -> t_hoff ;
6463+ src = (char * ) tuple -> t_data + tuple -> t_data -> t_hoff ;
6464+
64606465 /*
64616466 * Unlink relcache init files as needed. If unlinking, acquire
64626467 * RelCacheInitLock until after associated invalidations. By doing this
@@ -6467,15 +6472,15 @@ heap_inplace_update_and_unlock(Relation relation,
64676472 */
64686473 PreInplace_Inval ();
64696474
6470- /* NO EREPORT(ERROR) from here till changes are logged */
6471- START_CRIT_SECTION ();
6472-
6473- memcpy ((char * ) htup + htup -> t_hoff ,
6474- (char * ) tuple -> t_data + tuple -> t_data -> t_hoff ,
6475- newlen );
6476-
64776475 /*----------
6478- * XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid:
6476+ * NO EREPORT(ERROR) from here till changes are complete
6477+ *
6478+ * Our buffer lock won't stop a reader having already pinned and checked
6479+ * visibility for this tuple. Hence, we write WAL first, then mutate the
6480+ * buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
6481+ * checkpoint delay makes that acceptable. With the usual order of
6482+ * changes, a crash after memcpy() and before XLogInsert() could allow
6483+ * datfrozenxid to overtake relfrozenxid:
64796484 *
64806485 * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
64816486 * ["R" is a VACUUM tbl]
@@ -6485,31 +6490,65 @@ heap_inplace_update_and_unlock(Relation relation,
64856490 * D: raise pg_database.datfrozenxid, XLogInsert(), finish
64866491 * [crash]
64876492 * [recovery restores datfrozenxid w/o relfrozenxid]
6488- */
6489-
6490- MarkBufferDirty (buffer );
6493+ *
6494+ * Mimic MarkBufferDirtyHint() subroutine XLogSaveBufferForHint().
6495+ * Specifically, use DELAY_CHKPT_START, and copy the buffer to the stack.
6496+ * The stack copy facilitates a FPI of the post-mutation block before we
6497+ * accept other sessions seeing it. DELAY_CHKPT_START allows us to
6498+ * XLogInsert() before MarkBufferDirty(). Since XLogSaveBufferForHint()
6499+ * can operate under BUFFER_LOCK_SHARED, it can't avoid DELAY_CHKPT_START.
6500+ * This function, however, likely could avoid it with the following order
6501+ * of operations: MarkBufferDirty(), XLogInsert(), memcpy(). Opt to use
6502+ * DELAY_CHKPT_START here, too, as a way to have fewer distinct code
6503+ * patterns to analyze. Inplace update isn't so frequent that it should
6504+ * pursue the small optimization of skipping DELAY_CHKPT_START.
6505+ */
6506+ Assert ((MyProc -> delayChkptFlags & DELAY_CHKPT_START ) == 0 );
6507+ START_CRIT_SECTION ();
6508+ MyProc -> delayChkptFlags |= DELAY_CHKPT_START ;
64916509
64926510 /* XLOG stuff */
64936511 if (RelationNeedsWAL (relation ))
64946512 {
64956513 xl_heap_inplace xlrec ;
6514+ PGAlignedBlock copied_buffer ;
6515+ char * origdata = (char * ) BufferGetBlock (buffer );
6516+ Page page = BufferGetPage (buffer );
6517+ uint16 lower = ((PageHeader ) page )-> pd_lower ;
6518+ uint16 upper = ((PageHeader ) page )-> pd_upper ;
6519+ uintptr_t dst_offset_in_block ;
6520+ RelFileLocator rlocator ;
6521+ ForkNumber forkno ;
6522+ BlockNumber blkno ;
64966523 XLogRecPtr recptr ;
64976524
64986525 xlrec .offnum = ItemPointerGetOffsetNumber (& tuple -> t_self );
64996526
65006527 XLogBeginInsert ();
65016528 XLogRegisterData ((char * ) & xlrec , SizeOfHeapInplace );
65026529
6503- XLogRegisterBuffer (0 , buffer , REGBUF_STANDARD );
6504- XLogRegisterBufData (0 , (char * ) htup + htup -> t_hoff , newlen );
6530+ /* register block matching what buffer will look like after changes */
6531+ memcpy (copied_buffer .data , origdata , lower );
6532+ memcpy (copied_buffer .data + upper , origdata + upper , BLCKSZ - upper );
6533+ dst_offset_in_block = dst - origdata ;
6534+ memcpy (copied_buffer .data + dst_offset_in_block , src , newlen );
6535+ BufferGetTag (buffer , & rlocator , & forkno , & blkno );
6536+ Assert (forkno == MAIN_FORKNUM );
6537+ XLogRegisterBlock (0 , & rlocator , forkno , blkno , copied_buffer .data ,
6538+ REGBUF_STANDARD );
6539+ XLogRegisterBufData (0 , src , newlen );
65056540
65066541 /* inplace updates aren't decoded atm, don't log the origin */
65076542
65086543 recptr = XLogInsert (RM_HEAP_ID , XLOG_HEAP_INPLACE );
65096544
6510- PageSetLSN (BufferGetPage ( buffer ) , recptr );
6545+ PageSetLSN (page , recptr );
65116546 }
65126547
6548+ memcpy (dst , src , newlen );
6549+
6550+ MarkBufferDirty (buffer );
6551+
65136552 LockBuffer (buffer , BUFFER_LOCK_UNLOCK );
65146553
65156554 /*
@@ -6518,6 +6557,7 @@ heap_inplace_update_and_unlock(Relation relation,
65186557 */
65196558 AtInplace_Inval ();
65206559
6560+ MyProc -> delayChkptFlags &= ~DELAY_CHKPT_START ;
65216561 END_CRIT_SECTION ();
65226562 UnlockTuple (relation , & tuple -> t_self , InplaceUpdateTupleLock );
65236563
0 commit comments