8484#include "pg_trace.h"
8585#include "pgstat.h"
8686#include "postmaster/autovacuum.h"
87- #include "storage/condition_variable.h"
8887#include "storage/pmsignal.h"
8988#include "storage/proc.h"
9089#include "storage/procarray.h"
@@ -276,12 +275,6 @@ typedef struct MultiXactStateData
276275 /* support for members anti-wraparound measures */
277276 MultiXactOffset offsetStopLimit ; /* known if oldestOffsetKnown */
278277
279- /*
280- * This is used to sleep until a multixact offset is written when we want
281- * to create the next one.
282- */
283- ConditionVariable nextoff_cv ;
284-
285278 /*
286279 * Per-backend data starts here. We have two arrays stored in the area
287280 * immediately following the MultiXactStateData struct. Each is indexed by
@@ -386,6 +379,9 @@ static MemoryContext MXactContext = NULL;
386379#define debug_elog6 (a ,b ,c ,d ,e ,f )
387380#endif
388381
382+ /* hack to deal with WAL generated with older minor versions */
383+ static int64 pre_initialized_offsets_page = -1 ;
384+
389385/* internal MultiXactId management */
390386static void MultiXactIdSetOldestVisible (void );
391387static void RecordNewMultiXact (MultiXactId multi , MultiXactOffset offset ,
@@ -922,13 +918,65 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
922918 int entryno ;
923919 int slotno ;
924920 MultiXactOffset * offptr ;
925- int i ;
921+ MultiXactId next ;
922+ int64 next_pageno ;
923+ int next_entryno ;
924+ MultiXactOffset * next_offptr ;
926925 LWLock * lock ;
927926 LWLock * prevlock = NULL ;
928927
928+ /* position of this multixid in the offsets SLRU area */
929929 pageno = MultiXactIdToOffsetPage (multi );
930930 entryno = MultiXactIdToOffsetEntry (multi );
931931
932+ /* position of the next multixid */
933+ next = multi + 1 ;
934+ if (next < FirstMultiXactId )
935+ next = FirstMultiXactId ;
936+ next_pageno = MultiXactIdToOffsetPage (next );
937+ next_entryno = MultiXactIdToOffsetEntry (next );
938+
939+ /*
940+ * Older minor versions didn't set the next multixid's offset in this
941+ * function, and therefore didn't initialize the next page until the next
942+ * multixid was assigned. If we're replaying WAL that was generated by
943+ * such a version, the next page might not be initialized yet. Initialize
944+ * it now.
945+ */
946+ if (InRecovery &&
947+ next_pageno != pageno &&
948+ pg_atomic_read_u64 (& MultiXactOffsetCtl -> shared -> latest_page_number ) == pageno )
949+ {
950+ elog (DEBUG1 , "next offsets page is not initialized, initializing it now" );
951+
952+ lock = SimpleLruGetBankLock (MultiXactOffsetCtl , next_pageno );
953+ LWLockAcquire (lock , LW_EXCLUSIVE );
954+
955+ /* Create and zero the page */
956+ slotno = SimpleLruZeroPage (MultiXactOffsetCtl , next_pageno );
957+
958+ /* Make sure it's written out */
959+ SimpleLruWritePage (MultiXactOffsetCtl , slotno );
960+ Assert (!MultiXactOffsetCtl -> shared -> page_dirty [slotno ]);
961+
962+ LWLockRelease (lock );
963+
964+ /*
965+ * Remember that we initialized the page, so that we don't zero it
966+ * again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record.
967+ */
968+ pre_initialized_offsets_page = next_pageno ;
969+ }
970+
971+ /*
972+ * Set the starting offset of this multixid's members.
973+ *
974+ * In the common case, it was already be set by the previous
975+ * RecordNewMultiXact call, as this was the next multixid of the previous
976+ * multixid. But if multiple backends are generating multixids
977+ * concurrently, we might race ahead and get called before the previous
978+ * multixid.
979+ */
932980 lock = SimpleLruGetBankLock (MultiXactOffsetCtl , pageno );
933981 LWLockAcquire (lock , LW_EXCLUSIVE );
934982
@@ -943,22 +991,50 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
943991 offptr = (MultiXactOffset * ) MultiXactOffsetCtl -> shared -> page_buffer [slotno ];
944992 offptr += entryno ;
945993
946- * offptr = offset ;
994+ if (* offptr != offset )
995+ {
996+ /* should already be set to the correct value, or not at all */
997+ Assert (* offptr == 0 );
998+ * offptr = offset ;
999+ MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
1000+ }
1001+
1002+ /*
1003+ * Set the next multixid's offset to the end of this multixid's members.
1004+ */
1005+ if (next_pageno == pageno )
1006+ {
1007+ next_offptr = offptr + 1 ;
1008+ }
1009+ else
1010+ {
1011+ /* must be the first entry on the page */
1012+ Assert (next_entryno == 0 || next == FirstMultiXactId );
1013+
1014+ /* Swap the lock for a lock on the next page */
1015+ LWLockRelease (lock );
1016+ lock = SimpleLruGetBankLock (MultiXactOffsetCtl , next_pageno );
1017+ LWLockAcquire (lock , LW_EXCLUSIVE );
1018+
1019+ slotno = SimpleLruReadPage (MultiXactOffsetCtl , next_pageno , true, next );
1020+ next_offptr = (MultiXactOffset * ) MultiXactOffsetCtl -> shared -> page_buffer [slotno ];
1021+ next_offptr += next_entryno ;
1022+ }
9471023
948- MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
1024+ if (* next_offptr != offset + nmembers )
1025+ {
1026+ /* should already be set to the correct value, or not at all */
1027+ Assert (* next_offptr == 0 );
1028+ * next_offptr = offset + nmembers ;
1029+ MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
1030+ }
9491031
9501032 /* Release MultiXactOffset SLRU lock. */
9511033 LWLockRelease (lock );
9521034
953- /*
954- * If anybody was waiting to know the offset of this multixact ID we just
955- * wrote, they can read it now, so wake them up.
956- */
957- ConditionVariableBroadcast (& MultiXactState -> nextoff_cv );
958-
9591035 prev_pageno = -1 ;
9601036
961- for (i = 0 ; i < nmembers ; i ++ , offset ++ )
1037+ for (int i = 0 ; i < nmembers ; i ++ , offset ++ )
9621038 {
9631039 TransactionId * memberptr ;
9641040 uint32 * flagsptr ;
@@ -1148,8 +1224,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
11481224 result = FirstMultiXactId ;
11491225 }
11501226
1151- /* Make sure there is room for the MXID in the file. */
1152- ExtendMultiXactOffset (result );
1227+ /*
1228+ * Make sure there is room for the next MXID in the file. Assigning this
1229+ * MXID sets the next MXID's offset already.
1230+ */
1231+ ExtendMultiXactOffset (result + 1 );
11531232
11541233 /*
11551234 * Reserve the members space, similarly to above. Also, be careful not to
@@ -1314,7 +1393,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13141393 MultiXactOffset nextOffset ;
13151394 MultiXactMember * ptr ;
13161395 LWLock * lock ;
1317- bool slept = false;
13181396
13191397 debug_elog3 (DEBUG2 , "GetMembers: asked for %u" , multi );
13201398
@@ -1391,23 +1469,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13911469 * one's. However, there are some corner cases to worry about:
13921470 *
13931471 * 1. This multixact may be the latest one created, in which case there is
1394- * no next one to look at. In this case the nextOffset value we just
1395- * saved is the correct endpoint.
1472+ * no next one to look at. The next multixact's offset should be set
1473+ * already, as we set it in RecordNewMultiXact(), but we used to not do
1474+ * that in older minor versions. To cope with that case, if this
1475+ * multixact is the latest one created, use the nextOffset value we read
1476+ * above as the endpoint.
13961477 *
1397- * 2. The next multixact may still be in process of being filled in: that
1398- * is, another process may have done GetNewMultiXactId but not yet written
1399- * the offset entry for that ID. In that scenario, it is guaranteed that
1400- * the offset entry for that multixact exists (because GetNewMultiXactId
1401- * won't release MultiXactGenLock until it does) but contains zero
1402- * (because we are careful to pre-zero offset pages). Because
1403- * GetNewMultiXactId will never return zero as the starting offset for a
1404- * multixact, when we read zero as the next multixact's offset, we know we
1405- * have this case. We handle this by sleeping on the condition variable
1406- * we have just for this; the process in charge will signal the CV as soon
1407- * as it has finished writing the multixact offset.
1408- *
1409- * 3. Because GetNewMultiXactId increments offset zero to offset one to
1410- * handle case #2, there is an ambiguity near the point of offset
1478+ * 2. Because GetNewMultiXactId skips over offset zero, to reserve zero
1479+ * for to mean "unset", there is an ambiguity near the point of offset
14111480 * wraparound. If we see next multixact's offset is one, is that our
14121481 * multixact's actual endpoint, or did it end at zero with a subsequent
14131482 * increment? We handle this using the knowledge that if the zero'th
@@ -1419,7 +1488,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
14191488 * cases, so it seems better than holding the MultiXactGenLock for a long
14201489 * time on every multixact creation.
14211490 */
1422- retry :
14231491 pageno = MultiXactIdToOffsetPage (multi );
14241492 entryno = MultiXactIdToOffsetEntry (multi );
14251493
@@ -1482,31 +1550,17 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
14821550 nextMXOffset = * offptr ;
14831551
14841552 if (nextMXOffset == 0 )
1485- {
1486- /* Corner case 2: next multixact is still being filled in */
1487- LWLockRelease (lock );
1488- CHECK_FOR_INTERRUPTS ();
1489-
1490- INJECTION_POINT ("multixact-get-members-cv-sleep" , NULL );
1491-
1492- ConditionVariableSleep (& MultiXactState -> nextoff_cv ,
1493- WAIT_EVENT_MULTIXACT_CREATION );
1494- slept = true;
1495- goto retry ;
1496- }
1553+ ereport (ERROR ,
1554+ (errcode (ERRCODE_DATA_CORRUPTED ),
1555+ errmsg ("MultiXact %u has invalid next offset" ,
1556+ multi )));
14971557
14981558 length = nextMXOffset - offset ;
14991559 }
15001560
15011561 LWLockRelease (lock );
15021562 lock = NULL ;
15031563
1504- /*
1505- * If we slept above, clean up state; it's no longer needed.
1506- */
1507- if (slept )
1508- ConditionVariableCancelSleep ();
1509-
15101564 ptr = (MultiXactMember * ) palloc (length * sizeof (MultiXactMember ));
15111565
15121566 truelength = 0 ;
@@ -1549,7 +1603,7 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
15491603
15501604 if (!TransactionIdIsValid (* xactptr ))
15511605 {
1552- /* Corner case 3 : we must be looking at unused slot zero */
1606+ /* Corner case 2 : we must be looking at unused slot zero */
15531607 Assert (offset == 0 );
15541608 continue ;
15551609 }
@@ -1996,7 +2050,6 @@ MultiXactShmemInit(void)
19962050
19972051 /* Make sure we zero out the per-backend state */
19982052 MemSet (MultiXactState , 0 , SHARED_MULTIXACT_STATE_SIZE );
1999- ConditionVariableInit (& MultiXactState -> nextoff_cv );
20002053 }
20012054 else
20022055 Assert (found );
@@ -2203,26 +2256,34 @@ TrimMultiXact(void)
22032256 pageno );
22042257
22052258 /*
2206- * Zero out the remainder of the current offsets page. See notes in
2207- * TrimCLOG() for background. Unlike CLOG, some WAL record covers every
2208- * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
2209- * rule "write xlog before data," nextMXact successors may carry obsolete,
2210- * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
2211- * operates normally.
2259+ * Set the offset of nextMXact on the offsets page. This is normally done
2260+ * in RecordNewMultiXact() of the previous multixact, but we used to not
2261+ * do that in older minor versions. To ensure that the next offset is set
2262+ * if the binary was just upgraded from an older minor version, do it now.
2263+ *
2264+ * Zero out the remainder of the page. See notes in TrimCLOG() for
2265+ * background. Unlike CLOG, some WAL record covers every pg_multixact
2266+ * SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write
2267+ * xlog before data," nextMXact successors may carry obsolete, nonzero
2268+ * offset values.
22122269 */
22132270 entryno = MultiXactIdToOffsetEntry (nextMXact );
2214- if (entryno != 0 )
22152271 {
22162272 int slotno ;
22172273 MultiXactOffset * offptr ;
22182274 LWLock * lock = SimpleLruGetBankLock (MultiXactOffsetCtl , pageno );
22192275
22202276 LWLockAcquire (lock , LW_EXCLUSIVE );
2221- slotno = SimpleLruReadPage (MultiXactOffsetCtl , pageno , true, nextMXact );
2277+ if (entryno == 0 )
2278+ slotno = SimpleLruZeroPage (MultiXactOffsetCtl , pageno );
2279+ else
2280+ slotno = SimpleLruReadPage (MultiXactOffsetCtl , pageno , true, nextMXact );
22222281 offptr = (MultiXactOffset * ) MultiXactOffsetCtl -> shared -> page_buffer [slotno ];
22232282 offptr += entryno ;
22242283
2225- MemSet (offptr , 0 , BLCKSZ - (entryno * sizeof (MultiXactOffset )));
2284+ * offptr = offset ;
2285+ if (entryno != 0 && (entryno + 1 ) * sizeof (MultiXactOffset ) != BLCKSZ )
2286+ MemSet (offptr + 1 , 0 , BLCKSZ - (entryno + 1 ) * sizeof (MultiXactOffset ));
22262287
22272288 MultiXactOffsetCtl -> shared -> page_dirty [slotno ] = true;
22282289 LWLockRelease (lock );
@@ -3407,14 +3468,24 @@ multixact_redo(XLogReaderState *record)
34073468
34083469 memcpy (& pageno , XLogRecGetData (record ), sizeof (pageno ));
34093470
3410- lock = SimpleLruGetBankLock (MultiXactOffsetCtl , pageno );
3411- LWLockAcquire (lock , LW_EXCLUSIVE );
3471+ /*
3472+ * Skip the record if we already initialized the page at the previous
3473+ * XLOG_MULTIXACT_CREATE_ID record. See RecordNewMultiXact().
3474+ */
3475+ if (pre_initialized_offsets_page != pageno )
3476+ {
3477+ lock = SimpleLruGetBankLock (MultiXactOffsetCtl , pageno );
3478+ LWLockAcquire (lock , LW_EXCLUSIVE );
34123479
3413- slotno = ZeroMultiXactOffsetPage (pageno , false);
3414- SimpleLruWritePage (MultiXactOffsetCtl , slotno );
3415- Assert (!MultiXactOffsetCtl -> shared -> page_dirty [slotno ]);
3480+ slotno = ZeroMultiXactOffsetPage (pageno , false);
3481+ SimpleLruWritePage (MultiXactOffsetCtl , slotno );
3482+ Assert (!MultiXactOffsetCtl -> shared -> page_dirty [slotno ]);
34163483
3417- LWLockRelease (lock );
3484+ LWLockRelease (lock );
3485+ }
3486+ else
3487+ elog (DEBUG1 , "skipping initialization of offsets page " INT64_FORMAT " because it was already initialized on multixid creation" , pageno );
3488+ pre_initialized_offsets_page = -1 ;
34183489 }
34193490 else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE )
34203491 {
@@ -3440,6 +3511,22 @@ multixact_redo(XLogReaderState *record)
34403511 TransactionId max_xid ;
34413512 int i ;
34423513
3514+ if (pre_initialized_offsets_page != -1 )
3515+ {
3516+ /*
3517+ * If we implicitly initialized the next offsets page while
3518+ * replaying an XLOG_MULTIXACT_CREATE_ID record that was generated
3519+ * with an older minor version, we still expect to see an
3520+ * XLOG_MULTIXACT_ZERO_OFF_PAGE record for it before any other
3521+ * XLOG_MULTIXACT_CREATE_ID records. Therefore this case should
3522+ * not happen. If it does, we'll continue with the replay, but
3523+ * log a message to note that something's funny.
3524+ */
3525+ elog (LOG , "expected to see an XLOG_MULTIXACT_ZERO_OFF_PAGE record for page " INT64_FORMAT " that was implicitly initialized earlier" ,
3526+ pre_initialized_offsets_page );
3527+ pre_initialized_offsets_page = -1 ;
3528+ }
3529+
34433530 /* Store the data back into the SLRU files */
34443531 RecordNewMultiXact (xlrec -> mid , xlrec -> moff , xlrec -> nmembers ,
34453532 xlrec -> members );
0 commit comments