🌐 AI搜索 & 代理 主页
Skip to content

Commit e46041f

Browse files
committed
Set next multixid's offset when creating a new multixid
With this commit, the next multixid's offset will always be set on the offsets page, by the time that a backend might try to read it, so we no longer need the waiting mechanism with the condition variable. In other words, this eliminates "corner case 2" mentioned in the comments. The waiting mechanism was broken in a few scenarios: - When nextMulti was advanced without WAL-logging the next multixid. For example, if a later multixid was already assigned and WAL-logged before the previous one was WAL-logged, and then the server crashed. In that case the next offset would never be set in the offsets SLRU, and a query trying to read it would get stuck waiting for it. Same thing could happen if pg_resetwal was used to forcibly advance nextMulti. - In hot standby mode, a deadlock could happen where one backend waits for the next multixid assignment record, but WAL replay is not advancing because of a recovery conflict with the waiting backend. The old TAP test used carefully placed injection points to exercise the old waiting code, but now that the waiting code is gone, much of the old test is no longer relevant. Rewrite the test to reproduce the IPC/MultixactCreation hang after crash recovery instead, and to verify that previously recorded multixids stay readable. Backpatch to all supported versions. In back-branches, we still need to be able to read WAL that was generated before this fix, so in the back-branches this includes a hack to initialize the next offsets page when replaying XLOG_MULTIXACT_CREATE_ID for the last multixid on a page. On 'master', bump XLOG_PAGE_MAGIC instead to indicate that the WAL is not compatible. Author: Andrey Borodin <amborodin@acm.org> Reviewed-by: Dmitry Yurichev <dsy.075@yandex.ru> Reviewed-by: Álvaro Herrera <alvherre@kurilemu.de> Reviewed-by: Kirill Reshke <reshkekirill@gmail.com> Reviewed-by: Ivan Bykov <i.bykov@modernsys.ru> Reviewed-by: Chao Li <li.evan.chao@gmail.com> Discussion: https://www.postgresql.org/message-id/172e5723-d65f-4eec-b512-14beacb326ce@yandex.ru Backpatch-through: 14
1 parent 19e7867 commit e46041f

File tree

3 files changed

+191
-161
lines changed

3 files changed

+191
-161
lines changed

src/backend/access/transam/multixact.c

Lines changed: 159 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@
8484
#include "pg_trace.h"
8585
#include "pgstat.h"
8686
#include "postmaster/autovacuum.h"
87-
#include "storage/condition_variable.h"
8887
#include "storage/pmsignal.h"
8988
#include "storage/proc.h"
9089
#include "storage/procarray.h"
@@ -276,12 +275,6 @@ typedef struct MultiXactStateData
276275
/* support for members anti-wraparound measures */
277276
MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */
278277

279-
/*
280-
* This is used to sleep until a multixact offset is written when we want
281-
* to create the next one.
282-
*/
283-
ConditionVariable nextoff_cv;
284-
285278
/*
286279
* Per-backend data starts here. We have two arrays stored in the area
287280
* immediately following the MultiXactStateData struct. Each is indexed by
@@ -386,6 +379,9 @@ static MemoryContext MXactContext = NULL;
386379
#define debug_elog6(a,b,c,d,e,f)
387380
#endif
388381

382+
/* hack to deal with WAL generated with older minor versions */
383+
static int64 pre_initialized_offsets_page = -1;
384+
389385
/* internal MultiXactId management */
390386
static void MultiXactIdSetOldestVisible(void);
391387
static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
@@ -922,13 +918,65 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
922918
int entryno;
923919
int slotno;
924920
MultiXactOffset *offptr;
925-
int i;
921+
MultiXactId next;
922+
int64 next_pageno;
923+
int next_entryno;
924+
MultiXactOffset *next_offptr;
926925
LWLock *lock;
927926
LWLock *prevlock = NULL;
928927

928+
/* position of this multixid in the offsets SLRU area */
929929
pageno = MultiXactIdToOffsetPage(multi);
930930
entryno = MultiXactIdToOffsetEntry(multi);
931931

932+
/* position of the next multixid */
933+
next = multi + 1;
934+
if (next < FirstMultiXactId)
935+
next = FirstMultiXactId;
936+
next_pageno = MultiXactIdToOffsetPage(next);
937+
next_entryno = MultiXactIdToOffsetEntry(next);
938+
939+
/*
940+
* Older minor versions didn't set the next multixid's offset in this
941+
* function, and therefore didn't initialize the next page until the next
942+
* multixid was assigned. If we're replaying WAL that was generated by
943+
* such a version, the next page might not be initialized yet. Initialize
944+
* it now.
945+
*/
946+
if (InRecovery &&
947+
next_pageno != pageno &&
948+
pg_atomic_read_u64(&MultiXactOffsetCtl->shared->latest_page_number) == pageno)
949+
{
950+
elog(DEBUG1, "next offsets page is not initialized, initializing it now");
951+
952+
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
953+
LWLockAcquire(lock, LW_EXCLUSIVE);
954+
955+
/* Create and zero the page */
956+
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno);
957+
958+
/* Make sure it's written out */
959+
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
960+
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
961+
962+
LWLockRelease(lock);
963+
964+
/*
965+
* Remember that we initialized the page, so that we don't zero it
966+
* again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record.
967+
*/
968+
pre_initialized_offsets_page = next_pageno;
969+
}
970+
971+
/*
972+
* Set the starting offset of this multixid's members.
973+
*
974+
* In the common case, it was already be set by the previous
975+
* RecordNewMultiXact call, as this was the next multixid of the previous
976+
* multixid. But if multiple backends are generating multixids
977+
* concurrently, we might race ahead and get called before the previous
978+
* multixid.
979+
*/
932980
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
933981
LWLockAcquire(lock, LW_EXCLUSIVE);
934982

@@ -943,22 +991,50 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
943991
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
944992
offptr += entryno;
945993

946-
*offptr = offset;
994+
if (*offptr != offset)
995+
{
996+
/* should already be set to the correct value, or not at all */
997+
Assert(*offptr == 0);
998+
*offptr = offset;
999+
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
1000+
}
1001+
1002+
/*
1003+
* Set the next multixid's offset to the end of this multixid's members.
1004+
*/
1005+
if (next_pageno == pageno)
1006+
{
1007+
next_offptr = offptr + 1;
1008+
}
1009+
else
1010+
{
1011+
/* must be the first entry on the page */
1012+
Assert(next_entryno == 0 || next == FirstMultiXactId);
1013+
1014+
/* Swap the lock for a lock on the next page */
1015+
LWLockRelease(lock);
1016+
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
1017+
LWLockAcquire(lock, LW_EXCLUSIVE);
1018+
1019+
slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next);
1020+
next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
1021+
next_offptr += next_entryno;
1022+
}
9471023

948-
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
1024+
if (*next_offptr != offset + nmembers)
1025+
{
1026+
/* should already be set to the correct value, or not at all */
1027+
Assert(*next_offptr == 0);
1028+
*next_offptr = offset + nmembers;
1029+
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
1030+
}
9491031

9501032
/* Release MultiXactOffset SLRU lock. */
9511033
LWLockRelease(lock);
9521034

953-
/*
954-
* If anybody was waiting to know the offset of this multixact ID we just
955-
* wrote, they can read it now, so wake them up.
956-
*/
957-
ConditionVariableBroadcast(&MultiXactState->nextoff_cv);
958-
9591035
prev_pageno = -1;
9601036

961-
for (i = 0; i < nmembers; i++, offset++)
1037+
for (int i = 0; i < nmembers; i++, offset++)
9621038
{
9631039
TransactionId *memberptr;
9641040
uint32 *flagsptr;
@@ -1148,8 +1224,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
11481224
result = FirstMultiXactId;
11491225
}
11501226

1151-
/* Make sure there is room for the MXID in the file. */
1152-
ExtendMultiXactOffset(result);
1227+
/*
1228+
* Make sure there is room for the next MXID in the file. Assigning this
1229+
* MXID sets the next MXID's offset already.
1230+
*/
1231+
ExtendMultiXactOffset(result + 1);
11531232

11541233
/*
11551234
* Reserve the members space, similarly to above. Also, be careful not to
@@ -1314,7 +1393,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13141393
MultiXactOffset nextOffset;
13151394
MultiXactMember *ptr;
13161395
LWLock *lock;
1317-
bool slept = false;
13181396

13191397
debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
13201398

@@ -1391,23 +1469,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
13911469
* one's. However, there are some corner cases to worry about:
13921470
*
13931471
* 1. This multixact may be the latest one created, in which case there is
1394-
* no next one to look at. In this case the nextOffset value we just
1395-
* saved is the correct endpoint.
1472+
* no next one to look at. The next multixact's offset should be set
1473+
* already, as we set it in RecordNewMultiXact(), but we used to not do
1474+
* that in older minor versions. To cope with that case, if this
1475+
* multixact is the latest one created, use the nextOffset value we read
1476+
* above as the endpoint.
13961477
*
1397-
* 2. The next multixact may still be in process of being filled in: that
1398-
* is, another process may have done GetNewMultiXactId but not yet written
1399-
* the offset entry for that ID. In that scenario, it is guaranteed that
1400-
* the offset entry for that multixact exists (because GetNewMultiXactId
1401-
* won't release MultiXactGenLock until it does) but contains zero
1402-
* (because we are careful to pre-zero offset pages). Because
1403-
* GetNewMultiXactId will never return zero as the starting offset for a
1404-
* multixact, when we read zero as the next multixact's offset, we know we
1405-
* have this case. We handle this by sleeping on the condition variable
1406-
* we have just for this; the process in charge will signal the CV as soon
1407-
* as it has finished writing the multixact offset.
1408-
*
1409-
* 3. Because GetNewMultiXactId increments offset zero to offset one to
1410-
* handle case #2, there is an ambiguity near the point of offset
1478+
* 2. Because GetNewMultiXactId skips over offset zero, to reserve zero
1479+
* for to mean "unset", there is an ambiguity near the point of offset
14111480
* wraparound. If we see next multixact's offset is one, is that our
14121481
* multixact's actual endpoint, or did it end at zero with a subsequent
14131482
* increment? We handle this using the knowledge that if the zero'th
@@ -1419,7 +1488,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
14191488
* cases, so it seems better than holding the MultiXactGenLock for a long
14201489
* time on every multixact creation.
14211490
*/
1422-
retry:
14231491
pageno = MultiXactIdToOffsetPage(multi);
14241492
entryno = MultiXactIdToOffsetEntry(multi);
14251493

@@ -1482,31 +1550,17 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
14821550
nextMXOffset = *offptr;
14831551

14841552
if (nextMXOffset == 0)
1485-
{
1486-
/* Corner case 2: next multixact is still being filled in */
1487-
LWLockRelease(lock);
1488-
CHECK_FOR_INTERRUPTS();
1489-
1490-
INJECTION_POINT("multixact-get-members-cv-sleep", NULL);
1491-
1492-
ConditionVariableSleep(&MultiXactState->nextoff_cv,
1493-
WAIT_EVENT_MULTIXACT_CREATION);
1494-
slept = true;
1495-
goto retry;
1496-
}
1553+
ereport(ERROR,
1554+
(errcode(ERRCODE_DATA_CORRUPTED),
1555+
errmsg("MultiXact %u has invalid next offset",
1556+
multi)));
14971557

14981558
length = nextMXOffset - offset;
14991559
}
15001560

15011561
LWLockRelease(lock);
15021562
lock = NULL;
15031563

1504-
/*
1505-
* If we slept above, clean up state; it's no longer needed.
1506-
*/
1507-
if (slept)
1508-
ConditionVariableCancelSleep();
1509-
15101564
ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
15111565

15121566
truelength = 0;
@@ -1549,7 +1603,7 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
15491603

15501604
if (!TransactionIdIsValid(*xactptr))
15511605
{
1552-
/* Corner case 3: we must be looking at unused slot zero */
1606+
/* Corner case 2: we must be looking at unused slot zero */
15531607
Assert(offset == 0);
15541608
continue;
15551609
}
@@ -1996,7 +2050,6 @@ MultiXactShmemInit(void)
19962050

19972051
/* Make sure we zero out the per-backend state */
19982052
MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
1999-
ConditionVariableInit(&MultiXactState->nextoff_cv);
20002053
}
20012054
else
20022055
Assert(found);
@@ -2203,26 +2256,34 @@ TrimMultiXact(void)
22032256
pageno);
22042257

22052258
/*
2206-
* Zero out the remainder of the current offsets page. See notes in
2207-
* TrimCLOG() for background. Unlike CLOG, some WAL record covers every
2208-
* pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
2209-
* rule "write xlog before data," nextMXact successors may carry obsolete,
2210-
* nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
2211-
* operates normally.
2259+
* Set the offset of nextMXact on the offsets page. This is normally done
2260+
* in RecordNewMultiXact() of the previous multixact, but we used to not
2261+
* do that in older minor versions. To ensure that the next offset is set
2262+
* if the binary was just upgraded from an older minor version, do it now.
2263+
*
2264+
* Zero out the remainder of the page. See notes in TrimCLOG() for
2265+
* background. Unlike CLOG, some WAL record covers every pg_multixact
2266+
* SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write
2267+
* xlog before data," nextMXact successors may carry obsolete, nonzero
2268+
* offset values.
22122269
*/
22132270
entryno = MultiXactIdToOffsetEntry(nextMXact);
2214-
if (entryno != 0)
22152271
{
22162272
int slotno;
22172273
MultiXactOffset *offptr;
22182274
LWLock *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
22192275

22202276
LWLockAcquire(lock, LW_EXCLUSIVE);
2221-
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
2277+
if (entryno == 0)
2278+
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
2279+
else
2280+
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
22222281
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
22232282
offptr += entryno;
22242283

2225-
MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
2284+
*offptr = offset;
2285+
if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ)
2286+
MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset));
22262287

22272288
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
22282289
LWLockRelease(lock);
@@ -3407,14 +3468,24 @@ multixact_redo(XLogReaderState *record)
34073468

34083469
memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
34093470

3410-
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
3411-
LWLockAcquire(lock, LW_EXCLUSIVE);
3471+
/*
3472+
* Skip the record if we already initialized the page at the previous
3473+
* XLOG_MULTIXACT_CREATE_ID record. See RecordNewMultiXact().
3474+
*/
3475+
if (pre_initialized_offsets_page != pageno)
3476+
{
3477+
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
3478+
LWLockAcquire(lock, LW_EXCLUSIVE);
34123479

3413-
slotno = ZeroMultiXactOffsetPage(pageno, false);
3414-
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
3415-
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
3480+
slotno = ZeroMultiXactOffsetPage(pageno, false);
3481+
SimpleLruWritePage(MultiXactOffsetCtl, slotno);
3482+
Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
34163483

3417-
LWLockRelease(lock);
3484+
LWLockRelease(lock);
3485+
}
3486+
else
3487+
elog(DEBUG1, "skipping initialization of offsets page " INT64_FORMAT " because it was already initialized on multixid creation", pageno);
3488+
pre_initialized_offsets_page = -1;
34183489
}
34193490
else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
34203491
{
@@ -3440,6 +3511,22 @@ multixact_redo(XLogReaderState *record)
34403511
TransactionId max_xid;
34413512
int i;
34423513

3514+
if (pre_initialized_offsets_page != -1)
3515+
{
3516+
/*
3517+
* If we implicitly initialized the next offsets page while
3518+
* replaying an XLOG_MULTIXACT_CREATE_ID record that was generated
3519+
* with an older minor version, we still expect to see an
3520+
* XLOG_MULTIXACT_ZERO_OFF_PAGE record for it before any other
3521+
* XLOG_MULTIXACT_CREATE_ID records. Therefore this case should
3522+
* not happen. If it does, we'll continue with the replay, but
3523+
* log a message to note that something's funny.
3524+
*/
3525+
elog(LOG, "expected to see an XLOG_MULTIXACT_ZERO_OFF_PAGE record for page " INT64_FORMAT " that was implicitly initialized earlier",
3526+
pre_initialized_offsets_page);
3527+
pre_initialized_offsets_page = -1;
3528+
}
3529+
34433530
/* Store the data back into the SLRU files */
34443531
RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
34453532
xlrec->members);

0 commit comments

Comments
 (0)