🌐 AI搜索 & 代理 主页
Skip to content

Commit 15f68ce

Browse files
committed
Add TAP test to check recovery when redo LSN is missing
This commit provides test coverage for dc7c77f, where the redo record and the checkpoint record finish on different WAL segments with the start of recovery able to detect that the redo record is missing. This test uses a wait injection point done in the critical section of a checkpoint, method that requires not one but actually two wait injection points to avoid any memory allocations within the critical section of the checkpoint: - Checkpoint run with a background psql. - One first wait point is run by the checkpointer before the critical section, allocating the shared memory required by the DSM registry for the wait machinery in the library injection_points. - First point is woken up. - Second wait point is loaded before the critical section, allocating the memory to build the path to the library loaded, then run in the critical section once the checkpoint redo record has been logged. - WAL segment is switched while waiting on the second point. - Checkpoint completes. - Stop cluster with immediate mode. - The segment that includes the redo record is removed. - Start, recovery fails as the redo record cannot be found. The error message introduced in dc7c77f is now reduced to a FATAL, meaning that the information is still provided while being able to use a test for it. Nitin has provided a basic version of the test, that I have enhanced to make it portable with two points. Without dc7c77f, the cluster crashes in this test, not on a PANIC but due to the pointer dereference at the beginning of recovery, failure mentioned in the other commit. Author: Nitin Jadhav <nitinjadhavpostgres@gmail.com> Co-authored-by: Michael Paquier <michael@paquier.xyz> Discussion: https://postgr.es/m/CAMm1aWaaJi2w49c0RiaDBfhdCL6ztbr9m=daGqiOuVdizYWYaA@mail.gmail.com
1 parent dc7c77f commit 15f68ce

File tree

4 files changed

+125
-1
lines changed

4 files changed

+125
-1
lines changed

src/backend/access/transam/xlog.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7001,6 +7001,10 @@ CreateCheckPoint(int flags)
70017001
*/
70027002
SyncPreCheckpoint();
70037003

7004+
/* Run these points outside the critical section. */
7005+
INJECTION_POINT("create-checkpoint-initial", NULL);
7006+
INJECTION_POINT_LOAD("create-checkpoint-run");
7007+
70047008
/*
70057009
* Use a critical section to force system panic if we have trouble.
70067010
*/
@@ -7151,6 +7155,8 @@ CreateCheckPoint(int flags)
71517155
if (log_checkpoints)
71527156
LogCheckpointStart(flags, false);
71537157

7158+
INJECTION_POINT_CACHED("create-checkpoint-run", NULL);
7159+
71547160
/* Update the process title */
71557161
update_checkpoint_display(flags, false, false);
71567162

src/backend/access/transam/xlogrecovery.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -811,7 +811,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
811811
{
812812
XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
813813
if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID))
814-
ereport(PANIC,
814+
ereport(FATAL,
815815
errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
816816
LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)));
817817
}

src/test/recovery/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ tests += {
5858
't/047_checkpoint_physical_slot.pl',
5959
't/048_vacuum_horizon_floor.pl',
6060
't/049_wait_for_lsn.pl',
61+
't/050_redo_segment_missing.pl',
6162
],
6263
},
6364
}
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# Copyright (c) 2025, PostgreSQL Global Development Group
2+
#
3+
# Evaluates PostgreSQL's recovery behavior when a WAL segment containing the
4+
# redo record is missing, with a checkpoint record located in a different
5+
# segment.
6+
7+
use strict;
8+
use warnings FATAL => 'all';
9+
use PostgreSQL::Test::Cluster;
10+
use PostgreSQL::Test::Utils;
11+
use Test::More;
12+
13+
if ($ENV{enable_injection_points} ne 'yes')
14+
{
15+
plan skip_all => 'Injection points not supported by this build';
16+
}
17+
18+
my $node = PostgreSQL::Test::Cluster->new('testnode');
19+
$node->init;
20+
$node->append_conf('postgresql.conf', 'log_checkpoints = on');
21+
$node->start;
22+
23+
# Check if the extension injection_points is available, as it may be
24+
# possible that this script is run with installcheck, where the module
25+
# would not be installed by default.
26+
if (!$node->check_extension('injection_points'))
27+
{
28+
plan skip_all => 'Extension injection_points not installed';
29+
}
30+
$node->safe_psql('postgres', q(CREATE EXTENSION injection_points));
31+
32+
# Note that this uses two injection points based on waits, not one. This
33+
# may look strange, but this works as a workaround to enforce all memory
34+
# allocations to happen outside the critical section of the checkpoint
35+
# required for this test.
36+
# First, "create-checkpoint-initial" is run outside the critical section
37+
# section, and is used as a way to initialize the shared memory required
38+
# for the wait machinery with its DSM registry.
39+
# Then, "create-checkpoint-run" is loaded outside the critical section of
40+
# a checkpoint to allocate any memory required by the library load, and
41+
# its callback is run inside the critical section.
42+
$node->safe_psql('postgres',
43+
q{select injection_points_attach('create-checkpoint-initial', 'wait')});
44+
$node->safe_psql('postgres',
45+
q{select injection_points_attach('create-checkpoint-run', 'wait')});
46+
47+
# Start a psql session to run the checkpoint in the background and make
48+
# the test wait on the injection point so the checkpoint stops just after
49+
# it starts.
50+
my $checkpoint = $node->background_psql('postgres');
51+
$checkpoint->query_until(
52+
qr/starting_checkpoint/,
53+
q(\echo starting_checkpoint
54+
checkpoint;
55+
));
56+
57+
# Wait for the initial point to finish, the checkpointer is still
58+
# outside its critical section. Then release to reach the second
59+
# point.
60+
$node->wait_for_event('checkpointer', 'create-checkpoint-initial');
61+
$node->safe_psql('postgres',
62+
q{select injection_points_wakeup('create-checkpoint-initial')});
63+
64+
# Wait until the checkpoint has reached the second injection point.
65+
# We are now in the middle of a checkpoint running, after the redo
66+
# record has been logged.
67+
$node->wait_for_event('checkpointer', 'create-checkpoint-run');
68+
69+
# Switch the WAL segment, ensuring that the redo record will be included
70+
# in a different segment than the checkpoint record.
71+
$node->safe_psql('postgres', 'SELECT pg_switch_wal()');
72+
73+
# Continue the checkpoint and wait for its completion.
74+
my $log_offset = -s $node->logfile;
75+
$node->safe_psql('postgres',
76+
q{select injection_points_wakeup('create-checkpoint-run')});
77+
$node->wait_for_log(qr/checkpoint complete/, $log_offset);
78+
79+
$checkpoint->quit;
80+
81+
# Retrieve the WAL file names for the redo record and checkpoint record.
82+
my $redo_lsn = $node->safe_psql('postgres',
83+
"SELECT redo_lsn FROM pg_control_checkpoint()");
84+
my $redo_walfile_name =
85+
$node->safe_psql('postgres', "SELECT pg_walfile_name('$redo_lsn')");
86+
my $checkpoint_lsn = $node->safe_psql('postgres',
87+
"SELECT checkpoint_lsn FROM pg_control_checkpoint()");
88+
my $checkpoint_walfile_name =
89+
$node->safe_psql('postgres', "SELECT pg_walfile_name('$checkpoint_lsn')");
90+
91+
# Redo record and checkpoint record should be on different segments.
92+
isnt($redo_walfile_name, $checkpoint_walfile_name,
93+
'redo and checkpoint records on different segments');
94+
95+
# Remove the WAL segment containing the redo record.
96+
unlink $node->data_dir . "/pg_wal/$redo_walfile_name"
97+
or die "could not remove WAL file: $!";
98+
99+
$node->stop('immediate');
100+
101+
# Use run_log instead of node->start because this test expects that
102+
# the server ends with an error during recovery.
103+
run_log(
104+
[
105+
'pg_ctl',
106+
'--pgdata' => $node->data_dir,
107+
'--log' => $node->logfile,
108+
'start',
109+
]);
110+
111+
# Confirm that recovery has failed, as expected.
112+
my $logfile = slurp_file($node->logfile());
113+
ok( $logfile =~
114+
qr/FATAL: .* could not find redo location .* referenced by checkpoint record at .*/,
115+
"ends with FATAL because it could not find redo location");
116+
117+
done_testing();

0 commit comments

Comments
 (0)