Add TAP test to check recovery when redo LSN is missing

michaelpq · michaelpq · commit 15f68cebdcec · 2025-12-16T14:28:05.000+09:00
This commit provides test coverage for dc7c77f, where the redo record and the checkpoint record finish on different WAL segments with the start of recovery able to detect that the redo record is missing. This test uses a wait injection point done in the critical section of a checkpoint, method that requires not one but actually two wait injection points to avoid any memory allocations within the critical section of the checkpoint: - Checkpoint run with a background psql. - One first wait point is run by the checkpointer before the critical section, allocating the shared memory required by the DSM registry for the wait machinery in the library injection_points. - First point is woken up. - Second wait point is loaded before the critical section, allocating the memory to build the path to the library loaded, then run in the critical section once the checkpoint redo record has been logged. - WAL segment is switched while waiting on the second point. - Checkpoint completes. - Stop cluster with immediate mode. - The segment that includes the redo record is removed. - Start, recovery fails as the redo record cannot be found. The error message introduced in dc7c77f is now reduced to a FATAL, meaning that the information is still provided while being able to use a test for it. Nitin has provided a basic version of the test, that I have enhanced to make it portable with two points. Without dc7c77f, the cluster crashes in this test, not on a PANIC but due to the pointer dereference at the beginning of recovery, failure mentioned in the other commit. Author: Nitin Jadhav <nitinjadhavpostgres@gmail.com> Co-authored-by: Michael Paquier <michael@paquier.xyz> Discussion: https://postgr.es/m/CAMm1aWaaJi2w49c0RiaDBfhdCL6ztbr9m=daGqiOuVdizYWYaA@mail.gmail.com
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
@@ -7001,6 +7001,10 @@ CreateCheckPoint(int flags)
 	 */
 	SyncPreCheckpoint();
 
+	/* Run these points outside the critical section. */
+	INJECTION_POINT("create-checkpoint-initial", NULL);
+	INJECTION_POINT_LOAD("create-checkpoint-run");
+
 	/*
 	 * Use a critical section to force system panic if we have trouble.
 	 */
@@ -7151,6 +7155,8 @@ CreateCheckPoint(int flags)
 	if (log_checkpoints)
 		LogCheckpointStart(flags, false);
 
+	INJECTION_POINT_CACHED("create-checkpoint-run", NULL);
+
 	/* Update the process title */
 	update_checkpoint_display(flags, false, false);
 
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
@@ -811,7 +811,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 		{
 			XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
 			if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID))
-				ereport(PANIC,
+				ereport(FATAL,
 						errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
 							   LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)));
 		}
diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build
@@ -58,6 +58,7 @@ tests += {
       't/047_checkpoint_physical_slot.pl',
       't/048_vacuum_horizon_floor.pl',
       't/049_wait_for_lsn.pl',
+      't/050_redo_segment_missing.pl',
     ],
   },
 }
diff --git a/src/test/recovery/t/050_redo_segment_missing.pl b/src/test/recovery/t/050_redo_segment_missing.pl
@@ -0,0 +1,117 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+#
+# Evaluates PostgreSQL's recovery behavior when a WAL segment containing the
+# redo record is missing, with a checkpoint record located in a different
+# segment.
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+if ($ENV{enable_injection_points} ne 'yes')
+{
+	plan skip_all => 'Injection points not supported by this build';
+}
+
+my $node = PostgreSQL::Test::Cluster->new('testnode');
+$node->init;
+$node->append_conf('postgresql.conf', 'log_checkpoints = on');
+$node->start;
+
+# Check if the extension injection_points is available, as it may be
+# possible that this script is run with installcheck, where the module
+# would not be installed by default.
+if (!$node->check_extension('injection_points'))
+{
+	plan skip_all => 'Extension injection_points not installed';
+}
+$node->safe_psql('postgres', q(CREATE EXTENSION injection_points));
+
+# Note that this uses two injection points based on waits, not one.  This
+# may look strange, but this works as a workaround to enforce all memory
+# allocations to happen outside the critical section of the checkpoint
+# required for this test.
+# First, "create-checkpoint-initial" is run outside the critical section
+# section, and is used as a way to initialize the shared memory required
+# for the wait machinery with its DSM registry.
+# Then, "create-checkpoint-run" is loaded outside the critical section of
+# a checkpoint to allocate any memory required by the library load, and
+# its callback is run inside the critical section.
+$node->safe_psql('postgres',
+	q{select injection_points_attach('create-checkpoint-initial', 'wait')});
+$node->safe_psql('postgres',
+	q{select injection_points_attach('create-checkpoint-run', 'wait')});
+
+# Start a psql session to run the checkpoint in the background and make
+# the test wait on the injection point so the checkpoint stops just after
+# it starts.
+my $checkpoint = $node->background_psql('postgres');
+$checkpoint->query_until(
+	qr/starting_checkpoint/,
+	q(\echo starting_checkpoint
+checkpoint;
+));
+
+# Wait for the initial point to finish, the checkpointer is still
+# outside its critical section.  Then release to reach the second
+# point.
+$node->wait_for_event('checkpointer', 'create-checkpoint-initial');
+$node->safe_psql('postgres',
+	q{select injection_points_wakeup('create-checkpoint-initial')});
+
+# Wait until the checkpoint has reached the second injection point.
+# We are now in the middle of a checkpoint running, after the redo
+# record has been logged.
+$node->wait_for_event('checkpointer', 'create-checkpoint-run');
+
+# Switch the WAL segment, ensuring that the redo record will be included
+# in a different segment than the checkpoint record.
+$node->safe_psql('postgres', 'SELECT pg_switch_wal()');
+
+# Continue the checkpoint and wait for its completion.
+my $log_offset = -s $node->logfile;
+$node->safe_psql('postgres',
+	q{select injection_points_wakeup('create-checkpoint-run')});
+$node->wait_for_log(qr/checkpoint complete/, $log_offset);
+
+$checkpoint->quit;
+
+# Retrieve the WAL file names for the redo record and checkpoint record.
+my $redo_lsn = $node->safe_psql('postgres',
+	"SELECT redo_lsn FROM pg_control_checkpoint()");
+my $redo_walfile_name =
+  $node->safe_psql('postgres', "SELECT pg_walfile_name('$redo_lsn')");
+my $checkpoint_lsn = $node->safe_psql('postgres',
+	"SELECT checkpoint_lsn FROM pg_control_checkpoint()");
+my $checkpoint_walfile_name =
+  $node->safe_psql('postgres', "SELECT pg_walfile_name('$checkpoint_lsn')");
+
+# Redo record and checkpoint record should be on different segments.
+isnt($redo_walfile_name, $checkpoint_walfile_name,
+	'redo and checkpoint records on different segments');
+
+# Remove the WAL segment containing the redo record.
+unlink $node->data_dir . "/pg_wal/$redo_walfile_name"
+  or die "could not remove WAL file: $!";
+
+$node->stop('immediate');
+
+# Use run_log instead of node->start because this test expects that
+# the server ends with an error during recovery.
+run_log(
+	[
+		'pg_ctl',
+		'--pgdata' => $node->data_dir,
+		'--log' => $node->logfile,
+		'start',
+	]);
+
+# Confirm that recovery has failed, as expected.
+my $logfile = slurp_file($node->logfile());
+ok( $logfile =~
+	  qr/FATAL: .* could not find redo location .* referenced by checkpoint record at .*/,
+	"ends with FATAL because it could not find redo location");
+
+done_testing();

Original file line number	Diff line number	Diff line change
`@@ -811,7 +811,7 @@ InitWalRecovery(ControlFileData ControlFile, bool wasShutdown_ptr,`
`811`	`811`	`{`
`812`	`812`	`XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);`
`813`	`813`	`if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID))`
`814`		`- ereport(PANIC,`
	`814`	`+ ereport(FATAL,`
`815`	`815`	`errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",`
`816`	`816`	`LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)));`
`817`	`817`	`}`
Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,7 @@ tests += {`
`58`	`58`	`'t/047_checkpoint_physical_slot.pl',`
`59`	`59`	`'t/048_vacuum_horizon_floor.pl',`
`60`	`60`	`'t/049_wait_for_lsn.pl',`
	`61`	`+ 't/050_redo_segment_missing.pl',`
`61`	`62`	`],`
`62`	`63`	`},`
`63`	`64`	`}`