Coverage Report

Created: 2025-06-13 06:06

/src/postgres/src/backend/storage/ipc/standby.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * standby.c
4
 *    Misc functions used in Hot Standby mode.
5
 *
6
 *  All functions for handling RM_STANDBY_ID, which relate to
7
 *  AccessExclusiveLocks and starting snapshots for Hot Standby mode.
8
 *  Plus conflict recovery processing.
9
 *
10
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
11
 * Portions Copyright (c) 1994, Regents of the University of California
12
 *
13
 * IDENTIFICATION
14
 *    src/backend/storage/ipc/standby.c
15
 *
16
 *-------------------------------------------------------------------------
17
 */
18
#include "postgres.h"
19
#include "access/transam.h"
20
#include "access/twophase.h"
21
#include "access/xact.h"
22
#include "access/xloginsert.h"
23
#include "access/xlogrecovery.h"
24
#include "access/xlogutils.h"
25
#include "miscadmin.h"
26
#include "pgstat.h"
27
#include "replication/slot.h"
28
#include "storage/bufmgr.h"
29
#include "storage/proc.h"
30
#include "storage/procarray.h"
31
#include "storage/sinvaladt.h"
32
#include "storage/standby.h"
33
#include "utils/hsearch.h"
34
#include "utils/injection_point.h"
35
#include "utils/ps_status.h"
36
#include "utils/timeout.h"
37
#include "utils/timestamp.h"
38
39
/* User-settable GUC parameters */
40
int     max_standby_archive_delay = 30 * 1000;
41
int     max_standby_streaming_delay = 30 * 1000;
42
bool    log_recovery_conflict_waits = false;
43
44
/*
45
 * Keep track of all the exclusive locks owned by original transactions.
46
 * For each known exclusive lock, there is a RecoveryLockEntry in the
47
 * RecoveryLockHash hash table.  All RecoveryLockEntrys belonging to a
48
 * given XID are chained together so that we can find them easily.
49
 * For each original transaction that is known to have any such locks,
50
 * there is a RecoveryLockXidEntry in the RecoveryLockXidHash hash table,
51
 * which stores the head of the chain of its locks.
52
 */
53
typedef struct RecoveryLockEntry
54
{
55
  xl_standby_lock key;    /* hash key: xid, dbOid, relOid */
56
  struct RecoveryLockEntry *next; /* chain link */
57
} RecoveryLockEntry;
58
59
typedef struct RecoveryLockXidEntry
60
{
61
  TransactionId xid;      /* hash key -- must be first */
62
  struct RecoveryLockEntry *head; /* chain head */
63
} RecoveryLockXidEntry;
64
65
static HTAB *RecoveryLockHash = NULL;
66
static HTAB *RecoveryLockXidHash = NULL;
67
68
/* Flags set by timeout handlers */
69
static volatile sig_atomic_t got_standby_deadlock_timeout = false;
70
static volatile sig_atomic_t got_standby_delay_timeout = false;
71
static volatile sig_atomic_t got_standby_lock_timeout = false;
72
73
static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
74
                           ProcSignalReason reason,
75
                           uint32 wait_event_info,
76
                           bool report_waiting);
77
static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason);
78
static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
79
static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
80
static const char *get_recovery_conflict_desc(ProcSignalReason reason);
81
82
/*
83
 * InitRecoveryTransactionEnvironment
84
 *    Initialize tracking of our primary's in-progress transactions.
85
 *
86
 * We need to issue shared invalidations and hold locks. Holding locks
87
 * means others may want to wait on us, so we need to make a lock table
88
 * vxact entry like a real transaction. We could create and delete
89
 * lock table entries for each transaction but its simpler just to create
90
 * one permanent entry and leave it there all the time. Locks are then
91
 * acquired and released as needed. Yes, this means you can see the
92
 * Startup process in pg_locks once we have run this.
93
 */
94
void
95
InitRecoveryTransactionEnvironment(void)
96
0
{
97
0
  VirtualTransactionId vxid;
98
0
  HASHCTL   hash_ctl;
99
100
0
  Assert(RecoveryLockHash == NULL); /* don't run this twice */
101
102
  /*
103
   * Initialize the hash tables for tracking the locks held by each
104
   * transaction.
105
   */
106
0
  hash_ctl.keysize = sizeof(xl_standby_lock);
107
0
  hash_ctl.entrysize = sizeof(RecoveryLockEntry);
108
0
  RecoveryLockHash = hash_create("RecoveryLockHash",
109
0
                   64,
110
0
                   &hash_ctl,
111
0
                   HASH_ELEM | HASH_BLOBS);
112
0
  hash_ctl.keysize = sizeof(TransactionId);
113
0
  hash_ctl.entrysize = sizeof(RecoveryLockXidEntry);
114
0
  RecoveryLockXidHash = hash_create("RecoveryLockXidHash",
115
0
                    64,
116
0
                    &hash_ctl,
117
0
                    HASH_ELEM | HASH_BLOBS);
118
119
  /*
120
   * Initialize shared invalidation management for Startup process, being
121
   * careful to register ourselves as a sendOnly process so we don't need to
122
   * read messages, nor will we get signaled when the queue starts filling
123
   * up.
124
   */
125
0
  SharedInvalBackendInit(true);
126
127
  /*
128
   * Lock a virtual transaction id for Startup process.
129
   *
130
   * We need to do GetNextLocalTransactionId() because
131
   * SharedInvalBackendInit() leaves localTransactionId invalid and the lock
132
   * manager doesn't like that at all.
133
   *
134
   * Note that we don't need to run XactLockTableInsert() because nobody
135
   * needs to wait on xids. That sounds a little strange, but table locks
136
   * are held by vxids and row level locks are held by xids. All queries
137
   * hold AccessShareLocks so never block while we write or lock new rows.
138
   */
139
0
  MyProc->vxid.procNumber = MyProcNumber;
140
0
  vxid.procNumber = MyProcNumber;
141
0
  vxid.localTransactionId = GetNextLocalTransactionId();
142
0
  VirtualXactLockTableInsert(vxid);
143
144
0
  standbyState = STANDBY_INITIALIZED;
145
0
}
146
147
/*
148
 * ShutdownRecoveryTransactionEnvironment
149
 *    Shut down transaction tracking
150
 *
151
 * Prepare to switch from hot standby mode to normal operation. Shut down
152
 * recovery-time transaction tracking.
153
 *
154
 * This must be called even in shutdown of startup process if transaction
155
 * tracking has been initialized. Otherwise some locks the tracked
156
 * transactions were holding will not be released and may interfere with
157
 * the processes still running (but will exit soon later) at the exit of
158
 * startup process.
159
 */
160
void
161
ShutdownRecoveryTransactionEnvironment(void)
162
0
{
163
  /*
164
   * Do nothing if RecoveryLockHash is NULL because that means that
165
   * transaction tracking has not yet been initialized or has already been
166
   * shut down.  This makes it safe to have possibly-redundant calls of this
167
   * function during process exit.
168
   */
169
0
  if (RecoveryLockHash == NULL)
170
0
    return;
171
172
  /* Mark all tracked in-progress transactions as finished. */
173
0
  ExpireAllKnownAssignedTransactionIds();
174
175
  /* Release all locks the tracked transactions were holding */
176
0
  StandbyReleaseAllLocks();
177
178
  /* Destroy the lock hash tables. */
179
0
  hash_destroy(RecoveryLockHash);
180
0
  hash_destroy(RecoveryLockXidHash);
181
0
  RecoveryLockHash = NULL;
182
0
  RecoveryLockXidHash = NULL;
183
184
  /* Cleanup our VirtualTransaction */
185
0
  VirtualXactLockTableCleanup();
186
0
}
187
188
189
/*
190
 * -----------------------------------------------------
191
 *    Standby wait timers and backend cancel logic
192
 * -----------------------------------------------------
193
 */
194
195
/*
196
 * Determine the cutoff time at which we want to start canceling conflicting
197
 * transactions.  Returns zero (a time safely in the past) if we are willing
198
 * to wait forever.
199
 */
200
static TimestampTz
201
GetStandbyLimitTime(void)
202
0
{
203
0
  TimestampTz rtime;
204
0
  bool    fromStream;
205
206
  /*
207
   * The cutoff time is the last WAL data receipt time plus the appropriate
208
   * delay variable.  Delay of -1 means wait forever.
209
   */
210
0
  GetXLogReceiptTime(&rtime, &fromStream);
211
0
  if (fromStream)
212
0
  {
213
0
    if (max_standby_streaming_delay < 0)
214
0
      return 0;     /* wait forever */
215
0
    return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
216
0
  }
217
0
  else
218
0
  {
219
0
    if (max_standby_archive_delay < 0)
220
0
      return 0;     /* wait forever */
221
0
    return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
222
0
  }
223
0
}
224
225
0
#define STANDBY_INITIAL_WAIT_US  1000
226
static int  standbyWait_us = STANDBY_INITIAL_WAIT_US;
227
228
/*
229
 * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
230
 * We wait here for a while then return. If we decide we can't wait any
231
 * more then we return true, if we can wait some more return false.
232
 */
233
static bool
234
WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
235
0
{
236
0
  TimestampTz ltime;
237
238
0
  CHECK_FOR_INTERRUPTS();
239
240
  /* Are we past the limit time? */
241
0
  ltime = GetStandbyLimitTime();
242
0
  if (ltime && GetCurrentTimestamp() >= ltime)
243
0
    return true;
244
245
  /*
246
   * Sleep a bit (this is essential to avoid busy-waiting).
247
   */
248
0
  pgstat_report_wait_start(wait_event_info);
249
0
  pg_usleep(standbyWait_us);
250
0
  pgstat_report_wait_end();
251
252
  /*
253
   * Progressively increase the sleep times, but not to more than 1s, since
254
   * pg_usleep isn't interruptible on some platforms.
255
   */
256
0
  standbyWait_us *= 2;
257
0
  if (standbyWait_us > 1000000)
258
0
    standbyWait_us = 1000000;
259
260
0
  return false;
261
0
}
262
263
/*
264
 * Log the recovery conflict.
265
 *
266
 * wait_start is the timestamp when the caller started to wait.
267
 * now is the timestamp when this function has been called.
268
 * wait_list is the list of virtual transaction ids assigned to
269
 * conflicting processes. still_waiting indicates whether
270
 * the startup process is still waiting for the recovery conflict
271
 * to be resolved or not.
272
 */
273
void
274
LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
275
          TimestampTz now, VirtualTransactionId *wait_list,
276
          bool still_waiting)
277
0
{
278
0
  long    secs;
279
0
  int     usecs;
280
0
  long    msecs;
281
0
  StringInfoData buf;
282
0
  int     nprocs = 0;
283
284
  /*
285
   * There must be no conflicting processes when the recovery conflict has
286
   * already been resolved.
287
   */
288
0
  Assert(still_waiting || wait_list == NULL);
289
290
0
  TimestampDifference(wait_start, now, &secs, &usecs);
291
0
  msecs = secs * 1000 + usecs / 1000;
292
0
  usecs = usecs % 1000;
293
294
0
  if (wait_list)
295
0
  {
296
0
    VirtualTransactionId *vxids;
297
298
    /* Construct a string of list of the conflicting processes */
299
0
    vxids = wait_list;
300
0
    while (VirtualTransactionIdIsValid(*vxids))
301
0
    {
302
0
      PGPROC     *proc = ProcNumberGetProc(vxids->procNumber);
303
304
      /* proc can be NULL if the target backend is not active */
305
0
      if (proc)
306
0
      {
307
0
        if (nprocs == 0)
308
0
        {
309
0
          initStringInfo(&buf);
310
0
          appendStringInfo(&buf, "%d", proc->pid);
311
0
        }
312
0
        else
313
0
          appendStringInfo(&buf, ", %d", proc->pid);
314
315
0
        nprocs++;
316
0
      }
317
318
0
      vxids++;
319
0
    }
320
0
  }
321
322
  /*
323
   * If wait_list is specified, report the list of PIDs of active
324
   * conflicting backends in a detail message. Note that if all the backends
325
   * in the list are not active, no detail message is logged.
326
   */
327
0
  if (still_waiting)
328
0
  {
329
0
    ereport(LOG,
330
0
        errmsg("recovery still waiting after %ld.%03d ms: %s",
331
0
             msecs, usecs, get_recovery_conflict_desc(reason)),
332
0
        nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
333
0
                          "Conflicting processes: %s.",
334
0
                          nprocs, buf.data) : 0);
335
0
  }
336
0
  else
337
0
  {
338
0
    ereport(LOG,
339
0
        errmsg("recovery finished waiting after %ld.%03d ms: %s",
340
0
             msecs, usecs, get_recovery_conflict_desc(reason)));
341
0
  }
342
343
0
  if (nprocs > 0)
344
0
    pfree(buf.data);
345
0
}
346
347
/*
348
 * This is the main executioner for any query backend that conflicts with
349
 * recovery processing. Judgement has already been passed on it within
350
 * a specific rmgr. Here we just issue the orders to the procs. The procs
351
 * then throw the required error as instructed.
352
 *
353
 * If report_waiting is true, "waiting" is reported in PS display and the
354
 * wait for recovery conflict is reported in the log, if necessary. If
355
 * the caller is responsible for reporting them, report_waiting should be
356
 * false. Otherwise, both the caller and this function report the same
357
 * thing unexpectedly.
358
 */
359
static void
360
ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
361
                     ProcSignalReason reason, uint32 wait_event_info,
362
                     bool report_waiting)
363
0
{
364
0
  TimestampTz waitStart = 0;
365
0
  bool    waiting = false;
366
0
  bool    logged_recovery_conflict = false;
367
368
  /* Fast exit, to avoid a kernel call if there's no work to be done. */
369
0
  if (!VirtualTransactionIdIsValid(*waitlist))
370
0
    return;
371
372
  /* Set the wait start timestamp for reporting */
373
0
  if (report_waiting && (log_recovery_conflict_waits || update_process_title))
374
0
    waitStart = GetCurrentTimestamp();
375
376
0
  while (VirtualTransactionIdIsValid(*waitlist))
377
0
  {
378
    /* reset standbyWait_us for each xact we wait for */
379
0
    standbyWait_us = STANDBY_INITIAL_WAIT_US;
380
381
    /* wait until the virtual xid is gone */
382
0
    while (!VirtualXactLock(*waitlist, false))
383
0
    {
384
      /* Is it time to kill it? */
385
0
      if (WaitExceedsMaxStandbyDelay(wait_event_info))
386
0
      {
387
0
        pid_t   pid;
388
389
        /*
390
         * Now find out who to throw out of the balloon.
391
         */
392
0
        Assert(VirtualTransactionIdIsValid(*waitlist));
393
0
        pid = CancelVirtualTransaction(*waitlist, reason);
394
395
        /*
396
         * Wait a little bit for it to die so that we avoid flooding
397
         * an unresponsive backend when system is heavily loaded.
398
         */
399
0
        if (pid != 0)
400
0
          pg_usleep(5000L);
401
0
      }
402
403
0
      if (waitStart != 0 && (!logged_recovery_conflict || !waiting))
404
0
      {
405
0
        TimestampTz now = 0;
406
0
        bool    maybe_log_conflict;
407
0
        bool    maybe_update_title;
408
409
0
        maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict);
410
0
        maybe_update_title = (update_process_title && !waiting);
411
412
        /* Get the current timestamp if not report yet */
413
0
        if (maybe_log_conflict || maybe_update_title)
414
0
          now = GetCurrentTimestamp();
415
416
        /*
417
         * Report via ps if we have been waiting for more than 500
418
         * msec (should that be configurable?)
419
         */
420
0
        if (maybe_update_title &&
421
0
          TimestampDifferenceExceeds(waitStart, now, 500))
422
0
        {
423
0
          set_ps_display_suffix("waiting");
424
0
          waiting = true;
425
0
        }
426
427
        /*
428
         * Emit the log message if the startup process is waiting
429
         * longer than deadlock_timeout for recovery conflict.
430
         */
431
0
        if (maybe_log_conflict &&
432
0
          TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
433
0
        {
434
0
          LogRecoveryConflict(reason, waitStart, now, waitlist, true);
435
0
          logged_recovery_conflict = true;
436
0
        }
437
0
      }
438
0
    }
439
440
    /* The virtual transaction is gone now, wait for the next one */
441
0
    waitlist++;
442
0
  }
443
444
  /*
445
   * Emit the log message if recovery conflict was resolved but the startup
446
   * process waited longer than deadlock_timeout for it.
447
   */
448
0
  if (logged_recovery_conflict)
449
0
    LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
450
0
              NULL, false);
451
452
  /* reset ps display to remove the suffix if we added one */
453
0
  if (waiting)
454
0
    set_ps_display_remove_suffix();
455
456
0
}
457
458
/*
459
 * Generate whatever recovery conflicts are needed to eliminate snapshots that
460
 * might see XIDs <= snapshotConflictHorizon as still running.
461
 *
462
 * snapshotConflictHorizon cutoffs are our standard approach to generating
463
 * granular recovery conflicts.  Note that InvalidTransactionId values are
464
 * interpreted as "definitely don't need any conflicts" here, which is a
465
 * general convention that WAL records can (and often do) depend on.
466
 */
467
void
468
ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon,
469
                  bool isCatalogRel,
470
                  RelFileLocator locator)
471
0
{
472
0
  VirtualTransactionId *backends;
473
474
  /*
475
   * If we get passed InvalidTransactionId then we do nothing (no conflict).
476
   *
477
   * This can happen when replaying already-applied WAL records after a
478
   * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE
479
   * record that marks as frozen a page which was already all-visible.  It's
480
   * also quite common with records generated during index deletion
481
   * (original execution of the deletion can reason that a recovery conflict
482
   * which is sufficient for the deletion operation must take place before
483
   * replay of the deletion record itself).
484
   */
485
0
  if (!TransactionIdIsValid(snapshotConflictHorizon))
486
0
    return;
487
488
0
  Assert(TransactionIdIsNormal(snapshotConflictHorizon));
489
0
  backends = GetConflictingVirtualXIDs(snapshotConflictHorizon,
490
0
                     locator.dbOid);
491
0
  ResolveRecoveryConflictWithVirtualXIDs(backends,
492
0
                       PROCSIG_RECOVERY_CONFLICT_SNAPSHOT,
493
0
                       WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT,
494
0
                       true);
495
496
  /*
497
   * Note that WaitExceedsMaxStandbyDelay() is not taken into account here
498
   * (as opposed to ResolveRecoveryConflictWithVirtualXIDs() above). That
499
   * seems OK, given that this kind of conflict should not normally be
500
   * reached, e.g. due to using a physical replication slot.
501
   */
502
0
  if (wal_level >= WAL_LEVEL_LOGICAL && isCatalogRel)
503
0
    InvalidateObsoleteReplicationSlots(RS_INVAL_HORIZON, 0, locator.dbOid,
504
0
                       snapshotConflictHorizon);
505
0
}
506
507
/*
508
 * Variant of ResolveRecoveryConflictWithSnapshot that works with
509
 * FullTransactionId values
510
 */
511
void
512
ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId snapshotConflictHorizon,
513
                       bool isCatalogRel,
514
                       RelFileLocator locator)
515
0
{
516
  /*
517
   * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
518
   * so truncate the logged FullTransactionId.  If the logged value is very
519
   * old, so that XID wrap-around already happened on it, there can't be any
520
   * snapshots that still see it.
521
   */
522
0
  FullTransactionId nextXid = ReadNextFullTransactionId();
523
0
  uint64    diff;
524
525
0
  diff = U64FromFullTransactionId(nextXid) -
526
0
    U64FromFullTransactionId(snapshotConflictHorizon);
527
0
  if (diff < MaxTransactionId / 2)
528
0
  {
529
0
    TransactionId truncated;
530
531
0
    truncated = XidFromFullTransactionId(snapshotConflictHorizon);
532
0
    ResolveRecoveryConflictWithSnapshot(truncated,
533
0
                      isCatalogRel,
534
0
                      locator);
535
0
  }
536
0
}
537
538
void
539
ResolveRecoveryConflictWithTablespace(Oid tsid)
540
0
{
541
0
  VirtualTransactionId *temp_file_users;
542
543
  /*
544
   * Standby users may be currently using this tablespace for their
545
   * temporary files. We only care about current users because
546
   * temp_tablespace parameter will just ignore tablespaces that no longer
547
   * exist.
548
   *
549
   * Ask everybody to cancel their queries immediately so we can ensure no
550
   * temp files remain and we can remove the tablespace. Nuke the entire
551
   * site from orbit, it's the only way to be sure.
552
   *
553
   * XXX: We could work out the pids of active backends using this
554
   * tablespace by examining the temp filenames in the directory. We would
555
   * then convert the pids into VirtualXIDs before attempting to cancel
556
   * them.
557
   *
558
   * We don't wait for commit because drop tablespace is non-transactional.
559
   */
560
0
  temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
561
0
                        InvalidOid);
562
0
  ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
563
0
                       PROCSIG_RECOVERY_CONFLICT_TABLESPACE,
564
0
                       WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE,
565
0
                       true);
566
0
}
567
568
void
569
ResolveRecoveryConflictWithDatabase(Oid dbid)
570
0
{
571
  /*
572
   * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
573
   * only waits for transactions and completely idle sessions would block
574
   * us. This is rare enough that we do this as simply as possible: no wait,
575
   * just force them off immediately.
576
   *
577
   * No locking is required here because we already acquired
578
   * AccessExclusiveLock. Anybody trying to connect while we do this will
579
   * block during InitPostgres() and then disconnect when they see the
580
   * database has been removed.
581
   */
582
0
  while (CountDBBackends(dbid) > 0)
583
0
  {
584
0
    CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true);
585
586
    /*
587
     * Wait awhile for them to die so that we avoid flooding an
588
     * unresponsive backend when system is heavily loaded.
589
     */
590
0
    pg_usleep(10000);
591
0
  }
592
0
}
593
594
/*
595
 * ResolveRecoveryConflictWithLock is called from ProcSleep()
596
 * to resolve conflicts with other backends holding relation locks.
597
 *
598
 * The WaitLatch sleep normally done in ProcSleep()
599
 * (when not InHotStandby) is performed here, for code clarity.
600
 *
601
 * We either resolve conflicts immediately or set a timeout to wake us at
602
 * the limit of our patience.
603
 *
604
 * Resolve conflicts by canceling to all backends holding a conflicting
605
 * lock.  As we are already queued to be granted the lock, no new lock
606
 * requests conflicting with ours will be granted in the meantime.
607
 *
608
 * We also must check for deadlocks involving the Startup process and
609
 * hot-standby backend processes. If deadlock_timeout is reached in
610
 * this function, all the backends holding the conflicting locks are
611
 * requested to check themselves for deadlocks.
612
 *
613
 * logging_conflict should be true if the recovery conflict has not been
614
 * logged yet even though logging is enabled. After deadlock_timeout is
615
 * reached and the request for deadlock check is sent, we wait again to
616
 * be signaled by the release of the lock if logging_conflict is false.
617
 * Otherwise we return without waiting again so that the caller can report
618
 * the recovery conflict. In this case, then, this function is called again
619
 * with logging_conflict=false (because the recovery conflict has already
620
 * been logged) and we will wait again for the lock to be released.
621
 */
622
void
623
ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
624
0
{
625
0
  TimestampTz ltime;
626
0
  TimestampTz now;
627
628
0
  Assert(InHotStandby);
629
630
0
  ltime = GetStandbyLimitTime();
631
0
  now = GetCurrentTimestamp();
632
633
  /*
634
   * Update waitStart if first time through after the startup process
635
   * started waiting for the lock. It should not be updated every time
636
   * ResolveRecoveryConflictWithLock() is called during the wait.
637
   *
638
   * Use the current time obtained for comparison with ltime as waitStart
639
   * (i.e., the time when this process started waiting for the lock). Since
640
   * getting the current time newly can cause overhead, we reuse the
641
   * already-obtained time to avoid that overhead.
642
   *
643
   * Note that waitStart is updated without holding the lock table's
644
   * partition lock, to avoid the overhead by additional lock acquisition.
645
   * This can cause "waitstart" in pg_locks to become NULL for a very short
646
   * period of time after the wait started even though "granted" is false.
647
   * This is OK in practice because we can assume that users are likely to
648
   * look at "waitstart" when waiting for the lock for a long time.
649
   */
650
0
  if (pg_atomic_read_u64(&MyProc->waitStart) == 0)
651
0
    pg_atomic_write_u64(&MyProc->waitStart, now);
652
653
0
  if (now >= ltime && ltime != 0)
654
0
  {
655
    /*
656
     * We're already behind, so clear a path as quickly as possible.
657
     */
658
0
    VirtualTransactionId *backends;
659
660
0
    backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
661
662
    /*
663
     * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting
664
     * "waiting" in PS display by disabling its argument report_waiting
665
     * because the caller, WaitOnLock(), has already reported that.
666
     */
667
0
    ResolveRecoveryConflictWithVirtualXIDs(backends,
668
0
                         PROCSIG_RECOVERY_CONFLICT_LOCK,
669
0
                         PG_WAIT_LOCK | locktag.locktag_type,
670
0
                         false);
671
0
  }
672
0
  else
673
0
  {
674
    /*
675
     * Wait (or wait again) until ltime, and check for deadlocks as well
676
     * if we will be waiting longer than deadlock_timeout
677
     */
678
0
    EnableTimeoutParams timeouts[2];
679
0
    int     cnt = 0;
680
681
0
    if (ltime != 0)
682
0
    {
683
0
      got_standby_lock_timeout = false;
684
0
      timeouts[cnt].id = STANDBY_LOCK_TIMEOUT;
685
0
      timeouts[cnt].type = TMPARAM_AT;
686
0
      timeouts[cnt].fin_time = ltime;
687
0
      cnt++;
688
0
    }
689
690
0
    got_standby_deadlock_timeout = false;
691
0
    timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
692
0
    timeouts[cnt].type = TMPARAM_AFTER;
693
0
    timeouts[cnt].delay_ms = DeadlockTimeout;
694
0
    cnt++;
695
696
0
    enable_timeouts(timeouts, cnt);
697
0
  }
698
699
  /* Wait to be signaled by the release of the Relation Lock */
700
0
  ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
701
702
  /*
703
   * Exit if ltime is reached. Then all the backends holding conflicting
704
   * locks will be canceled in the next ResolveRecoveryConflictWithLock()
705
   * call.
706
   */
707
0
  if (got_standby_lock_timeout)
708
0
    goto cleanup;
709
710
0
  if (got_standby_deadlock_timeout)
711
0
  {
712
0
    VirtualTransactionId *backends;
713
714
0
    backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
715
716
    /* Quick exit if there's no work to be done */
717
0
    if (!VirtualTransactionIdIsValid(*backends))
718
0
      goto cleanup;
719
720
    /*
721
     * Send signals to all the backends holding the conflicting locks, to
722
     * ask them to check themselves for deadlocks.
723
     */
724
0
    while (VirtualTransactionIdIsValid(*backends))
725
0
    {
726
0
      SignalVirtualTransaction(*backends,
727
0
                   PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
728
0
                   false);
729
0
      backends++;
730
0
    }
731
732
    /*
733
     * Exit if the recovery conflict has not been logged yet even though
734
     * logging is enabled, so that the caller can log that. Then
735
     * RecoveryConflictWithLock() is called again and we will wait again
736
     * for the lock to be released.
737
     */
738
0
    if (logging_conflict)
739
0
      goto cleanup;
740
741
    /*
742
     * Wait again here to be signaled by the release of the Relation Lock,
743
     * to prevent the subsequent RecoveryConflictWithLock() from causing
744
     * deadlock_timeout and sending a request for deadlocks check again.
745
     * Otherwise the request continues to be sent every deadlock_timeout
746
     * until the relation locks are released or ltime is reached.
747
     */
748
0
    got_standby_deadlock_timeout = false;
749
0
    ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
750
0
  }
751
752
0
cleanup:
753
754
  /*
755
   * Clear any timeout requests established above.  We assume here that the
756
   * Startup process doesn't have any other outstanding timeouts than those
757
   * used by this function. If that stops being true, we could cancel the
758
   * timeouts individually, but that'd be slower.
759
   */
760
0
  disable_all_timeouts(false);
761
0
  got_standby_lock_timeout = false;
762
0
  got_standby_deadlock_timeout = false;
763
0
}
764
765
/*
766
 * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
767
 * to resolve conflicts with other backends holding buffer pins.
768
 *
769
 * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
770
 * (when not InHotStandby) is performed here, for code clarity.
771
 *
772
 * We either resolve conflicts immediately or set a timeout to wake us at
773
 * the limit of our patience.
774
 *
775
 * Resolve conflicts by sending a PROCSIG signal to all backends to check if
776
 * they hold one of the buffer pins that is blocking Startup process. If so,
777
 * those backends will take an appropriate error action, ERROR or FATAL.
778
 *
779
 * We also must check for deadlocks.  Deadlocks occur because if queries
780
 * wait on a lock, that must be behind an AccessExclusiveLock, which can only
781
 * be cleared if the Startup process replays a transaction completion record.
782
 * If Startup process is also waiting then that is a deadlock. The deadlock
783
 * can occur if the query is waiting and then the Startup sleeps, or if
784
 * Startup is sleeping and the query waits on a lock. We protect against
785
 * only the former sequence here, the latter sequence is checked prior to
786
 * the query sleeping, in CheckRecoveryConflictDeadlock().
787
 *
788
 * Deadlocks are extremely rare, and relatively expensive to check for,
789
 * so we don't do a deadlock check right away ... only if we have had to wait
790
 * at least deadlock_timeout.
791
 */
792
void
793
ResolveRecoveryConflictWithBufferPin(void)
794
0
{
795
0
  TimestampTz ltime;
796
797
0
  Assert(InHotStandby);
798
799
0
  ltime = GetStandbyLimitTime();
800
801
0
  if (GetCurrentTimestamp() >= ltime && ltime != 0)
802
0
  {
803
    /*
804
     * We're already behind, so clear a path as quickly as possible.
805
     */
806
0
    SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
807
0
  }
808
0
  else
809
0
  {
810
    /*
811
     * Wake up at ltime, and check for deadlocks as well if we will be
812
     * waiting longer than deadlock_timeout
813
     */
814
0
    EnableTimeoutParams timeouts[2];
815
0
    int     cnt = 0;
816
817
0
    if (ltime != 0)
818
0
    {
819
0
      timeouts[cnt].id = STANDBY_TIMEOUT;
820
0
      timeouts[cnt].type = TMPARAM_AT;
821
0
      timeouts[cnt].fin_time = ltime;
822
0
      cnt++;
823
0
    }
824
825
0
    got_standby_deadlock_timeout = false;
826
0
    timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
827
0
    timeouts[cnt].type = TMPARAM_AFTER;
828
0
    timeouts[cnt].delay_ms = DeadlockTimeout;
829
0
    cnt++;
830
831
0
    enable_timeouts(timeouts, cnt);
832
0
  }
833
834
  /*
835
   * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
836
   * by one of the timeouts established above.
837
   *
838
   * We assume that only UnpinBuffer() and the timeout requests established
839
   * above can wake us up here. WakeupRecovery() called by walreceiver or
840
   * SIGHUP signal handler, etc cannot do that because it uses the different
841
   * latch from that ProcWaitForSignal() waits on.
842
   */
843
0
  ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
844
845
0
  if (got_standby_delay_timeout)
846
0
    SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
847
0
  else if (got_standby_deadlock_timeout)
848
0
  {
849
    /*
850
     * Send out a request for hot-standby backends to check themselves for
851
     * deadlocks.
852
     *
853
     * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
854
     * to be signaled by UnpinBuffer() again and send a request for
855
     * deadlocks check if deadlock_timeout happens. This causes the
856
     * request to continue to be sent every deadlock_timeout until the
857
     * buffer is unpinned or ltime is reached. This would increase the
858
     * workload in the startup process and backends. In practice it may
859
     * not be so harmful because the period that the buffer is kept pinned
860
     * is basically no so long. But we should fix this?
861
     */
862
0
    SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
863
0
  }
864
865
  /*
866
   * Clear any timeout requests established above.  We assume here that the
867
   * Startup process doesn't have any other timeouts than what this function
868
   * uses.  If that stops being true, we could cancel the timeouts
869
   * individually, but that'd be slower.
870
   */
871
0
  disable_all_timeouts(false);
872
0
  got_standby_delay_timeout = false;
873
0
  got_standby_deadlock_timeout = false;
874
0
}
875
876
static void
877
SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
878
0
{
879
0
  Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN ||
880
0
       reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
881
882
  /*
883
   * We send signal to all backends to ask them if they are holding the
884
   * buffer pin which is delaying the Startup process. We must not set the
885
   * conflict flag yet, since most backends will be innocent. Let the
886
   * SIGUSR1 handling in each backend decide their own fate.
887
   */
888
0
  CancelDBBackends(InvalidOid, reason, false);
889
0
}
890
891
/*
892
 * In Hot Standby perform early deadlock detection.  We abort the lock
893
 * wait if we are about to sleep while holding the buffer pin that Startup
894
 * process is waiting for.
895
 *
896
 * Note: this code is pessimistic, because there is no way for it to
897
 * determine whether an actual deadlock condition is present: the lock we
898
 * need to wait for might be unrelated to any held by the Startup process.
899
 * Sooner or later, this mechanism should get ripped out in favor of somehow
900
 * accounting for buffer locks in DeadLockCheck().  However, errors here
901
 * seem to be very low-probability in practice, so for now it's not worth
902
 * the trouble.
903
 */
904
void
905
CheckRecoveryConflictDeadlock(void)
906
0
{
907
0
  Assert(!InRecovery);    /* do not call in Startup process */
908
909
0
  if (!HoldingBufferPinThatDelaysRecovery())
910
0
    return;
911
912
  /*
913
   * Error message should match ProcessInterrupts() but we avoid calling
914
   * that because we aren't handling an interrupt at this point. Note that
915
   * we only cancel the current transaction here, so if we are in a
916
   * subtransaction and the pin is held by a parent, then the Startup
917
   * process will continue to wait even though we have avoided deadlock.
918
   */
919
0
  ereport(ERROR,
920
0
      (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
921
0
       errmsg("canceling statement due to conflict with recovery"),
922
0
       errdetail("User transaction caused buffer deadlock with recovery.")));
923
0
}
924
925
926
/* --------------------------------
927
 *    timeout handler routines
928
 * --------------------------------
929
 */
930
931
/*
932
 * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is
933
 * exceeded.
934
 */
935
void
936
StandbyDeadLockHandler(void)
937
0
{
938
0
  got_standby_deadlock_timeout = true;
939
0
}
940
941
/*
942
 * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
943
 */
944
void
945
StandbyTimeoutHandler(void)
946
0
{
947
0
  got_standby_delay_timeout = true;
948
0
}
949
950
/*
951
 * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
952
 */
953
void
954
StandbyLockTimeoutHandler(void)
955
0
{
956
0
  got_standby_lock_timeout = true;
957
0
}
958
959
/*
960
 * -----------------------------------------------------
961
 * Locking in Recovery Mode
962
 * -----------------------------------------------------
963
 *
964
 * All locks are held by the Startup process using a single virtual
965
 * transaction. This implementation is both simpler and in some senses,
966
 * more correct. The locks held mean "some original transaction held
967
 * this lock, so query access is not allowed at this time". So the Startup
968
 * process is the proxy by which the original locks are implemented.
969
 *
970
 * We only keep track of AccessExclusiveLocks, which are only ever held by
971
 * one transaction on one relation.
972
 *
973
 * We keep a table of known locks in the RecoveryLockHash hash table.
974
 * The point of that table is to let us efficiently de-duplicate locks,
975
 * which is important because checkpoints will re-report the same locks
976
 * already held.  There is also a RecoveryLockXidHash table with one entry
977
 * per xid, which allows us to efficiently find all the locks held by a
978
 * given original transaction.
979
 *
980
 * We use session locks rather than normal locks so we don't need
981
 * ResourceOwners.
982
 */
983
984
985
void
986
StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
987
0
{
988
0
  RecoveryLockXidEntry *xidentry;
989
0
  RecoveryLockEntry *lockentry;
990
0
  xl_standby_lock key;
991
0
  LOCKTAG   locktag;
992
0
  bool    found;
993
994
  /* Already processed? */
995
0
  if (!TransactionIdIsValid(xid) ||
996
0
    TransactionIdDidCommit(xid) ||
997
0
    TransactionIdDidAbort(xid))
998
0
    return;
999
1000
0
  elog(DEBUG4, "adding recovery lock: db %u rel %u", dbOid, relOid);
1001
1002
  /* dbOid is InvalidOid when we are locking a shared relation. */
1003
0
  Assert(OidIsValid(relOid));
1004
1005
  /* Create a hash entry for this xid, if we don't have one already. */
1006
0
  xidentry = hash_search(RecoveryLockXidHash, &xid, HASH_ENTER, &found);
1007
0
  if (!found)
1008
0
  {
1009
0
    Assert(xidentry->xid == xid); /* dynahash should have set this */
1010
0
    xidentry->head = NULL;
1011
0
  }
1012
1013
  /* Create a hash entry for this lock, unless we have one already. */
1014
0
  key.xid = xid;
1015
0
  key.dbOid = dbOid;
1016
0
  key.relOid = relOid;
1017
0
  lockentry = hash_search(RecoveryLockHash, &key, HASH_ENTER, &found);
1018
0
  if (!found)
1019
0
  {
1020
    /* It's new, so link it into the XID's list ... */
1021
0
    lockentry->next = xidentry->head;
1022
0
    xidentry->head = lockentry;
1023
1024
    /* ... and acquire the lock locally. */
1025
0
    SET_LOCKTAG_RELATION(locktag, dbOid, relOid);
1026
1027
0
    (void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
1028
0
  }
1029
0
}
1030
1031
/*
1032
 * Release all the locks associated with this RecoveryLockXidEntry.
1033
 */
1034
static void
1035
StandbyReleaseXidEntryLocks(RecoveryLockXidEntry *xidentry)
1036
0
{
1037
0
  RecoveryLockEntry *entry;
1038
0
  RecoveryLockEntry *next;
1039
1040
0
  for (entry = xidentry->head; entry != NULL; entry = next)
1041
0
  {
1042
0
    LOCKTAG   locktag;
1043
1044
0
    elog(DEBUG4,
1045
0
       "releasing recovery lock: xid %u db %u rel %u",
1046
0
       entry->key.xid, entry->key.dbOid, entry->key.relOid);
1047
    /* Release the lock ... */
1048
0
    SET_LOCKTAG_RELATION(locktag, entry->key.dbOid, entry->key.relOid);
1049
0
    if (!LockRelease(&locktag, AccessExclusiveLock, true))
1050
0
    {
1051
0
      elog(LOG,
1052
0
         "RecoveryLockHash contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
1053
0
         entry->key.xid, entry->key.dbOid, entry->key.relOid);
1054
0
      Assert(false);
1055
0
    }
1056
    /* ... and remove the per-lock hash entry */
1057
0
    next = entry->next;
1058
0
    hash_search(RecoveryLockHash, entry, HASH_REMOVE, NULL);
1059
0
  }
1060
1061
0
  xidentry->head = NULL;    /* just for paranoia */
1062
0
}
1063
1064
/*
1065
 * Release locks for specific XID, or all locks if it's InvalidXid.
1066
 */
1067
static void
1068
StandbyReleaseLocks(TransactionId xid)
1069
0
{
1070
0
  RecoveryLockXidEntry *entry;
1071
1072
0
  if (TransactionIdIsValid(xid))
1073
0
  {
1074
0
    if ((entry = hash_search(RecoveryLockXidHash, &xid, HASH_FIND, NULL)))
1075
0
    {
1076
0
      StandbyReleaseXidEntryLocks(entry);
1077
0
      hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
1078
0
    }
1079
0
  }
1080
0
  else
1081
0
    StandbyReleaseAllLocks();
1082
0
}
1083
1084
/*
1085
 * Release locks for a transaction tree, starting at xid down, from
1086
 * RecoveryLockXidHash.
1087
 *
1088
 * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
1089
 * to remove any AccessExclusiveLocks requested by a transaction.
1090
 */
1091
void
1092
StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
1093
0
{
1094
0
  int     i;
1095
1096
0
  StandbyReleaseLocks(xid);
1097
1098
0
  for (i = 0; i < nsubxids; i++)
1099
0
    StandbyReleaseLocks(subxids[i]);
1100
0
}
1101
1102
/*
1103
 * Called at end of recovery and when we see a shutdown checkpoint.
1104
 */
1105
void
1106
StandbyReleaseAllLocks(void)
1107
0
{
1108
0
  HASH_SEQ_STATUS status;
1109
0
  RecoveryLockXidEntry *entry;
1110
1111
0
  elog(DEBUG2, "release all standby locks");
1112
1113
0
  hash_seq_init(&status, RecoveryLockXidHash);
1114
0
  while ((entry = hash_seq_search(&status)))
1115
0
  {
1116
0
    StandbyReleaseXidEntryLocks(entry);
1117
0
    hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
1118
0
  }
1119
0
}
1120
1121
/*
1122
 * StandbyReleaseOldLocks
1123
 *    Release standby locks held by top-level XIDs that aren't running,
1124
 *    as long as they're not prepared transactions.
1125
 *
1126
 * This is needed to prune the locks of crashed transactions, which didn't
1127
 * write an ABORT/COMMIT record.
1128
 */
1129
void
1130
StandbyReleaseOldLocks(TransactionId oldxid)
1131
0
{
1132
0
  HASH_SEQ_STATUS status;
1133
0
  RecoveryLockXidEntry *entry;
1134
1135
0
  hash_seq_init(&status, RecoveryLockXidHash);
1136
0
  while ((entry = hash_seq_search(&status)))
1137
0
  {
1138
0
    Assert(TransactionIdIsValid(entry->xid));
1139
1140
    /* Skip if prepared transaction. */
1141
0
    if (StandbyTransactionIdIsPrepared(entry->xid))
1142
0
      continue;
1143
1144
    /* Skip if >= oldxid. */
1145
0
    if (!TransactionIdPrecedes(entry->xid, oldxid))
1146
0
      continue;
1147
1148
    /* Remove all locks and hash table entry. */
1149
0
    StandbyReleaseXidEntryLocks(entry);
1150
0
    hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
1151
0
  }
1152
0
}
1153
1154
/*
1155
 * --------------------------------------------------------------------
1156
 *    Recovery handling for Rmgr RM_STANDBY_ID
1157
 *
1158
 * These record types will only be created if XLogStandbyInfoActive()
1159
 * --------------------------------------------------------------------
1160
 */
1161
1162
void
1163
standby_redo(XLogReaderState *record)
1164
0
{
1165
0
  uint8   info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1166
1167
  /* Backup blocks are not used in standby records */
1168
0
  Assert(!XLogRecHasAnyBlockRefs(record));
1169
1170
  /* Do nothing if we're not in hot standby mode */
1171
0
  if (standbyState == STANDBY_DISABLED)
1172
0
    return;
1173
1174
0
  if (info == XLOG_STANDBY_LOCK)
1175
0
  {
1176
0
    xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
1177
0
    int     i;
1178
1179
0
    for (i = 0; i < xlrec->nlocks; i++)
1180
0
      StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
1181
0
                        xlrec->locks[i].dbOid,
1182
0
                        xlrec->locks[i].relOid);
1183
0
  }
1184
0
  else if (info == XLOG_RUNNING_XACTS)
1185
0
  {
1186
0
    xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
1187
0
    RunningTransactionsData running;
1188
1189
0
    running.xcnt = xlrec->xcnt;
1190
0
    running.subxcnt = xlrec->subxcnt;
1191
0
    running.subxid_status = xlrec->subxid_overflow ? SUBXIDS_MISSING : SUBXIDS_IN_ARRAY;
1192
0
    running.nextXid = xlrec->nextXid;
1193
0
    running.latestCompletedXid = xlrec->latestCompletedXid;
1194
0
    running.oldestRunningXid = xlrec->oldestRunningXid;
1195
0
    running.xids = xlrec->xids;
1196
1197
0
    ProcArrayApplyRecoveryInfo(&running);
1198
1199
    /*
1200
     * The startup process currently has no convenient way to schedule
1201
     * stats to be reported. XLOG_RUNNING_XACTS records issued at a
1202
     * regular cadence, making this a convenient location to report stats.
1203
     * While these records aren't generated with wal_level=minimal, stats
1204
     * also cannot be accessed during WAL replay.
1205
     */
1206
0
    pgstat_report_stat(true);
1207
0
  }
1208
0
  else if (info == XLOG_INVALIDATIONS)
1209
0
  {
1210
0
    xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
1211
1212
0
    ProcessCommittedInvalidationMessages(xlrec->msgs,
1213
0
                       xlrec->nmsgs,
1214
0
                       xlrec->relcacheInitFileInval,
1215
0
                       xlrec->dbId,
1216
0
                       xlrec->tsId);
1217
0
  }
1218
0
  else
1219
0
    elog(PANIC, "standby_redo: unknown op code %u", info);
1220
0
}
1221
1222
/*
1223
 * Log details of the current snapshot to WAL. This allows the snapshot state
1224
 * to be reconstructed on the standby and for logical decoding.
1225
 *
1226
 * This is used for Hot Standby as follows:
1227
 *
1228
 * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
1229
 * start from a shutdown checkpoint because we know nothing was running
1230
 * at that time and our recovery snapshot is known empty. In the more
1231
 * typical case of an online checkpoint we need to jump through a few
1232
 * hoops to get a correct recovery snapshot and this requires a two or
1233
 * sometimes a three stage process.
1234
 *
1235
 * The initial snapshot must contain all running xids and all current
1236
 * AccessExclusiveLocks at a point in time on the standby. Assembling
1237
 * that information while the server is running requires many and
1238
 * various LWLocks, so we choose to derive that information piece by
1239
 * piece and then re-assemble that info on the standby. When that
1240
 * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
1241
 *
1242
 * Since locking on the primary when we derive the information is not
1243
 * strict, we note that there is a time window between the derivation and
1244
 * writing to WAL of the derived information. That allows race conditions
1245
 * that we must resolve, since xids and locks may enter or leave the
1246
 * snapshot during that window. This creates the issue that an xid or
1247
 * lock may start *after* the snapshot has been derived yet *before* the
1248
 * snapshot is logged in the running xacts WAL record. We resolve this by
1249
 * starting to accumulate changes at a point just prior to when we derive
1250
 * the snapshot on the primary, then ignore duplicates when we later apply
1251
 * the snapshot from the running xacts record. This is implemented during
1252
 * CreateCheckPoint() where we use the logical checkpoint location as
1253
 * our starting point and then write the running xacts record immediately
1254
 * before writing the main checkpoint WAL record. Since we always start
1255
 * up from a checkpoint and are immediately at our starting point, we
1256
 * unconditionally move to STANDBY_INITIALIZED. After this point we
1257
 * must do 4 things:
1258
 *  * move shared nextXid forwards as we see new xids
1259
 *  * extend the clog and subtrans with each new xid
1260
 *  * keep track of uncommitted known assigned xids
1261
 *  * keep track of uncommitted AccessExclusiveLocks
1262
 *
1263
 * When we see a commit/abort we must remove known assigned xids and locks
1264
 * from the completing transaction. Attempted removals that cannot locate
1265
 * an entry are expected and must not cause an error when we are in state
1266
 * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
1267
 * KnownAssignedXidsRemove().
1268
 *
1269
 * Later, when we apply the running xact data we must be careful to ignore
1270
 * transactions already committed, since those commits raced ahead when
1271
 * making WAL entries.
1272
 *
1273
 * For logical decoding only the running xacts information is needed;
1274
 * there's no need to look at the locking information, but it's logged anyway,
1275
 * as there's no independent knob to just enable logical decoding. For
1276
 * details of how this is used, check snapbuild.c's introductory comment.
1277
 *
1278
 *
1279
 * Returns the RecPtr of the last inserted record.
1280
 */
1281
XLogRecPtr
1282
LogStandbySnapshot(void)
1283
0
{
1284
0
  XLogRecPtr  recptr;
1285
0
  RunningTransactions running;
1286
0
  xl_standby_lock *locks;
1287
0
  int     nlocks;
1288
1289
0
  Assert(XLogStandbyInfoActive());
1290
1291
#ifdef USE_INJECTION_POINTS
1292
  if (IS_INJECTION_POINT_ATTACHED("skip-log-running-xacts"))
1293
  {
1294
    /*
1295
     * This record could move slot's xmin forward during decoding, leading
1296
     * to unpredictable results, so skip it when requested by the test.
1297
     */
1298
    return GetInsertRecPtr();
1299
  }
1300
#endif
1301
1302
  /*
1303
   * Get details of any AccessExclusiveLocks being held at the moment.
1304
   */
1305
0
  locks = GetRunningTransactionLocks(&nlocks);
1306
0
  if (nlocks > 0)
1307
0
    LogAccessExclusiveLocks(nlocks, locks);
1308
0
  pfree(locks);
1309
1310
  /*
1311
   * Log details of all in-progress transactions. This should be the last
1312
   * record we write, because standby will open up when it sees this.
1313
   */
1314
0
  running = GetRunningTransactionData();
1315
1316
  /*
1317
   * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
1318
   * For Hot Standby this can be done before inserting the WAL record
1319
   * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
1320
   * the clog. For logical decoding, though, the lock can't be released
1321
   * early because the clog might be "in the future" from the POV of the
1322
   * historic snapshot. This would allow for situations where we're waiting
1323
   * for the end of a transaction listed in the xl_running_xacts record
1324
   * which, according to the WAL, has committed before the xl_running_xacts
1325
   * record. Fortunately this routine isn't executed frequently, and it's
1326
   * only a shared lock.
1327
   */
1328
0
  if (wal_level < WAL_LEVEL_LOGICAL)
1329
0
    LWLockRelease(ProcArrayLock);
1330
1331
0
  recptr = LogCurrentRunningXacts(running);
1332
1333
  /* Release lock if we kept it longer ... */
1334
0
  if (wal_level >= WAL_LEVEL_LOGICAL)
1335
0
    LWLockRelease(ProcArrayLock);
1336
1337
  /* GetRunningTransactionData() acquired XidGenLock, we must release it */
1338
0
  LWLockRelease(XidGenLock);
1339
1340
0
  return recptr;
1341
0
}
1342
1343
/*
1344
 * Record an enhanced snapshot of running transactions into WAL.
1345
 *
1346
 * The definitions of RunningTransactionsData and xl_running_xacts are
1347
 * similar. We keep them separate because xl_running_xacts is a contiguous
1348
 * chunk of memory and never exists fully until it is assembled in WAL.
1349
 * The inserted records are marked as not being important for durability,
1350
 * to avoid triggering superfluous checkpoint / archiving activity.
1351
 */
1352
static XLogRecPtr
1353
LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
1354
0
{
1355
0
  xl_running_xacts xlrec;
1356
0
  XLogRecPtr  recptr;
1357
1358
0
  xlrec.xcnt = CurrRunningXacts->xcnt;
1359
0
  xlrec.subxcnt = CurrRunningXacts->subxcnt;
1360
0
  xlrec.subxid_overflow = (CurrRunningXacts->subxid_status != SUBXIDS_IN_ARRAY);
1361
0
  xlrec.nextXid = CurrRunningXacts->nextXid;
1362
0
  xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
1363
0
  xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
1364
1365
  /* Header */
1366
0
  XLogBeginInsert();
1367
0
  XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
1368
0
  XLogRegisterData(&xlrec, MinSizeOfXactRunningXacts);
1369
1370
  /* array of TransactionIds */
1371
0
  if (xlrec.xcnt > 0)
1372
0
    XLogRegisterData(CurrRunningXacts->xids,
1373
0
             (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
1374
1375
0
  recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
1376
1377
0
  if (xlrec.subxid_overflow)
1378
0
    elog(DEBUG2,
1379
0
       "snapshot of %d running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
1380
0
       CurrRunningXacts->xcnt,
1381
0
       LSN_FORMAT_ARGS(recptr),
1382
0
       CurrRunningXacts->oldestRunningXid,
1383
0
       CurrRunningXacts->latestCompletedXid,
1384
0
       CurrRunningXacts->nextXid);
1385
0
  else
1386
0
    elog(DEBUG2,
1387
0
       "snapshot of %d+%d running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
1388
0
       CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
1389
0
       LSN_FORMAT_ARGS(recptr),
1390
0
       CurrRunningXacts->oldestRunningXid,
1391
0
       CurrRunningXacts->latestCompletedXid,
1392
0
       CurrRunningXacts->nextXid);
1393
1394
  /*
1395
   * Ensure running_xacts information is synced to disk not too far in the
1396
   * future. We don't want to stall anything though (i.e. use XLogFlush()),
1397
   * so we let the wal writer do it during normal operation.
1398
   * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
1399
   * and nudge the WALWriter into action if sleeping. Check
1400
   * XLogBackgroundFlush() for details why a record might not be flushed
1401
   * without it.
1402
   */
1403
0
  XLogSetAsyncXactLSN(recptr);
1404
1405
0
  return recptr;
1406
0
}
1407
1408
/*
1409
 * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
1410
 * logged, as described in backend/storage/lmgr/README.
1411
 */
1412
static void
1413
LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
1414
0
{
1415
0
  xl_standby_locks xlrec;
1416
1417
0
  xlrec.nlocks = nlocks;
1418
1419
0
  XLogBeginInsert();
1420
0
  XLogRegisterData(&xlrec, offsetof(xl_standby_locks, locks));
1421
0
  XLogRegisterData(locks, nlocks * sizeof(xl_standby_lock));
1422
0
  XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
1423
1424
0
  (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
1425
0
}
1426
1427
/*
1428
 * Individual logging of AccessExclusiveLocks for use during LockAcquire()
1429
 */
1430
void
1431
LogAccessExclusiveLock(Oid dbOid, Oid relOid)
1432
0
{
1433
0
  xl_standby_lock xlrec;
1434
1435
0
  xlrec.xid = GetCurrentTransactionId();
1436
1437
0
  xlrec.dbOid = dbOid;
1438
0
  xlrec.relOid = relOid;
1439
1440
0
  LogAccessExclusiveLocks(1, &xlrec);
1441
0
  MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
1442
0
}
1443
1444
/*
1445
 * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
1446
 */
1447
void
1448
LogAccessExclusiveLockPrepare(void)
1449
0
{
1450
  /*
1451
   * Ensure that a TransactionId has been assigned to this transaction, for
1452
   * two reasons, both related to lock release on the standby. First, we
1453
   * must assign an xid so that RecordTransactionCommit() and
1454
   * RecordTransactionAbort() do not optimise away the transaction
1455
   * completion record which recovery relies upon to release locks. It's a
1456
   * hack, but for a corner case not worth adding code for into the main
1457
   * commit path. Second, we must assign an xid before the lock is recorded
1458
   * in shared memory, otherwise a concurrently executing
1459
   * GetRunningTransactionLocks() might see a lock associated with an
1460
   * InvalidTransactionId which we later assert cannot happen.
1461
   */
1462
0
  (void) GetCurrentTransactionId();
1463
0
}
1464
1465
/*
1466
 * Emit WAL for invalidations. This currently is only used for commits without
1467
 * an xid but which contain invalidations.
1468
 */
1469
void
1470
LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
1471
            bool relcacheInitFileInval)
1472
0
{
1473
0
  xl_invalidations xlrec;
1474
1475
  /* prepare record */
1476
0
  memset(&xlrec, 0, sizeof(xlrec));
1477
0
  xlrec.dbId = MyDatabaseId;
1478
0
  xlrec.tsId = MyDatabaseTableSpace;
1479
0
  xlrec.relcacheInitFileInval = relcacheInitFileInval;
1480
0
  xlrec.nmsgs = nmsgs;
1481
1482
  /* perform insertion */
1483
0
  XLogBeginInsert();
1484
0
  XLogRegisterData(&xlrec, MinSizeOfInvalidations);
1485
0
  XLogRegisterData(msgs,
1486
0
           nmsgs * sizeof(SharedInvalidationMessage));
1487
0
  XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
1488
0
}
1489
1490
/* Return the description of recovery conflict */
1491
static const char *
1492
get_recovery_conflict_desc(ProcSignalReason reason)
1493
0
{
1494
0
  const char *reasonDesc = _("unknown reason");
1495
1496
0
  switch (reason)
1497
0
  {
1498
0
    case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
1499
0
      reasonDesc = _("recovery conflict on buffer pin");
1500
0
      break;
1501
0
    case PROCSIG_RECOVERY_CONFLICT_LOCK:
1502
0
      reasonDesc = _("recovery conflict on lock");
1503
0
      break;
1504
0
    case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
1505
0
      reasonDesc = _("recovery conflict on tablespace");
1506
0
      break;
1507
0
    case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
1508
0
      reasonDesc = _("recovery conflict on snapshot");
1509
0
      break;
1510
0
    case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT:
1511
0
      reasonDesc = _("recovery conflict on replication slot");
1512
0
      break;
1513
0
    case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
1514
0
      reasonDesc = _("recovery conflict on buffer deadlock");
1515
0
      break;
1516
0
    case PROCSIG_RECOVERY_CONFLICT_DATABASE:
1517
0
      reasonDesc = _("recovery conflict on database");
1518
0
      break;
1519
0
    default:
1520
0
      break;
1521
0
  }
1522
1523
0
  return reasonDesc;
1524
0
}