Coverage Report

Created: 2025-06-13 06:06

/src/postgres/src/backend/utils/activity/pgstat_io.c
Line
Count
Source (jump to first uncovered line)
1
/* -------------------------------------------------------------------------
2
 *
3
 * pgstat_io.c
4
 *    Implementation of IO statistics.
5
 *
6
 * This file contains the implementation of IO statistics. It is kept separate
7
 * from pgstat.c to enforce the line between the statistics access / storage
8
 * implementation and the details about individual types of statistics.
9
 *
10
 * Copyright (c) 2021-2025, PostgreSQL Global Development Group
11
 *
12
 * IDENTIFICATION
13
 *    src/backend/utils/activity/pgstat_io.c
14
 * -------------------------------------------------------------------------
15
 */
16
17
#include "postgres.h"
18
19
#include "executor/instrument.h"
20
#include "storage/bufmgr.h"
21
#include "utils/pgstat_internal.h"
22
23
static PgStat_PendingIO PendingIOStats;
24
static bool have_iostats = false;
25
26
/*
27
 * Check that stats have not been counted for any combination of IOObject,
28
 * IOContext, and IOOp which are not tracked for the passed-in BackendType. If
29
 * stats are tracked for this combination and IO times are non-zero, counts
30
 * should be non-zero.
31
 *
32
 * The passed-in PgStat_BktypeIO must contain stats from the BackendType
33
 * specified by the second parameter. Caller is responsible for locking the
34
 * passed-in PgStat_BktypeIO, if needed.
35
 */
36
bool
37
pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
38
               BackendType bktype)
39
0
{
40
0
  for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
41
0
  {
42
0
    for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
43
0
    {
44
0
      for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
45
0
      {
46
        /* we do track it */
47
0
        if (pgstat_tracks_io_op(bktype, io_object, io_context, io_op))
48
0
        {
49
          /* ensure that if IO times are non-zero, counts are > 0 */
50
0
          if (backend_io->times[io_object][io_context][io_op] != 0 &&
51
0
            backend_io->counts[io_object][io_context][io_op] <= 0)
52
0
            return false;
53
54
0
          continue;
55
0
        }
56
57
        /* we don't track it, and it is not 0 */
58
0
        if (backend_io->counts[io_object][io_context][io_op] != 0)
59
0
          return false;
60
0
      }
61
0
    }
62
0
  }
63
64
0
  return true;
65
0
}
66
67
void
68
pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op,
69
           uint32 cnt, uint64 bytes)
70
0
{
71
0
  Assert((unsigned int) io_object < IOOBJECT_NUM_TYPES);
72
0
  Assert((unsigned int) io_context < IOCONTEXT_NUM_TYPES);
73
0
  Assert(pgstat_is_ioop_tracked_in_bytes(io_op) || bytes == 0);
74
0
  Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
75
76
0
  PendingIOStats.counts[io_object][io_context][io_op] += cnt;
77
0
  PendingIOStats.bytes[io_object][io_context][io_op] += bytes;
78
79
  /* Add the per-backend counts */
80
0
  pgstat_count_backend_io_op(io_object, io_context, io_op, cnt, bytes);
81
82
0
  have_iostats = true;
83
0
}
84
85
/*
86
 * Initialize the internal timing for an IO operation, depending on an
87
 * IO timing GUC.
88
 */
89
instr_time
90
pgstat_prepare_io_time(bool track_io_guc)
91
0
{
92
0
  instr_time  io_start;
93
94
0
  if (track_io_guc)
95
0
    INSTR_TIME_SET_CURRENT(io_start);
96
0
  else
97
0
  {
98
    /*
99
     * There is no need to set io_start when an IO timing GUC is disabled.
100
     * Initialize it to zero to avoid compiler warnings and to let
101
     * pgstat_count_io_op_time() know that timings should be ignored.
102
     */
103
0
    INSTR_TIME_SET_ZERO(io_start);
104
0
  }
105
106
0
  return io_start;
107
0
}
108
109
/*
110
 * Like pgstat_count_io_op() except it also accumulates time.
111
 *
112
 * The calls related to pgstat_count_buffer_*() are for pgstat_database.  As
113
 * pg_stat_database only counts block read and write times, these are done for
114
 * IOOP_READ, IOOP_WRITE and IOOP_EXTEND.
115
 *
116
 * pgBufferUsage is used for EXPLAIN.  pgBufferUsage has write and read stats
117
 * for shared, local and temporary blocks.  pg_stat_io does not track the
118
 * activity of temporary blocks, so these are ignored here.
119
 */
120
void
121
pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
122
            instr_time start_time, uint32 cnt, uint64 bytes)
123
0
{
124
0
  if (!INSTR_TIME_IS_ZERO(start_time))
125
0
  {
126
0
    instr_time  io_time;
127
128
0
    INSTR_TIME_SET_CURRENT(io_time);
129
0
    INSTR_TIME_SUBTRACT(io_time, start_time);
130
131
0
    if (io_object != IOOBJECT_WAL)
132
0
    {
133
0
      if (io_op == IOOP_WRITE || io_op == IOOP_EXTEND)
134
0
      {
135
0
        pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
136
0
        if (io_object == IOOBJECT_RELATION)
137
0
          INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time);
138
0
        else if (io_object == IOOBJECT_TEMP_RELATION)
139
0
          INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time);
140
0
      }
141
0
      else if (io_op == IOOP_READ)
142
0
      {
143
0
        pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
144
0
        if (io_object == IOOBJECT_RELATION)
145
0
          INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time);
146
0
        else if (io_object == IOOBJECT_TEMP_RELATION)
147
0
          INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time);
148
0
      }
149
0
    }
150
151
0
    INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op],
152
0
             io_time);
153
154
    /* Add the per-backend count */
155
0
    pgstat_count_backend_io_op_time(io_object, io_context, io_op,
156
0
                    io_time);
157
0
  }
158
159
0
  pgstat_count_io_op(io_object, io_context, io_op, cnt, bytes);
160
0
}
161
162
PgStat_IO *
163
pgstat_fetch_stat_io(void)
164
0
{
165
0
  pgstat_snapshot_fixed(PGSTAT_KIND_IO);
166
167
0
  return &pgStatLocal.snapshot.io;
168
0
}
169
170
/*
171
 * Check if there any IO stats waiting for flush.
172
 */
173
bool
174
pgstat_io_have_pending_cb(void)
175
0
{
176
0
  return have_iostats;
177
0
}
178
179
/*
180
 * Simpler wrapper of pgstat_io_flush_cb()
181
 */
182
void
183
pgstat_flush_io(bool nowait)
184
0
{
185
0
  (void) pgstat_io_flush_cb(nowait);
186
0
}
187
188
/*
189
 * Flush out locally pending IO statistics
190
 *
191
 * If no stats have been recorded, this function returns false.
192
 *
193
 * If nowait is true, this function returns true if the lock could not be
194
 * acquired. Otherwise, return false.
195
 */
196
bool
197
pgstat_io_flush_cb(bool nowait)
198
0
{
199
0
  LWLock     *bktype_lock;
200
0
  PgStat_BktypeIO *bktype_shstats;
201
202
0
  if (!have_iostats)
203
0
    return false;
204
205
0
  bktype_lock = &pgStatLocal.shmem->io.locks[MyBackendType];
206
0
  bktype_shstats =
207
0
    &pgStatLocal.shmem->io.stats.stats[MyBackendType];
208
209
0
  if (!nowait)
210
0
    LWLockAcquire(bktype_lock, LW_EXCLUSIVE);
211
0
  else if (!LWLockConditionalAcquire(bktype_lock, LW_EXCLUSIVE))
212
0
    return true;
213
214
0
  for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
215
0
  {
216
0
    for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
217
0
    {
218
0
      for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
219
0
      {
220
0
        instr_time  time;
221
222
0
        bktype_shstats->counts[io_object][io_context][io_op] +=
223
0
          PendingIOStats.counts[io_object][io_context][io_op];
224
225
0
        bktype_shstats->bytes[io_object][io_context][io_op] +=
226
0
          PendingIOStats.bytes[io_object][io_context][io_op];
227
228
0
        time = PendingIOStats.pending_times[io_object][io_context][io_op];
229
230
0
        bktype_shstats->times[io_object][io_context][io_op] +=
231
0
          INSTR_TIME_GET_MICROSEC(time);
232
0
      }
233
0
    }
234
0
  }
235
236
0
  Assert(pgstat_bktype_io_stats_valid(bktype_shstats, MyBackendType));
237
238
0
  LWLockRelease(bktype_lock);
239
240
0
  memset(&PendingIOStats, 0, sizeof(PendingIOStats));
241
242
0
  have_iostats = false;
243
244
0
  return false;
245
0
}
246
247
const char *
248
pgstat_get_io_context_name(IOContext io_context)
249
0
{
250
0
  switch (io_context)
251
0
  {
252
0
    case IOCONTEXT_BULKREAD:
253
0
      return "bulkread";
254
0
    case IOCONTEXT_BULKWRITE:
255
0
      return "bulkwrite";
256
0
    case IOCONTEXT_INIT:
257
0
      return "init";
258
0
    case IOCONTEXT_NORMAL:
259
0
      return "normal";
260
0
    case IOCONTEXT_VACUUM:
261
0
      return "vacuum";
262
0
  }
263
264
0
  elog(ERROR, "unrecognized IOContext value: %d", io_context);
265
0
  pg_unreachable();
266
0
}
267
268
const char *
269
pgstat_get_io_object_name(IOObject io_object)
270
0
{
271
0
  switch (io_object)
272
0
  {
273
0
    case IOOBJECT_RELATION:
274
0
      return "relation";
275
0
    case IOOBJECT_TEMP_RELATION:
276
0
      return "temp relation";
277
0
    case IOOBJECT_WAL:
278
0
      return "wal";
279
0
  }
280
281
0
  elog(ERROR, "unrecognized IOObject value: %d", io_object);
282
0
  pg_unreachable();
283
0
}
284
285
void
286
pgstat_io_init_shmem_cb(void *stats)
287
0
{
288
0
  PgStatShared_IO *stat_shmem = (PgStatShared_IO *) stats;
289
290
0
  for (int i = 0; i < BACKEND_NUM_TYPES; i++)
291
0
    LWLockInitialize(&stat_shmem->locks[i], LWTRANCHE_PGSTATS_DATA);
292
0
}
293
294
void
295
pgstat_io_reset_all_cb(TimestampTz ts)
296
0
{
297
0
  for (int i = 0; i < BACKEND_NUM_TYPES; i++)
298
0
  {
299
0
    LWLock     *bktype_lock = &pgStatLocal.shmem->io.locks[i];
300
0
    PgStat_BktypeIO *bktype_shstats = &pgStatLocal.shmem->io.stats.stats[i];
301
302
0
    LWLockAcquire(bktype_lock, LW_EXCLUSIVE);
303
304
    /*
305
     * Use the lock in the first BackendType's PgStat_BktypeIO to protect
306
     * the reset timestamp as well.
307
     */
308
0
    if (i == 0)
309
0
      pgStatLocal.shmem->io.stats.stat_reset_timestamp = ts;
310
311
0
    memset(bktype_shstats, 0, sizeof(*bktype_shstats));
312
0
    LWLockRelease(bktype_lock);
313
0
  }
314
0
}
315
316
void
317
pgstat_io_snapshot_cb(void)
318
0
{
319
0
  for (int i = 0; i < BACKEND_NUM_TYPES; i++)
320
0
  {
321
0
    LWLock     *bktype_lock = &pgStatLocal.shmem->io.locks[i];
322
0
    PgStat_BktypeIO *bktype_shstats = &pgStatLocal.shmem->io.stats.stats[i];
323
0
    PgStat_BktypeIO *bktype_snap = &pgStatLocal.snapshot.io.stats[i];
324
325
0
    LWLockAcquire(bktype_lock, LW_SHARED);
326
327
    /*
328
     * Use the lock in the first BackendType's PgStat_BktypeIO to protect
329
     * the reset timestamp as well.
330
     */
331
0
    if (i == 0)
332
0
      pgStatLocal.snapshot.io.stat_reset_timestamp =
333
0
        pgStatLocal.shmem->io.stats.stat_reset_timestamp;
334
335
    /* using struct assignment due to better type safety */
336
0
    *bktype_snap = *bktype_shstats;
337
0
    LWLockRelease(bktype_lock);
338
0
  }
339
0
}
340
341
/*
342
* IO statistics are not collected for all BackendTypes.
343
*
344
* The following BackendTypes do not participate in the cumulative stats
345
* subsystem or do not perform IO on which we currently track:
346
* - Dead-end backend because it is not connected to shared memory and
347
*   doesn't do any IO
348
* - Syslogger because it is not connected to shared memory
349
* - Archiver because most relevant archiving IO is delegated to a
350
*   specialized command or module
351
*
352
* Function returns true if BackendType participates in the cumulative stats
353
* subsystem for IO and false if it does not.
354
*
355
* When adding a new BackendType, also consider adding relevant restrictions to
356
* pgstat_tracks_io_object() and pgstat_tracks_io_op().
357
*/
358
bool
359
pgstat_tracks_io_bktype(BackendType bktype)
360
0
{
361
  /*
362
   * List every type so that new backend types trigger a warning about
363
   * needing to adjust this switch.
364
   */
365
0
  switch (bktype)
366
0
  {
367
0
    case B_INVALID:
368
0
    case B_DEAD_END_BACKEND:
369
0
    case B_ARCHIVER:
370
0
    case B_LOGGER:
371
0
      return false;
372
373
0
    case B_AUTOVAC_LAUNCHER:
374
0
    case B_AUTOVAC_WORKER:
375
0
    case B_BACKEND:
376
0
    case B_BG_WORKER:
377
0
    case B_BG_WRITER:
378
0
    case B_CHECKPOINTER:
379
0
    case B_IO_WORKER:
380
0
    case B_SLOTSYNC_WORKER:
381
0
    case B_STANDALONE_BACKEND:
382
0
    case B_STARTUP:
383
0
    case B_WAL_RECEIVER:
384
0
    case B_WAL_SENDER:
385
0
    case B_WAL_SUMMARIZER:
386
0
    case B_WAL_WRITER:
387
0
      return true;
388
0
  }
389
390
0
  return false;
391
0
}
392
393
/*
394
 * Some BackendTypes do not perform IO on certain IOObjects or in certain
395
 * IOContexts. Some IOObjects are never operated on in some IOContexts. Check
396
 * that the given BackendType is expected to do IO in the given IOContext and
397
 * on the given IOObject and that the given IOObject is expected to be operated
398
 * on in the given IOContext.
399
 */
400
bool
401
pgstat_tracks_io_object(BackendType bktype, IOObject io_object,
402
            IOContext io_context)
403
0
{
404
0
  bool    no_temp_rel;
405
406
  /*
407
   * Some BackendTypes should never track IO statistics.
408
   */
409
0
  if (!pgstat_tracks_io_bktype(bktype))
410
0
    return false;
411
412
  /*
413
   * Currently, IO on IOOBJECT_WAL objects can only occur in the
414
   * IOCONTEXT_NORMAL and IOCONTEXT_INIT IOContexts.
415
   */
416
0
  if (io_object == IOOBJECT_WAL &&
417
0
    (io_context != IOCONTEXT_NORMAL &&
418
0
     io_context != IOCONTEXT_INIT))
419
0
    return false;
420
421
  /*
422
   * Currently, IO on temporary relations can only occur in the
423
   * IOCONTEXT_NORMAL IOContext.
424
   */
425
0
  if (io_context != IOCONTEXT_NORMAL &&
426
0
    io_object == IOOBJECT_TEMP_RELATION)
427
0
    return false;
428
429
  /*
430
   * In core Postgres, only regular backends and WAL Sender processes
431
   * executing queries will use local buffers and operate on temporary
432
   * relations. Parallel workers will not use local buffers (see
433
   * InitLocalBuffers()); however, extensions leveraging background workers
434
   * have no such limitation, so track IO on IOOBJECT_TEMP_RELATION for
435
   * BackendType B_BG_WORKER.
436
   */
437
0
  no_temp_rel = bktype == B_AUTOVAC_LAUNCHER || bktype == B_BG_WRITER ||
438
0
    bktype == B_CHECKPOINTER || bktype == B_AUTOVAC_WORKER ||
439
0
    bktype == B_STANDALONE_BACKEND || bktype == B_STARTUP ||
440
0
    bktype == B_WAL_SUMMARIZER || bktype == B_WAL_WRITER ||
441
0
    bktype == B_WAL_RECEIVER;
442
443
0
  if (no_temp_rel && io_context == IOCONTEXT_NORMAL &&
444
0
    io_object == IOOBJECT_TEMP_RELATION)
445
0
    return false;
446
447
  /*
448
   * Some BackendTypes only perform IO under IOOBJECT_WAL, hence exclude all
449
   * rows for all the other objects for these.
450
   */
451
0
  if ((bktype == B_WAL_SUMMARIZER || bktype == B_WAL_RECEIVER ||
452
0
     bktype == B_WAL_WRITER) && io_object != IOOBJECT_WAL)
453
0
    return false;
454
455
  /*
456
   * Some BackendTypes do not currently perform any IO in certain
457
   * IOContexts, and, while it may not be inherently incorrect for them to
458
   * do so, excluding those rows from the view makes the view easier to use.
459
   */
460
0
  if ((bktype == B_CHECKPOINTER || bktype == B_BG_WRITER) &&
461
0
    (io_context == IOCONTEXT_BULKREAD ||
462
0
     io_context == IOCONTEXT_BULKWRITE ||
463
0
     io_context == IOCONTEXT_VACUUM))
464
0
    return false;
465
466
0
  if (bktype == B_AUTOVAC_LAUNCHER && io_context == IOCONTEXT_VACUUM)
467
0
    return false;
468
469
0
  if ((bktype == B_AUTOVAC_WORKER || bktype == B_AUTOVAC_LAUNCHER) &&
470
0
    io_context == IOCONTEXT_BULKWRITE)
471
0
    return false;
472
473
0
  return true;
474
0
}
475
476
/*
477
 * Some BackendTypes will never do certain IOOps and some IOOps should not
478
 * occur in certain IOContexts or on certain IOObjects. Check that the given
479
 * IOOp is valid for the given BackendType in the given IOContext and on the
480
 * given IOObject. Note that there are currently no cases of an IOOp being
481
 * invalid for a particular BackendType only within a certain IOContext and/or
482
 * only on a certain IOObject.
483
 */
484
bool
485
pgstat_tracks_io_op(BackendType bktype, IOObject io_object,
486
          IOContext io_context, IOOp io_op)
487
0
{
488
0
  bool    strategy_io_context;
489
490
  /* if (io_context, io_object) will never collect stats, we're done */
491
0
  if (!pgstat_tracks_io_object(bktype, io_object, io_context))
492
0
    return false;
493
494
  /*
495
   * Some BackendTypes will not do certain IOOps.
496
   */
497
0
  if (bktype == B_BG_WRITER &&
498
0
    (io_op == IOOP_READ || io_op == IOOP_EVICT || io_op == IOOP_HIT))
499
0
    return false;
500
501
0
  if (bktype == B_CHECKPOINTER &&
502
0
    ((io_object != IOOBJECT_WAL && io_op == IOOP_READ) ||
503
0
     (io_op == IOOP_EVICT || io_op == IOOP_HIT)))
504
0
    return false;
505
506
0
  if ((bktype == B_AUTOVAC_LAUNCHER || bktype == B_BG_WRITER ||
507
0
     bktype == B_CHECKPOINTER) && io_op == IOOP_EXTEND)
508
0
    return false;
509
510
  /*
511
   * Some BackendTypes do not perform reads with IOOBJECT_WAL.
512
   */
513
0
  if (io_object == IOOBJECT_WAL && io_op == IOOP_READ &&
514
0
    (bktype == B_WAL_RECEIVER || bktype == B_BG_WRITER ||
515
0
     bktype == B_AUTOVAC_LAUNCHER || bktype == B_AUTOVAC_WORKER ||
516
0
     bktype == B_WAL_WRITER))
517
0
    return false;
518
519
  /*
520
   * Temporary tables are not logged and thus do not require fsync'ing.
521
   * Writeback is not requested for temporary tables.
522
   */
523
0
  if (io_object == IOOBJECT_TEMP_RELATION &&
524
0
    (io_op == IOOP_FSYNC || io_op == IOOP_WRITEBACK))
525
0
    return false;
526
527
  /*
528
   * Some IOOps are not valid in certain IOContexts and some IOOps are only
529
   * valid in certain contexts.
530
   */
531
0
  if (io_context == IOCONTEXT_BULKREAD && io_op == IOOP_EXTEND)
532
0
    return false;
533
534
0
  strategy_io_context = io_context == IOCONTEXT_BULKREAD ||
535
0
    io_context == IOCONTEXT_BULKWRITE || io_context == IOCONTEXT_VACUUM;
536
537
  /*
538
   * IOOP_REUSE is only relevant when a BufferAccessStrategy is in use.
539
   */
540
0
  if (!strategy_io_context && io_op == IOOP_REUSE)
541
0
    return false;
542
543
  /*
544
   * IOOBJECT_WAL IOObject will not do certain IOOps depending on IOContext.
545
   */
546
0
  if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_INIT &&
547
0
    !(io_op == IOOP_WRITE || io_op == IOOP_FSYNC))
548
0
    return false;
549
550
0
  if (io_object == IOOBJECT_WAL && io_context == IOCONTEXT_NORMAL &&
551
0
    !(io_op == IOOP_WRITE || io_op == IOOP_READ || io_op == IOOP_FSYNC))
552
0
    return false;
553
554
  /*
555
   * IOOP_FSYNC IOOps done by a backend using a BufferAccessStrategy are
556
   * counted in the IOCONTEXT_NORMAL IOContext. See comment in
557
   * register_dirty_segment() for more details.
558
   */
559
0
  if (strategy_io_context && io_op == IOOP_FSYNC)
560
0
    return false;
561
562
563
0
  return true;
564
0
}