Coverage Report

Created: 2025-07-03 06:49

/src/postgres/src/backend/storage/smgr/smgr.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * smgr.c
4
 *    public interface routines to storage manager switch.
5
 *
6
 * All file system operations on relations dispatch through these routines.
7
 * An SMgrRelation represents physical on-disk relation files that are open
8
 * for reading and writing.
9
 *
10
 * When a relation is first accessed through the relation cache, the
11
 * corresponding SMgrRelation entry is opened by calling smgropen(), and the
12
 * reference is stored in the relation cache entry.
13
 *
14
 * Accesses that don't go through the relation cache open the SMgrRelation
15
 * directly.  That includes flushing buffers from the buffer cache, as well as
16
 * all accesses in auxiliary processes like the checkpointer or the WAL redo
17
 * in the startup process.
18
 *
19
 * Operations like CREATE, DROP, ALTER TABLE also hold SMgrRelation references
20
 * independent of the relation cache.  They need to prepare the physical files
21
 * before updating the relation cache.
22
 *
23
 * There is a hash table that holds all the SMgrRelation entries in the
24
 * backend.  If you call smgropen() twice for the same rel locator, you get a
25
 * reference to the same SMgrRelation. The reference is valid until the end of
26
 * transaction.  This makes repeated access to the same relation efficient,
27
 * and allows caching things like the relation size in the SMgrRelation entry.
28
 *
29
 * At end of transaction, all SMgrRelation entries that haven't been pinned
30
 * are removed.  An SMgrRelation can hold kernel file system descriptors for
31
 * the underlying files, and we'd like to close those reasonably soon if the
32
 * file gets deleted.  The SMgrRelations references held by the relcache are
33
 * pinned to prevent them from being closed.
34
 *
35
 * There is another mechanism to close file descriptors early:
36
 * PROCSIGNAL_BARRIER_SMGRRELEASE.  It is a request to immediately close all
37
 * file descriptors.  Upon receiving that signal, the backend closes all file
38
 * descriptors held open by SMgrRelations, but because it can happen in the
39
 * middle of a transaction, we cannot destroy the SMgrRelation objects
40
 * themselves, as there could pointers to them in active use.  See
41
 * smgrrelease() and smgrreleaseall().
42
 *
43
 * NB: We need to hold interrupts across most of the functions in this file,
44
 * as otherwise interrupt processing, e.g. due to a < ERROR elog/ereport, can
45
 * trigger procsignal processing, which in turn can trigger
46
 * smgrreleaseall(). Most of the relevant code is not reentrant.  It seems
47
 * better to put the HOLD_INTERRUPTS()/RESUME_INTERRUPTS() here, instead of
48
 * trying to push them down to md.c where possible: For one, every smgr
49
 * implementation would be vulnerable, for another, a good bit of smgr.c code
50
 * itself is affected too.  Eventually we might want a more targeted solution,
51
 * allowing e.g. a networked smgr implementation to be interrupted, but many
52
 * other, more complicated, problems would need to be fixed for that to be
53
 * viable (e.g. smgr.c is often called with interrupts already held).
54
 *
55
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
56
 * Portions Copyright (c) 1994, Regents of the University of California
57
 *
58
 *
59
 * IDENTIFICATION
60
 *    src/backend/storage/smgr/smgr.c
61
 *
62
 *-------------------------------------------------------------------------
63
 */
64
#include "postgres.h"
65
66
#include "access/xlogutils.h"
67
#include "lib/ilist.h"
68
#include "miscadmin.h"
69
#include "storage/aio.h"
70
#include "storage/bufmgr.h"
71
#include "storage/ipc.h"
72
#include "storage/md.h"
73
#include "storage/smgr.h"
74
#include "utils/hsearch.h"
75
#include "utils/inval.h"
76
77
78
/*
79
 * This struct of function pointers defines the API between smgr.c and
80
 * any individual storage manager module.  Note that smgr subfunctions are
81
 * generally expected to report problems via elog(ERROR).  An exception is
82
 * that smgr_unlink should use elog(WARNING), rather than erroring out,
83
 * because we normally unlink relations during post-commit/abort cleanup,
84
 * and so it's too late to raise an error.  Also, various conditions that
85
 * would normally be errors should be allowed during bootstrap and/or WAL
86
 * recovery --- see comments in md.c for details.
87
 */
88
typedef struct f_smgr
89
{
90
  void    (*smgr_init) (void);  /* may be NULL */
91
  void    (*smgr_shutdown) (void);  /* may be NULL */
92
  void    (*smgr_open) (SMgrRelation reln);
93
  void    (*smgr_close) (SMgrRelation reln, ForkNumber forknum);
94
  void    (*smgr_create) (SMgrRelation reln, ForkNumber forknum,
95
                bool isRedo);
96
  bool    (*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
97
  void    (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum,
98
                bool isRedo);
99
  void    (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
100
                BlockNumber blocknum, const void *buffer, bool skipFsync);
101
  void    (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
102
                  BlockNumber blocknum, int nblocks, bool skipFsync);
103
  bool    (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
104
                  BlockNumber blocknum, int nblocks);
105
  uint32    (*smgr_maxcombine) (SMgrRelation reln, ForkNumber forknum,
106
                  BlockNumber blocknum);
107
  void    (*smgr_readv) (SMgrRelation reln, ForkNumber forknum,
108
                 BlockNumber blocknum,
109
                 void **buffers, BlockNumber nblocks);
110
  void    (*smgr_startreadv) (PgAioHandle *ioh,
111
                  SMgrRelation reln, ForkNumber forknum,
112
                  BlockNumber blocknum,
113
                  void **buffers, BlockNumber nblocks);
114
  void    (*smgr_writev) (SMgrRelation reln, ForkNumber forknum,
115
                BlockNumber blocknum,
116
                const void **buffers, BlockNumber nblocks,
117
                bool skipFsync);
118
  void    (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
119
                   BlockNumber blocknum, BlockNumber nblocks);
120
  BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
121
  void    (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
122
                  BlockNumber old_blocks, BlockNumber nblocks);
123
  void    (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
124
  void    (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum);
125
  int     (*smgr_fd) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off);
126
} f_smgr;
127
128
static const f_smgr smgrsw[] = {
129
  /* magnetic disk */
130
  {
131
    .smgr_init = mdinit,
132
    .smgr_shutdown = NULL,
133
    .smgr_open = mdopen,
134
    .smgr_close = mdclose,
135
    .smgr_create = mdcreate,
136
    .smgr_exists = mdexists,
137
    .smgr_unlink = mdunlink,
138
    .smgr_extend = mdextend,
139
    .smgr_zeroextend = mdzeroextend,
140
    .smgr_prefetch = mdprefetch,
141
    .smgr_maxcombine = mdmaxcombine,
142
    .smgr_readv = mdreadv,
143
    .smgr_startreadv = mdstartreadv,
144
    .smgr_writev = mdwritev,
145
    .smgr_writeback = mdwriteback,
146
    .smgr_nblocks = mdnblocks,
147
    .smgr_truncate = mdtruncate,
148
    .smgr_immedsync = mdimmedsync,
149
    .smgr_registersync = mdregistersync,
150
    .smgr_fd = mdfd,
151
  }
152
};
153
154
static const int NSmgr = lengthof(smgrsw);
155
156
/*
157
 * Each backend has a hashtable that stores all extant SMgrRelation objects.
158
 * In addition, "unpinned" SMgrRelation objects are chained together in a list.
159
 */
160
static HTAB *SMgrRelationHash = NULL;
161
162
static dlist_head unpinned_relns;
163
164
/* local function prototypes */
165
static void smgrshutdown(int code, Datum arg);
166
static void smgrdestroy(SMgrRelation reln);
167
168
static void smgr_aio_reopen(PgAioHandle *ioh);
169
static char *smgr_aio_describe_identity(const PgAioTargetData *sd);
170
171
172
const PgAioTargetInfo aio_smgr_target_info = {
173
  .name = "smgr",
174
  .reopen = smgr_aio_reopen,
175
  .describe_identity = smgr_aio_describe_identity,
176
};
177
178
179
/*
180
 * smgrinit(), smgrshutdown() -- Initialize or shut down storage
181
 *                 managers.
182
 *
183
 * Note: smgrinit is called during backend startup (normal or standalone
184
 * case), *not* during postmaster start.  Therefore, any resources created
185
 * here or destroyed in smgrshutdown are backend-local.
186
 */
187
void
188
smgrinit(void)
189
0
{
190
0
  int     i;
191
192
0
  HOLD_INTERRUPTS();
193
194
0
  for (i = 0; i < NSmgr; i++)
195
0
  {
196
0
    if (smgrsw[i].smgr_init)
197
0
      smgrsw[i].smgr_init();
198
0
  }
199
200
0
  RESUME_INTERRUPTS();
201
202
  /* register the shutdown proc */
203
0
  on_proc_exit(smgrshutdown, 0);
204
0
}
205
206
/*
207
 * on_proc_exit hook for smgr cleanup during backend shutdown
208
 */
209
static void
210
smgrshutdown(int code, Datum arg)
211
0
{
212
0
  int     i;
213
214
0
  HOLD_INTERRUPTS();
215
216
0
  for (i = 0; i < NSmgr; i++)
217
0
  {
218
0
    if (smgrsw[i].smgr_shutdown)
219
0
      smgrsw[i].smgr_shutdown();
220
0
  }
221
222
0
  RESUME_INTERRUPTS();
223
0
}
224
225
/*
226
 * smgropen() -- Return an SMgrRelation object, creating it if need be.
227
 *
228
 * In versions of PostgreSQL prior to 17, this function returned an object
229
 * with no defined lifetime.  Now, however, the object remains valid for the
230
 * lifetime of the transaction, up to the point where AtEOXact_SMgr() is
231
 * called, making it much easier for callers to know for how long they can
232
 * hold on to a pointer to the returned object.  If this function is called
233
 * outside of a transaction, the object remains valid until smgrdestroy() or
234
 * smgrdestroyall() is called.  Background processes that use smgr but not
235
 * transactions typically do this once per checkpoint cycle.
236
 *
237
 * This does not attempt to actually open the underlying files.
238
 */
239
SMgrRelation
240
smgropen(RelFileLocator rlocator, ProcNumber backend)
241
0
{
242
0
  RelFileLocatorBackend brlocator;
243
0
  SMgrRelation reln;
244
0
  bool    found;
245
246
0
  Assert(RelFileNumberIsValid(rlocator.relNumber));
247
248
0
  HOLD_INTERRUPTS();
249
250
0
  if (SMgrRelationHash == NULL)
251
0
  {
252
    /* First time through: initialize the hash table */
253
0
    HASHCTL   ctl;
254
255
0
    ctl.keysize = sizeof(RelFileLocatorBackend);
256
0
    ctl.entrysize = sizeof(SMgrRelationData);
257
0
    SMgrRelationHash = hash_create("smgr relation table", 400,
258
0
                     &ctl, HASH_ELEM | HASH_BLOBS);
259
0
    dlist_init(&unpinned_relns);
260
0
  }
261
262
  /* Look up or create an entry */
263
0
  brlocator.locator = rlocator;
264
0
  brlocator.backend = backend;
265
0
  reln = (SMgrRelation) hash_search(SMgrRelationHash,
266
0
                    &brlocator,
267
0
                    HASH_ENTER, &found);
268
269
  /* Initialize it if not present before */
270
0
  if (!found)
271
0
  {
272
    /* hash_search already filled in the lookup key */
273
0
    reln->smgr_targblock = InvalidBlockNumber;
274
0
    for (int i = 0; i <= MAX_FORKNUM; ++i)
275
0
      reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
276
0
    reln->smgr_which = 0; /* we only have md.c at present */
277
278
    /* it is not pinned yet */
279
0
    reln->pincount = 0;
280
0
    dlist_push_tail(&unpinned_relns, &reln->node);
281
282
    /* implementation-specific initialization */
283
0
    smgrsw[reln->smgr_which].smgr_open(reln);
284
0
  }
285
286
0
  RESUME_INTERRUPTS();
287
288
0
  return reln;
289
0
}
290
291
/*
292
 * smgrpin() -- Prevent an SMgrRelation object from being destroyed at end of
293
 *        transaction
294
 */
295
void
296
smgrpin(SMgrRelation reln)
297
0
{
298
0
  if (reln->pincount == 0)
299
0
    dlist_delete(&reln->node);
300
0
  reln->pincount++;
301
0
}
302
303
/*
304
 * smgrunpin() -- Allow an SMgrRelation object to be destroyed at end of
305
 *          transaction
306
 *
307
 * The object remains valid, but if there are no other pins on it, it is moved
308
 * to the unpinned list where it will be destroyed by AtEOXact_SMgr().
309
 */
310
void
311
smgrunpin(SMgrRelation reln)
312
0
{
313
0
  Assert(reln->pincount > 0);
314
0
  reln->pincount--;
315
0
  if (reln->pincount == 0)
316
0
    dlist_push_tail(&unpinned_relns, &reln->node);
317
0
}
318
319
/*
320
 * smgrdestroy() -- Delete an SMgrRelation object.
321
 */
322
static void
323
smgrdestroy(SMgrRelation reln)
324
0
{
325
0
  ForkNumber  forknum;
326
327
0
  Assert(reln->pincount == 0);
328
329
0
  HOLD_INTERRUPTS();
330
331
0
  for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
332
0
    smgrsw[reln->smgr_which].smgr_close(reln, forknum);
333
334
0
  dlist_delete(&reln->node);
335
336
0
  if (hash_search(SMgrRelationHash,
337
0
          &(reln->smgr_rlocator),
338
0
          HASH_REMOVE, NULL) == NULL)
339
0
    elog(ERROR, "SMgrRelation hashtable corrupted");
340
341
0
  RESUME_INTERRUPTS();
342
0
}
343
344
/*
345
 * smgrrelease() -- Release all resources used by this object.
346
 *
347
 * The object remains valid.
348
 */
349
void
350
smgrrelease(SMgrRelation reln)
351
0
{
352
0
  HOLD_INTERRUPTS();
353
354
0
  for (ForkNumber forknum = 0; forknum <= MAX_FORKNUM; forknum++)
355
0
  {
356
0
    smgrsw[reln->smgr_which].smgr_close(reln, forknum);
357
0
    reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
358
0
  }
359
0
  reln->smgr_targblock = InvalidBlockNumber;
360
361
0
  RESUME_INTERRUPTS();
362
0
}
363
364
/*
365
 * smgrclose() -- Close an SMgrRelation object.
366
 *
367
 * The SMgrRelation reference should not be used after this call.  However,
368
 * because we don't keep track of the references returned by smgropen(), we
369
 * don't know if there are other references still pointing to the same object,
370
 * so we cannot remove the SMgrRelation object yet.  Therefore, this is just a
371
 * synonym for smgrrelease() at the moment.
372
 */
373
void
374
smgrclose(SMgrRelation reln)
375
0
{
376
0
  smgrrelease(reln);
377
0
}
378
379
/*
380
 * smgrdestroyall() -- Release resources used by all unpinned objects.
381
 *
382
 * It must be known that there are no pointers to SMgrRelations, other than
383
 * those pinned with smgrpin().
384
 */
385
void
386
smgrdestroyall(void)
387
0
{
388
0
  dlist_mutable_iter iter;
389
390
  /* seems unsafe to accept interrupts while in a dlist_foreach_modify() */
391
0
  HOLD_INTERRUPTS();
392
393
  /*
394
   * Zap all unpinned SMgrRelations.  We rely on smgrdestroy() to remove
395
   * each one from the list.
396
   */
397
0
  dlist_foreach_modify(iter, &unpinned_relns)
398
0
  {
399
0
    SMgrRelation rel = dlist_container(SMgrRelationData, node,
400
0
                       iter.cur);
401
402
0
    smgrdestroy(rel);
403
0
  }
404
405
0
  RESUME_INTERRUPTS();
406
0
}
407
408
/*
409
 * smgrreleaseall() -- Release resources used by all objects.
410
 */
411
void
412
smgrreleaseall(void)
413
0
{
414
0
  HASH_SEQ_STATUS status;
415
0
  SMgrRelation reln;
416
417
  /* Nothing to do if hashtable not set up */
418
0
  if (SMgrRelationHash == NULL)
419
0
    return;
420
421
  /* seems unsafe to accept interrupts while iterating */
422
0
  HOLD_INTERRUPTS();
423
424
0
  hash_seq_init(&status, SMgrRelationHash);
425
426
0
  while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
427
0
  {
428
0
    smgrrelease(reln);
429
0
  }
430
431
0
  RESUME_INTERRUPTS();
432
0
}
433
434
/*
435
 * smgrreleaserellocator() -- Release resources for given RelFileLocator, if
436
 *                it's open.
437
 *
438
 * This has the same effects as smgrrelease(smgropen(rlocator)), but avoids
439
 * uselessly creating a hashtable entry only to drop it again when no
440
 * such entry exists already.
441
 */
442
void
443
smgrreleaserellocator(RelFileLocatorBackend rlocator)
444
0
{
445
0
  SMgrRelation reln;
446
447
  /* Nothing to do if hashtable not set up */
448
0
  if (SMgrRelationHash == NULL)
449
0
    return;
450
451
0
  reln = (SMgrRelation) hash_search(SMgrRelationHash,
452
0
                    &rlocator,
453
0
                    HASH_FIND, NULL);
454
0
  if (reln != NULL)
455
0
    smgrrelease(reln);
456
0
}
457
458
/*
459
 * smgrexists() -- Does the underlying file for a fork exist?
460
 */
461
bool
462
smgrexists(SMgrRelation reln, ForkNumber forknum)
463
0
{
464
0
  bool    ret;
465
466
0
  HOLD_INTERRUPTS();
467
0
  ret = smgrsw[reln->smgr_which].smgr_exists(reln, forknum);
468
0
  RESUME_INTERRUPTS();
469
470
0
  return ret;
471
0
}
472
473
/*
474
 * smgrcreate() -- Create a new relation.
475
 *
476
 * Given an already-created (but presumably unused) SMgrRelation,
477
 * cause the underlying disk file or other storage for the fork
478
 * to be created.
479
 */
480
void
481
smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
482
0
{
483
0
  HOLD_INTERRUPTS();
484
0
  smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo);
485
0
  RESUME_INTERRUPTS();
486
0
}
487
488
/*
489
 * smgrdosyncall() -- Immediately sync all forks of all given relations
490
 *
491
 * All forks of all given relations are synced out to the store.
492
 *
493
 * This is equivalent to FlushRelationBuffers() for each smgr relation,
494
 * then calling smgrimmedsync() for all forks of each relation, but it's
495
 * significantly quicker so should be preferred when possible.
496
 */
497
void
498
smgrdosyncall(SMgrRelation *rels, int nrels)
499
0
{
500
0
  int     i = 0;
501
0
  ForkNumber  forknum;
502
503
0
  if (nrels == 0)
504
0
    return;
505
506
0
  FlushRelationsAllBuffers(rels, nrels);
507
508
0
  HOLD_INTERRUPTS();
509
510
  /*
511
   * Sync the physical file(s).
512
   */
513
0
  for (i = 0; i < nrels; i++)
514
0
  {
515
0
    int     which = rels[i]->smgr_which;
516
517
0
    for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
518
0
    {
519
0
      if (smgrsw[which].smgr_exists(rels[i], forknum))
520
0
        smgrsw[which].smgr_immedsync(rels[i], forknum);
521
0
    }
522
0
  }
523
524
0
  RESUME_INTERRUPTS();
525
0
}
526
527
/*
528
 * smgrdounlinkall() -- Immediately unlink all forks of all given relations
529
 *
530
 * All forks of all given relations are removed from the store.  This
531
 * should not be used during transactional operations, since it can't be
532
 * undone.
533
 *
534
 * If isRedo is true, it is okay for the underlying file(s) to be gone
535
 * already.
536
 */
537
void
538
smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
539
0
{
540
0
  int     i = 0;
541
0
  RelFileLocatorBackend *rlocators;
542
0
  ForkNumber  forknum;
543
544
0
  if (nrels == 0)
545
0
    return;
546
547
  /*
548
   * It would be unsafe to process interrupts between DropRelationBuffers()
549
   * and unlinking the underlying files. This probably should be a critical
550
   * section, but we're not there yet.
551
   */
552
0
  HOLD_INTERRUPTS();
553
554
  /*
555
   * Get rid of any remaining buffers for the relations.  bufmgr will just
556
   * drop them without bothering to write the contents.
557
   */
558
0
  DropRelationsAllBuffers(rels, nrels);
559
560
  /*
561
   * create an array which contains all relations to be dropped, and close
562
   * each relation's forks at the smgr level while at it
563
   */
564
0
  rlocators = palloc(sizeof(RelFileLocatorBackend) * nrels);
565
0
  for (i = 0; i < nrels; i++)
566
0
  {
567
0
    RelFileLocatorBackend rlocator = rels[i]->smgr_rlocator;
568
0
    int     which = rels[i]->smgr_which;
569
570
0
    rlocators[i] = rlocator;
571
572
    /* Close the forks at smgr level */
573
0
    for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
574
0
      smgrsw[which].smgr_close(rels[i], forknum);
575
0
  }
576
577
  /*
578
   * Send a shared-inval message to force other backends to close any
579
   * dangling smgr references they may have for these rels.  We should do
580
   * this before starting the actual unlinking, in case we fail partway
581
   * through that step.  Note that the sinval messages will eventually come
582
   * back to this backend, too, and thereby provide a backstop that we
583
   * closed our own smgr rel.
584
   */
585
0
  for (i = 0; i < nrels; i++)
586
0
    CacheInvalidateSmgr(rlocators[i]);
587
588
  /*
589
   * Delete the physical file(s).
590
   *
591
   * Note: smgr_unlink must treat deletion failure as a WARNING, not an
592
   * ERROR, because we've already decided to commit or abort the current
593
   * xact.
594
   */
595
596
0
  for (i = 0; i < nrels; i++)
597
0
  {
598
0
    int     which = rels[i]->smgr_which;
599
600
0
    for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
601
0
      smgrsw[which].smgr_unlink(rlocators[i], forknum, isRedo);
602
0
  }
603
604
0
  pfree(rlocators);
605
606
0
  RESUME_INTERRUPTS();
607
0
}
608
609
610
/*
611
 * smgrextend() -- Add a new block to a file.
612
 *
613
 * The semantics are nearly the same as smgrwrite(): write at the
614
 * specified position.  However, this is to be used for the case of
615
 * extending a relation (i.e., blocknum is at or beyond the current
616
 * EOF).  Note that we assume writing a block beyond current EOF
617
 * causes intervening file space to become filled with zeroes.
618
 */
619
void
620
smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
621
       const void *buffer, bool skipFsync)
622
0
{
623
0
  HOLD_INTERRUPTS();
624
625
0
  smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
626
0
                     buffer, skipFsync);
627
628
  /*
629
   * Normally we expect this to increase nblocks by one, but if the cached
630
   * value isn't as expected, just invalidate it so the next call asks the
631
   * kernel.
632
   */
633
0
  if (reln->smgr_cached_nblocks[forknum] == blocknum)
634
0
    reln->smgr_cached_nblocks[forknum] = blocknum + 1;
635
0
  else
636
0
    reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
637
638
0
  RESUME_INTERRUPTS();
639
0
}
640
641
/*
642
 * smgrzeroextend() -- Add new zeroed out blocks to a file.
643
 *
644
 * Similar to smgrextend(), except the relation can be extended by
645
 * multiple blocks at once and the added blocks will be filled with
646
 * zeroes.
647
 */
648
void
649
smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
650
         int nblocks, bool skipFsync)
651
0
{
652
0
  HOLD_INTERRUPTS();
653
654
0
  smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum,
655
0
                       nblocks, skipFsync);
656
657
  /*
658
   * Normally we expect this to increase the fork size by nblocks, but if
659
   * the cached value isn't as expected, just invalidate it so the next call
660
   * asks the kernel.
661
   */
662
0
  if (reln->smgr_cached_nblocks[forknum] == blocknum)
663
0
    reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
664
0
  else
665
0
    reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
666
667
0
  RESUME_INTERRUPTS();
668
0
}
669
670
/*
671
 * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
672
 *
673
 * In recovery only, this can return false to indicate that a file
674
 * doesn't exist (presumably it has been dropped by a later WAL
675
 * record).
676
 */
677
bool
678
smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
679
       int nblocks)
680
0
{
681
0
  bool    ret;
682
683
0
  HOLD_INTERRUPTS();
684
0
  ret = smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum, nblocks);
685
0
  RESUME_INTERRUPTS();
686
687
0
  return ret;
688
0
}
689
690
/*
691
 * smgrmaxcombine() - Return the maximum number of total blocks that can be
692
 *         combined with an IO starting at blocknum.
693
 *
694
 * The returned value includes the IO for blocknum itself.
695
 */
696
uint32
697
smgrmaxcombine(SMgrRelation reln, ForkNumber forknum,
698
         BlockNumber blocknum)
699
0
{
700
0
  uint32    ret;
701
702
0
  HOLD_INTERRUPTS();
703
0
  ret = smgrsw[reln->smgr_which].smgr_maxcombine(reln, forknum, blocknum);
704
0
  RESUME_INTERRUPTS();
705
706
0
  return ret;
707
0
}
708
709
/*
710
 * smgrreadv() -- read a particular block range from a relation into the
711
 *         supplied buffers.
712
 *
713
 * This routine is called from the buffer manager in order to
714
 * instantiate pages in the shared buffer cache.  All storage managers
715
 * return pages in the format that POSTGRES expects.
716
 *
717
 * If more than one block is intended to be read, callers need to use
718
 * smgrmaxcombine() to check how many blocks can be combined into one IO.
719
 */
720
void
721
smgrreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
722
      void **buffers, BlockNumber nblocks)
723
0
{
724
0
  HOLD_INTERRUPTS();
725
0
  smgrsw[reln->smgr_which].smgr_readv(reln, forknum, blocknum, buffers,
726
0
                    nblocks);
727
0
  RESUME_INTERRUPTS();
728
0
}
729
730
/*
731
 * smgrstartreadv() -- asynchronous version of smgrreadv()
732
 *
733
 * This starts an asynchronous readv IO using the IO handle `ioh`. Other than
734
 * `ioh` all parameters are the same as smgrreadv().
735
 *
736
 * Completion callbacks above smgr will be passed the result as the number of
737
 * successfully read blocks if the read [partially] succeeds (Buffers for
738
 * blocks not successfully read might bear unspecified modifications, up to
739
 * the full nblocks). This maintains the abstraction that smgr operates on the
740
 * level of blocks, rather than bytes.
741
 *
742
 * Compared to smgrreadv(), more responsibilities fall on the caller:
743
 * - Partial reads need to be handled by the caller re-issuing IO for the
744
 *   unread blocks
745
 * - smgr will ereport(LOG_SERVER_ONLY) some problems, but higher layers are
746
 *   responsible for pgaio_result_report() to mirror that news to the user (if
747
 *   the IO results in PGAIO_RS_WARNING) or abort the (sub)transaction (if
748
 *   PGAIO_RS_ERROR).
749
 * - Under Valgrind, the "buffers" memory may or may not change status to
750
 *   DEFINED, depending on io_method and concurrent activity.
751
 */
752
void
753
smgrstartreadv(PgAioHandle *ioh,
754
         SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
755
         void **buffers, BlockNumber nblocks)
756
0
{
757
0
  HOLD_INTERRUPTS();
758
0
  smgrsw[reln->smgr_which].smgr_startreadv(ioh,
759
0
                       reln, forknum, blocknum, buffers,
760
0
                       nblocks);
761
0
  RESUME_INTERRUPTS();
762
0
}
763
764
/*
765
 * smgrwritev() -- Write the supplied buffers out.
766
 *
767
 * This is to be used only for updating already-existing blocks of a
768
 * relation (ie, those before the current EOF).  To extend a relation,
769
 * use smgrextend().
770
 *
771
 * This is not a synchronous write -- the block is not necessarily
772
 * on disk at return, only dumped out to the kernel.  However,
773
 * provisions will be made to fsync the write before the next checkpoint.
774
 *
775
 * NB: The mechanism to ensure fsync at next checkpoint assumes that there is
776
 * something that prevents a concurrent checkpoint from "racing ahead" of the
777
 * write.  One way to prevent that is by holding a lock on the buffer; the
778
 * buffer manager's writes are protected by that.  The bulk writer facility
779
 * in bulk_write.c checks the redo pointer and calls smgrimmedsync() if a
780
 * checkpoint happened; that relies on the fact that no other backend can be
781
 * concurrently modifying the page.
782
 *
783
 * skipFsync indicates that the caller will make other provisions to
784
 * fsync the relation, so we needn't bother.  Temporary relations also
785
 * do not require fsync.
786
 *
787
 * If more than one block is intended to be read, callers need to use
788
 * smgrmaxcombine() to check how many blocks can be combined into one IO.
789
 */
790
void
791
smgrwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
792
       const void **buffers, BlockNumber nblocks, bool skipFsync)
793
0
{
794
0
  HOLD_INTERRUPTS();
795
0
  smgrsw[reln->smgr_which].smgr_writev(reln, forknum, blocknum,
796
0
                     buffers, nblocks, skipFsync);
797
0
  RESUME_INTERRUPTS();
798
0
}
799
800
/*
801
 * smgrwriteback() -- Trigger kernel writeback for the supplied range of
802
 *             blocks.
803
 */
804
void
805
smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
806
        BlockNumber nblocks)
807
0
{
808
0
  HOLD_INTERRUPTS();
809
0
  smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum,
810
0
                      nblocks);
811
0
  RESUME_INTERRUPTS();
812
0
}
813
814
/*
815
 * smgrnblocks() -- Calculate the number of blocks in the
816
 *          supplied relation.
817
 */
818
BlockNumber
819
smgrnblocks(SMgrRelation reln, ForkNumber forknum)
820
0
{
821
0
  BlockNumber result;
822
823
  /* Check and return if we get the cached value for the number of blocks. */
824
0
  result = smgrnblocks_cached(reln, forknum);
825
0
  if (result != InvalidBlockNumber)
826
0
    return result;
827
828
0
  HOLD_INTERRUPTS();
829
830
0
  result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
831
832
0
  reln->smgr_cached_nblocks[forknum] = result;
833
834
0
  RESUME_INTERRUPTS();
835
836
0
  return result;
837
0
}
838
839
/*
840
 * smgrnblocks_cached() -- Get the cached number of blocks in the supplied
841
 *               relation.
842
 *
843
 * Returns an InvalidBlockNumber when not in recovery and when the relation
844
 * fork size is not cached.
845
 */
846
BlockNumber
847
smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
848
0
{
849
  /*
850
   * For now, this function uses cached values only in recovery due to lack
851
   * of a shared invalidation mechanism for changes in file size.  Code
852
   * elsewhere reads smgr_cached_nblocks and copes with stale data.
853
   */
854
0
  if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
855
0
    return reln->smgr_cached_nblocks[forknum];
856
857
0
  return InvalidBlockNumber;
858
0
}
859
860
/*
861
 * smgrtruncate() -- Truncate the given forks of supplied relation to
862
 *           each specified numbers of blocks
863
 *
864
 * The truncation is done immediately, so this can't be rolled back.
865
 *
866
 * The caller must hold AccessExclusiveLock on the relation, to ensure that
867
 * other backends receive the smgr invalidation event that this function sends
868
 * before they access any forks of the relation again.  The current size of
869
 * the forks should be provided in old_nblocks.  This function should normally
870
 * be called in a critical section, but the current size must be checked
871
 * outside the critical section, and no interrupts or smgr functions relating
872
 * to this relation should be called in between.
873
 */
874
void
875
smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
876
       BlockNumber *old_nblocks, BlockNumber *nblocks)
877
0
{
878
0
  int     i;
879
880
  /*
881
   * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
882
   * just drop them without bothering to write the contents.
883
   */
884
0
  DropRelationBuffers(reln, forknum, nforks, nblocks);
885
886
  /*
887
   * Send a shared-inval message to force other backends to close any smgr
888
   * references they may have for this rel.  This is useful because they
889
   * might have open file pointers to segments that got removed, and/or
890
   * smgr_targblock variables pointing past the new rel end.  (The inval
891
   * message will come back to our backend, too, causing a
892
   * probably-unnecessary local smgr flush.  But we don't expect that this
893
   * is a performance-critical path.)  As in the unlink code, we want to be
894
   * sure the message is sent before we start changing things on-disk.
895
   */
896
0
  CacheInvalidateSmgr(reln->smgr_rlocator);
897
898
  /* Do the truncation */
899
0
  for (i = 0; i < nforks; i++)
900
0
  {
901
    /* Make the cached size is invalid if we encounter an error. */
902
0
    reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
903
904
0
    smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i],
905
0
                         old_nblocks[i], nblocks[i]);
906
907
    /*
908
     * We might as well update the local smgr_cached_nblocks values. The
909
     * smgr cache inval message that this function sent will cause other
910
     * backends to invalidate their copies of smgr_cached_nblocks, and
911
     * these ones too at the next command boundary. But ensure they aren't
912
     * outright wrong until then.
913
     */
914
0
    reln->smgr_cached_nblocks[forknum[i]] = nblocks[i];
915
0
  }
916
0
}
917
918
/*
919
 * smgrregistersync() -- Request a relation to be sync'd at next checkpoint
920
 *
921
 * This can be used after calling smgrwrite() or smgrextend() with skipFsync =
922
 * true, to register the fsyncs that were skipped earlier.
923
 *
924
 * Note: be mindful that a checkpoint could already have happened between the
925
 * smgrwrite or smgrextend calls and this!  In that case, the checkpoint
926
 * already missed fsyncing this relation, and you should use smgrimmedsync
927
 * instead.  Most callers should use the bulk loading facility in bulk_write.c
928
 * which handles all that.
929
 */
930
void
931
smgrregistersync(SMgrRelation reln, ForkNumber forknum)
932
0
{
933
0
  HOLD_INTERRUPTS();
934
0
  smgrsw[reln->smgr_which].smgr_registersync(reln, forknum);
935
0
  RESUME_INTERRUPTS();
936
0
}
937
938
/*
939
 * smgrimmedsync() -- Force the specified relation to stable storage.
940
 *
941
 * Synchronously force all previous writes to the specified relation
942
 * down to disk.
943
 *
944
 * This is useful for building completely new relations (eg, new
945
 * indexes).  Instead of incrementally WAL-logging the index build
946
 * steps, we can just write completed index pages to disk with smgrwrite
947
 * or smgrextend, and then fsync the completed index file before
948
 * committing the transaction.  (This is sufficient for purposes of
949
 * crash recovery, since it effectively duplicates forcing a checkpoint
950
 * for the completed index.  But it is *not* sufficient if one wishes
951
 * to use the WAL log for PITR or replication purposes: in that case
952
 * we have to make WAL entries as well.)
953
 *
954
 * The preceding writes should specify skipFsync = true to avoid
955
 * duplicative fsyncs.
956
 *
957
 * Note that you need to do FlushRelationBuffers() first if there is
958
 * any possibility that there are dirty buffers for the relation;
959
 * otherwise the sync is not very meaningful.
960
 *
961
 * Most callers should use the bulk loading facility in bulk_write.c
962
 * instead of calling this directly.
963
 */
964
void
965
smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
966
0
{
967
0
  HOLD_INTERRUPTS();
968
0
  smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
969
0
  RESUME_INTERRUPTS();
970
0
}
971
972
/*
973
 * Return fd for the specified block number and update *off to the appropriate
974
 * position.
975
 *
976
 * This is only to be used for when AIO needs to perform the IO in a different
977
 * process than where it was issued (e.g. in an IO worker).
978
 */
979
static int
980
smgrfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
981
0
{
982
0
  int     fd;
983
984
  /*
985
   * The caller needs to prevent interrupts from being processed, otherwise
986
   * the FD could be closed prematurely.
987
   */
988
0
  Assert(!INTERRUPTS_CAN_BE_PROCESSED());
989
990
0
  fd = smgrsw[reln->smgr_which].smgr_fd(reln, forknum, blocknum, off);
991
992
0
  return fd;
993
0
}
994
995
/*
996
 * AtEOXact_SMgr
997
 *
998
 * This routine is called during transaction commit or abort (it doesn't
999
 * particularly care which).  All unpinned SMgrRelation objects are destroyed.
1000
 *
1001
 * We do this as a compromise between wanting transient SMgrRelations to
1002
 * live awhile (to amortize the costs of blind writes of multiple blocks)
1003
 * and needing them to not live forever (since we're probably holding open
1004
 * a kernel file descriptor for the underlying file, and we need to ensure
1005
 * that gets closed reasonably soon if the file gets deleted).
1006
 */
1007
void
1008
AtEOXact_SMgr(void)
1009
0
{
1010
0
  smgrdestroyall();
1011
0
}
1012
1013
/*
1014
 * This routine is called when we are ordered to release all open files by a
1015
 * ProcSignalBarrier.
1016
 */
1017
bool
1018
ProcessBarrierSmgrRelease(void)
1019
0
{
1020
0
  smgrreleaseall();
1021
0
  return true;
1022
0
}
1023
1024
/*
1025
 * Set target of the IO handle to be smgr and initialize all the relevant
1026
 * pieces of data.
1027
 */
1028
void
1029
pgaio_io_set_target_smgr(PgAioHandle *ioh,
1030
             SMgrRelationData *smgr,
1031
             ForkNumber forknum,
1032
             BlockNumber blocknum,
1033
             int nblocks,
1034
             bool skip_fsync)
1035
0
{
1036
0
  PgAioTargetData *sd = pgaio_io_get_target_data(ioh);
1037
1038
0
  pgaio_io_set_target(ioh, PGAIO_TID_SMGR);
1039
1040
  /* backend is implied via IO owner */
1041
0
  sd->smgr.rlocator = smgr->smgr_rlocator.locator;
1042
0
  sd->smgr.forkNum = forknum;
1043
0
  sd->smgr.blockNum = blocknum;
1044
0
  sd->smgr.nblocks = nblocks;
1045
0
  sd->smgr.is_temp = SmgrIsTemp(smgr);
1046
  /* Temp relations should never be fsync'd */
1047
0
  sd->smgr.skip_fsync = skip_fsync && !SmgrIsTemp(smgr);
1048
0
}
1049
1050
/*
1051
 * Callback for the smgr AIO target, to reopen the file (e.g. because the IO
1052
 * is executed in a worker).
1053
 */
1054
static void
1055
smgr_aio_reopen(PgAioHandle *ioh)
1056
0
{
1057
0
  PgAioTargetData *sd = pgaio_io_get_target_data(ioh);
1058
0
  PgAioOpData *od = pgaio_io_get_op_data(ioh);
1059
0
  SMgrRelation reln;
1060
0
  ProcNumber  procno;
1061
0
  uint32    off;
1062
1063
  /*
1064
   * The caller needs to prevent interrupts from being processed, otherwise
1065
   * the FD could be closed again before we get to executing the IO.
1066
   */
1067
0
  Assert(!INTERRUPTS_CAN_BE_PROCESSED());
1068
1069
0
  if (sd->smgr.is_temp)
1070
0
    procno = pgaio_io_get_owner(ioh);
1071
0
  else
1072
0
    procno = INVALID_PROC_NUMBER;
1073
1074
0
  reln = smgropen(sd->smgr.rlocator, procno);
1075
0
  switch (pgaio_io_get_op(ioh))
1076
0
  {
1077
0
    case PGAIO_OP_INVALID:
1078
0
      pg_unreachable();
1079
0
      break;
1080
0
    case PGAIO_OP_READV:
1081
0
      od->read.fd = smgrfd(reln, sd->smgr.forkNum, sd->smgr.blockNum, &off);
1082
0
      Assert(off == od->read.offset);
1083
0
      break;
1084
0
    case PGAIO_OP_WRITEV:
1085
0
      od->write.fd = smgrfd(reln, sd->smgr.forkNum, sd->smgr.blockNum, &off);
1086
0
      Assert(off == od->write.offset);
1087
0
      break;
1088
0
  }
1089
0
}
1090
1091
/*
1092
 * Callback for the smgr AIO target, describing the target of the IO.
1093
 */
1094
static char *
1095
smgr_aio_describe_identity(const PgAioTargetData *sd)
1096
0
{
1097
0
  RelPathStr  path;
1098
0
  char     *desc;
1099
1100
0
  path = relpathbackend(sd->smgr.rlocator,
1101
0
              sd->smgr.is_temp ?
1102
0
              MyProcNumber : INVALID_PROC_NUMBER,
1103
0
              sd->smgr.forkNum);
1104
1105
0
  if (sd->smgr.nblocks == 0)
1106
0
    desc = psprintf(_("file \"%s\""), path.str);
1107
0
  else if (sd->smgr.nblocks == 1)
1108
0
    desc = psprintf(_("block %u in file \"%s\""),
1109
0
            sd->smgr.blockNum,
1110
0
            path.str);
1111
0
  else
1112
0
    desc = psprintf(_("blocks %u..%u in file \"%s\""),
1113
0
            sd->smgr.blockNum,
1114
0
            sd->smgr.blockNum + sd->smgr.nblocks - 1,
1115
0
            path.str);
1116
1117
0
  return desc;
1118
0
}