/src/postgres/src/common/blkreftable.c

Source
/*-------------------------------------------------------------------------
 *
 * blkreftable.c
 *    Block reference tables.
 *
 * A block reference table is used to keep track of which blocks have
 * been modified by WAL records within a certain LSN range.
 *
 * For each relation fork, we keep track of all blocks that have appeared
 * in block reference in the WAL. We also keep track of the "limit block",
 * which is the smallest relation length in blocks known to have occurred
 * during that range of WAL records.  This should be set to 0 if the relation
 * fork is created or destroyed, and to the post-truncation length if
 * truncated.
 *
 * Whenever we set the limit block, we also forget about any modified blocks
 * beyond that point. Those blocks don't exist any more. Such blocks can
 * later be marked as modified again; if that happens, it means the relation
 * was re-extended.
 *
 * Portions Copyright (c) 2010-2025, PostgreSQL Global Development Group
 *
 * src/common/blkreftable.c
 *
 *-------------------------------------------------------------------------
 */


#ifndef FRONTEND
#include "postgres.h"
#else
#include "postgres_fe.h"
#endif

#ifdef FRONTEND
#include "common/logging.h"
#endif

#include "common/blkreftable.h"
#include "common/hashfn.h"
#include "port/pg_crc32c.h"

/*
 * A block reference table keeps track of the status of each relation
 * fork individually.
 */
typedef struct BlockRefTableKey
{
  RelFileLocator rlocator;
  ForkNumber  forknum;
} BlockRefTableKey;

/*
 * We could need to store data either for a relation in which only a
 * tiny fraction of the blocks have been modified or for a relation in
 * which nearly every block has been modified, and we want a
 * space-efficient representation in both cases. To accomplish this,
 * we divide the relation into chunks of 2^16 blocks and choose between
 * an array representation and a bitmap representation for each chunk.
 *
 * When the number of modified blocks in a given chunk is small, we
 * essentially store an array of block numbers, but we need not store the
 * entire block number: instead, we store each block number as a 2-byte
 * offset from the start of the chunk.
 *
 * When the number of modified blocks in a given chunk is large, we switch
 * to a bitmap representation.
 *
 * These same basic representational choices are used both when a block
 * reference table is stored in memory and when it is serialized to disk.
 *
 * In the in-memory representation, we initially allocate each chunk with
 * space for a number of entries given by INITIAL_ENTRIES_PER_CHUNK and
 * increase that as necessary until we reach MAX_ENTRIES_PER_CHUNK.
 * Any chunk whose allocated size reaches MAX_ENTRIES_PER_CHUNK is converted
 * to a bitmap, and thus never needs to grow further.
 */
#define BLOCKS_PER_CHUNK    (1 << 16)
#define BLOCKS_PER_ENTRY    (BITS_PER_BYTE * sizeof(uint16))
#define MAX_ENTRIES_PER_CHUNK (BLOCKS_PER_CHUNK / BLOCKS_PER_ENTRY)
#define INITIAL_ENTRIES_PER_CHUNK 16
typedef uint16 *BlockRefTableChunk;

/*
 * State for one relation fork.
 *
 * 'rlocator' and 'forknum' identify the relation fork to which this entry
 * pertains.
 *
 * 'limit_block' is the shortest known length of the relation in blocks
 * within the LSN range covered by a particular block reference table.
 * It should be set to 0 if the relation fork is created or dropped. If the
 * relation fork is truncated, it should be set to the number of blocks that
 * remain after truncation.
 *
 * 'nchunks' is the allocated length of each of the three arrays that follow.
 * We can only represent the status of block numbers less than nchunks *
 * BLOCKS_PER_CHUNK.
 *
 * 'chunk_size' is an array storing the allocated size of each chunk.
 *
 * 'chunk_usage' is an array storing the number of elements used in each
 * chunk. If that value is less than MAX_ENTRIES_PER_CHUNK, the corresponding
 * chunk is used as an array; else the corresponding chunk is used as a bitmap.
 * When used as a bitmap, the least significant bit of the first array element
 * is the status of the lowest-numbered block covered by this chunk.
 *
 * 'chunk_data' is the array of chunks.
 */
struct BlockRefTableEntry
{
  BlockRefTableKey key;
  BlockNumber limit_block;
  char    status;
  uint32    nchunks;
  uint16     *chunk_size;
  uint16     *chunk_usage;
  BlockRefTableChunk *chunk_data;
};

/* Declare and define a hash table over type BlockRefTableEntry. */
#define SH_PREFIX blockreftable
#define SH_ELEMENT_TYPE BlockRefTableEntry
#define SH_KEY_TYPE BlockRefTableKey
#define SH_KEY key
#define SH_HASH_KEY(tb, key) \
  hash_bytes((const unsigned char *) &key, sizeof(BlockRefTableKey))
#define SH_EQUAL(tb, a, b) (memcmp(&a, &b, sizeof(BlockRefTableKey)) == 0)
#define SH_SCOPE static inline
#ifdef FRONTEND
#define SH_RAW_ALLOCATOR pg_malloc0
#endif
#define SH_DEFINE
#define SH_DECLARE
#include "lib/simplehash.h"

/*
 * A block reference table is basically just the hash table, but we don't
 * want to expose that to outside callers.
 *
 * We keep track of the memory context in use explicitly too, so that it's
 * easy to place all of our allocations in the same context.
 */
struct BlockRefTable
{
  blockreftable_hash *hash;
#ifndef FRONTEND
  MemoryContext mcxt;
#endif
};

/*
 * On-disk serialization format for block reference table entries.
 */
typedef struct BlockRefTableSerializedEntry
{
  RelFileLocator rlocator;
  ForkNumber  forknum;
  BlockNumber limit_block;
  uint32    nchunks;
} BlockRefTableSerializedEntry;

/*
 * Buffer size, so that we avoid doing many small I/Os.
 */
#define BUFSIZE         65536

/*
 * Ad-hoc buffer for file I/O.
 */
typedef struct BlockRefTableBuffer
{
  io_callback_fn io_callback;
  void     *io_callback_arg;
  char    data[BUFSIZE];
  int     used;
  int     cursor;
  pg_crc32c crc;
} BlockRefTableBuffer;

/*
 * State for keeping track of progress while incrementally reading a block
 * table reference file from disk.
 *
 * total_chunks means the number of chunks for the RelFileLocator/ForkNumber
 * combination that is currently being read, and consumed_chunks is the number
 * of those that have been read. (We always read all the information for
 * a single chunk at one time, so we don't need to be able to represent the
 * state where a chunk has been partially read.)
 *
 * chunk_size is the array of chunk sizes. The length is given by total_chunks.
 *
 * chunk_data holds the current chunk.
 *
 * chunk_position helps us figure out how much progress we've made in returning
 * the block numbers for the current chunk to the caller. If the chunk is a
 * bitmap, it's the number of bits we've scanned; otherwise, it's the number
 * of chunk entries we've scanned.
 */
struct BlockRefTableReader
{
  BlockRefTableBuffer buffer;
  char     *error_filename;
  report_error_fn error_callback;
  void     *error_callback_arg;
  uint32    total_chunks;
  uint32    consumed_chunks;
  uint16     *chunk_size;
  uint16    chunk_data[MAX_ENTRIES_PER_CHUNK];
  uint32    chunk_position;
};

/*
 * State for keeping track of progress while incrementally writing a block
 * reference table file to disk.
 */
struct BlockRefTableWriter
{
  BlockRefTableBuffer buffer;
};

/* Function prototypes. */
static int  BlockRefTableComparator(const void *a, const void *b);
static void BlockRefTableFlush(BlockRefTableBuffer *buffer);
static void BlockRefTableRead(BlockRefTableReader *reader, void *data,
                int length);
static void BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data,
                 int length);
static void BlockRefTableFileTerminate(BlockRefTableBuffer *buffer);

/*
 * Create an empty block reference table.
 */
BlockRefTable *
CreateEmptyBlockRefTable(void)
{
  BlockRefTable *brtab = palloc(sizeof(BlockRefTable));

  /*
   * Even completely empty database has a few hundred relation forks, so it
   * seems best to size the hash on the assumption that we're going to have
   * at least a few thousand entries.
   */
#ifdef FRONTEND
  brtab->hash = blockreftable_create(4096, NULL);
#else
  brtab->mcxt = CurrentMemoryContext;
  brtab->hash = blockreftable_create(brtab->mcxt, 4096, NULL);
#endif

  return brtab;
}

/*
 * Set the "limit block" for a relation fork and forget any modified blocks
 * with equal or higher block numbers.
 *
 * The "limit block" is the shortest known length of the relation within the
 * range of WAL records covered by this block reference table.
 */
void
BlockRefTableSetLimitBlock(BlockRefTable *brtab,
               const RelFileLocator *rlocator,
               ForkNumber forknum,
               BlockNumber limit_block)
{
  BlockRefTableEntry *brtentry;
  BlockRefTableKey key = {0}; /* make sure any padding is zero */
  bool    found;

  memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
  key.forknum = forknum;
  brtentry = blockreftable_insert(brtab->hash, key, &found);

  if (!found)
  {
    /*
     * We have no existing data about this relation fork, so just record
     * the limit_block value supplied by the caller, and make sure other
     * parts of the entry are properly initialized.
     */
    brtentry->limit_block = limit_block;
    brtentry->nchunks = 0;
    brtentry->chunk_size = NULL;
    brtentry->chunk_usage = NULL;
    brtentry->chunk_data = NULL;
    return;
  }

  BlockRefTableEntrySetLimitBlock(brtentry, limit_block);
}

/*
 * Mark a block in a given relation fork as known to have been modified.
 */
void
BlockRefTableMarkBlockModified(BlockRefTable *brtab,
                 const RelFileLocator *rlocator,
                 ForkNumber forknum,
                 BlockNumber blknum)
{
  BlockRefTableEntry *brtentry;
  BlockRefTableKey key = {0}; /* make sure any padding is zero */
  bool    found;
#ifndef FRONTEND
  MemoryContext oldcontext = MemoryContextSwitchTo(brtab->mcxt);
#endif

  memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
  key.forknum = forknum;
  brtentry = blockreftable_insert(brtab->hash, key, &found);

  if (!found)
  {
    /*
     * We want to set the initial limit block value to something higher
     * than any legal block number. InvalidBlockNumber fits the bill.
     */
    brtentry->limit_block = InvalidBlockNumber;
    brtentry->nchunks = 0;
    brtentry->chunk_size = NULL;
    brtentry->chunk_usage = NULL;
    brtentry->chunk_data = NULL;
  }

  BlockRefTableEntryMarkBlockModified(brtentry, forknum, blknum);

#ifndef FRONTEND
  MemoryContextSwitchTo(oldcontext);
#endif
}

/*
 * Get an entry from a block reference table.
 *
 * If the entry does not exist, this function returns NULL. Otherwise, it
 * returns the entry and sets *limit_block to the value from the entry.
 */
BlockRefTableEntry *
BlockRefTableGetEntry(BlockRefTable *brtab, const RelFileLocator *rlocator,
            ForkNumber forknum, BlockNumber *limit_block)
{
  BlockRefTableKey key = {0}; /* make sure any padding is zero */
  BlockRefTableEntry *entry;

  Assert(limit_block != NULL);

  memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator));
  key.forknum = forknum;
  entry = blockreftable_lookup(brtab->hash, key);

  if (entry != NULL)
    *limit_block = entry->limit_block;

  return entry;
}

/*
 * Get block numbers from a table entry.
 *
 * 'blocks' must point to enough space to hold at least 'nblocks' block
 * numbers, and any block numbers we manage to get will be written there.
 * The return value is the number of block numbers actually written.
 *
 * We do not return block numbers unless they are greater than or equal to
 * start_blkno and strictly less than stop_blkno.
 */
int
BlockRefTableEntryGetBlocks(BlockRefTableEntry *entry,
              BlockNumber start_blkno,
              BlockNumber stop_blkno,
              BlockNumber *blocks,
              int nblocks)
{
  uint32    start_chunkno;
  uint32    stop_chunkno;
  uint32    chunkno;
  int     nresults = 0;

  Assert(entry != NULL);

  /*
   * Figure out which chunks could potentially contain blocks of interest.
   *
   * We need to be careful about overflow here, because stop_blkno could be
   * InvalidBlockNumber or something very close to it.
   */
  start_chunkno = start_blkno / BLOCKS_PER_CHUNK;
  stop_chunkno = stop_blkno / BLOCKS_PER_CHUNK;
  if ((stop_blkno % BLOCKS_PER_CHUNK) != 0)
    ++stop_chunkno;
  if (stop_chunkno > entry->nchunks)
    stop_chunkno = entry->nchunks;

  /*
   * Loop over chunks.
   */
  for (chunkno = start_chunkno; chunkno < stop_chunkno; ++chunkno)
  {
    uint16    chunk_usage = entry->chunk_usage[chunkno];
    BlockRefTableChunk chunk_data = entry->chunk_data[chunkno];
    unsigned  start_offset = 0;
    unsigned  stop_offset = BLOCKS_PER_CHUNK;

    /*
     * If the start and/or stop block number falls within this chunk, the
     * whole chunk may not be of interest. Figure out which portion we
     * care about, if it's not the whole thing.
     */
    if (chunkno == start_chunkno)
      start_offset = start_blkno % BLOCKS_PER_CHUNK;
    if (chunkno == stop_chunkno - 1)
    {
      Assert(stop_blkno > chunkno * BLOCKS_PER_CHUNK);
      stop_offset = stop_blkno - (chunkno * BLOCKS_PER_CHUNK);
      Assert(stop_offset <= BLOCKS_PER_CHUNK);
    }

    /*
     * Handling differs depending on whether this is an array of offsets
     * or a bitmap.
     */
    if (chunk_usage == MAX_ENTRIES_PER_CHUNK)
    {
      unsigned  i;

      /* It's a bitmap, so test every relevant bit. */
      for (i = start_offset; i < stop_offset; ++i)
      {
        uint16    w = chunk_data[i / BLOCKS_PER_ENTRY];

        if ((w & (1 << (i % BLOCKS_PER_ENTRY))) != 0)
        {
          BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + i;

          blocks[nresults++] = blkno;

          /* Early exit if we run out of output space. */
          if (nresults == nblocks)
            return nresults;
        }
      }
    }
    else
    {
      unsigned  i;

      /* It's an array of offsets, so check each one. */
      for (i = 0; i < chunk_usage; ++i)
      {
        uint16    offset = chunk_data[i];

        if (offset >= start_offset && offset < stop_offset)
        {
          BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + offset;

          blocks[nresults++] = blkno;

          /* Early exit if we run out of output space. */
          if (nresults == nblocks)
            return nresults;
        }
      }
    }
  }

  return nresults;
}

/*
 * Serialize a block reference table to a file.
 */
void
WriteBlockRefTable(BlockRefTable *brtab,
           io_callback_fn write_callback,
           void *write_callback_arg)
{
  BlockRefTableSerializedEntry *sdata = NULL;
  BlockRefTableBuffer buffer;
  uint32    magic = BLOCKREFTABLE_MAGIC;

  /* Prepare buffer. */
  memset(&buffer, 0, sizeof(BlockRefTableBuffer));
  buffer.io_callback = write_callback;
  buffer.io_callback_arg = write_callback_arg;
  INIT_CRC32C(buffer.crc);

  /* Write magic number. */
  BlockRefTableWrite(&buffer, &magic, sizeof(uint32));

  /* Write the entries, assuming there are some. */
  if (brtab->hash->members > 0)
  {
    unsigned  i = 0;
    blockreftable_iterator it;
    BlockRefTableEntry *brtentry;

    /* Extract entries into serializable format and sort them. */
    sdata =
      palloc(brtab->hash->members * sizeof(BlockRefTableSerializedEntry));
    blockreftable_start_iterate(brtab->hash, &it);
    while ((brtentry = blockreftable_iterate(brtab->hash, &it)) != NULL)
    {
      BlockRefTableSerializedEntry *sentry = &sdata[i++];

      sentry->rlocator = brtentry->key.rlocator;
      sentry->forknum = brtentry->key.forknum;
      sentry->limit_block = brtentry->limit_block;
      sentry->nchunks = brtentry->nchunks;

      /* trim trailing zero entries */
      while (sentry->nchunks > 0 &&
           brtentry->chunk_usage[sentry->nchunks - 1] == 0)
        sentry->nchunks--;
    }
    Assert(i == brtab->hash->members);
    qsort(sdata, i, sizeof(BlockRefTableSerializedEntry),
        BlockRefTableComparator);

    /* Loop over entries in sorted order and serialize each one. */
    for (i = 0; i < brtab->hash->members; ++i)
    {
      BlockRefTableSerializedEntry *sentry = &sdata[i];
      BlockRefTableKey key = {0}; /* make sure any padding is zero */
      unsigned  j;

      /* Write the serialized entry itself. */
      BlockRefTableWrite(&buffer, sentry,
                 sizeof(BlockRefTableSerializedEntry));

      /* Look up the original entry so we can access the chunks. */
      memcpy(&key.rlocator, &sentry->rlocator, sizeof(RelFileLocator));
      key.forknum = sentry->forknum;
      brtentry = blockreftable_lookup(brtab->hash, key);
      Assert(brtentry != NULL);

      /* Write the untruncated portion of the chunk length array. */
      if (sentry->nchunks != 0)
        BlockRefTableWrite(&buffer, brtentry->chunk_usage,
                   sentry->nchunks * sizeof(uint16));

      /* Write the contents of each chunk. */
      for (j = 0; j < brtentry->nchunks; ++j)
      {
        if (brtentry->chunk_usage[j] == 0)
          continue;
        BlockRefTableWrite(&buffer, brtentry->chunk_data[j],
                   brtentry->chunk_usage[j] * sizeof(uint16));
      }
    }
  }

  /* Write out appropriate terminator and CRC and flush buffer. */
  BlockRefTableFileTerminate(&buffer);
}

/*
 * Prepare to incrementally read a block reference table file.
 *
 * 'read_callback' is a function that can be called to read data from the
 * underlying file (or other data source) into our internal buffer.
 *
 * 'read_callback_arg' is an opaque argument to be passed to read_callback.
 *
 * 'error_filename' is the filename that should be included in error messages
 * if the file is found to be malformed. The value is not copied, so the
 * caller should ensure that it remains valid until done with this
 * BlockRefTableReader.
 *
 * 'error_callback' is a function to be called if the file is found to be
 * malformed. This is not used for I/O errors, which must be handled internally
 * by read_callback.
 *
 * 'error_callback_arg' is an opaque argument to be passed to error_callback.
 */
BlockRefTableReader *
CreateBlockRefTableReader(io_callback_fn read_callback,
              void *read_callback_arg,
              char *error_filename,
              report_error_fn error_callback,
              void *error_callback_arg)
{
  BlockRefTableReader *reader;
  uint32    magic;

  /* Initialize data structure. */
  reader = palloc0(sizeof(BlockRefTableReader));
  reader->buffer.io_callback = read_callback;
  reader->buffer.io_callback_arg = read_callback_arg;
  reader->error_filename = error_filename;
  reader->error_callback = error_callback;
  reader->error_callback_arg = error_callback_arg;
  INIT_CRC32C(reader->buffer.crc);

  /* Verify magic number. */
  BlockRefTableRead(reader, &magic, sizeof(uint32));
  if (magic != BLOCKREFTABLE_MAGIC)
    error_callback(error_callback_arg,
             "file \"%s\" has wrong magic number: expected %u, found %u",
             error_filename,
             BLOCKREFTABLE_MAGIC, magic);

  return reader;
}

/*
 * Read next relation fork covered by this block reference table file.
 *
 * After calling this function, you must call BlockRefTableReaderGetBlocks
 * until it returns 0 before calling it again.
 */
bool
BlockRefTableReaderNextRelation(BlockRefTableReader *reader,
                RelFileLocator *rlocator,
                ForkNumber *forknum,
                BlockNumber *limit_block)
{
  BlockRefTableSerializedEntry sentry;
  BlockRefTableSerializedEntry zentry = {0};

  /*
   * Sanity check: caller must read all blocks from all chunks before moving
   * on to the next relation.
   */
  Assert(reader->total_chunks == reader->consumed_chunks);

  /* Read serialized entry. */
  BlockRefTableRead(reader, &sentry,
            sizeof(BlockRefTableSerializedEntry));

  /*
   * If we just read the sentinel entry indicating that we've reached the
   * end, read and check the CRC.
   */
  if (memcmp(&sentry, &zentry, sizeof(BlockRefTableSerializedEntry)) == 0)
  {
    pg_crc32c expected_crc;
    pg_crc32c actual_crc;

    /*
     * We want to know the CRC of the file excluding the 4-byte CRC
     * itself, so copy the current value of the CRC accumulator before
     * reading those bytes, and use the copy to finalize the calculation.
     */
    expected_crc = reader->buffer.crc;
    FIN_CRC32C(expected_crc);

    /* Now we can read the actual value. */
    BlockRefTableRead(reader, &actual_crc, sizeof(pg_crc32c));

    /* Throw an error if there is a mismatch. */
    if (!EQ_CRC32C(expected_crc, actual_crc))
      reader->error_callback(reader->error_callback_arg,
                   "file \"%s\" has wrong checksum: expected %08X, found %08X",
                   reader->error_filename, expected_crc, actual_crc);

    return false;
  }

  /* Read chunk size array. */
  if (reader->chunk_size != NULL)
    pfree(reader->chunk_size);
  reader->chunk_size = palloc(sentry.nchunks * sizeof(uint16));
  BlockRefTableRead(reader, reader->chunk_size,
            sentry.nchunks * sizeof(uint16));

  /* Set up for chunk scan. */
  reader->total_chunks = sentry.nchunks;
  reader->consumed_chunks = 0;

  /* Return data to caller. */
  memcpy(rlocator, &sentry.rlocator, sizeof(RelFileLocator));
  *forknum = sentry.forknum;
  *limit_block = sentry.limit_block;
  return true;
}

/*
 * Get modified blocks associated with the relation fork returned by
 * the most recent call to BlockRefTableReaderNextRelation.
 *
 * On return, block numbers will be written into the 'blocks' array, whose
 * length should be passed via 'nblocks'. The return value is the number of
 * entries actually written into the 'blocks' array, which may be less than
 * 'nblocks' if we run out of modified blocks in the relation fork before
 * we run out of room in the array.
 */
unsigned
BlockRefTableReaderGetBlocks(BlockRefTableReader *reader,
               BlockNumber *blocks,
               int nblocks)
{
  unsigned  blocks_found = 0;

  /* Must provide space for at least one block number to be returned. */
  Assert(nblocks > 0);

  /* Loop collecting blocks to return to caller. */
  for (;;)
  {
    uint16    next_chunk_size;

    /*
     * If we've read at least one chunk, maybe it contains some block
     * numbers that could satisfy caller's request.
     */
    if (reader->consumed_chunks > 0)
    {
      uint32    chunkno = reader->consumed_chunks - 1;
      uint16    chunk_size = reader->chunk_size[chunkno];

      if (chunk_size == MAX_ENTRIES_PER_CHUNK)
      {
        /* Bitmap format, so search for bits that are set. */
        while (reader->chunk_position < BLOCKS_PER_CHUNK &&
             blocks_found < nblocks)
        {
          uint16    chunkoffset = reader->chunk_position;
          uint16    w;

          w = reader->chunk_data[chunkoffset / BLOCKS_PER_ENTRY];
          if ((w & (1u << (chunkoffset % BLOCKS_PER_ENTRY))) != 0)
            blocks[blocks_found++] =
              chunkno * BLOCKS_PER_CHUNK + chunkoffset;
          ++reader->chunk_position;
        }
      }
      else
      {
        /* Not in bitmap format, so each entry is a 2-byte offset. */
        while (reader->chunk_position < chunk_size &&
             blocks_found < nblocks)
        {
          blocks[blocks_found++] = chunkno * BLOCKS_PER_CHUNK
            + reader->chunk_data[reader->chunk_position];
          ++reader->chunk_position;
        }
      }
    }

    /* We found enough blocks, so we're done. */
    if (blocks_found >= nblocks)
      break;

    /*
     * We didn't find enough blocks, so we must need the next chunk. If
     * there are none left, though, then we're done anyway.
     */
    if (reader->consumed_chunks == reader->total_chunks)
      break;

    /*
     * Read data for next chunk and reset scan position to beginning of
     * chunk. Note that the next chunk might be empty, in which case we
     * consume the chunk without actually consuming any bytes from the
     * underlying file.
     */
    next_chunk_size = reader->chunk_size[reader->consumed_chunks];
    if (next_chunk_size > 0)
      BlockRefTableRead(reader, reader->chunk_data,
                next_chunk_size * sizeof(uint16));
    ++reader->consumed_chunks;
    reader->chunk_position = 0;
  }

  return blocks_found;
}

/*
 * Release memory used while reading a block reference table from a file.
 */
void
DestroyBlockRefTableReader(BlockRefTableReader *reader)
{
  if (reader->chunk_size != NULL)
  {
    pfree(reader->chunk_size);
    reader->chunk_size = NULL;
  }
  pfree(reader);
}

/*
 * Prepare to write a block reference table file incrementally.
 *
 * Caller must be able to supply BlockRefTableEntry objects sorted in the
 * appropriate order.
 */
BlockRefTableWriter *
CreateBlockRefTableWriter(io_callback_fn write_callback,
              void *write_callback_arg)
{
  BlockRefTableWriter *writer;
  uint32    magic = BLOCKREFTABLE_MAGIC;

  /* Prepare buffer and CRC check and save callbacks. */
  writer = palloc0(sizeof(BlockRefTableWriter));
  writer->buffer.io_callback = write_callback;
  writer->buffer.io_callback_arg = write_callback_arg;
  INIT_CRC32C(writer->buffer.crc);

  /* Write magic number. */
  BlockRefTableWrite(&writer->buffer, &magic, sizeof(uint32));

  return writer;
}

/*
 * Append one entry to a block reference table file.
 *
 * Note that entries must be written in the proper order, that is, sorted by
 * tablespace, then database, then relfilenumber, then fork number. Caller
 * is responsible for supplying data in the correct order. If that seems hard,
 * use an in-memory BlockRefTable instead.
 */
void
BlockRefTableWriteEntry(BlockRefTableWriter *writer, BlockRefTableEntry *entry)
{
  BlockRefTableSerializedEntry sentry;
  unsigned  j;

  /* Convert to serialized entry format. */
  sentry.rlocator = entry->key.rlocator;
  sentry.forknum = entry->key.forknum;
  sentry.limit_block = entry->limit_block;
  sentry.nchunks = entry->nchunks;

  /* Trim trailing zero entries. */
  while (sentry.nchunks > 0 && entry->chunk_usage[sentry.nchunks - 1] == 0)
    sentry.nchunks--;

  /* Write the serialized entry itself. */
  BlockRefTableWrite(&writer->buffer, &sentry,
             sizeof(BlockRefTableSerializedEntry));

  /* Write the untruncated portion of the chunk length array. */
  if (sentry.nchunks != 0)
    BlockRefTableWrite(&writer->buffer, entry->chunk_usage,
               sentry.nchunks * sizeof(uint16));

  /* Write the contents of each chunk. */
  for (j = 0; j < entry->nchunks; ++j)
  {
    if (entry->chunk_usage[j] == 0)
      continue;
    BlockRefTableWrite(&writer->buffer, entry->chunk_data[j],
               entry->chunk_usage[j] * sizeof(uint16));
  }
}

/*
 * Finalize an incremental write of a block reference table file.
 */
void
DestroyBlockRefTableWriter(BlockRefTableWriter *writer)
{
  BlockRefTableFileTerminate(&writer->buffer);
  pfree(writer);
}

/*
 * Allocate a standalone BlockRefTableEntry.
 *
 * When we're manipulating a full in-memory BlockRefTable, the entries are
 * part of the hash table and are allocated by simplehash. This routine is
 * used by callers that want to write out a BlockRefTable to a file without
 * needing to store the whole thing in memory at once.
 *
 * Entries allocated by this function can be manipulated using the functions
 * BlockRefTableEntrySetLimitBlock and BlockRefTableEntryMarkBlockModified
 * and then written using BlockRefTableWriteEntry and freed using
 * BlockRefTableFreeEntry.
 */
BlockRefTableEntry *
CreateBlockRefTableEntry(RelFileLocator rlocator, ForkNumber forknum)
{
  BlockRefTableEntry *entry = palloc0(sizeof(BlockRefTableEntry));

  memcpy(&entry->key.rlocator, &rlocator, sizeof(RelFileLocator));
  entry->key.forknum = forknum;
  entry->limit_block = InvalidBlockNumber;

  return entry;
}

/*
 * Update a BlockRefTableEntry with a new value for the "limit block" and
 * forget any equal-or-higher-numbered modified blocks.
 *
 * The "limit block" is the shortest known length of the relation within the
 * range of WAL records covered by this block reference table.
 */
void
BlockRefTableEntrySetLimitBlock(BlockRefTableEntry *entry,
                BlockNumber limit_block)
{
  unsigned  chunkno;
  unsigned  limit_chunkno;
  unsigned  limit_chunkoffset;
  BlockRefTableChunk limit_chunk;

  /* If we already have an equal or lower limit block, do nothing. */
  if (limit_block >= entry->limit_block)
    return;

  /* Record the new limit block value. */
  entry->limit_block = limit_block;

  /*
   * Figure out which chunk would store the state of the new limit block,
   * and which offset within that chunk.
   */
  limit_chunkno = limit_block / BLOCKS_PER_CHUNK;
  limit_chunkoffset = limit_block % BLOCKS_PER_CHUNK;

  /*
   * If the number of chunks is not large enough for any blocks with equal
   * or higher block numbers to exist, then there is nothing further to do.
   */
  if (limit_chunkno >= entry->nchunks)
    return;

  /* Discard entire contents of any higher-numbered chunks. */
  for (chunkno = limit_chunkno + 1; chunkno < entry->nchunks; ++chunkno)
    entry->chunk_usage[chunkno] = 0;

  /*
   * Next, we need to discard any offsets within the chunk that would
   * contain the limit_block. We must handle this differently depending on
   * whether the chunk that would contain limit_block is a bitmap or an
   * array of offsets.
   */
  limit_chunk = entry->chunk_data[limit_chunkno];
  if (entry->chunk_usage[limit_chunkno] == MAX_ENTRIES_PER_CHUNK)
  {
    unsigned  chunkoffset;

    /* It's a bitmap. Unset bits. */
    for (chunkoffset = limit_chunkoffset; chunkoffset < BLOCKS_PER_CHUNK;
       ++chunkoffset)
      limit_chunk[chunkoffset / BLOCKS_PER_ENTRY] &=
        ~(1 << (chunkoffset % BLOCKS_PER_ENTRY));
  }
  else
  {
    unsigned  i,
          j = 0;

    /* It's an offset array. Filter out large offsets. */
    for (i = 0; i < entry->chunk_usage[limit_chunkno]; ++i)
    {
      Assert(j <= i);
      if (limit_chunk[i] < limit_chunkoffset)
        limit_chunk[j++] = limit_chunk[i];
    }
    Assert(j <= entry->chunk_usage[limit_chunkno]);
    entry->chunk_usage[limit_chunkno] = j;
  }
}

/*
 * Mark a block in a given BlockRefTableEntry as known to have been modified.
 */
void
BlockRefTableEntryMarkBlockModified(BlockRefTableEntry *entry,
                  ForkNumber forknum,
                  BlockNumber blknum)
{
  unsigned  chunkno;
  unsigned  chunkoffset;
  unsigned  i;

  /*
   * Which chunk should store the state of this block? And what is the
   * offset of this block relative to the start of that chunk?
   */
  chunkno = blknum / BLOCKS_PER_CHUNK;
  chunkoffset = blknum % BLOCKS_PER_CHUNK;

  /*
   * If 'nchunks' isn't big enough for us to be able to represent the state
   * of this block, we need to enlarge our arrays.
   */
  if (chunkno >= entry->nchunks)
  {
    unsigned  max_chunks;
    unsigned  extra_chunks;

    /*
     * New array size is a power of 2, at least 16, big enough so that
     * chunkno will be a valid array index.
     */
    max_chunks = Max(16, entry->nchunks);
    while (max_chunks < chunkno + 1)
      max_chunks *= 2;
    extra_chunks = max_chunks - entry->nchunks;

    if (entry->nchunks == 0)
    {
      entry->chunk_size = palloc0(sizeof(uint16) * max_chunks);
      entry->chunk_usage = palloc0(sizeof(uint16) * max_chunks);
      entry->chunk_data =
        palloc0(sizeof(BlockRefTableChunk) * max_chunks);
    }
    else
    {
      entry->chunk_size = repalloc(entry->chunk_size,
                     sizeof(uint16) * max_chunks);
      memset(&entry->chunk_size[entry->nchunks], 0,
           extra_chunks * sizeof(uint16));
      entry->chunk_usage = repalloc(entry->chunk_usage,
                      sizeof(uint16) * max_chunks);
      memset(&entry->chunk_usage[entry->nchunks], 0,
           extra_chunks * sizeof(uint16));
      entry->chunk_data = repalloc(entry->chunk_data,
                     sizeof(BlockRefTableChunk) * max_chunks);
      memset(&entry->chunk_data[entry->nchunks], 0,
           extra_chunks * sizeof(BlockRefTableChunk));
    }
    entry->nchunks = max_chunks;
  }

  /*
   * If the chunk that covers this block number doesn't exist yet, create it
   * as an array and add the appropriate offset to it. We make it pretty
   * small initially, because there might only be 1 or a few block
   * references in this chunk and we don't want to use up too much memory.
   */
  if (entry->chunk_size[chunkno] == 0)
  {
    entry->chunk_data[chunkno] =
      palloc(sizeof(uint16) * INITIAL_ENTRIES_PER_CHUNK);
    entry->chunk_size[chunkno] = INITIAL_ENTRIES_PER_CHUNK;
    entry->chunk_data[chunkno][0] = chunkoffset;
    entry->chunk_usage[chunkno] = 1;
    return;
  }

  /*
   * If the number of entries in this chunk is already maximum, it must be a
   * bitmap. Just set the appropriate bit.
   */
  if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK)
  {
    BlockRefTableChunk chunk = entry->chunk_data[chunkno];

    chunk[chunkoffset / BLOCKS_PER_ENTRY] |=
      1 << (chunkoffset % BLOCKS_PER_ENTRY);
    return;
  }

  /*
   * There is an existing chunk and it's in array format. Let's find out
   * whether it already has an entry for this block. If so, we do not need
   * to do anything.
   */
  for (i = 0; i < entry->chunk_usage[chunkno]; ++i)
  {
    if (entry->chunk_data[chunkno][i] == chunkoffset)
      return;
  }

  /*
   * If the number of entries currently used is one less than the maximum,
   * it's time to convert to bitmap format.
   */
  if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK - 1)
  {
    BlockRefTableChunk newchunk;
    unsigned  j;

    /* Allocate a new chunk. */
    newchunk = palloc0(MAX_ENTRIES_PER_CHUNK * sizeof(uint16));

    /* Set the bit for each existing entry. */
    for (j = 0; j < entry->chunk_usage[chunkno]; ++j)
    {
      unsigned  coff = entry->chunk_data[chunkno][j];

      newchunk[coff / BLOCKS_PER_ENTRY] |=
        1 << (coff % BLOCKS_PER_ENTRY);
    }

    /* Set the bit for the new entry. */
    newchunk[chunkoffset / BLOCKS_PER_ENTRY] |=
      1 << (chunkoffset % BLOCKS_PER_ENTRY);

    /* Swap the new chunk into place and update metadata. */
    pfree(entry->chunk_data[chunkno]);
    entry->chunk_data[chunkno] = newchunk;
    entry->chunk_size[chunkno] = MAX_ENTRIES_PER_CHUNK;
    entry->chunk_usage[chunkno] = MAX_ENTRIES_PER_CHUNK;
    return;
  }

  /*
   * OK, we currently have an array, and we don't need to convert to a
   * bitmap, but we do need to add a new element. If there's not enough
   * room, we'll have to expand the array.
   */
  if (entry->chunk_usage[chunkno] == entry->chunk_size[chunkno])
  {
    unsigned  newsize = entry->chunk_size[chunkno] * 2;

    Assert(newsize <= MAX_ENTRIES_PER_CHUNK);
    entry->chunk_data[chunkno] = repalloc(entry->chunk_data[chunkno],
                        newsize * sizeof(uint16));
    entry->chunk_size[chunkno] = newsize;
  }

  /* Now we can add the new entry. */
  entry->chunk_data[chunkno][entry->chunk_usage[chunkno]] =
    chunkoffset;
  entry->chunk_usage[chunkno]++;
}

/*
 * Release memory for a BlockRefTableEntry that was created by
 * CreateBlockRefTableEntry.
 */
void
BlockRefTableFreeEntry(BlockRefTableEntry *entry)
{
  if (entry->chunk_size != NULL)
  {
    pfree(entry->chunk_size);
    entry->chunk_size = NULL;
  }

  if (entry->chunk_usage != NULL)
  {
    pfree(entry->chunk_usage);
    entry->chunk_usage = NULL;
  }

  if (entry->chunk_data != NULL)
  {
    pfree(entry->chunk_data);
    entry->chunk_data = NULL;
  }

  pfree(entry);
}

/*
 * Comparator for BlockRefTableSerializedEntry objects.
 *
 * We make the tablespace OID the first column of the sort key to match
 * the on-disk tree structure.
 */
static int
BlockRefTableComparator(const void *a, const void *b)
{
  const BlockRefTableSerializedEntry *sa = a;
  const BlockRefTableSerializedEntry *sb = b;

  if (sa->rlocator.spcOid > sb->rlocator.spcOid)
    return 1;
  if (sa->rlocator.spcOid < sb->rlocator.spcOid)
    return -1;

  if (sa->rlocator.dbOid > sb->rlocator.dbOid)
    return 1;
  if (sa->rlocator.dbOid < sb->rlocator.dbOid)
    return -1;

  if (sa->rlocator.relNumber > sb->rlocator.relNumber)
    return 1;
  if (sa->rlocator.relNumber < sb->rlocator.relNumber)
    return -1;

  if (sa->forknum > sb->forknum)
    return 1;
  if (sa->forknum < sb->forknum)
    return -1;

  return 0;
}

/*
 * Flush any buffered data out of a BlockRefTableBuffer.
 */
static void
BlockRefTableFlush(BlockRefTableBuffer *buffer)
{
  buffer->io_callback(buffer->io_callback_arg, buffer->data, buffer->used);
  buffer->used = 0;
}

/*
 * Read data from a BlockRefTableBuffer, and update the running CRC
 * calculation for the returned data (but not any data that we may have
 * buffered but not yet actually returned).
 */
static void
BlockRefTableRead(BlockRefTableReader *reader, void *data, int length)
{
  BlockRefTableBuffer *buffer = &reader->buffer;

  /* Loop until read is fully satisfied. */
  while (length > 0)
  {
    if (buffer->cursor < buffer->used)
    {
      /*
       * If any buffered data is available, use that to satisfy as much
       * of the request as possible.
       */
      int     bytes_to_copy = Min(length, buffer->used - buffer->cursor);

      memcpy(data, &buffer->data[buffer->cursor], bytes_to_copy);
      COMP_CRC32C(buffer->crc, &buffer->data[buffer->cursor],
            bytes_to_copy);
      buffer->cursor += bytes_to_copy;
      data = ((char *) data) + bytes_to_copy;
      length -= bytes_to_copy;
    }
    else if (length >= BUFSIZE)
    {
      /*
       * If the request length is long, read directly into caller's
       * buffer.
       */
      int     bytes_read;

      bytes_read = buffer->io_callback(buffer->io_callback_arg,
                       data, length);
      COMP_CRC32C(buffer->crc, data, bytes_read);
      data = ((char *) data) + bytes_read;
      length -= bytes_read;

      /* If we didn't get anything, that's bad. */
      if (bytes_read == 0)
        reader->error_callback(reader->error_callback_arg,
                     "file \"%s\" ends unexpectedly",
                     reader->error_filename);
    }
    else
    {
      /*
       * Refill our buffer.
       */
      buffer->used = buffer->io_callback(buffer->io_callback_arg,
                         buffer->data, BUFSIZE);
      buffer->cursor = 0;

      /* If we didn't get anything, that's bad. */
      if (buffer->used == 0)
        reader->error_callback(reader->error_callback_arg,
                     "file \"%s\" ends unexpectedly",
                     reader->error_filename);
    }
  }
}

/*
 * Supply data to a BlockRefTableBuffer for write to the underlying File,
 * and update the running CRC calculation for that data.
 */
static void
BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data, int length)
{
  /* Update running CRC calculation. */
  COMP_CRC32C(buffer->crc, data, length);

  /* If the new data can't fit into the buffer, flush the buffer. */
  if (buffer->used + length > BUFSIZE)
  {
    buffer->io_callback(buffer->io_callback_arg, buffer->data,
              buffer->used);
    buffer->used = 0;
  }

  /* If the new data would fill the buffer, or more, write it directly. */
  if (length >= BUFSIZE)
  {
    buffer->io_callback(buffer->io_callback_arg, data, length);
    return;
  }

  /* Otherwise, copy the new data into the buffer. */
  memcpy(&buffer->data[buffer->used], data, length);
  buffer->used += length;
  Assert(buffer->used <= BUFSIZE);
}

/*
 * Generate the sentinel and CRC required at the end of a block reference
 * table file and flush them out of our internal buffer.
 */
static void
BlockRefTableFileTerminate(BlockRefTableBuffer *buffer)
{
  BlockRefTableSerializedEntry zentry = {0};
  pg_crc32c crc;

  /* Write a sentinel indicating that there are no more entries. */
  BlockRefTableWrite(buffer, &zentry,
             sizeof(BlockRefTableSerializedEntry));

  /*
   * Writing the checksum will perturb the ongoing checksum calculation, so
   * copy the state first and finalize the computation using the copy.
   */
  crc = buffer->crc;
  FIN_CRC32C(crc);
  BlockRefTableWrite(buffer, &crc, sizeof(pg_crc32c));

  /* Flush any leftover data out of our buffer. */
  BlockRefTableFlush(buffer);
}

Coverage Report

Created: 2025-09-27 06:52