/src/postgres/src/backend/access/heap/pruneheap.c
Line | Count | Source |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * pruneheap.c |
4 | | * heap page pruning and HOT-chain management code |
5 | | * |
6 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/backend/access/heap/pruneheap.c |
12 | | * |
13 | | *------------------------------------------------------------------------- |
14 | | */ |
15 | | #include "postgres.h" |
16 | | |
17 | | #include "access/heapam.h" |
18 | | #include "access/heapam_xlog.h" |
19 | | #include "access/htup_details.h" |
20 | | #include "access/multixact.h" |
21 | | #include "access/transam.h" |
22 | | #include "access/xlog.h" |
23 | | #include "access/xloginsert.h" |
24 | | #include "commands/vacuum.h" |
25 | | #include "executor/instrument.h" |
26 | | #include "miscadmin.h" |
27 | | #include "pgstat.h" |
28 | | #include "storage/bufmgr.h" |
29 | | #include "utils/rel.h" |
30 | | #include "utils/snapmgr.h" |
31 | | |
32 | | /* Working data for heap_page_prune_and_freeze() and subroutines */ |
33 | | typedef struct |
34 | | { |
35 | | /*------------------------------------------------------- |
36 | | * Arguments passed to heap_page_prune_and_freeze() |
37 | | *------------------------------------------------------- |
38 | | */ |
39 | | |
40 | | /* tuple visibility test, initialized for the relation */ |
41 | | GlobalVisState *vistest; |
42 | | /* whether or not dead items can be set LP_UNUSED during pruning */ |
43 | | bool mark_unused_now; |
44 | | /* whether to attempt freezing tuples */ |
45 | | bool freeze; |
46 | | struct VacuumCutoffs *cutoffs; |
47 | | |
48 | | /*------------------------------------------------------- |
49 | | * Fields describing what to do to the page |
50 | | *------------------------------------------------------- |
51 | | */ |
52 | | TransactionId new_prune_xid; /* new prune hint value */ |
53 | | TransactionId latest_xid_removed; |
54 | | int nredirected; /* numbers of entries in arrays below */ |
55 | | int ndead; |
56 | | int nunused; |
57 | | int nfrozen; |
58 | | /* arrays that accumulate indexes of items to be changed */ |
59 | | OffsetNumber redirected[MaxHeapTuplesPerPage * 2]; |
60 | | OffsetNumber nowdead[MaxHeapTuplesPerPage]; |
61 | | OffsetNumber nowunused[MaxHeapTuplesPerPage]; |
62 | | HeapTupleFreeze frozen[MaxHeapTuplesPerPage]; |
63 | | |
64 | | /*------------------------------------------------------- |
65 | | * Working state for HOT chain processing |
66 | | *------------------------------------------------------- |
67 | | */ |
68 | | |
69 | | /* |
70 | | * 'root_items' contains offsets of all LP_REDIRECT line pointers and |
71 | | * normal non-HOT tuples. They can be stand-alone items or the first item |
72 | | * in a HOT chain. 'heaponly_items' contains heap-only tuples which can |
73 | | * only be removed as part of a HOT chain. |
74 | | */ |
75 | | int nroot_items; |
76 | | OffsetNumber root_items[MaxHeapTuplesPerPage]; |
77 | | int nheaponly_items; |
78 | | OffsetNumber heaponly_items[MaxHeapTuplesPerPage]; |
79 | | |
80 | | /* |
81 | | * processed[offnum] is true if item at offnum has been processed. |
82 | | * |
83 | | * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is |
84 | | * 1. Otherwise every access would need to subtract 1. |
85 | | */ |
86 | | bool processed[MaxHeapTuplesPerPage + 1]; |
87 | | |
88 | | /* |
89 | | * Tuple visibility is only computed once for each tuple, for correctness |
90 | | * and efficiency reasons; see comment in heap_page_prune_and_freeze() for |
91 | | * details. This is of type int8[], instead of HTSV_Result[], so we can |
92 | | * use -1 to indicate no visibility has been computed, e.g. for LP_DEAD |
93 | | * items. |
94 | | * |
95 | | * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is |
96 | | * 1. Otherwise every access would need to subtract 1. |
97 | | */ |
98 | | int8 htsv[MaxHeapTuplesPerPage + 1]; |
99 | | |
100 | | /* |
101 | | * Freezing-related state. |
102 | | */ |
103 | | HeapPageFreeze pagefrz; |
104 | | |
105 | | /*------------------------------------------------------- |
106 | | * Information about what was done |
107 | | * |
108 | | * These fields are not used by pruning itself for the most part, but are |
109 | | * used to collect information about what was pruned and what state the |
110 | | * page is in after pruning, for the benefit of the caller. They are |
111 | | * copied to the caller's PruneFreezeResult at the end. |
112 | | * ------------------------------------------------------- |
113 | | */ |
114 | | |
115 | | int ndeleted; /* Number of tuples deleted from the page */ |
116 | | |
117 | | /* Number of live and recently dead tuples, after pruning */ |
118 | | int live_tuples; |
119 | | int recently_dead_tuples; |
120 | | |
121 | | /* Whether or not the page makes rel truncation unsafe */ |
122 | | bool hastup; |
123 | | |
124 | | /* |
125 | | * LP_DEAD items on the page after pruning. Includes existing LP_DEAD |
126 | | * items |
127 | | */ |
128 | | int lpdead_items; /* number of items in the array */ |
129 | | OffsetNumber *deadoffsets; /* points directly to presult->deadoffsets */ |
130 | | |
131 | | /* |
132 | | * all_visible and all_frozen indicate if the all-visible and all-frozen |
133 | | * bits in the visibility map can be set for this page after pruning. |
134 | | * |
135 | | * visibility_cutoff_xid is the newest xmin of live tuples on the page. |
136 | | * The caller can use it as the conflict horizon, when setting the VM |
137 | | * bits. It is only valid if we froze some tuples, and all_frozen is |
138 | | * true. |
139 | | * |
140 | | * NOTE: all_visible and all_frozen don't include LP_DEAD items. That's |
141 | | * convenient for heap_page_prune_and_freeze(), to use them to decide |
142 | | * whether to freeze the page or not. The all_visible and all_frozen |
143 | | * values returned to the caller are adjusted to include LP_DEAD items at |
144 | | * the end. |
145 | | * |
146 | | * all_frozen should only be considered valid if all_visible is also set; |
147 | | * we don't bother to clear the all_frozen flag every time we clear the |
148 | | * all_visible flag. |
149 | | */ |
150 | | bool all_visible; |
151 | | bool all_frozen; |
152 | | TransactionId visibility_cutoff_xid; |
153 | | } PruneState; |
154 | | |
155 | | /* Local functions */ |
156 | | static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate, |
157 | | HeapTuple tup, |
158 | | Buffer buffer); |
159 | | static inline HTSV_Result htsv_get_valid_status(int status); |
160 | | static void heap_prune_chain(Page page, BlockNumber blockno, OffsetNumber maxoff, |
161 | | OffsetNumber rootoffnum, PruneState *prstate); |
162 | | static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid); |
163 | | static void heap_prune_record_redirect(PruneState *prstate, |
164 | | OffsetNumber offnum, OffsetNumber rdoffnum, |
165 | | bool was_normal); |
166 | | static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum, |
167 | | bool was_normal); |
168 | | static void heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum, |
169 | | bool was_normal); |
170 | | static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal); |
171 | | |
172 | | static void heap_prune_record_unchanged_lp_unused(Page page, PruneState *prstate, OffsetNumber offnum); |
173 | | static void heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumber offnum); |
174 | | static void heap_prune_record_unchanged_lp_dead(Page page, PruneState *prstate, OffsetNumber offnum); |
175 | | static void heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetNumber offnum); |
176 | | |
177 | | static void page_verify_redirects(Page page); |
178 | | |
179 | | |
180 | | /* |
181 | | * Optionally prune and repair fragmentation in the specified page. |
182 | | * |
183 | | * This is an opportunistic function. It will perform housekeeping |
184 | | * only if the page heuristically looks like a candidate for pruning and we |
185 | | * can acquire buffer cleanup lock without blocking. |
186 | | * |
187 | | * Note: this is called quite often. It's important that it fall out quickly |
188 | | * if there's not any use in pruning. |
189 | | * |
190 | | * Caller must have pin on the buffer, and must *not* have a lock on it. |
191 | | */ |
192 | | void |
193 | | heap_page_prune_opt(Relation relation, Buffer buffer) |
194 | 0 | { |
195 | 0 | Page page = BufferGetPage(buffer); |
196 | 0 | TransactionId prune_xid; |
197 | 0 | GlobalVisState *vistest; |
198 | 0 | Size minfree; |
199 | | |
200 | | /* |
201 | | * We can't write WAL in recovery mode, so there's no point trying to |
202 | | * clean the page. The primary will likely issue a cleaning WAL record |
203 | | * soon anyway, so this is no particular loss. |
204 | | */ |
205 | 0 | if (RecoveryInProgress()) |
206 | 0 | return; |
207 | | |
208 | | /* |
209 | | * First check whether there's any chance there's something to prune, |
210 | | * determining the appropriate horizon is a waste if there's no prune_xid |
211 | | * (i.e. no updates/deletes left potentially dead tuples around). |
212 | | */ |
213 | 0 | prune_xid = ((PageHeader) page)->pd_prune_xid; |
214 | 0 | if (!TransactionIdIsValid(prune_xid)) |
215 | 0 | return; |
216 | | |
217 | | /* |
218 | | * Check whether prune_xid indicates that there may be dead rows that can |
219 | | * be cleaned up. |
220 | | */ |
221 | 0 | vistest = GlobalVisTestFor(relation); |
222 | |
|
223 | 0 | if (!GlobalVisTestIsRemovableXid(vistest, prune_xid)) |
224 | 0 | return; |
225 | | |
226 | | /* |
227 | | * We prune when a previous UPDATE failed to find enough space on the page |
228 | | * for a new tuple version, or when free space falls below the relation's |
229 | | * fill-factor target (but not less than 10%). |
230 | | * |
231 | | * Checking free space here is questionable since we aren't holding any |
232 | | * lock on the buffer; in the worst case we could get a bogus answer. It's |
233 | | * unlikely to be *seriously* wrong, though, since reading either pd_lower |
234 | | * or pd_upper is probably atomic. Avoiding taking a lock seems more |
235 | | * important than sometimes getting a wrong answer in what is after all |
236 | | * just a heuristic estimate. |
237 | | */ |
238 | 0 | minfree = RelationGetTargetPageFreeSpace(relation, |
239 | 0 | HEAP_DEFAULT_FILLFACTOR); |
240 | 0 | minfree = Max(minfree, BLCKSZ / 10); |
241 | |
|
242 | 0 | if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) |
243 | 0 | { |
244 | | /* OK, try to get exclusive buffer lock */ |
245 | 0 | if (!ConditionalLockBufferForCleanup(buffer)) |
246 | 0 | return; |
247 | | |
248 | | /* |
249 | | * Now that we have buffer lock, get accurate information about the |
250 | | * page's free space, and recheck the heuristic about whether to |
251 | | * prune. |
252 | | */ |
253 | 0 | if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) |
254 | 0 | { |
255 | 0 | OffsetNumber dummy_off_loc; |
256 | 0 | PruneFreezeResult presult; |
257 | | |
258 | | /* |
259 | | * For now, pass mark_unused_now as false regardless of whether or |
260 | | * not the relation has indexes, since we cannot safely determine |
261 | | * that during on-access pruning with the current implementation. |
262 | | */ |
263 | 0 | heap_page_prune_and_freeze(relation, buffer, vistest, 0, |
264 | 0 | NULL, &presult, PRUNE_ON_ACCESS, &dummy_off_loc, NULL, NULL); |
265 | | |
266 | | /* |
267 | | * Report the number of tuples reclaimed to pgstats. This is |
268 | | * presult.ndeleted minus the number of newly-LP_DEAD-set items. |
269 | | * |
270 | | * We derive the number of dead tuples like this to avoid totally |
271 | | * forgetting about items that were set to LP_DEAD, since they |
272 | | * still need to be cleaned up by VACUUM. We only want to count |
273 | | * heap-only tuples that just became LP_UNUSED in our report, |
274 | | * which don't. |
275 | | * |
276 | | * VACUUM doesn't have to compensate in the same way when it |
277 | | * tracks ndeleted, since it will set the same LP_DEAD items to |
278 | | * LP_UNUSED separately. |
279 | | */ |
280 | 0 | if (presult.ndeleted > presult.nnewlpdead) |
281 | 0 | pgstat_update_heap_dead_tuples(relation, |
282 | 0 | presult.ndeleted - presult.nnewlpdead); |
283 | 0 | } |
284 | | |
285 | | /* And release buffer lock */ |
286 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
287 | | |
288 | | /* |
289 | | * We avoid reuse of any free space created on the page by unrelated |
290 | | * UPDATEs/INSERTs by opting to not update the FSM at this point. The |
291 | | * free space should be reused by UPDATEs to *this* page. |
292 | | */ |
293 | 0 | } |
294 | 0 | } |
295 | | |
296 | | |
297 | | /* |
298 | | * Prune and repair fragmentation and potentially freeze tuples on the |
299 | | * specified page. |
300 | | * |
301 | | * Caller must have pin and buffer cleanup lock on the page. Note that we |
302 | | * don't update the FSM information for page on caller's behalf. Caller might |
303 | | * also need to account for a reduction in the length of the line pointer |
304 | | * array following array truncation by us. |
305 | | * |
306 | | * If the HEAP_PRUNE_FREEZE option is set, we will also freeze tuples if it's |
307 | | * required in order to advance relfrozenxid / relminmxid, or if it's |
308 | | * considered advantageous for overall system performance to do so now. The |
309 | | * 'cutoffs', 'presult', 'new_relfrozen_xid' and 'new_relmin_mxid' arguments |
310 | | * are required when freezing. When HEAP_PRUNE_FREEZE option is set, we also |
311 | | * set presult->all_visible and presult->all_frozen on exit, to indicate if |
312 | | * the VM bits can be set. They are always set to false when the |
313 | | * HEAP_PRUNE_FREEZE option is not set, because at the moment only callers |
314 | | * that also freeze need that information. |
315 | | * |
316 | | * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD |
317 | | * (see heap_prune_satisfies_vacuum). |
318 | | * |
319 | | * options: |
320 | | * MARK_UNUSED_NOW indicates that dead items can be set LP_UNUSED during |
321 | | * pruning. |
322 | | * |
323 | | * FREEZE indicates that we will also freeze tuples, and will return |
324 | | * 'all_visible', 'all_frozen' flags to the caller. |
325 | | * |
326 | | * cutoffs contains the freeze cutoffs, established by VACUUM at the beginning |
327 | | * of vacuuming the relation. Required if HEAP_PRUNE_FREEZE option is set. |
328 | | * cutoffs->OldestXmin is also used to determine if dead tuples are |
329 | | * HEAPTUPLE_RECENTLY_DEAD or HEAPTUPLE_DEAD. |
330 | | * |
331 | | * presult contains output parameters needed by callers, such as the number of |
332 | | * tuples removed and the offsets of dead items on the page after pruning. |
333 | | * heap_page_prune_and_freeze() is responsible for initializing it. Required |
334 | | * by all callers. |
335 | | * |
336 | | * reason indicates why the pruning is performed. It is included in the WAL |
337 | | * record for debugging and analysis purposes, but otherwise has no effect. |
338 | | * |
339 | | * off_loc is the offset location required by the caller to use in error |
340 | | * callback. |
341 | | * |
342 | | * new_relfrozen_xid and new_relmin_mxid must provided by the caller if the |
343 | | * HEAP_PRUNE_FREEZE option is set. On entry, they contain the oldest XID and |
344 | | * multi-XID seen on the relation so far. They will be updated with oldest |
345 | | * values present on the page after pruning. After processing the whole |
346 | | * relation, VACUUM can use these values as the new relfrozenxid/relminmxid |
347 | | * for the relation. |
348 | | */ |
349 | | void |
350 | | heap_page_prune_and_freeze(Relation relation, Buffer buffer, |
351 | | GlobalVisState *vistest, |
352 | | int options, |
353 | | struct VacuumCutoffs *cutoffs, |
354 | | PruneFreezeResult *presult, |
355 | | PruneReason reason, |
356 | | OffsetNumber *off_loc, |
357 | | TransactionId *new_relfrozen_xid, |
358 | | MultiXactId *new_relmin_mxid) |
359 | 0 | { |
360 | 0 | Page page = BufferGetPage(buffer); |
361 | 0 | BlockNumber blockno = BufferGetBlockNumber(buffer); |
362 | 0 | OffsetNumber offnum, |
363 | 0 | maxoff; |
364 | 0 | PruneState prstate; |
365 | 0 | HeapTupleData tup; |
366 | 0 | bool do_freeze; |
367 | 0 | bool do_prune; |
368 | 0 | bool do_hint; |
369 | 0 | bool hint_bit_fpi; |
370 | 0 | int64 fpi_before = pgWalUsage.wal_fpi; |
371 | | |
372 | | /* Copy parameters to prstate */ |
373 | 0 | prstate.vistest = vistest; |
374 | 0 | prstate.mark_unused_now = (options & HEAP_PAGE_PRUNE_MARK_UNUSED_NOW) != 0; |
375 | 0 | prstate.freeze = (options & HEAP_PAGE_PRUNE_FREEZE) != 0; |
376 | 0 | prstate.cutoffs = cutoffs; |
377 | | |
378 | | /* |
379 | | * Our strategy is to scan the page and make lists of items to change, |
380 | | * then apply the changes within a critical section. This keeps as much |
381 | | * logic as possible out of the critical section, and also ensures that |
382 | | * WAL replay will work the same as the normal case. |
383 | | * |
384 | | * First, initialize the new pd_prune_xid value to zero (indicating no |
385 | | * prunable tuples). If we find any tuples which may soon become |
386 | | * prunable, we will save the lowest relevant XID in new_prune_xid. Also |
387 | | * initialize the rest of our working state. |
388 | | */ |
389 | 0 | prstate.new_prune_xid = InvalidTransactionId; |
390 | 0 | prstate.latest_xid_removed = InvalidTransactionId; |
391 | 0 | prstate.nredirected = prstate.ndead = prstate.nunused = prstate.nfrozen = 0; |
392 | 0 | prstate.nroot_items = 0; |
393 | 0 | prstate.nheaponly_items = 0; |
394 | | |
395 | | /* initialize page freezing working state */ |
396 | 0 | prstate.pagefrz.freeze_required = false; |
397 | 0 | if (prstate.freeze) |
398 | 0 | { |
399 | 0 | Assert(new_relfrozen_xid && new_relmin_mxid); |
400 | 0 | prstate.pagefrz.FreezePageRelfrozenXid = *new_relfrozen_xid; |
401 | 0 | prstate.pagefrz.NoFreezePageRelfrozenXid = *new_relfrozen_xid; |
402 | 0 | prstate.pagefrz.FreezePageRelminMxid = *new_relmin_mxid; |
403 | 0 | prstate.pagefrz.NoFreezePageRelminMxid = *new_relmin_mxid; |
404 | 0 | } |
405 | 0 | else |
406 | 0 | { |
407 | 0 | Assert(new_relfrozen_xid == NULL && new_relmin_mxid == NULL); |
408 | 0 | prstate.pagefrz.FreezePageRelminMxid = InvalidMultiXactId; |
409 | 0 | prstate.pagefrz.NoFreezePageRelminMxid = InvalidMultiXactId; |
410 | 0 | prstate.pagefrz.FreezePageRelfrozenXid = InvalidTransactionId; |
411 | 0 | prstate.pagefrz.NoFreezePageRelfrozenXid = InvalidTransactionId; |
412 | 0 | } |
413 | |
|
414 | 0 | prstate.ndeleted = 0; |
415 | 0 | prstate.live_tuples = 0; |
416 | 0 | prstate.recently_dead_tuples = 0; |
417 | 0 | prstate.hastup = false; |
418 | 0 | prstate.lpdead_items = 0; |
419 | 0 | prstate.deadoffsets = presult->deadoffsets; |
420 | | |
421 | | /* |
422 | | * Caller may update the VM after we're done. We can keep track of |
423 | | * whether the page will be all-visible and all-frozen after pruning and |
424 | | * freezing to help the caller to do that. |
425 | | * |
426 | | * Currently, only VACUUM sets the VM bits. To save the effort, only do |
427 | | * the bookkeeping if the caller needs it. Currently, that's tied to |
428 | | * HEAP_PAGE_PRUNE_FREEZE, but it could be a separate flag if you wanted |
429 | | * to update the VM bits without also freezing or freeze without also |
430 | | * setting the VM bits. |
431 | | * |
432 | | * In addition to telling the caller whether it can set the VM bit, we |
433 | | * also use 'all_visible' and 'all_frozen' for our own decision-making. If |
434 | | * the whole page would become frozen, we consider opportunistically |
435 | | * freezing tuples. We will not be able to freeze the whole page if there |
436 | | * are tuples present that are not visible to everyone or if there are |
437 | | * dead tuples which are not yet removable. However, dead tuples which |
438 | | * will be removed by the end of vacuuming should not preclude us from |
439 | | * opportunistically freezing. Because of that, we do not clear |
440 | | * all_visible when we see LP_DEAD items. We fix that at the end of the |
441 | | * function, when we return the value to the caller, so that the caller |
442 | | * doesn't set the VM bit incorrectly. |
443 | | */ |
444 | 0 | if (prstate.freeze) |
445 | 0 | { |
446 | 0 | prstate.all_visible = true; |
447 | 0 | prstate.all_frozen = true; |
448 | 0 | } |
449 | 0 | else |
450 | 0 | { |
451 | | /* |
452 | | * Initializing to false allows skipping the work to update them in |
453 | | * heap_prune_record_unchanged_lp_normal(). |
454 | | */ |
455 | 0 | prstate.all_visible = false; |
456 | 0 | prstate.all_frozen = false; |
457 | 0 | } |
458 | | |
459 | | /* |
460 | | * The visibility cutoff xid is the newest xmin of live tuples on the |
461 | | * page. In the common case, this will be set as the conflict horizon the |
462 | | * caller can use for updating the VM. If, at the end of freezing and |
463 | | * pruning, the page is all-frozen, there is no possibility that any |
464 | | * running transaction on the standby does not see tuples on the page as |
465 | | * all-visible, so the conflict horizon remains InvalidTransactionId. |
466 | | */ |
467 | 0 | prstate.visibility_cutoff_xid = InvalidTransactionId; |
468 | |
|
469 | 0 | maxoff = PageGetMaxOffsetNumber(page); |
470 | 0 | tup.t_tableOid = RelationGetRelid(relation); |
471 | | |
472 | | /* |
473 | | * Determine HTSV for all tuples, and queue them up for processing as HOT |
474 | | * chain roots or as heap-only items. |
475 | | * |
476 | | * Determining HTSV only once for each tuple is required for correctness, |
477 | | * to deal with cases where running HTSV twice could result in different |
478 | | * results. For example, RECENTLY_DEAD can turn to DEAD if another |
479 | | * checked item causes GlobalVisTestIsRemovableFullXid() to update the |
480 | | * horizon, or INSERT_IN_PROGRESS can change to DEAD if the inserting |
481 | | * transaction aborts. |
482 | | * |
483 | | * It's also good for performance. Most commonly tuples within a page are |
484 | | * stored at decreasing offsets (while the items are stored at increasing |
485 | | * offsets). When processing all tuples on a page this leads to reading |
486 | | * memory at decreasing offsets within a page, with a variable stride. |
487 | | * That's hard for CPU prefetchers to deal with. Processing the items in |
488 | | * reverse order (and thus the tuples in increasing order) increases |
489 | | * prefetching efficiency significantly / decreases the number of cache |
490 | | * misses. |
491 | | */ |
492 | 0 | for (offnum = maxoff; |
493 | 0 | offnum >= FirstOffsetNumber; |
494 | 0 | offnum = OffsetNumberPrev(offnum)) |
495 | 0 | { |
496 | 0 | ItemId itemid = PageGetItemId(page, offnum); |
497 | 0 | HeapTupleHeader htup; |
498 | | |
499 | | /* |
500 | | * Set the offset number so that we can display it along with any |
501 | | * error that occurred while processing this tuple. |
502 | | */ |
503 | 0 | *off_loc = offnum; |
504 | |
|
505 | 0 | prstate.processed[offnum] = false; |
506 | 0 | prstate.htsv[offnum] = -1; |
507 | | |
508 | | /* Nothing to do if slot doesn't contain a tuple */ |
509 | 0 | if (!ItemIdIsUsed(itemid)) |
510 | 0 | { |
511 | 0 | heap_prune_record_unchanged_lp_unused(page, &prstate, offnum); |
512 | 0 | continue; |
513 | 0 | } |
514 | | |
515 | 0 | if (ItemIdIsDead(itemid)) |
516 | 0 | { |
517 | | /* |
518 | | * If the caller set mark_unused_now true, we can set dead line |
519 | | * pointers LP_UNUSED now. |
520 | | */ |
521 | 0 | if (unlikely(prstate.mark_unused_now)) |
522 | 0 | heap_prune_record_unused(&prstate, offnum, false); |
523 | 0 | else |
524 | 0 | heap_prune_record_unchanged_lp_dead(page, &prstate, offnum); |
525 | 0 | continue; |
526 | 0 | } |
527 | | |
528 | 0 | if (ItemIdIsRedirected(itemid)) |
529 | 0 | { |
530 | | /* This is the start of a HOT chain */ |
531 | 0 | prstate.root_items[prstate.nroot_items++] = offnum; |
532 | 0 | continue; |
533 | 0 | } |
534 | | |
535 | 0 | Assert(ItemIdIsNormal(itemid)); |
536 | | |
537 | | /* |
538 | | * Get the tuple's visibility status and queue it up for processing. |
539 | | */ |
540 | 0 | htup = (HeapTupleHeader) PageGetItem(page, itemid); |
541 | 0 | tup.t_data = htup; |
542 | 0 | tup.t_len = ItemIdGetLength(itemid); |
543 | 0 | ItemPointerSet(&tup.t_self, blockno, offnum); |
544 | |
|
545 | 0 | prstate.htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup, |
546 | 0 | buffer); |
547 | |
|
548 | 0 | if (!HeapTupleHeaderIsHeapOnly(htup)) |
549 | 0 | prstate.root_items[prstate.nroot_items++] = offnum; |
550 | 0 | else |
551 | 0 | prstate.heaponly_items[prstate.nheaponly_items++] = offnum; |
552 | 0 | } |
553 | | |
554 | | /* |
555 | | * If checksums are enabled, heap_prune_satisfies_vacuum() may have caused |
556 | | * an FPI to be emitted. |
557 | | */ |
558 | 0 | hint_bit_fpi = fpi_before != pgWalUsage.wal_fpi; |
559 | | |
560 | | /* |
561 | | * Process HOT chains. |
562 | | * |
563 | | * We added the items to the array starting from 'maxoff', so by |
564 | | * processing the array in reverse order, we process the items in |
565 | | * ascending offset number order. The order doesn't matter for |
566 | | * correctness, but some quick micro-benchmarking suggests that this is |
567 | | * faster. (Earlier PostgreSQL versions, which scanned all the items on |
568 | | * the page instead of using the root_items array, also did it in |
569 | | * ascending offset number order.) |
570 | | */ |
571 | 0 | for (int i = prstate.nroot_items - 1; i >= 0; i--) |
572 | 0 | { |
573 | 0 | offnum = prstate.root_items[i]; |
574 | | |
575 | | /* Ignore items already processed as part of an earlier chain */ |
576 | 0 | if (prstate.processed[offnum]) |
577 | 0 | continue; |
578 | | |
579 | | /* see preceding loop */ |
580 | 0 | *off_loc = offnum; |
581 | | |
582 | | /* Process this item or chain of items */ |
583 | 0 | heap_prune_chain(page, blockno, maxoff, offnum, &prstate); |
584 | 0 | } |
585 | | |
586 | | /* |
587 | | * Process any heap-only tuples that were not already processed as part of |
588 | | * a HOT chain. |
589 | | */ |
590 | 0 | for (int i = prstate.nheaponly_items - 1; i >= 0; i--) |
591 | 0 | { |
592 | 0 | offnum = prstate.heaponly_items[i]; |
593 | |
|
594 | 0 | if (prstate.processed[offnum]) |
595 | 0 | continue; |
596 | | |
597 | | /* see preceding loop */ |
598 | 0 | *off_loc = offnum; |
599 | | |
600 | | /* |
601 | | * If the tuple is DEAD and doesn't chain to anything else, mark it |
602 | | * unused. (If it does chain, we can only remove it as part of |
603 | | * pruning its chain.) |
604 | | * |
605 | | * We need this primarily to handle aborted HOT updates, that is, |
606 | | * XMIN_INVALID heap-only tuples. Those might not be linked to by any |
607 | | * chain, since the parent tuple might be re-updated before any |
608 | | * pruning occurs. So we have to be able to reap them separately from |
609 | | * chain-pruning. (Note that HeapTupleHeaderIsHotUpdated will never |
610 | | * return true for an XMIN_INVALID tuple, so this code will work even |
611 | | * when there were sequential updates within the aborted transaction.) |
612 | | */ |
613 | 0 | if (prstate.htsv[offnum] == HEAPTUPLE_DEAD) |
614 | 0 | { |
615 | 0 | ItemId itemid = PageGetItemId(page, offnum); |
616 | 0 | HeapTupleHeader htup = (HeapTupleHeader) PageGetItem(page, itemid); |
617 | |
|
618 | 0 | if (likely(!HeapTupleHeaderIsHotUpdated(htup))) |
619 | 0 | { |
620 | 0 | HeapTupleHeaderAdvanceConflictHorizon(htup, |
621 | 0 | &prstate.latest_xid_removed); |
622 | 0 | heap_prune_record_unused(&prstate, offnum, true); |
623 | 0 | } |
624 | 0 | else |
625 | 0 | { |
626 | | /* |
627 | | * This tuple should've been processed and removed as part of |
628 | | * a HOT chain, so something's wrong. To preserve evidence, |
629 | | * we don't dare to remove it. We cannot leave behind a DEAD |
630 | | * tuple either, because that will cause VACUUM to error out. |
631 | | * Throwing an error with a distinct error message seems like |
632 | | * the least bad option. |
633 | | */ |
634 | 0 | elog(ERROR, "dead heap-only tuple (%u, %d) is not linked to from any HOT chain", |
635 | 0 | blockno, offnum); |
636 | 0 | } |
637 | 0 | } |
638 | 0 | else |
639 | 0 | heap_prune_record_unchanged_lp_normal(page, &prstate, offnum); |
640 | 0 | } |
641 | | |
642 | | /* We should now have processed every tuple exactly once */ |
643 | | #ifdef USE_ASSERT_CHECKING |
644 | | for (offnum = FirstOffsetNumber; |
645 | | offnum <= maxoff; |
646 | | offnum = OffsetNumberNext(offnum)) |
647 | | { |
648 | | *off_loc = offnum; |
649 | | |
650 | | Assert(prstate.processed[offnum]); |
651 | | } |
652 | | #endif |
653 | | |
654 | | /* Clear the offset information once we have processed the given page. */ |
655 | 0 | *off_loc = InvalidOffsetNumber; |
656 | |
|
657 | 0 | do_prune = prstate.nredirected > 0 || |
658 | 0 | prstate.ndead > 0 || |
659 | 0 | prstate.nunused > 0; |
660 | | |
661 | | /* |
662 | | * Even if we don't prune anything, if we found a new value for the |
663 | | * pd_prune_xid field or the page was marked full, we will update the hint |
664 | | * bit. |
665 | | */ |
666 | 0 | do_hint = ((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || |
667 | 0 | PageIsFull(page); |
668 | | |
669 | | /* |
670 | | * Decide if we want to go ahead with freezing according to the freeze |
671 | | * plans we prepared, or not. |
672 | | */ |
673 | 0 | do_freeze = false; |
674 | 0 | if (prstate.freeze) |
675 | 0 | { |
676 | 0 | if (prstate.pagefrz.freeze_required) |
677 | 0 | { |
678 | | /* |
679 | | * heap_prepare_freeze_tuple indicated that at least one XID/MXID |
680 | | * from before FreezeLimit/MultiXactCutoff is present. Must |
681 | | * freeze to advance relfrozenxid/relminmxid. |
682 | | */ |
683 | 0 | do_freeze = true; |
684 | 0 | } |
685 | 0 | else |
686 | 0 | { |
687 | | /* |
688 | | * Opportunistically freeze the page if we are generating an FPI |
689 | | * anyway and if doing so means that we can set the page |
690 | | * all-frozen afterwards (might not happen until VACUUM's final |
691 | | * heap pass). |
692 | | * |
693 | | * XXX: Previously, we knew if pruning emitted an FPI by checking |
694 | | * pgWalUsage.wal_fpi before and after pruning. Once the freeze |
695 | | * and prune records were combined, this heuristic couldn't be |
696 | | * used anymore. The opportunistic freeze heuristic must be |
697 | | * improved; however, for now, try to approximate the old logic. |
698 | | */ |
699 | 0 | if (prstate.all_visible && prstate.all_frozen && prstate.nfrozen > 0) |
700 | 0 | { |
701 | | /* |
702 | | * Freezing would make the page all-frozen. Have already |
703 | | * emitted an FPI or will do so anyway? |
704 | | */ |
705 | 0 | if (RelationNeedsWAL(relation)) |
706 | 0 | { |
707 | 0 | if (hint_bit_fpi) |
708 | 0 | do_freeze = true; |
709 | 0 | else if (do_prune) |
710 | 0 | { |
711 | 0 | if (XLogCheckBufferNeedsBackup(buffer)) |
712 | 0 | do_freeze = true; |
713 | 0 | } |
714 | 0 | else if (do_hint) |
715 | 0 | { |
716 | 0 | if (XLogHintBitIsNeeded() && XLogCheckBufferNeedsBackup(buffer)) |
717 | 0 | do_freeze = true; |
718 | 0 | } |
719 | 0 | } |
720 | 0 | } |
721 | 0 | } |
722 | 0 | } |
723 | |
|
724 | 0 | if (do_freeze) |
725 | 0 | { |
726 | | /* |
727 | | * Validate the tuples we will be freezing before entering the |
728 | | * critical section. |
729 | | */ |
730 | 0 | heap_pre_freeze_checks(buffer, prstate.frozen, prstate.nfrozen); |
731 | 0 | } |
732 | 0 | else if (prstate.nfrozen > 0) |
733 | 0 | { |
734 | | /* |
735 | | * The page contained some tuples that were not already frozen, and we |
736 | | * chose not to freeze them now. The page won't be all-frozen then. |
737 | | */ |
738 | 0 | Assert(!prstate.pagefrz.freeze_required); |
739 | |
|
740 | 0 | prstate.all_frozen = false; |
741 | 0 | prstate.nfrozen = 0; /* avoid miscounts in instrumentation */ |
742 | 0 | } |
743 | 0 | else |
744 | 0 | { |
745 | | /* |
746 | | * We have no freeze plans to execute. The page might already be |
747 | | * all-frozen (perhaps only following pruning), though. Such pages |
748 | | * can be marked all-frozen in the VM by our caller, even though none |
749 | | * of its tuples were newly frozen here. |
750 | | */ |
751 | 0 | } |
752 | | |
753 | | /* Any error while applying the changes is critical */ |
754 | 0 | START_CRIT_SECTION(); |
755 | |
|
756 | 0 | if (do_hint) |
757 | 0 | { |
758 | | /* |
759 | | * Update the page's pd_prune_xid field to either zero, or the lowest |
760 | | * XID of any soon-prunable tuple. |
761 | | */ |
762 | 0 | ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; |
763 | | |
764 | | /* |
765 | | * Also clear the "page is full" flag, since there's no point in |
766 | | * repeating the prune/defrag process until something else happens to |
767 | | * the page. |
768 | | */ |
769 | 0 | PageClearFull(page); |
770 | | |
771 | | /* |
772 | | * If that's all we had to do to the page, this is a non-WAL-logged |
773 | | * hint. If we are going to freeze or prune the page, we will mark |
774 | | * the buffer dirty below. |
775 | | */ |
776 | 0 | if (!do_freeze && !do_prune) |
777 | 0 | MarkBufferDirtyHint(buffer, true); |
778 | 0 | } |
779 | |
|
780 | 0 | if (do_prune || do_freeze) |
781 | 0 | { |
782 | | /* Apply the planned item changes and repair page fragmentation. */ |
783 | 0 | if (do_prune) |
784 | 0 | { |
785 | 0 | heap_page_prune_execute(buffer, false, |
786 | 0 | prstate.redirected, prstate.nredirected, |
787 | 0 | prstate.nowdead, prstate.ndead, |
788 | 0 | prstate.nowunused, prstate.nunused); |
789 | 0 | } |
790 | |
|
791 | 0 | if (do_freeze) |
792 | 0 | heap_freeze_prepared_tuples(buffer, prstate.frozen, prstate.nfrozen); |
793 | |
|
794 | 0 | MarkBufferDirty(buffer); |
795 | | |
796 | | /* |
797 | | * Emit a WAL XLOG_HEAP2_PRUNE* record showing what we did |
798 | | */ |
799 | 0 | if (RelationNeedsWAL(relation)) |
800 | 0 | { |
801 | | /* |
802 | | * The snapshotConflictHorizon for the whole record should be the |
803 | | * most conservative of all the horizons calculated for any of the |
804 | | * possible modifications. If this record will prune tuples, any |
805 | | * transactions on the standby older than the youngest xmax of the |
806 | | * most recently removed tuple this record will prune will |
807 | | * conflict. If this record will freeze tuples, any transactions |
808 | | * on the standby with xids older than the youngest tuple this |
809 | | * record will freeze will conflict. |
810 | | */ |
811 | 0 | TransactionId frz_conflict_horizon = InvalidTransactionId; |
812 | 0 | TransactionId conflict_xid; |
813 | | |
814 | | /* |
815 | | * We can use the visibility_cutoff_xid as our cutoff for |
816 | | * conflicts when the whole page is eligible to become all-frozen |
817 | | * in the VM once we're done with it. Otherwise we generate a |
818 | | * conservative cutoff by stepping back from OldestXmin. |
819 | | */ |
820 | 0 | if (do_freeze) |
821 | 0 | { |
822 | 0 | if (prstate.all_visible && prstate.all_frozen) |
823 | 0 | frz_conflict_horizon = prstate.visibility_cutoff_xid; |
824 | 0 | else |
825 | 0 | { |
826 | | /* Avoids false conflicts when hot_standby_feedback in use */ |
827 | 0 | frz_conflict_horizon = prstate.cutoffs->OldestXmin; |
828 | 0 | TransactionIdRetreat(frz_conflict_horizon); |
829 | 0 | } |
830 | 0 | } |
831 | |
|
832 | 0 | if (TransactionIdFollows(frz_conflict_horizon, prstate.latest_xid_removed)) |
833 | 0 | conflict_xid = frz_conflict_horizon; |
834 | 0 | else |
835 | 0 | conflict_xid = prstate.latest_xid_removed; |
836 | |
|
837 | 0 | log_heap_prune_and_freeze(relation, buffer, |
838 | 0 | conflict_xid, |
839 | 0 | true, reason, |
840 | 0 | prstate.frozen, prstate.nfrozen, |
841 | 0 | prstate.redirected, prstate.nredirected, |
842 | 0 | prstate.nowdead, prstate.ndead, |
843 | 0 | prstate.nowunused, prstate.nunused); |
844 | 0 | } |
845 | 0 | } |
846 | |
|
847 | 0 | END_CRIT_SECTION(); |
848 | | |
849 | | /* Copy information back for caller */ |
850 | 0 | presult->ndeleted = prstate.ndeleted; |
851 | 0 | presult->nnewlpdead = prstate.ndead; |
852 | 0 | presult->nfrozen = prstate.nfrozen; |
853 | 0 | presult->live_tuples = prstate.live_tuples; |
854 | 0 | presult->recently_dead_tuples = prstate.recently_dead_tuples; |
855 | | |
856 | | /* |
857 | | * It was convenient to ignore LP_DEAD items in all_visible earlier on to |
858 | | * make the choice of whether or not to freeze the page unaffected by the |
859 | | * short-term presence of LP_DEAD items. These LP_DEAD items were |
860 | | * effectively assumed to be LP_UNUSED items in the making. It doesn't |
861 | | * matter which vacuum heap pass (initial pass or final pass) ends up |
862 | | * setting the page all-frozen, as long as the ongoing VACUUM does it. |
863 | | * |
864 | | * Now that freezing has been finalized, unset all_visible if there are |
865 | | * any LP_DEAD items on the page. It needs to reflect the present state |
866 | | * of the page, as expected by our caller. |
867 | | */ |
868 | 0 | if (prstate.all_visible && prstate.lpdead_items == 0) |
869 | 0 | { |
870 | 0 | presult->all_visible = prstate.all_visible; |
871 | 0 | presult->all_frozen = prstate.all_frozen; |
872 | 0 | } |
873 | 0 | else |
874 | 0 | { |
875 | 0 | presult->all_visible = false; |
876 | 0 | presult->all_frozen = false; |
877 | 0 | } |
878 | |
|
879 | 0 | presult->hastup = prstate.hastup; |
880 | | |
881 | | /* |
882 | | * For callers planning to update the visibility map, the conflict horizon |
883 | | * for that record must be the newest xmin on the page. However, if the |
884 | | * page is completely frozen, there can be no conflict and the |
885 | | * vm_conflict_horizon should remain InvalidTransactionId. This includes |
886 | | * the case that we just froze all the tuples; the prune-freeze record |
887 | | * included the conflict XID already so the caller doesn't need it. |
888 | | */ |
889 | 0 | if (presult->all_frozen) |
890 | 0 | presult->vm_conflict_horizon = InvalidTransactionId; |
891 | 0 | else |
892 | 0 | presult->vm_conflict_horizon = prstate.visibility_cutoff_xid; |
893 | |
|
894 | 0 | presult->lpdead_items = prstate.lpdead_items; |
895 | | /* the presult->deadoffsets array was already filled in */ |
896 | |
|
897 | 0 | if (prstate.freeze) |
898 | 0 | { |
899 | 0 | if (presult->nfrozen > 0) |
900 | 0 | { |
901 | 0 | *new_relfrozen_xid = prstate.pagefrz.FreezePageRelfrozenXid; |
902 | 0 | *new_relmin_mxid = prstate.pagefrz.FreezePageRelminMxid; |
903 | 0 | } |
904 | 0 | else |
905 | 0 | { |
906 | 0 | *new_relfrozen_xid = prstate.pagefrz.NoFreezePageRelfrozenXid; |
907 | 0 | *new_relmin_mxid = prstate.pagefrz.NoFreezePageRelminMxid; |
908 | 0 | } |
909 | 0 | } |
910 | 0 | } |
911 | | |
912 | | |
913 | | /* |
914 | | * Perform visibility checks for heap pruning. |
915 | | */ |
916 | | static HTSV_Result |
917 | | heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) |
918 | 0 | { |
919 | 0 | HTSV_Result res; |
920 | 0 | TransactionId dead_after; |
921 | |
|
922 | 0 | res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after); |
923 | |
|
924 | 0 | if (res != HEAPTUPLE_RECENTLY_DEAD) |
925 | 0 | return res; |
926 | | |
927 | | /* |
928 | | * For VACUUM, we must be sure to prune tuples with xmax older than |
929 | | * OldestXmin -- a visibility cutoff determined at the beginning of |
930 | | * vacuuming the relation. OldestXmin is used for freezing determination |
931 | | * and we cannot freeze dead tuples' xmaxes. |
932 | | */ |
933 | 0 | if (prstate->cutoffs && |
934 | 0 | TransactionIdIsValid(prstate->cutoffs->OldestXmin) && |
935 | 0 | NormalTransactionIdPrecedes(dead_after, prstate->cutoffs->OldestXmin)) |
936 | 0 | return HEAPTUPLE_DEAD; |
937 | | |
938 | | /* |
939 | | * Determine whether or not the tuple is considered dead when compared |
940 | | * with the provided GlobalVisState. On-access pruning does not provide |
941 | | * VacuumCutoffs. And for vacuum, even if the tuple's xmax is not older |
942 | | * than OldestXmin, GlobalVisTestIsRemovableXid() could find the row dead |
943 | | * if the GlobalVisState has been updated since the beginning of vacuuming |
944 | | * the relation. |
945 | | */ |
946 | 0 | if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after)) |
947 | 0 | return HEAPTUPLE_DEAD; |
948 | | |
949 | 0 | return res; |
950 | 0 | } |
951 | | |
952 | | |
953 | | /* |
954 | | * Pruning calculates tuple visibility once and saves the results in an array |
955 | | * of int8. See PruneState.htsv for details. This helper function is meant |
956 | | * to guard against examining visibility status array members which have not |
957 | | * yet been computed. |
958 | | */ |
959 | | static inline HTSV_Result |
960 | | htsv_get_valid_status(int status) |
961 | 0 | { |
962 | 0 | Assert(status >= HEAPTUPLE_DEAD && |
963 | 0 | status <= HEAPTUPLE_DELETE_IN_PROGRESS); |
964 | 0 | return (HTSV_Result) status; |
965 | 0 | } |
966 | | |
967 | | /* |
968 | | * Prune specified line pointer or a HOT chain originating at line pointer. |
969 | | * |
970 | | * Tuple visibility information is provided in prstate->htsv. |
971 | | * |
972 | | * If the item is an index-referenced tuple (i.e. not a heap-only tuple), |
973 | | * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT |
974 | | * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple. |
975 | | * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really |
976 | | * DEAD, our visibility test is just too coarse to detect it. |
977 | | * |
978 | | * Pruning must never leave behind a DEAD tuple that still has tuple storage. |
979 | | * VACUUM isn't prepared to deal with that case. |
980 | | * |
981 | | * The root line pointer is redirected to the tuple immediately after the |
982 | | * latest DEAD tuple. If all tuples in the chain are DEAD, the root line |
983 | | * pointer is marked LP_DEAD. (This includes the case of a DEAD simple |
984 | | * tuple, which we treat as a chain of length 1.) |
985 | | * |
986 | | * We don't actually change the page here. We just add entries to the arrays in |
987 | | * prstate showing the changes to be made. Items to be redirected are added |
988 | | * to the redirected[] array (two entries per redirection); items to be set to |
989 | | * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED |
990 | | * state are added to nowunused[]. We perform bookkeeping of live tuples, |
991 | | * visibility etc. based on what the page will look like after the changes |
992 | | * applied. All that bookkeeping is performed in the heap_prune_record_*() |
993 | | * subroutines. The division of labor is that heap_prune_chain() decides the |
994 | | * fate of each tuple, ie. whether it's going to be removed, redirected or |
995 | | * left unchanged, and the heap_prune_record_*() subroutines update PruneState |
996 | | * based on that outcome. |
997 | | */ |
998 | | static void |
999 | | heap_prune_chain(Page page, BlockNumber blockno, OffsetNumber maxoff, |
1000 | | OffsetNumber rootoffnum, PruneState *prstate) |
1001 | 0 | { |
1002 | 0 | TransactionId priorXmax = InvalidTransactionId; |
1003 | 0 | ItemId rootlp; |
1004 | 0 | OffsetNumber offnum; |
1005 | 0 | OffsetNumber chainitems[MaxHeapTuplesPerPage]; |
1006 | | |
1007 | | /* |
1008 | | * After traversing the HOT chain, ndeadchain is the index in chainitems |
1009 | | * of the first live successor after the last dead item. |
1010 | | */ |
1011 | 0 | int ndeadchain = 0, |
1012 | 0 | nchain = 0; |
1013 | |
|
1014 | 0 | rootlp = PageGetItemId(page, rootoffnum); |
1015 | | |
1016 | | /* Start from the root tuple */ |
1017 | 0 | offnum = rootoffnum; |
1018 | | |
1019 | | /* while not end of the chain */ |
1020 | 0 | for (;;) |
1021 | 0 | { |
1022 | 0 | HeapTupleHeader htup; |
1023 | 0 | ItemId lp; |
1024 | | |
1025 | | /* Sanity check (pure paranoia) */ |
1026 | 0 | if (offnum < FirstOffsetNumber) |
1027 | 0 | break; |
1028 | | |
1029 | | /* |
1030 | | * An offset past the end of page's line pointer array is possible |
1031 | | * when the array was truncated (original item must have been unused) |
1032 | | */ |
1033 | 0 | if (offnum > maxoff) |
1034 | 0 | break; |
1035 | | |
1036 | | /* If item is already processed, stop --- it must not be same chain */ |
1037 | 0 | if (prstate->processed[offnum]) |
1038 | 0 | break; |
1039 | | |
1040 | 0 | lp = PageGetItemId(page, offnum); |
1041 | | |
1042 | | /* |
1043 | | * Unused item obviously isn't part of the chain. Likewise, a dead |
1044 | | * line pointer can't be part of the chain. Both of those cases were |
1045 | | * already marked as processed. |
1046 | | */ |
1047 | 0 | Assert(ItemIdIsUsed(lp)); |
1048 | 0 | Assert(!ItemIdIsDead(lp)); |
1049 | | |
1050 | | /* |
1051 | | * If we are looking at the redirected root line pointer, jump to the |
1052 | | * first normal tuple in the chain. If we find a redirect somewhere |
1053 | | * else, stop --- it must not be same chain. |
1054 | | */ |
1055 | 0 | if (ItemIdIsRedirected(lp)) |
1056 | 0 | { |
1057 | 0 | if (nchain > 0) |
1058 | 0 | break; /* not at start of chain */ |
1059 | 0 | chainitems[nchain++] = offnum; |
1060 | 0 | offnum = ItemIdGetRedirect(rootlp); |
1061 | 0 | continue; |
1062 | 0 | } |
1063 | | |
1064 | 0 | Assert(ItemIdIsNormal(lp)); |
1065 | |
|
1066 | 0 | htup = (HeapTupleHeader) PageGetItem(page, lp); |
1067 | | |
1068 | | /* |
1069 | | * Check the tuple XMIN against prior XMAX, if any |
1070 | | */ |
1071 | 0 | if (TransactionIdIsValid(priorXmax) && |
1072 | 0 | !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) |
1073 | 0 | break; |
1074 | | |
1075 | | /* |
1076 | | * OK, this tuple is indeed a member of the chain. |
1077 | | */ |
1078 | 0 | chainitems[nchain++] = offnum; |
1079 | |
|
1080 | 0 | switch (htsv_get_valid_status(prstate->htsv[offnum])) |
1081 | 0 | { |
1082 | 0 | case HEAPTUPLE_DEAD: |
1083 | | |
1084 | | /* Remember the last DEAD tuple seen */ |
1085 | 0 | ndeadchain = nchain; |
1086 | 0 | HeapTupleHeaderAdvanceConflictHorizon(htup, |
1087 | 0 | &prstate->latest_xid_removed); |
1088 | | /* Advance to next chain member */ |
1089 | 0 | break; |
1090 | | |
1091 | 0 | case HEAPTUPLE_RECENTLY_DEAD: |
1092 | | |
1093 | | /* |
1094 | | * We don't need to advance the conflict horizon for |
1095 | | * RECENTLY_DEAD tuples, even if we are removing them. This |
1096 | | * is because we only remove RECENTLY_DEAD tuples if they |
1097 | | * precede a DEAD tuple, and the DEAD tuple must have been |
1098 | | * inserted by a newer transaction than the RECENTLY_DEAD |
1099 | | * tuple by virtue of being later in the chain. We will have |
1100 | | * advanced the conflict horizon for the DEAD tuple. |
1101 | | */ |
1102 | | |
1103 | | /* |
1104 | | * Advance past RECENTLY_DEAD tuples just in case there's a |
1105 | | * DEAD one after them. We have to make sure that we don't |
1106 | | * miss any DEAD tuples, since DEAD tuples that still have |
1107 | | * tuple storage after pruning will confuse VACUUM. |
1108 | | */ |
1109 | 0 | break; |
1110 | | |
1111 | 0 | case HEAPTUPLE_DELETE_IN_PROGRESS: |
1112 | 0 | case HEAPTUPLE_LIVE: |
1113 | 0 | case HEAPTUPLE_INSERT_IN_PROGRESS: |
1114 | 0 | goto process_chain; |
1115 | | |
1116 | 0 | default: |
1117 | 0 | elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); |
1118 | 0 | goto process_chain; |
1119 | 0 | } |
1120 | | |
1121 | | /* |
1122 | | * If the tuple is not HOT-updated, then we are at the end of this |
1123 | | * HOT-update chain. |
1124 | | */ |
1125 | 0 | if (!HeapTupleHeaderIsHotUpdated(htup)) |
1126 | 0 | goto process_chain; |
1127 | | |
1128 | | /* HOT implies it can't have moved to different partition */ |
1129 | 0 | Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup)); |
1130 | | |
1131 | | /* |
1132 | | * Advance to next chain member. |
1133 | | */ |
1134 | 0 | Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blockno); |
1135 | 0 | offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); |
1136 | 0 | priorXmax = HeapTupleHeaderGetUpdateXid(htup); |
1137 | 0 | } |
1138 | | |
1139 | 0 | if (ItemIdIsRedirected(rootlp) && nchain < 2) |
1140 | 0 | { |
1141 | | /* |
1142 | | * We found a redirect item that doesn't point to a valid follow-on |
1143 | | * item. This can happen if the loop in heap_page_prune_and_freeze() |
1144 | | * caused us to visit the dead successor of a redirect item before |
1145 | | * visiting the redirect item. We can clean up by setting the |
1146 | | * redirect item to LP_DEAD state or LP_UNUSED if the caller |
1147 | | * indicated. |
1148 | | */ |
1149 | 0 | heap_prune_record_dead_or_unused(prstate, rootoffnum, false); |
1150 | 0 | return; |
1151 | 0 | } |
1152 | | |
1153 | 0 | process_chain: |
1154 | |
|
1155 | 0 | if (ndeadchain == 0) |
1156 | 0 | { |
1157 | | /* |
1158 | | * No DEAD tuple was found, so the chain is entirely composed of |
1159 | | * normal, unchanged tuples. Leave it alone. |
1160 | | */ |
1161 | 0 | int i = 0; |
1162 | |
|
1163 | 0 | if (ItemIdIsRedirected(rootlp)) |
1164 | 0 | { |
1165 | 0 | heap_prune_record_unchanged_lp_redirect(prstate, rootoffnum); |
1166 | 0 | i++; |
1167 | 0 | } |
1168 | 0 | for (; i < nchain; i++) |
1169 | 0 | heap_prune_record_unchanged_lp_normal(page, prstate, chainitems[i]); |
1170 | 0 | } |
1171 | 0 | else if (ndeadchain == nchain) |
1172 | 0 | { |
1173 | | /* |
1174 | | * The entire chain is dead. Mark the root line pointer LP_DEAD, and |
1175 | | * fully remove the other tuples in the chain. |
1176 | | */ |
1177 | 0 | heap_prune_record_dead_or_unused(prstate, rootoffnum, ItemIdIsNormal(rootlp)); |
1178 | 0 | for (int i = 1; i < nchain; i++) |
1179 | 0 | heap_prune_record_unused(prstate, chainitems[i], true); |
1180 | 0 | } |
1181 | 0 | else |
1182 | 0 | { |
1183 | | /* |
1184 | | * We found a DEAD tuple in the chain. Redirect the root line pointer |
1185 | | * to the first non-DEAD tuple, and mark as unused each intermediate |
1186 | | * item that we are able to remove from the chain. |
1187 | | */ |
1188 | 0 | heap_prune_record_redirect(prstate, rootoffnum, chainitems[ndeadchain], |
1189 | 0 | ItemIdIsNormal(rootlp)); |
1190 | 0 | for (int i = 1; i < ndeadchain; i++) |
1191 | 0 | heap_prune_record_unused(prstate, chainitems[i], true); |
1192 | | |
1193 | | /* the rest of tuples in the chain are normal, unchanged tuples */ |
1194 | 0 | for (int i = ndeadchain; i < nchain; i++) |
1195 | 0 | heap_prune_record_unchanged_lp_normal(page, prstate, chainitems[i]); |
1196 | 0 | } |
1197 | 0 | } |
1198 | | |
1199 | | /* Record lowest soon-prunable XID */ |
1200 | | static void |
1201 | | heap_prune_record_prunable(PruneState *prstate, TransactionId xid) |
1202 | 0 | { |
1203 | | /* |
1204 | | * This should exactly match the PageSetPrunable macro. We can't store |
1205 | | * directly into the page header yet, so we update working state. |
1206 | | */ |
1207 | 0 | Assert(TransactionIdIsNormal(xid)); |
1208 | 0 | if (!TransactionIdIsValid(prstate->new_prune_xid) || |
1209 | 0 | TransactionIdPrecedes(xid, prstate->new_prune_xid)) |
1210 | 0 | prstate->new_prune_xid = xid; |
1211 | 0 | } |
1212 | | |
1213 | | /* Record line pointer to be redirected */ |
1214 | | static void |
1215 | | heap_prune_record_redirect(PruneState *prstate, |
1216 | | OffsetNumber offnum, OffsetNumber rdoffnum, |
1217 | | bool was_normal) |
1218 | 0 | { |
1219 | 0 | Assert(!prstate->processed[offnum]); |
1220 | 0 | prstate->processed[offnum] = true; |
1221 | | |
1222 | | /* |
1223 | | * Do not mark the redirect target here. It needs to be counted |
1224 | | * separately as an unchanged tuple. |
1225 | | */ |
1226 | |
|
1227 | 0 | Assert(prstate->nredirected < MaxHeapTuplesPerPage); |
1228 | 0 | prstate->redirected[prstate->nredirected * 2] = offnum; |
1229 | 0 | prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum; |
1230 | |
|
1231 | 0 | prstate->nredirected++; |
1232 | | |
1233 | | /* |
1234 | | * If the root entry had been a normal tuple, we are deleting it, so count |
1235 | | * it in the result. But changing a redirect (even to DEAD state) doesn't |
1236 | | * count. |
1237 | | */ |
1238 | 0 | if (was_normal) |
1239 | 0 | prstate->ndeleted++; |
1240 | |
|
1241 | 0 | prstate->hastup = true; |
1242 | 0 | } |
1243 | | |
1244 | | /* Record line pointer to be marked dead */ |
1245 | | static void |
1246 | | heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum, |
1247 | | bool was_normal) |
1248 | 0 | { |
1249 | 0 | Assert(!prstate->processed[offnum]); |
1250 | 0 | prstate->processed[offnum] = true; |
1251 | |
|
1252 | 0 | Assert(prstate->ndead < MaxHeapTuplesPerPage); |
1253 | 0 | prstate->nowdead[prstate->ndead] = offnum; |
1254 | 0 | prstate->ndead++; |
1255 | | |
1256 | | /* |
1257 | | * Deliberately delay unsetting all_visible until later during pruning. |
1258 | | * Removable dead tuples shouldn't preclude freezing the page. |
1259 | | */ |
1260 | | |
1261 | | /* Record the dead offset for vacuum */ |
1262 | 0 | prstate->deadoffsets[prstate->lpdead_items++] = offnum; |
1263 | | |
1264 | | /* |
1265 | | * If the root entry had been a normal tuple, we are deleting it, so count |
1266 | | * it in the result. But changing a redirect (even to DEAD state) doesn't |
1267 | | * count. |
1268 | | */ |
1269 | 0 | if (was_normal) |
1270 | 0 | prstate->ndeleted++; |
1271 | 0 | } |
1272 | | |
1273 | | /* |
1274 | | * Depending on whether or not the caller set mark_unused_now to true, record that a |
1275 | | * line pointer should be marked LP_DEAD or LP_UNUSED. There are other cases in |
1276 | | * which we will mark line pointers LP_UNUSED, but we will not mark line |
1277 | | * pointers LP_DEAD if mark_unused_now is true. |
1278 | | */ |
1279 | | static void |
1280 | | heap_prune_record_dead_or_unused(PruneState *prstate, OffsetNumber offnum, |
1281 | | bool was_normal) |
1282 | 0 | { |
1283 | | /* |
1284 | | * If the caller set mark_unused_now to true, we can remove dead tuples |
1285 | | * during pruning instead of marking their line pointers dead. Set this |
1286 | | * tuple's line pointer LP_UNUSED. We hint that this option is less |
1287 | | * likely. |
1288 | | */ |
1289 | 0 | if (unlikely(prstate->mark_unused_now)) |
1290 | 0 | heap_prune_record_unused(prstate, offnum, was_normal); |
1291 | 0 | else |
1292 | 0 | heap_prune_record_dead(prstate, offnum, was_normal); |
1293 | 0 | } |
1294 | | |
1295 | | /* Record line pointer to be marked unused */ |
1296 | | static void |
1297 | | heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum, bool was_normal) |
1298 | 0 | { |
1299 | 0 | Assert(!prstate->processed[offnum]); |
1300 | 0 | prstate->processed[offnum] = true; |
1301 | |
|
1302 | 0 | Assert(prstate->nunused < MaxHeapTuplesPerPage); |
1303 | 0 | prstate->nowunused[prstate->nunused] = offnum; |
1304 | 0 | prstate->nunused++; |
1305 | | |
1306 | | /* |
1307 | | * If the root entry had been a normal tuple, we are deleting it, so count |
1308 | | * it in the result. But changing a redirect (even to DEAD state) doesn't |
1309 | | * count. |
1310 | | */ |
1311 | 0 | if (was_normal) |
1312 | 0 | prstate->ndeleted++; |
1313 | 0 | } |
1314 | | |
1315 | | /* |
1316 | | * Record an unused line pointer that is left unchanged. |
1317 | | */ |
1318 | | static void |
1319 | | heap_prune_record_unchanged_lp_unused(Page page, PruneState *prstate, OffsetNumber offnum) |
1320 | 0 | { |
1321 | 0 | Assert(!prstate->processed[offnum]); |
1322 | 0 | prstate->processed[offnum] = true; |
1323 | 0 | } |
1324 | | |
1325 | | /* |
1326 | | * Record line pointer that is left unchanged. We consider freezing it, and |
1327 | | * update bookkeeping of tuple counts and page visibility. |
1328 | | */ |
1329 | | static void |
1330 | | heap_prune_record_unchanged_lp_normal(Page page, PruneState *prstate, OffsetNumber offnum) |
1331 | 0 | { |
1332 | 0 | HeapTupleHeader htup; |
1333 | |
|
1334 | 0 | Assert(!prstate->processed[offnum]); |
1335 | 0 | prstate->processed[offnum] = true; |
1336 | |
|
1337 | 0 | prstate->hastup = true; /* the page is not empty */ |
1338 | | |
1339 | | /* |
1340 | | * The criteria for counting a tuple as live in this block need to match |
1341 | | * what analyze.c's acquire_sample_rows() does, otherwise VACUUM and |
1342 | | * ANALYZE may produce wildly different reltuples values, e.g. when there |
1343 | | * are many recently-dead tuples. |
1344 | | * |
1345 | | * The logic here is a bit simpler than acquire_sample_rows(), as VACUUM |
1346 | | * can't run inside a transaction block, which makes some cases impossible |
1347 | | * (e.g. in-progress insert from the same transaction). |
1348 | | * |
1349 | | * HEAPTUPLE_DEAD are handled by the other heap_prune_record_*() |
1350 | | * subroutines. They don't count dead items like acquire_sample_rows() |
1351 | | * does, because we assume that all dead items will become LP_UNUSED |
1352 | | * before VACUUM finishes. This difference is only superficial. VACUUM |
1353 | | * effectively agrees with ANALYZE about DEAD items, in the end. VACUUM |
1354 | | * won't remember LP_DEAD items, but only because they're not supposed to |
1355 | | * be left behind when it is done. (Cases where we bypass index vacuuming |
1356 | | * will violate this optimistic assumption, but the overall impact of that |
1357 | | * should be negligible.) |
1358 | | */ |
1359 | 0 | htup = (HeapTupleHeader) PageGetItem(page, PageGetItemId(page, offnum)); |
1360 | |
|
1361 | 0 | switch (prstate->htsv[offnum]) |
1362 | 0 | { |
1363 | 0 | case HEAPTUPLE_LIVE: |
1364 | | |
1365 | | /* |
1366 | | * Count it as live. Not only is this natural, but it's also what |
1367 | | * acquire_sample_rows() does. |
1368 | | */ |
1369 | 0 | prstate->live_tuples++; |
1370 | | |
1371 | | /* |
1372 | | * Is the tuple definitely visible to all transactions? |
1373 | | * |
1374 | | * NB: Like with per-tuple hint bits, we can't set the |
1375 | | * PD_ALL_VISIBLE flag if the inserter committed asynchronously. |
1376 | | * See SetHintBits for more info. Check that the tuple is hinted |
1377 | | * xmin-committed because of that. |
1378 | | */ |
1379 | 0 | if (prstate->all_visible) |
1380 | 0 | { |
1381 | 0 | TransactionId xmin; |
1382 | |
|
1383 | 0 | if (!HeapTupleHeaderXminCommitted(htup)) |
1384 | 0 | { |
1385 | 0 | prstate->all_visible = false; |
1386 | 0 | break; |
1387 | 0 | } |
1388 | | |
1389 | | /* |
1390 | | * The inserter definitely committed. But is it old enough |
1391 | | * that everyone sees it as committed? A FrozenTransactionId |
1392 | | * is seen as committed to everyone. Otherwise, we check if |
1393 | | * there is a snapshot that considers this xid to still be |
1394 | | * running, and if so, we don't consider the page all-visible. |
1395 | | */ |
1396 | 0 | xmin = HeapTupleHeaderGetXmin(htup); |
1397 | | |
1398 | | /* |
1399 | | * For now always use prstate->cutoffs for this test, because |
1400 | | * we only update 'all_visible' when freezing is requested. We |
1401 | | * could use GlobalVisTestIsRemovableXid instead, if a |
1402 | | * non-freezing caller wanted to set the VM bit. |
1403 | | */ |
1404 | 0 | Assert(prstate->cutoffs); |
1405 | 0 | if (!TransactionIdPrecedes(xmin, prstate->cutoffs->OldestXmin)) |
1406 | 0 | { |
1407 | 0 | prstate->all_visible = false; |
1408 | 0 | break; |
1409 | 0 | } |
1410 | | |
1411 | | /* Track newest xmin on page. */ |
1412 | 0 | if (TransactionIdFollows(xmin, prstate->visibility_cutoff_xid) && |
1413 | 0 | TransactionIdIsNormal(xmin)) |
1414 | 0 | prstate->visibility_cutoff_xid = xmin; |
1415 | 0 | } |
1416 | 0 | break; |
1417 | | |
1418 | 0 | case HEAPTUPLE_RECENTLY_DEAD: |
1419 | 0 | prstate->recently_dead_tuples++; |
1420 | 0 | prstate->all_visible = false; |
1421 | | |
1422 | | /* |
1423 | | * This tuple will soon become DEAD. Update the hint field so |
1424 | | * that the page is reconsidered for pruning in future. |
1425 | | */ |
1426 | 0 | heap_prune_record_prunable(prstate, |
1427 | 0 | HeapTupleHeaderGetUpdateXid(htup)); |
1428 | 0 | break; |
1429 | | |
1430 | 0 | case HEAPTUPLE_INSERT_IN_PROGRESS: |
1431 | | |
1432 | | /* |
1433 | | * We do not count these rows as live, because we expect the |
1434 | | * inserting transaction to update the counters at commit, and we |
1435 | | * assume that will happen only after we report our results. This |
1436 | | * assumption is a bit shaky, but it is what acquire_sample_rows() |
1437 | | * does, so be consistent. |
1438 | | */ |
1439 | 0 | prstate->all_visible = false; |
1440 | | |
1441 | | /* |
1442 | | * If we wanted to optimize for aborts, we might consider marking |
1443 | | * the page prunable when we see INSERT_IN_PROGRESS. But we |
1444 | | * don't. See related decisions about when to mark the page |
1445 | | * prunable in heapam.c. |
1446 | | */ |
1447 | 0 | break; |
1448 | | |
1449 | 0 | case HEAPTUPLE_DELETE_IN_PROGRESS: |
1450 | | |
1451 | | /* |
1452 | | * This an expected case during concurrent vacuum. Count such |
1453 | | * rows as live. As above, we assume the deleting transaction |
1454 | | * will commit and update the counters after we report. |
1455 | | */ |
1456 | 0 | prstate->live_tuples++; |
1457 | 0 | prstate->all_visible = false; |
1458 | | |
1459 | | /* |
1460 | | * This tuple may soon become DEAD. Update the hint field so that |
1461 | | * the page is reconsidered for pruning in future. |
1462 | | */ |
1463 | 0 | heap_prune_record_prunable(prstate, |
1464 | 0 | HeapTupleHeaderGetUpdateXid(htup)); |
1465 | 0 | break; |
1466 | | |
1467 | 0 | default: |
1468 | | |
1469 | | /* |
1470 | | * DEAD tuples should've been passed to heap_prune_record_dead() |
1471 | | * or heap_prune_record_unused() instead. |
1472 | | */ |
1473 | 0 | elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result %d", |
1474 | 0 | prstate->htsv[offnum]); |
1475 | 0 | break; |
1476 | 0 | } |
1477 | | |
1478 | | /* Consider freezing any normal tuples which will not be removed */ |
1479 | 0 | if (prstate->freeze) |
1480 | 0 | { |
1481 | 0 | bool totally_frozen; |
1482 | |
|
1483 | 0 | if ((heap_prepare_freeze_tuple(htup, |
1484 | 0 | prstate->cutoffs, |
1485 | 0 | &prstate->pagefrz, |
1486 | 0 | &prstate->frozen[prstate->nfrozen], |
1487 | 0 | &totally_frozen))) |
1488 | 0 | { |
1489 | | /* Save prepared freeze plan for later */ |
1490 | 0 | prstate->frozen[prstate->nfrozen++].offset = offnum; |
1491 | 0 | } |
1492 | | |
1493 | | /* |
1494 | | * If any tuple isn't either totally frozen already or eligible to |
1495 | | * become totally frozen (according to its freeze plan), then the page |
1496 | | * definitely cannot be set all-frozen in the visibility map later on. |
1497 | | */ |
1498 | 0 | if (!totally_frozen) |
1499 | 0 | prstate->all_frozen = false; |
1500 | 0 | } |
1501 | 0 | } |
1502 | | |
1503 | | |
1504 | | /* |
1505 | | * Record line pointer that was already LP_DEAD and is left unchanged. |
1506 | | */ |
1507 | | static void |
1508 | | heap_prune_record_unchanged_lp_dead(Page page, PruneState *prstate, OffsetNumber offnum) |
1509 | 0 | { |
1510 | 0 | Assert(!prstate->processed[offnum]); |
1511 | 0 | prstate->processed[offnum] = true; |
1512 | | |
1513 | | /* |
1514 | | * Deliberately don't set hastup for LP_DEAD items. We make the soft |
1515 | | * assumption that any LP_DEAD items encountered here will become |
1516 | | * LP_UNUSED later on, before count_nondeletable_pages is reached. If we |
1517 | | * don't make this assumption then rel truncation will only happen every |
1518 | | * other VACUUM, at most. Besides, VACUUM must treat |
1519 | | * hastup/nonempty_pages as provisional no matter how LP_DEAD items are |
1520 | | * handled (handled here, or handled later on). |
1521 | | * |
1522 | | * Similarly, don't unset all_visible until later, at the end of |
1523 | | * heap_page_prune_and_freeze(). This will allow us to attempt to freeze |
1524 | | * the page after pruning. As long as we unset it before updating the |
1525 | | * visibility map, this will be correct. |
1526 | | */ |
1527 | | |
1528 | | /* Record the dead offset for vacuum */ |
1529 | 0 | prstate->deadoffsets[prstate->lpdead_items++] = offnum; |
1530 | 0 | } |
1531 | | |
1532 | | /* |
1533 | | * Record LP_REDIRECT that is left unchanged. |
1534 | | */ |
1535 | | static void |
1536 | | heap_prune_record_unchanged_lp_redirect(PruneState *prstate, OffsetNumber offnum) |
1537 | 0 | { |
1538 | | /* |
1539 | | * A redirect line pointer doesn't count as a live tuple. |
1540 | | * |
1541 | | * If we leave a redirect line pointer in place, there will be another |
1542 | | * tuple on the page that it points to. We will do the bookkeeping for |
1543 | | * that separately. So we have nothing to do here, except remember that |
1544 | | * we processed this item. |
1545 | | */ |
1546 | 0 | Assert(!prstate->processed[offnum]); |
1547 | 0 | prstate->processed[offnum] = true; |
1548 | 0 | } |
1549 | | |
1550 | | /* |
1551 | | * Perform the actual page changes needed by heap_page_prune_and_freeze(). |
1552 | | * |
1553 | | * If 'lp_truncate_only' is set, we are merely marking LP_DEAD line pointers |
1554 | | * as unused, not redirecting or removing anything else. The |
1555 | | * PageRepairFragmentation() call is skipped in that case. |
1556 | | * |
1557 | | * If 'lp_truncate_only' is not set, the caller must hold a cleanup lock on |
1558 | | * the buffer. If it is set, an ordinary exclusive lock suffices. |
1559 | | */ |
1560 | | void |
1561 | | heap_page_prune_execute(Buffer buffer, bool lp_truncate_only, |
1562 | | OffsetNumber *redirected, int nredirected, |
1563 | | OffsetNumber *nowdead, int ndead, |
1564 | | OffsetNumber *nowunused, int nunused) |
1565 | 0 | { |
1566 | 0 | Page page = BufferGetPage(buffer); |
1567 | 0 | OffsetNumber *offnum; |
1568 | 0 | HeapTupleHeader htup PG_USED_FOR_ASSERTS_ONLY; |
1569 | | |
1570 | | /* Shouldn't be called unless there's something to do */ |
1571 | 0 | Assert(nredirected > 0 || ndead > 0 || nunused > 0); |
1572 | | |
1573 | | /* If 'lp_truncate_only', we can only remove already-dead line pointers */ |
1574 | 0 | Assert(!lp_truncate_only || (nredirected == 0 && ndead == 0)); |
1575 | | |
1576 | | /* Update all redirected line pointers */ |
1577 | 0 | offnum = redirected; |
1578 | 0 | for (int i = 0; i < nredirected; i++) |
1579 | 0 | { |
1580 | 0 | OffsetNumber fromoff = *offnum++; |
1581 | 0 | OffsetNumber tooff = *offnum++; |
1582 | 0 | ItemId fromlp = PageGetItemId(page, fromoff); |
1583 | 0 | ItemId tolp PG_USED_FOR_ASSERTS_ONLY; |
1584 | |
|
1585 | | #ifdef USE_ASSERT_CHECKING |
1586 | | |
1587 | | /* |
1588 | | * Any existing item that we set as an LP_REDIRECT (any 'from' item) |
1589 | | * must be the first item from a HOT chain. If the item has tuple |
1590 | | * storage then it can't be a heap-only tuple. Otherwise we are just |
1591 | | * maintaining an existing LP_REDIRECT from an existing HOT chain that |
1592 | | * has been pruned at least once before now. |
1593 | | */ |
1594 | | if (!ItemIdIsRedirected(fromlp)) |
1595 | | { |
1596 | | Assert(ItemIdHasStorage(fromlp) && ItemIdIsNormal(fromlp)); |
1597 | | |
1598 | | htup = (HeapTupleHeader) PageGetItem(page, fromlp); |
1599 | | Assert(!HeapTupleHeaderIsHeapOnly(htup)); |
1600 | | } |
1601 | | else |
1602 | | { |
1603 | | /* We shouldn't need to redundantly set the redirect */ |
1604 | | Assert(ItemIdGetRedirect(fromlp) != tooff); |
1605 | | } |
1606 | | |
1607 | | /* |
1608 | | * The item that we're about to set as an LP_REDIRECT (the 'from' |
1609 | | * item) will point to an existing item (the 'to' item) that is |
1610 | | * already a heap-only tuple. There can be at most one LP_REDIRECT |
1611 | | * item per HOT chain. |
1612 | | * |
1613 | | * We need to keep around an LP_REDIRECT item (after original |
1614 | | * non-heap-only root tuple gets pruned away) so that it's always |
1615 | | * possible for VACUUM to easily figure out what TID to delete from |
1616 | | * indexes when an entire HOT chain becomes dead. A heap-only tuple |
1617 | | * can never become LP_DEAD; an LP_REDIRECT item or a regular heap |
1618 | | * tuple can. |
1619 | | * |
1620 | | * This check may miss problems, e.g. the target of a redirect could |
1621 | | * be marked as unused subsequently. The page_verify_redirects() check |
1622 | | * below will catch such problems. |
1623 | | */ |
1624 | | tolp = PageGetItemId(page, tooff); |
1625 | | Assert(ItemIdHasStorage(tolp) && ItemIdIsNormal(tolp)); |
1626 | | htup = (HeapTupleHeader) PageGetItem(page, tolp); |
1627 | | Assert(HeapTupleHeaderIsHeapOnly(htup)); |
1628 | | #endif |
1629 | |
|
1630 | 0 | ItemIdSetRedirect(fromlp, tooff); |
1631 | 0 | } |
1632 | | |
1633 | | /* Update all now-dead line pointers */ |
1634 | 0 | offnum = nowdead; |
1635 | 0 | for (int i = 0; i < ndead; i++) |
1636 | 0 | { |
1637 | 0 | OffsetNumber off = *offnum++; |
1638 | 0 | ItemId lp = PageGetItemId(page, off); |
1639 | |
|
1640 | | #ifdef USE_ASSERT_CHECKING |
1641 | | |
1642 | | /* |
1643 | | * An LP_DEAD line pointer must be left behind when the original item |
1644 | | * (which is dead to everybody) could still be referenced by a TID in |
1645 | | * an index. This should never be necessary with any individual |
1646 | | * heap-only tuple item, though. (It's not clear how much of a problem |
1647 | | * that would be, but there is no reason to allow it.) |
1648 | | */ |
1649 | | if (ItemIdHasStorage(lp)) |
1650 | | { |
1651 | | Assert(ItemIdIsNormal(lp)); |
1652 | | htup = (HeapTupleHeader) PageGetItem(page, lp); |
1653 | | Assert(!HeapTupleHeaderIsHeapOnly(htup)); |
1654 | | } |
1655 | | else |
1656 | | { |
1657 | | /* Whole HOT chain becomes dead */ |
1658 | | Assert(ItemIdIsRedirected(lp)); |
1659 | | } |
1660 | | #endif |
1661 | |
|
1662 | 0 | ItemIdSetDead(lp); |
1663 | 0 | } |
1664 | | |
1665 | | /* Update all now-unused line pointers */ |
1666 | 0 | offnum = nowunused; |
1667 | 0 | for (int i = 0; i < nunused; i++) |
1668 | 0 | { |
1669 | 0 | OffsetNumber off = *offnum++; |
1670 | 0 | ItemId lp = PageGetItemId(page, off); |
1671 | |
|
1672 | | #ifdef USE_ASSERT_CHECKING |
1673 | | |
1674 | | if (lp_truncate_only) |
1675 | | { |
1676 | | /* Setting LP_DEAD to LP_UNUSED in vacuum's second pass */ |
1677 | | Assert(ItemIdIsDead(lp) && !ItemIdHasStorage(lp)); |
1678 | | } |
1679 | | else |
1680 | | { |
1681 | | /* |
1682 | | * When heap_page_prune_and_freeze() was called, mark_unused_now |
1683 | | * may have been passed as true, which allows would-be LP_DEAD |
1684 | | * items to be made LP_UNUSED instead. This is only possible if |
1685 | | * the relation has no indexes. If there are any dead items, then |
1686 | | * mark_unused_now was not true and every item being marked |
1687 | | * LP_UNUSED must refer to a heap-only tuple. |
1688 | | */ |
1689 | | if (ndead > 0) |
1690 | | { |
1691 | | Assert(ItemIdHasStorage(lp) && ItemIdIsNormal(lp)); |
1692 | | htup = (HeapTupleHeader) PageGetItem(page, lp); |
1693 | | Assert(HeapTupleHeaderIsHeapOnly(htup)); |
1694 | | } |
1695 | | else |
1696 | | Assert(ItemIdIsUsed(lp)); |
1697 | | } |
1698 | | |
1699 | | #endif |
1700 | |
|
1701 | 0 | ItemIdSetUnused(lp); |
1702 | 0 | } |
1703 | |
|
1704 | 0 | if (lp_truncate_only) |
1705 | 0 | PageTruncateLinePointerArray(page); |
1706 | 0 | else |
1707 | 0 | { |
1708 | | /* |
1709 | | * Finally, repair any fragmentation, and update the page's hint bit |
1710 | | * about whether it has free pointers. |
1711 | | */ |
1712 | 0 | PageRepairFragmentation(page); |
1713 | | |
1714 | | /* |
1715 | | * Now that the page has been modified, assert that redirect items |
1716 | | * still point to valid targets. |
1717 | | */ |
1718 | 0 | page_verify_redirects(page); |
1719 | 0 | } |
1720 | 0 | } |
1721 | | |
1722 | | |
1723 | | /* |
1724 | | * If built with assertions, verify that all LP_REDIRECT items point to a |
1725 | | * valid item. |
1726 | | * |
1727 | | * One way that bugs related to HOT pruning show is redirect items pointing to |
1728 | | * removed tuples. It's not trivial to reliably check that marking an item |
1729 | | * unused will not orphan a redirect item during heap_prune_chain() / |
1730 | | * heap_page_prune_execute(), so we additionally check the whole page after |
1731 | | * pruning. Without this check such bugs would typically only cause asserts |
1732 | | * later, potentially well after the corruption has been introduced. |
1733 | | * |
1734 | | * Also check comments in heap_page_prune_execute()'s redirection loop. |
1735 | | */ |
1736 | | static void |
1737 | | page_verify_redirects(Page page) |
1738 | 0 | { |
1739 | | #ifdef USE_ASSERT_CHECKING |
1740 | | OffsetNumber offnum; |
1741 | | OffsetNumber maxoff; |
1742 | | |
1743 | | maxoff = PageGetMaxOffsetNumber(page); |
1744 | | for (offnum = FirstOffsetNumber; |
1745 | | offnum <= maxoff; |
1746 | | offnum = OffsetNumberNext(offnum)) |
1747 | | { |
1748 | | ItemId itemid = PageGetItemId(page, offnum); |
1749 | | OffsetNumber targoff; |
1750 | | ItemId targitem; |
1751 | | HeapTupleHeader htup; |
1752 | | |
1753 | | if (!ItemIdIsRedirected(itemid)) |
1754 | | continue; |
1755 | | |
1756 | | targoff = ItemIdGetRedirect(itemid); |
1757 | | targitem = PageGetItemId(page, targoff); |
1758 | | |
1759 | | Assert(ItemIdIsUsed(targitem)); |
1760 | | Assert(ItemIdIsNormal(targitem)); |
1761 | | Assert(ItemIdHasStorage(targitem)); |
1762 | | htup = (HeapTupleHeader) PageGetItem(page, targitem); |
1763 | | Assert(HeapTupleHeaderIsHeapOnly(htup)); |
1764 | | } |
1765 | | #endif |
1766 | 0 | } |
1767 | | |
1768 | | |
1769 | | /* |
1770 | | * For all items in this page, find their respective root line pointers. |
1771 | | * If item k is part of a HOT-chain with root at item j, then we set |
1772 | | * root_offsets[k - 1] = j. |
1773 | | * |
1774 | | * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries. |
1775 | | * Unused entries are filled with InvalidOffsetNumber (zero). |
1776 | | * |
1777 | | * The function must be called with at least share lock on the buffer, to |
1778 | | * prevent concurrent prune operations. |
1779 | | * |
1780 | | * Note: The information collected here is valid only as long as the caller |
1781 | | * holds a pin on the buffer. Once pin is released, a tuple might be pruned |
1782 | | * and reused by a completely unrelated tuple. |
1783 | | */ |
1784 | | void |
1785 | | heap_get_root_tuples(Page page, OffsetNumber *root_offsets) |
1786 | 0 | { |
1787 | 0 | OffsetNumber offnum, |
1788 | 0 | maxoff; |
1789 | |
|
1790 | 0 | MemSet(root_offsets, InvalidOffsetNumber, |
1791 | 0 | MaxHeapTuplesPerPage * sizeof(OffsetNumber)); |
1792 | |
|
1793 | 0 | maxoff = PageGetMaxOffsetNumber(page); |
1794 | 0 | for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) |
1795 | 0 | { |
1796 | 0 | ItemId lp = PageGetItemId(page, offnum); |
1797 | 0 | HeapTupleHeader htup; |
1798 | 0 | OffsetNumber nextoffnum; |
1799 | 0 | TransactionId priorXmax; |
1800 | | |
1801 | | /* skip unused and dead items */ |
1802 | 0 | if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp)) |
1803 | 0 | continue; |
1804 | | |
1805 | 0 | if (ItemIdIsNormal(lp)) |
1806 | 0 | { |
1807 | 0 | htup = (HeapTupleHeader) PageGetItem(page, lp); |
1808 | | |
1809 | | /* |
1810 | | * Check if this tuple is part of a HOT-chain rooted at some other |
1811 | | * tuple. If so, skip it for now; we'll process it when we find |
1812 | | * its root. |
1813 | | */ |
1814 | 0 | if (HeapTupleHeaderIsHeapOnly(htup)) |
1815 | 0 | continue; |
1816 | | |
1817 | | /* |
1818 | | * This is either a plain tuple or the root of a HOT-chain. |
1819 | | * Remember it in the mapping. |
1820 | | */ |
1821 | 0 | root_offsets[offnum - 1] = offnum; |
1822 | | |
1823 | | /* If it's not the start of a HOT-chain, we're done with it */ |
1824 | 0 | if (!HeapTupleHeaderIsHotUpdated(htup)) |
1825 | 0 | continue; |
1826 | | |
1827 | | /* Set up to scan the HOT-chain */ |
1828 | 0 | nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); |
1829 | 0 | priorXmax = HeapTupleHeaderGetUpdateXid(htup); |
1830 | 0 | } |
1831 | 0 | else |
1832 | 0 | { |
1833 | | /* Must be a redirect item. We do not set its root_offsets entry */ |
1834 | 0 | Assert(ItemIdIsRedirected(lp)); |
1835 | | /* Set up to scan the HOT-chain */ |
1836 | 0 | nextoffnum = ItemIdGetRedirect(lp); |
1837 | 0 | priorXmax = InvalidTransactionId; |
1838 | 0 | } |
1839 | | |
1840 | | /* |
1841 | | * Now follow the HOT-chain and collect other tuples in the chain. |
1842 | | * |
1843 | | * Note: Even though this is a nested loop, the complexity of the |
1844 | | * function is O(N) because a tuple in the page should be visited not |
1845 | | * more than twice, once in the outer loop and once in HOT-chain |
1846 | | * chases. |
1847 | | */ |
1848 | 0 | for (;;) |
1849 | 0 | { |
1850 | | /* Sanity check (pure paranoia) */ |
1851 | 0 | if (offnum < FirstOffsetNumber) |
1852 | 0 | break; |
1853 | | |
1854 | | /* |
1855 | | * An offset past the end of page's line pointer array is possible |
1856 | | * when the array was truncated |
1857 | | */ |
1858 | 0 | if (offnum > maxoff) |
1859 | 0 | break; |
1860 | | |
1861 | 0 | lp = PageGetItemId(page, nextoffnum); |
1862 | | |
1863 | | /* Check for broken chains */ |
1864 | 0 | if (!ItemIdIsNormal(lp)) |
1865 | 0 | break; |
1866 | | |
1867 | 0 | htup = (HeapTupleHeader) PageGetItem(page, lp); |
1868 | |
|
1869 | 0 | if (TransactionIdIsValid(priorXmax) && |
1870 | 0 | !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup))) |
1871 | 0 | break; |
1872 | | |
1873 | | /* Remember the root line pointer for this item */ |
1874 | 0 | root_offsets[nextoffnum - 1] = offnum; |
1875 | | |
1876 | | /* Advance to next chain member, if any */ |
1877 | 0 | if (!HeapTupleHeaderIsHotUpdated(htup)) |
1878 | 0 | break; |
1879 | | |
1880 | | /* HOT implies it can't have moved to different partition */ |
1881 | 0 | Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup)); |
1882 | |
|
1883 | 0 | nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); |
1884 | 0 | priorXmax = HeapTupleHeaderGetUpdateXid(htup); |
1885 | 0 | } |
1886 | 0 | } |
1887 | 0 | } |
1888 | | |
1889 | | |
1890 | | /* |
1891 | | * Compare fields that describe actions required to freeze tuple with caller's |
1892 | | * open plan. If everything matches then the frz tuple plan is equivalent to |
1893 | | * caller's plan. |
1894 | | */ |
1895 | | static inline bool |
1896 | | heap_log_freeze_eq(xlhp_freeze_plan *plan, HeapTupleFreeze *frz) |
1897 | 0 | { |
1898 | 0 | if (plan->xmax == frz->xmax && |
1899 | 0 | plan->t_infomask2 == frz->t_infomask2 && |
1900 | 0 | plan->t_infomask == frz->t_infomask && |
1901 | 0 | plan->frzflags == frz->frzflags) |
1902 | 0 | return true; |
1903 | | |
1904 | | /* Caller must call heap_log_freeze_new_plan again for frz */ |
1905 | 0 | return false; |
1906 | 0 | } |
1907 | | |
1908 | | /* |
1909 | | * Comparator used to deduplicate the freeze plans used in WAL records. |
1910 | | */ |
1911 | | static int |
1912 | | heap_log_freeze_cmp(const void *arg1, const void *arg2) |
1913 | 0 | { |
1914 | 0 | HeapTupleFreeze *frz1 = (HeapTupleFreeze *) arg1; |
1915 | 0 | HeapTupleFreeze *frz2 = (HeapTupleFreeze *) arg2; |
1916 | |
|
1917 | 0 | if (frz1->xmax < frz2->xmax) |
1918 | 0 | return -1; |
1919 | 0 | else if (frz1->xmax > frz2->xmax) |
1920 | 0 | return 1; |
1921 | | |
1922 | 0 | if (frz1->t_infomask2 < frz2->t_infomask2) |
1923 | 0 | return -1; |
1924 | 0 | else if (frz1->t_infomask2 > frz2->t_infomask2) |
1925 | 0 | return 1; |
1926 | | |
1927 | 0 | if (frz1->t_infomask < frz2->t_infomask) |
1928 | 0 | return -1; |
1929 | 0 | else if (frz1->t_infomask > frz2->t_infomask) |
1930 | 0 | return 1; |
1931 | | |
1932 | 0 | if (frz1->frzflags < frz2->frzflags) |
1933 | 0 | return -1; |
1934 | 0 | else if (frz1->frzflags > frz2->frzflags) |
1935 | 0 | return 1; |
1936 | | |
1937 | | /* |
1938 | | * heap_log_freeze_eq would consider these tuple-wise plans to be equal. |
1939 | | * (So the tuples will share a single canonical freeze plan.) |
1940 | | * |
1941 | | * We tiebreak on page offset number to keep each freeze plan's page |
1942 | | * offset number array individually sorted. (Unnecessary, but be tidy.) |
1943 | | */ |
1944 | 0 | if (frz1->offset < frz2->offset) |
1945 | 0 | return -1; |
1946 | 0 | else if (frz1->offset > frz2->offset) |
1947 | 0 | return 1; |
1948 | | |
1949 | 0 | Assert(false); |
1950 | 0 | return 0; |
1951 | 0 | } |
1952 | | |
1953 | | /* |
1954 | | * Start new plan initialized using tuple-level actions. At least one tuple |
1955 | | * will have steps required to freeze described by caller's plan during REDO. |
1956 | | */ |
1957 | | static inline void |
1958 | | heap_log_freeze_new_plan(xlhp_freeze_plan *plan, HeapTupleFreeze *frz) |
1959 | 0 | { |
1960 | 0 | plan->xmax = frz->xmax; |
1961 | 0 | plan->t_infomask2 = frz->t_infomask2; |
1962 | 0 | plan->t_infomask = frz->t_infomask; |
1963 | 0 | plan->frzflags = frz->frzflags; |
1964 | 0 | plan->ntuples = 1; /* for now */ |
1965 | 0 | } |
1966 | | |
1967 | | /* |
1968 | | * Deduplicate tuple-based freeze plans so that each distinct set of |
1969 | | * processing steps is only stored once in the WAL record. |
1970 | | * Called during original execution of freezing (for logged relations). |
1971 | | * |
1972 | | * Return value is number of plans set in *plans_out for caller. Also writes |
1973 | | * an array of offset numbers into *offsets_out output argument for caller |
1974 | | * (actually there is one array per freeze plan, but that's not of immediate |
1975 | | * concern to our caller). |
1976 | | */ |
1977 | | static int |
1978 | | heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, |
1979 | | xlhp_freeze_plan *plans_out, |
1980 | | OffsetNumber *offsets_out) |
1981 | 0 | { |
1982 | 0 | int nplans = 0; |
1983 | | |
1984 | | /* Sort tuple-based freeze plans in the order required to deduplicate */ |
1985 | 0 | qsort(tuples, ntuples, sizeof(HeapTupleFreeze), heap_log_freeze_cmp); |
1986 | |
|
1987 | 0 | for (int i = 0; i < ntuples; i++) |
1988 | 0 | { |
1989 | 0 | HeapTupleFreeze *frz = tuples + i; |
1990 | |
|
1991 | 0 | if (i == 0) |
1992 | 0 | { |
1993 | | /* New canonical freeze plan starting with first tup */ |
1994 | 0 | heap_log_freeze_new_plan(plans_out, frz); |
1995 | 0 | nplans++; |
1996 | 0 | } |
1997 | 0 | else if (heap_log_freeze_eq(plans_out, frz)) |
1998 | 0 | { |
1999 | | /* tup matches open canonical plan -- include tup in it */ |
2000 | 0 | Assert(offsets_out[i - 1] < frz->offset); |
2001 | 0 | plans_out->ntuples++; |
2002 | 0 | } |
2003 | 0 | else |
2004 | 0 | { |
2005 | | /* Tup doesn't match current plan -- done with it now */ |
2006 | 0 | plans_out++; |
2007 | | |
2008 | | /* New canonical freeze plan starting with this tup */ |
2009 | 0 | heap_log_freeze_new_plan(plans_out, frz); |
2010 | 0 | nplans++; |
2011 | 0 | } |
2012 | | |
2013 | | /* |
2014 | | * Save page offset number in dedicated buffer in passing. |
2015 | | * |
2016 | | * REDO routine relies on the record's offset numbers array grouping |
2017 | | * offset numbers by freeze plan. The sort order within each grouping |
2018 | | * is ascending offset number order, just to keep things tidy. |
2019 | | */ |
2020 | 0 | offsets_out[i] = frz->offset; |
2021 | 0 | } |
2022 | |
|
2023 | 0 | Assert(nplans > 0 && nplans <= ntuples); |
2024 | |
|
2025 | 0 | return nplans; |
2026 | 0 | } |
2027 | | |
2028 | | /* |
2029 | | * Write an XLOG_HEAP2_PRUNE* WAL record |
2030 | | * |
2031 | | * This is used for several different page maintenance operations: |
2032 | | * |
2033 | | * - Page pruning, in VACUUM's 1st pass or on access: Some items are |
2034 | | * redirected, some marked dead, and some removed altogether. |
2035 | | * |
2036 | | * - Freezing: Items are marked as 'frozen'. |
2037 | | * |
2038 | | * - Vacuum, 2nd pass: Items that are already LP_DEAD are marked as unused. |
2039 | | * |
2040 | | * They have enough commonalities that we use a single WAL record for them |
2041 | | * all. |
2042 | | * |
2043 | | * If replaying the record requires a cleanup lock, pass cleanup_lock = true. |
2044 | | * Replaying 'redirected' or 'dead' items always requires a cleanup lock, but |
2045 | | * replaying 'unused' items depends on whether they were all previously marked |
2046 | | * as dead. |
2047 | | * |
2048 | | * Note: This function scribbles on the 'frozen' array. |
2049 | | * |
2050 | | * Note: This is called in a critical section, so careful what you do here. |
2051 | | */ |
2052 | | void |
2053 | | log_heap_prune_and_freeze(Relation relation, Buffer buffer, |
2054 | | TransactionId conflict_xid, |
2055 | | bool cleanup_lock, |
2056 | | PruneReason reason, |
2057 | | HeapTupleFreeze *frozen, int nfrozen, |
2058 | | OffsetNumber *redirected, int nredirected, |
2059 | | OffsetNumber *dead, int ndead, |
2060 | | OffsetNumber *unused, int nunused) |
2061 | 0 | { |
2062 | 0 | xl_heap_prune xlrec; |
2063 | 0 | XLogRecPtr recptr; |
2064 | 0 | uint8 info; |
2065 | | |
2066 | | /* The following local variables hold data registered in the WAL record: */ |
2067 | 0 | xlhp_freeze_plan plans[MaxHeapTuplesPerPage]; |
2068 | 0 | xlhp_freeze_plans freeze_plans; |
2069 | 0 | xlhp_prune_items redirect_items; |
2070 | 0 | xlhp_prune_items dead_items; |
2071 | 0 | xlhp_prune_items unused_items; |
2072 | 0 | OffsetNumber frz_offsets[MaxHeapTuplesPerPage]; |
2073 | |
|
2074 | 0 | xlrec.flags = 0; |
2075 | | |
2076 | | /* |
2077 | | * Prepare data for the buffer. The arrays are not actually in the |
2078 | | * buffer, but we pretend that they are. When XLogInsert stores a full |
2079 | | * page image, the arrays can be omitted. |
2080 | | */ |
2081 | 0 | XLogBeginInsert(); |
2082 | 0 | XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); |
2083 | 0 | if (nfrozen > 0) |
2084 | 0 | { |
2085 | 0 | int nplans; |
2086 | |
|
2087 | 0 | xlrec.flags |= XLHP_HAS_FREEZE_PLANS; |
2088 | | |
2089 | | /* |
2090 | | * Prepare deduplicated representation for use in the WAL record. This |
2091 | | * destructively sorts frozen tuples array in-place. |
2092 | | */ |
2093 | 0 | nplans = heap_log_freeze_plan(frozen, nfrozen, plans, frz_offsets); |
2094 | |
|
2095 | 0 | freeze_plans.nplans = nplans; |
2096 | 0 | XLogRegisterBufData(0, &freeze_plans, |
2097 | 0 | offsetof(xlhp_freeze_plans, plans)); |
2098 | 0 | XLogRegisterBufData(0, plans, |
2099 | 0 | sizeof(xlhp_freeze_plan) * nplans); |
2100 | 0 | } |
2101 | 0 | if (nredirected > 0) |
2102 | 0 | { |
2103 | 0 | xlrec.flags |= XLHP_HAS_REDIRECTIONS; |
2104 | |
|
2105 | 0 | redirect_items.ntargets = nredirected; |
2106 | 0 | XLogRegisterBufData(0, &redirect_items, |
2107 | 0 | offsetof(xlhp_prune_items, data)); |
2108 | 0 | XLogRegisterBufData(0, redirected, |
2109 | 0 | sizeof(OffsetNumber[2]) * nredirected); |
2110 | 0 | } |
2111 | 0 | if (ndead > 0) |
2112 | 0 | { |
2113 | 0 | xlrec.flags |= XLHP_HAS_DEAD_ITEMS; |
2114 | |
|
2115 | 0 | dead_items.ntargets = ndead; |
2116 | 0 | XLogRegisterBufData(0, &dead_items, |
2117 | 0 | offsetof(xlhp_prune_items, data)); |
2118 | 0 | XLogRegisterBufData(0, dead, |
2119 | 0 | sizeof(OffsetNumber) * ndead); |
2120 | 0 | } |
2121 | 0 | if (nunused > 0) |
2122 | 0 | { |
2123 | 0 | xlrec.flags |= XLHP_HAS_NOW_UNUSED_ITEMS; |
2124 | |
|
2125 | 0 | unused_items.ntargets = nunused; |
2126 | 0 | XLogRegisterBufData(0, &unused_items, |
2127 | 0 | offsetof(xlhp_prune_items, data)); |
2128 | 0 | XLogRegisterBufData(0, unused, |
2129 | 0 | sizeof(OffsetNumber) * nunused); |
2130 | 0 | } |
2131 | 0 | if (nfrozen > 0) |
2132 | 0 | XLogRegisterBufData(0, frz_offsets, |
2133 | 0 | sizeof(OffsetNumber) * nfrozen); |
2134 | | |
2135 | | /* |
2136 | | * Prepare the main xl_heap_prune record. We already set the XLHP_HAS_* |
2137 | | * flag above. |
2138 | | */ |
2139 | 0 | if (RelationIsAccessibleInLogicalDecoding(relation)) |
2140 | 0 | xlrec.flags |= XLHP_IS_CATALOG_REL; |
2141 | 0 | if (TransactionIdIsValid(conflict_xid)) |
2142 | 0 | xlrec.flags |= XLHP_HAS_CONFLICT_HORIZON; |
2143 | 0 | if (cleanup_lock) |
2144 | 0 | xlrec.flags |= XLHP_CLEANUP_LOCK; |
2145 | 0 | else |
2146 | 0 | { |
2147 | 0 | Assert(nredirected == 0 && ndead == 0); |
2148 | | /* also, any items in 'unused' must've been LP_DEAD previously */ |
2149 | 0 | } |
2150 | 0 | XLogRegisterData(&xlrec, SizeOfHeapPrune); |
2151 | 0 | if (TransactionIdIsValid(conflict_xid)) |
2152 | 0 | XLogRegisterData(&conflict_xid, sizeof(TransactionId)); |
2153 | |
|
2154 | 0 | switch (reason) |
2155 | 0 | { |
2156 | 0 | case PRUNE_ON_ACCESS: |
2157 | 0 | info = XLOG_HEAP2_PRUNE_ON_ACCESS; |
2158 | 0 | break; |
2159 | 0 | case PRUNE_VACUUM_SCAN: |
2160 | 0 | info = XLOG_HEAP2_PRUNE_VACUUM_SCAN; |
2161 | 0 | break; |
2162 | 0 | case PRUNE_VACUUM_CLEANUP: |
2163 | 0 | info = XLOG_HEAP2_PRUNE_VACUUM_CLEANUP; |
2164 | 0 | break; |
2165 | 0 | default: |
2166 | 0 | elog(ERROR, "unrecognized prune reason: %d", (int) reason); |
2167 | 0 | break; |
2168 | 0 | } |
2169 | 0 | recptr = XLogInsert(RM_HEAP2_ID, info); |
2170 | |
|
2171 | 0 | PageSetLSN(BufferGetPage(buffer), recptr); |
2172 | 0 | } |