/src/postgres/src/backend/access/heap/heapam.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * heapam.c |
4 | | * heap access method code |
5 | | * |
6 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/backend/access/heap/heapam.c |
12 | | * |
13 | | * |
14 | | * INTERFACE ROUTINES |
15 | | * heap_beginscan - begin relation scan |
16 | | * heap_rescan - restart a relation scan |
17 | | * heap_endscan - end relation scan |
18 | | * heap_getnext - retrieve next tuple in scan |
19 | | * heap_fetch - retrieve tuple with given tid |
20 | | * heap_insert - insert tuple into a relation |
21 | | * heap_multi_insert - insert multiple tuples into a relation |
22 | | * heap_delete - delete a tuple from a relation |
23 | | * heap_update - replace a tuple in a relation with another tuple |
24 | | * |
25 | | * NOTES |
26 | | * This file contains the heap_ routines which implement |
27 | | * the POSTGRES heap access method used for all POSTGRES |
28 | | * relations. |
29 | | * |
30 | | *------------------------------------------------------------------------- |
31 | | */ |
32 | | #include "postgres.h" |
33 | | |
34 | | #include "access/heapam.h" |
35 | | #include "access/heaptoast.h" |
36 | | #include "access/hio.h" |
37 | | #include "access/multixact.h" |
38 | | #include "access/subtrans.h" |
39 | | #include "access/syncscan.h" |
40 | | #include "access/valid.h" |
41 | | #include "access/visibilitymap.h" |
42 | | #include "access/xloginsert.h" |
43 | | #include "catalog/pg_database.h" |
44 | | #include "catalog/pg_database_d.h" |
45 | | #include "commands/vacuum.h" |
46 | | #include "pgstat.h" |
47 | | #include "port/pg_bitutils.h" |
48 | | #include "storage/lmgr.h" |
49 | | #include "storage/predicate.h" |
50 | | #include "storage/procarray.h" |
51 | | #include "utils/datum.h" |
52 | | #include "utils/injection_point.h" |
53 | | #include "utils/inval.h" |
54 | | #include "utils/spccache.h" |
55 | | #include "utils/syscache.h" |
56 | | |
57 | | |
58 | | static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, |
59 | | TransactionId xid, CommandId cid, int options); |
60 | | static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, |
61 | | Buffer newbuf, HeapTuple oldtup, |
62 | | HeapTuple newtup, HeapTuple old_key_tuple, |
63 | | bool all_visible_cleared, bool new_all_visible_cleared); |
64 | | #ifdef USE_ASSERT_CHECKING |
65 | | static void check_lock_if_inplace_updateable_rel(Relation relation, |
66 | | ItemPointer otid, |
67 | | HeapTuple newtup); |
68 | | static void check_inplace_rel_lock(HeapTuple oldtup); |
69 | | #endif |
70 | | static Bitmapset *HeapDetermineColumnsInfo(Relation relation, |
71 | | Bitmapset *interesting_cols, |
72 | | Bitmapset *external_cols, |
73 | | HeapTuple oldtup, HeapTuple newtup, |
74 | | bool *has_external); |
75 | | static bool heap_acquire_tuplock(Relation relation, ItemPointer tid, |
76 | | LockTupleMode mode, LockWaitPolicy wait_policy, |
77 | | bool *have_tuple_lock); |
78 | | static inline BlockNumber heapgettup_advance_block(HeapScanDesc scan, |
79 | | BlockNumber block, |
80 | | ScanDirection dir); |
81 | | static pg_noinline BlockNumber heapgettup_initial_block(HeapScanDesc scan, |
82 | | ScanDirection dir); |
83 | | static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, |
84 | | uint16 old_infomask2, TransactionId add_to_xmax, |
85 | | LockTupleMode mode, bool is_update, |
86 | | TransactionId *result_xmax, uint16 *result_infomask, |
87 | | uint16 *result_infomask2); |
88 | | static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple, |
89 | | ItemPointer ctid, TransactionId xid, |
90 | | LockTupleMode mode); |
91 | | static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, |
92 | | uint16 *new_infomask2); |
93 | | static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax, |
94 | | uint16 t_infomask); |
95 | | static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, |
96 | | LockTupleMode lockmode, bool *current_is_member); |
97 | | static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, |
98 | | Relation rel, ItemPointer ctid, XLTW_Oper oper, |
99 | | int *remaining); |
100 | | static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, |
101 | | uint16 infomask, Relation rel, int *remaining, |
102 | | bool logLockFailure); |
103 | | static void index_delete_sort(TM_IndexDeleteOp *delstate); |
104 | | static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate); |
105 | | static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup); |
106 | | static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, |
107 | | bool *copy); |
108 | | |
109 | | |
110 | | /* |
111 | | * Each tuple lock mode has a corresponding heavyweight lock, and one or two |
112 | | * corresponding MultiXactStatuses (one to merely lock tuples, another one to |
113 | | * update them). This table (and the macros below) helps us determine the |
114 | | * heavyweight lock mode and MultiXactStatus values to use for any particular |
115 | | * tuple lock strength. |
116 | | * |
117 | | * These interact with InplaceUpdateTupleLock, an alias for ExclusiveLock. |
118 | | * |
119 | | * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock |
120 | | * instead. |
121 | | */ |
122 | | static const struct |
123 | | { |
124 | | LOCKMODE hwlock; |
125 | | int lockstatus; |
126 | | int updstatus; |
127 | | } |
128 | | |
129 | | tupleLockExtraInfo[MaxLockTupleMode + 1] = |
130 | | { |
131 | | { /* LockTupleKeyShare */ |
132 | | AccessShareLock, |
133 | | MultiXactStatusForKeyShare, |
134 | | -1 /* KeyShare does not allow updating tuples */ |
135 | | }, |
136 | | { /* LockTupleShare */ |
137 | | RowShareLock, |
138 | | MultiXactStatusForShare, |
139 | | -1 /* Share does not allow updating tuples */ |
140 | | }, |
141 | | { /* LockTupleNoKeyExclusive */ |
142 | | ExclusiveLock, |
143 | | MultiXactStatusForNoKeyUpdate, |
144 | | MultiXactStatusNoKeyUpdate |
145 | | }, |
146 | | { /* LockTupleExclusive */ |
147 | | AccessExclusiveLock, |
148 | | MultiXactStatusForUpdate, |
149 | | MultiXactStatusUpdate |
150 | | } |
151 | | }; |
152 | | |
153 | | /* Get the LOCKMODE for a given MultiXactStatus */ |
154 | | #define LOCKMODE_from_mxstatus(status) \ |
155 | 0 | (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock) |
156 | | |
157 | | /* |
158 | | * Acquire heavyweight locks on tuples, using a LockTupleMode strength value. |
159 | | * This is more readable than having every caller translate it to lock.h's |
160 | | * LOCKMODE. |
161 | | */ |
162 | | #define LockTupleTuplock(rel, tup, mode) \ |
163 | 0 | LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) |
164 | | #define UnlockTupleTuplock(rel, tup, mode) \ |
165 | 0 | UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock) |
166 | | #define ConditionalLockTupleTuplock(rel, tup, mode, log) \ |
167 | 0 | ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock, (log)) |
168 | | |
169 | | #ifdef USE_PREFETCH |
170 | | /* |
171 | | * heap_index_delete_tuples and index_delete_prefetch_buffer use this |
172 | | * structure to coordinate prefetching activity |
173 | | */ |
174 | | typedef struct |
175 | | { |
176 | | BlockNumber cur_hblkno; |
177 | | int next_item; |
178 | | int ndeltids; |
179 | | TM_IndexDelete *deltids; |
180 | | } IndexDeletePrefetchState; |
181 | | #endif |
182 | | |
183 | | /* heap_index_delete_tuples bottom-up index deletion costing constants */ |
184 | | #define BOTTOMUP_MAX_NBLOCKS 6 |
185 | 0 | #define BOTTOMUP_TOLERANCE_NBLOCKS 3 |
186 | | |
187 | | /* |
188 | | * heap_index_delete_tuples uses this when determining which heap blocks it |
189 | | * must visit to help its bottom-up index deletion caller |
190 | | */ |
191 | | typedef struct IndexDeleteCounts |
192 | | { |
193 | | int16 npromisingtids; /* Number of "promising" TIDs in group */ |
194 | | int16 ntids; /* Number of TIDs in group */ |
195 | | int16 ifirsttid; /* Offset to group's first deltid */ |
196 | | } IndexDeleteCounts; |
197 | | |
198 | | /* |
199 | | * This table maps tuple lock strength values for each particular |
200 | | * MultiXactStatus value. |
201 | | */ |
202 | | static const int MultiXactStatusLock[MaxMultiXactStatus + 1] = |
203 | | { |
204 | | LockTupleKeyShare, /* ForKeyShare */ |
205 | | LockTupleShare, /* ForShare */ |
206 | | LockTupleNoKeyExclusive, /* ForNoKeyUpdate */ |
207 | | LockTupleExclusive, /* ForUpdate */ |
208 | | LockTupleNoKeyExclusive, /* NoKeyUpdate */ |
209 | | LockTupleExclusive /* Update */ |
210 | | }; |
211 | | |
212 | | /* Get the LockTupleMode for a given MultiXactStatus */ |
213 | | #define TUPLOCK_from_mxstatus(status) \ |
214 | 0 | (MultiXactStatusLock[(status)]) |
215 | | |
216 | | /* |
217 | | * Check that we have a valid snapshot if we might need TOAST access. |
218 | | */ |
219 | | static inline void |
220 | | AssertHasSnapshotForToast(Relation rel) |
221 | 0 | { |
222 | | #ifdef USE_ASSERT_CHECKING |
223 | | |
224 | | /* bootstrap mode in particular breaks this rule */ |
225 | | if (!IsNormalProcessingMode()) |
226 | | return; |
227 | | |
228 | | /* if the relation doesn't have a TOAST table, we are good */ |
229 | | if (!OidIsValid(rel->rd_rel->reltoastrelid)) |
230 | | return; |
231 | | |
232 | | Assert(HaveRegisteredOrActiveSnapshot()); |
233 | | |
234 | | #endif /* USE_ASSERT_CHECKING */ |
235 | 0 | } |
236 | | |
237 | | /* ---------------------------------------------------------------- |
238 | | * heap support routines |
239 | | * ---------------------------------------------------------------- |
240 | | */ |
241 | | |
242 | | /* |
243 | | * Streaming read API callback for parallel sequential scans. Returns the next |
244 | | * block the caller wants from the read stream or InvalidBlockNumber when done. |
245 | | */ |
246 | | static BlockNumber |
247 | | heap_scan_stream_read_next_parallel(ReadStream *stream, |
248 | | void *callback_private_data, |
249 | | void *per_buffer_data) |
250 | 0 | { |
251 | 0 | HeapScanDesc scan = (HeapScanDesc) callback_private_data; |
252 | |
|
253 | 0 | Assert(ScanDirectionIsForward(scan->rs_dir)); |
254 | 0 | Assert(scan->rs_base.rs_parallel); |
255 | |
|
256 | 0 | if (unlikely(!scan->rs_inited)) |
257 | 0 | { |
258 | | /* parallel scan */ |
259 | 0 | table_block_parallelscan_startblock_init(scan->rs_base.rs_rd, |
260 | 0 | scan->rs_parallelworkerdata, |
261 | 0 | (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel); |
262 | | |
263 | | /* may return InvalidBlockNumber if there are no more blocks */ |
264 | 0 | scan->rs_prefetch_block = table_block_parallelscan_nextpage(scan->rs_base.rs_rd, |
265 | 0 | scan->rs_parallelworkerdata, |
266 | 0 | (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel); |
267 | 0 | scan->rs_inited = true; |
268 | 0 | } |
269 | 0 | else |
270 | 0 | { |
271 | 0 | scan->rs_prefetch_block = table_block_parallelscan_nextpage(scan->rs_base.rs_rd, |
272 | 0 | scan->rs_parallelworkerdata, (ParallelBlockTableScanDesc) |
273 | 0 | scan->rs_base.rs_parallel); |
274 | 0 | } |
275 | |
|
276 | 0 | return scan->rs_prefetch_block; |
277 | 0 | } |
278 | | |
279 | | /* |
280 | | * Streaming read API callback for serial sequential and TID range scans. |
281 | | * Returns the next block the caller wants from the read stream or |
282 | | * InvalidBlockNumber when done. |
283 | | */ |
284 | | static BlockNumber |
285 | | heap_scan_stream_read_next_serial(ReadStream *stream, |
286 | | void *callback_private_data, |
287 | | void *per_buffer_data) |
288 | 0 | { |
289 | 0 | HeapScanDesc scan = (HeapScanDesc) callback_private_data; |
290 | |
|
291 | 0 | if (unlikely(!scan->rs_inited)) |
292 | 0 | { |
293 | 0 | scan->rs_prefetch_block = heapgettup_initial_block(scan, scan->rs_dir); |
294 | 0 | scan->rs_inited = true; |
295 | 0 | } |
296 | 0 | else |
297 | 0 | scan->rs_prefetch_block = heapgettup_advance_block(scan, |
298 | 0 | scan->rs_prefetch_block, |
299 | 0 | scan->rs_dir); |
300 | |
|
301 | 0 | return scan->rs_prefetch_block; |
302 | 0 | } |
303 | | |
304 | | /* |
305 | | * Read stream API callback for bitmap heap scans. |
306 | | * Returns the next block the caller wants from the read stream or |
307 | | * InvalidBlockNumber when done. |
308 | | */ |
309 | | static BlockNumber |
310 | | bitmapheap_stream_read_next(ReadStream *pgsr, void *private_data, |
311 | | void *per_buffer_data) |
312 | 0 | { |
313 | 0 | TBMIterateResult *tbmres = per_buffer_data; |
314 | 0 | BitmapHeapScanDesc bscan = (BitmapHeapScanDesc) private_data; |
315 | 0 | HeapScanDesc hscan = (HeapScanDesc) bscan; |
316 | 0 | TableScanDesc sscan = &hscan->rs_base; |
317 | |
|
318 | 0 | for (;;) |
319 | 0 | { |
320 | 0 | CHECK_FOR_INTERRUPTS(); |
321 | | |
322 | | /* no more entries in the bitmap */ |
323 | 0 | if (!tbm_iterate(&sscan->st.rs_tbmiterator, tbmres)) |
324 | 0 | return InvalidBlockNumber; |
325 | | |
326 | | /* |
327 | | * Ignore any claimed entries past what we think is the end of the |
328 | | * relation. It may have been extended after the start of our scan (we |
329 | | * only hold an AccessShareLock, and it could be inserts from this |
330 | | * backend). We don't take this optimization in SERIALIZABLE |
331 | | * isolation though, as we need to examine all invisible tuples |
332 | | * reachable by the index. |
333 | | */ |
334 | 0 | if (!IsolationIsSerializable() && |
335 | 0 | tbmres->blockno >= hscan->rs_nblocks) |
336 | 0 | continue; |
337 | | |
338 | 0 | return tbmres->blockno; |
339 | 0 | } |
340 | | |
341 | | /* not reachable */ |
342 | 0 | Assert(false); |
343 | 0 | } |
344 | | |
345 | | /* ---------------- |
346 | | * initscan - scan code common to heap_beginscan and heap_rescan |
347 | | * ---------------- |
348 | | */ |
349 | | static void |
350 | | initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock) |
351 | 0 | { |
352 | 0 | ParallelBlockTableScanDesc bpscan = NULL; |
353 | 0 | bool allow_strat; |
354 | 0 | bool allow_sync; |
355 | | |
356 | | /* |
357 | | * Determine the number of blocks we have to scan. |
358 | | * |
359 | | * It is sufficient to do this once at scan start, since any tuples added |
360 | | * while the scan is in progress will be invisible to my snapshot anyway. |
361 | | * (That is not true when using a non-MVCC snapshot. However, we couldn't |
362 | | * guarantee to return tuples added after scan start anyway, since they |
363 | | * might go into pages we already scanned. To guarantee consistent |
364 | | * results for a non-MVCC snapshot, the caller must hold some higher-level |
365 | | * lock that ensures the interesting tuple(s) won't change.) |
366 | | */ |
367 | 0 | if (scan->rs_base.rs_parallel != NULL) |
368 | 0 | { |
369 | 0 | bpscan = (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel; |
370 | 0 | scan->rs_nblocks = bpscan->phs_nblocks; |
371 | 0 | } |
372 | 0 | else |
373 | 0 | scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_base.rs_rd); |
374 | | |
375 | | /* |
376 | | * If the table is large relative to NBuffers, use a bulk-read access |
377 | | * strategy and enable synchronized scanning (see syncscan.c). Although |
378 | | * the thresholds for these features could be different, we make them the |
379 | | * same so that there are only two behaviors to tune rather than four. |
380 | | * (However, some callers need to be able to disable one or both of these |
381 | | * behaviors, independently of the size of the table; also there is a GUC |
382 | | * variable that can disable synchronized scanning.) |
383 | | * |
384 | | * Note that table_block_parallelscan_initialize has a very similar test; |
385 | | * if you change this, consider changing that one, too. |
386 | | */ |
387 | 0 | if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) && |
388 | 0 | scan->rs_nblocks > NBuffers / 4) |
389 | 0 | { |
390 | 0 | allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0; |
391 | 0 | allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0; |
392 | 0 | } |
393 | 0 | else |
394 | 0 | allow_strat = allow_sync = false; |
395 | |
|
396 | 0 | if (allow_strat) |
397 | 0 | { |
398 | | /* During a rescan, keep the previous strategy object. */ |
399 | 0 | if (scan->rs_strategy == NULL) |
400 | 0 | scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD); |
401 | 0 | } |
402 | 0 | else |
403 | 0 | { |
404 | 0 | if (scan->rs_strategy != NULL) |
405 | 0 | FreeAccessStrategy(scan->rs_strategy); |
406 | 0 | scan->rs_strategy = NULL; |
407 | 0 | } |
408 | |
|
409 | 0 | if (scan->rs_base.rs_parallel != NULL) |
410 | 0 | { |
411 | | /* For parallel scan, believe whatever ParallelTableScanDesc says. */ |
412 | 0 | if (scan->rs_base.rs_parallel->phs_syncscan) |
413 | 0 | scan->rs_base.rs_flags |= SO_ALLOW_SYNC; |
414 | 0 | else |
415 | 0 | scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; |
416 | 0 | } |
417 | 0 | else if (keep_startblock) |
418 | 0 | { |
419 | | /* |
420 | | * When rescanning, we want to keep the previous startblock setting, |
421 | | * so that rewinding a cursor doesn't generate surprising results. |
422 | | * Reset the active syncscan setting, though. |
423 | | */ |
424 | 0 | if (allow_sync && synchronize_seqscans) |
425 | 0 | scan->rs_base.rs_flags |= SO_ALLOW_SYNC; |
426 | 0 | else |
427 | 0 | scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; |
428 | 0 | } |
429 | 0 | else if (allow_sync && synchronize_seqscans) |
430 | 0 | { |
431 | 0 | scan->rs_base.rs_flags |= SO_ALLOW_SYNC; |
432 | 0 | scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks); |
433 | 0 | } |
434 | 0 | else |
435 | 0 | { |
436 | 0 | scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; |
437 | 0 | scan->rs_startblock = 0; |
438 | 0 | } |
439 | |
|
440 | 0 | scan->rs_numblocks = InvalidBlockNumber; |
441 | 0 | scan->rs_inited = false; |
442 | 0 | scan->rs_ctup.t_data = NULL; |
443 | 0 | ItemPointerSetInvalid(&scan->rs_ctup.t_self); |
444 | 0 | scan->rs_cbuf = InvalidBuffer; |
445 | 0 | scan->rs_cblock = InvalidBlockNumber; |
446 | 0 | scan->rs_ntuples = 0; |
447 | 0 | scan->rs_cindex = 0; |
448 | | |
449 | | /* |
450 | | * Initialize to ForwardScanDirection because it is most common and |
451 | | * because heap scans go forward before going backward (e.g. CURSORs). |
452 | | */ |
453 | 0 | scan->rs_dir = ForwardScanDirection; |
454 | 0 | scan->rs_prefetch_block = InvalidBlockNumber; |
455 | | |
456 | | /* page-at-a-time fields are always invalid when not rs_inited */ |
457 | | |
458 | | /* |
459 | | * copy the scan key, if appropriate |
460 | | */ |
461 | 0 | if (key != NULL && scan->rs_base.rs_nkeys > 0) |
462 | 0 | memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData)); |
463 | | |
464 | | /* |
465 | | * Currently, we only have a stats counter for sequential heap scans (but |
466 | | * e.g for bitmap scans the underlying bitmap index scans will be counted, |
467 | | * and for sample scans we update stats for tuple fetches). |
468 | | */ |
469 | 0 | if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN) |
470 | 0 | pgstat_count_heap_scan(scan->rs_base.rs_rd); |
471 | 0 | } |
472 | | |
473 | | /* |
474 | | * heap_setscanlimits - restrict range of a heapscan |
475 | | * |
476 | | * startBlk is the page to start at |
477 | | * numBlks is number of pages to scan (InvalidBlockNumber means "all") |
478 | | */ |
479 | | void |
480 | | heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks) |
481 | 0 | { |
482 | 0 | HeapScanDesc scan = (HeapScanDesc) sscan; |
483 | |
|
484 | 0 | Assert(!scan->rs_inited); /* else too late to change */ |
485 | | /* else rs_startblock is significant */ |
486 | 0 | Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC)); |
487 | | |
488 | | /* Check startBlk is valid (but allow case of zero blocks...) */ |
489 | 0 | Assert(startBlk == 0 || startBlk < scan->rs_nblocks); |
490 | |
|
491 | 0 | scan->rs_startblock = startBlk; |
492 | 0 | scan->rs_numblocks = numBlks; |
493 | 0 | } |
494 | | |
495 | | /* |
496 | | * Per-tuple loop for heap_prepare_pagescan(). Pulled out so it can be called |
497 | | * multiple times, with constant arguments for all_visible, |
498 | | * check_serializable. |
499 | | */ |
500 | | pg_attribute_always_inline |
501 | | static int |
502 | | page_collect_tuples(HeapScanDesc scan, Snapshot snapshot, |
503 | | Page page, Buffer buffer, |
504 | | BlockNumber block, int lines, |
505 | | bool all_visible, bool check_serializable) |
506 | 0 | { |
507 | 0 | int ntup = 0; |
508 | 0 | OffsetNumber lineoff; |
509 | |
|
510 | 0 | for (lineoff = FirstOffsetNumber; lineoff <= lines; lineoff++) |
511 | 0 | { |
512 | 0 | ItemId lpp = PageGetItemId(page, lineoff); |
513 | 0 | HeapTupleData loctup; |
514 | 0 | bool valid; |
515 | |
|
516 | 0 | if (!ItemIdIsNormal(lpp)) |
517 | 0 | continue; |
518 | | |
519 | 0 | loctup.t_data = (HeapTupleHeader) PageGetItem(page, lpp); |
520 | 0 | loctup.t_len = ItemIdGetLength(lpp); |
521 | 0 | loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd); |
522 | 0 | ItemPointerSet(&(loctup.t_self), block, lineoff); |
523 | |
|
524 | 0 | if (all_visible) |
525 | 0 | valid = true; |
526 | 0 | else |
527 | 0 | valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); |
528 | |
|
529 | 0 | if (check_serializable) |
530 | 0 | HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd, |
531 | 0 | &loctup, buffer, snapshot); |
532 | |
|
533 | 0 | if (valid) |
534 | 0 | { |
535 | 0 | scan->rs_vistuples[ntup] = lineoff; |
536 | 0 | ntup++; |
537 | 0 | } |
538 | 0 | } |
539 | |
|
540 | 0 | Assert(ntup <= MaxHeapTuplesPerPage); |
541 | |
|
542 | 0 | return ntup; |
543 | 0 | } |
544 | | |
545 | | /* |
546 | | * heap_prepare_pagescan - Prepare current scan page to be scanned in pagemode |
547 | | * |
548 | | * Preparation currently consists of 1. prune the scan's rs_cbuf page, and 2. |
549 | | * fill the rs_vistuples[] array with the OffsetNumbers of visible tuples. |
550 | | */ |
551 | | void |
552 | | heap_prepare_pagescan(TableScanDesc sscan) |
553 | 0 | { |
554 | 0 | HeapScanDesc scan = (HeapScanDesc) sscan; |
555 | 0 | Buffer buffer = scan->rs_cbuf; |
556 | 0 | BlockNumber block = scan->rs_cblock; |
557 | 0 | Snapshot snapshot; |
558 | 0 | Page page; |
559 | 0 | int lines; |
560 | 0 | bool all_visible; |
561 | 0 | bool check_serializable; |
562 | |
|
563 | 0 | Assert(BufferGetBlockNumber(buffer) == block); |
564 | | |
565 | | /* ensure we're not accidentally being used when not in pagemode */ |
566 | 0 | Assert(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE); |
567 | 0 | snapshot = scan->rs_base.rs_snapshot; |
568 | | |
569 | | /* |
570 | | * Prune and repair fragmentation for the whole page, if possible. |
571 | | */ |
572 | 0 | heap_page_prune_opt(scan->rs_base.rs_rd, buffer); |
573 | | |
574 | | /* |
575 | | * We must hold share lock on the buffer content while examining tuple |
576 | | * visibility. Afterwards, however, the tuples we have found to be |
577 | | * visible are guaranteed good as long as we hold the buffer pin. |
578 | | */ |
579 | 0 | LockBuffer(buffer, BUFFER_LOCK_SHARE); |
580 | |
|
581 | 0 | page = BufferGetPage(buffer); |
582 | 0 | lines = PageGetMaxOffsetNumber(page); |
583 | | |
584 | | /* |
585 | | * If the all-visible flag indicates that all tuples on the page are |
586 | | * visible to everyone, we can skip the per-tuple visibility tests. |
587 | | * |
588 | | * Note: In hot standby, a tuple that's already visible to all |
589 | | * transactions on the primary might still be invisible to a read-only |
590 | | * transaction in the standby. We partly handle this problem by tracking |
591 | | * the minimum xmin of visible tuples as the cut-off XID while marking a |
592 | | * page all-visible on the primary and WAL log that along with the |
593 | | * visibility map SET operation. In hot standby, we wait for (or abort) |
594 | | * all transactions that can potentially may not see one or more tuples on |
595 | | * the page. That's how index-only scans work fine in hot standby. A |
596 | | * crucial difference between index-only scans and heap scans is that the |
597 | | * index-only scan completely relies on the visibility map where as heap |
598 | | * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if |
599 | | * the page-level flag can be trusted in the same way, because it might |
600 | | * get propagated somehow without being explicitly WAL-logged, e.g. via a |
601 | | * full page write. Until we can prove that beyond doubt, let's check each |
602 | | * tuple for visibility the hard way. |
603 | | */ |
604 | 0 | all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery; |
605 | 0 | check_serializable = |
606 | 0 | CheckForSerializableConflictOutNeeded(scan->rs_base.rs_rd, snapshot); |
607 | | |
608 | | /* |
609 | | * We call page_collect_tuples() with constant arguments, to get the |
610 | | * compiler to constant fold the constant arguments. Separate calls with |
611 | | * constant arguments, rather than variables, are needed on several |
612 | | * compilers to actually perform constant folding. |
613 | | */ |
614 | 0 | if (likely(all_visible)) |
615 | 0 | { |
616 | 0 | if (likely(!check_serializable)) |
617 | 0 | scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer, |
618 | 0 | block, lines, true, false); |
619 | 0 | else |
620 | 0 | scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer, |
621 | 0 | block, lines, true, true); |
622 | 0 | } |
623 | 0 | else |
624 | 0 | { |
625 | 0 | if (likely(!check_serializable)) |
626 | 0 | scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer, |
627 | 0 | block, lines, false, false); |
628 | 0 | else |
629 | 0 | scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer, |
630 | 0 | block, lines, false, true); |
631 | 0 | } |
632 | |
|
633 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
634 | 0 | } |
635 | | |
636 | | /* |
637 | | * heap_fetch_next_buffer - read and pin the next block from MAIN_FORKNUM. |
638 | | * |
639 | | * Read the next block of the scan relation from the read stream and save it |
640 | | * in the scan descriptor. It is already pinned. |
641 | | */ |
642 | | static inline void |
643 | | heap_fetch_next_buffer(HeapScanDesc scan, ScanDirection dir) |
644 | 0 | { |
645 | 0 | Assert(scan->rs_read_stream); |
646 | | |
647 | | /* release previous scan buffer, if any */ |
648 | 0 | if (BufferIsValid(scan->rs_cbuf)) |
649 | 0 | { |
650 | 0 | ReleaseBuffer(scan->rs_cbuf); |
651 | 0 | scan->rs_cbuf = InvalidBuffer; |
652 | 0 | } |
653 | | |
654 | | /* |
655 | | * Be sure to check for interrupts at least once per page. Checks at |
656 | | * higher code levels won't be able to stop a seqscan that encounters many |
657 | | * pages' worth of consecutive dead tuples. |
658 | | */ |
659 | 0 | CHECK_FOR_INTERRUPTS(); |
660 | | |
661 | | /* |
662 | | * If the scan direction is changing, reset the prefetch block to the |
663 | | * current block. Otherwise, we will incorrectly prefetch the blocks |
664 | | * between the prefetch block and the current block again before |
665 | | * prefetching blocks in the new, correct scan direction. |
666 | | */ |
667 | 0 | if (unlikely(scan->rs_dir != dir)) |
668 | 0 | { |
669 | 0 | scan->rs_prefetch_block = scan->rs_cblock; |
670 | 0 | read_stream_reset(scan->rs_read_stream); |
671 | 0 | } |
672 | |
|
673 | 0 | scan->rs_dir = dir; |
674 | |
|
675 | 0 | scan->rs_cbuf = read_stream_next_buffer(scan->rs_read_stream, NULL); |
676 | 0 | if (BufferIsValid(scan->rs_cbuf)) |
677 | 0 | scan->rs_cblock = BufferGetBlockNumber(scan->rs_cbuf); |
678 | 0 | } |
679 | | |
680 | | /* |
681 | | * heapgettup_initial_block - return the first BlockNumber to scan |
682 | | * |
683 | | * Returns InvalidBlockNumber when there are no blocks to scan. This can |
684 | | * occur with empty tables and in parallel scans when parallel workers get all |
685 | | * of the pages before we can get a chance to get our first page. |
686 | | */ |
687 | | static pg_noinline BlockNumber |
688 | | heapgettup_initial_block(HeapScanDesc scan, ScanDirection dir) |
689 | 0 | { |
690 | 0 | Assert(!scan->rs_inited); |
691 | 0 | Assert(scan->rs_base.rs_parallel == NULL); |
692 | | |
693 | | /* When there are no pages to scan, return InvalidBlockNumber */ |
694 | 0 | if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0) |
695 | 0 | return InvalidBlockNumber; |
696 | | |
697 | 0 | if (ScanDirectionIsForward(dir)) |
698 | 0 | { |
699 | 0 | return scan->rs_startblock; |
700 | 0 | } |
701 | 0 | else |
702 | 0 | { |
703 | | /* |
704 | | * Disable reporting to syncscan logic in a backwards scan; it's not |
705 | | * very likely anyone else is doing the same thing at the same time, |
706 | | * and much more likely that we'll just bollix things for forward |
707 | | * scanners. |
708 | | */ |
709 | 0 | scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; |
710 | | |
711 | | /* |
712 | | * Start from last page of the scan. Ensure we take into account |
713 | | * rs_numblocks if it's been adjusted by heap_setscanlimits(). |
714 | | */ |
715 | 0 | if (scan->rs_numblocks != InvalidBlockNumber) |
716 | 0 | return (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks; |
717 | | |
718 | 0 | if (scan->rs_startblock > 0) |
719 | 0 | return scan->rs_startblock - 1; |
720 | | |
721 | 0 | return scan->rs_nblocks - 1; |
722 | 0 | } |
723 | 0 | } |
724 | | |
725 | | |
726 | | /* |
727 | | * heapgettup_start_page - helper function for heapgettup() |
728 | | * |
729 | | * Return the next page to scan based on the scan->rs_cbuf and set *linesleft |
730 | | * to the number of tuples on this page. Also set *lineoff to the first |
731 | | * offset to scan with forward scans getting the first offset and backward |
732 | | * getting the final offset on the page. |
733 | | */ |
734 | | static Page |
735 | | heapgettup_start_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, |
736 | | OffsetNumber *lineoff) |
737 | 0 | { |
738 | 0 | Page page; |
739 | |
|
740 | 0 | Assert(scan->rs_inited); |
741 | 0 | Assert(BufferIsValid(scan->rs_cbuf)); |
742 | | |
743 | | /* Caller is responsible for ensuring buffer is locked if needed */ |
744 | 0 | page = BufferGetPage(scan->rs_cbuf); |
745 | |
|
746 | 0 | *linesleft = PageGetMaxOffsetNumber(page) - FirstOffsetNumber + 1; |
747 | |
|
748 | 0 | if (ScanDirectionIsForward(dir)) |
749 | 0 | *lineoff = FirstOffsetNumber; |
750 | 0 | else |
751 | 0 | *lineoff = (OffsetNumber) (*linesleft); |
752 | | |
753 | | /* lineoff now references the physically previous or next tid */ |
754 | 0 | return page; |
755 | 0 | } |
756 | | |
757 | | |
758 | | /* |
759 | | * heapgettup_continue_page - helper function for heapgettup() |
760 | | * |
761 | | * Return the next page to scan based on the scan->rs_cbuf and set *linesleft |
762 | | * to the number of tuples left to scan on this page. Also set *lineoff to |
763 | | * the next offset to scan according to the ScanDirection in 'dir'. |
764 | | */ |
765 | | static inline Page |
766 | | heapgettup_continue_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, |
767 | | OffsetNumber *lineoff) |
768 | 0 | { |
769 | 0 | Page page; |
770 | |
|
771 | 0 | Assert(scan->rs_inited); |
772 | 0 | Assert(BufferIsValid(scan->rs_cbuf)); |
773 | | |
774 | | /* Caller is responsible for ensuring buffer is locked if needed */ |
775 | 0 | page = BufferGetPage(scan->rs_cbuf); |
776 | |
|
777 | 0 | if (ScanDirectionIsForward(dir)) |
778 | 0 | { |
779 | 0 | *lineoff = OffsetNumberNext(scan->rs_coffset); |
780 | 0 | *linesleft = PageGetMaxOffsetNumber(page) - (*lineoff) + 1; |
781 | 0 | } |
782 | 0 | else |
783 | 0 | { |
784 | | /* |
785 | | * The previous returned tuple may have been vacuumed since the |
786 | | * previous scan when we use a non-MVCC snapshot, so we must |
787 | | * re-establish the lineoff <= PageGetMaxOffsetNumber(page) invariant |
788 | | */ |
789 | 0 | *lineoff = Min(PageGetMaxOffsetNumber(page), OffsetNumberPrev(scan->rs_coffset)); |
790 | 0 | *linesleft = *lineoff; |
791 | 0 | } |
792 | | |
793 | | /* lineoff now references the physically previous or next tid */ |
794 | 0 | return page; |
795 | 0 | } |
796 | | |
797 | | /* |
798 | | * heapgettup_advance_block - helper for heap_fetch_next_buffer() |
799 | | * |
800 | | * Given the current block number, the scan direction, and various information |
801 | | * contained in the scan descriptor, calculate the BlockNumber to scan next |
802 | | * and return it. If there are no further blocks to scan, return |
803 | | * InvalidBlockNumber to indicate this fact to the caller. |
804 | | * |
805 | | * This should not be called to determine the initial block number -- only for |
806 | | * subsequent blocks. |
807 | | * |
808 | | * This also adjusts rs_numblocks when a limit has been imposed by |
809 | | * heap_setscanlimits(). |
810 | | */ |
811 | | static inline BlockNumber |
812 | | heapgettup_advance_block(HeapScanDesc scan, BlockNumber block, ScanDirection dir) |
813 | 0 | { |
814 | 0 | Assert(scan->rs_base.rs_parallel == NULL); |
815 | |
|
816 | 0 | if (likely(ScanDirectionIsForward(dir))) |
817 | 0 | { |
818 | 0 | block++; |
819 | | |
820 | | /* wrap back to the start of the heap */ |
821 | 0 | if (block >= scan->rs_nblocks) |
822 | 0 | block = 0; |
823 | | |
824 | | /* |
825 | | * Report our new scan position for synchronization purposes. We don't |
826 | | * do that when moving backwards, however. That would just mess up any |
827 | | * other forward-moving scanners. |
828 | | * |
829 | | * Note: we do this before checking for end of scan so that the final |
830 | | * state of the position hint is back at the start of the rel. That's |
831 | | * not strictly necessary, but otherwise when you run the same query |
832 | | * multiple times the starting position would shift a little bit |
833 | | * backwards on every invocation, which is confusing. We don't |
834 | | * guarantee any specific ordering in general, though. |
835 | | */ |
836 | 0 | if (scan->rs_base.rs_flags & SO_ALLOW_SYNC) |
837 | 0 | ss_report_location(scan->rs_base.rs_rd, block); |
838 | | |
839 | | /* we're done if we're back at where we started */ |
840 | 0 | if (block == scan->rs_startblock) |
841 | 0 | return InvalidBlockNumber; |
842 | | |
843 | | /* check if the limit imposed by heap_setscanlimits() is met */ |
844 | 0 | if (scan->rs_numblocks != InvalidBlockNumber) |
845 | 0 | { |
846 | 0 | if (--scan->rs_numblocks == 0) |
847 | 0 | return InvalidBlockNumber; |
848 | 0 | } |
849 | | |
850 | 0 | return block; |
851 | 0 | } |
852 | 0 | else |
853 | 0 | { |
854 | | /* we're done if the last block is the start position */ |
855 | 0 | if (block == scan->rs_startblock) |
856 | 0 | return InvalidBlockNumber; |
857 | | |
858 | | /* check if the limit imposed by heap_setscanlimits() is met */ |
859 | 0 | if (scan->rs_numblocks != InvalidBlockNumber) |
860 | 0 | { |
861 | 0 | if (--scan->rs_numblocks == 0) |
862 | 0 | return InvalidBlockNumber; |
863 | 0 | } |
864 | | |
865 | | /* wrap to the end of the heap when the last page was page 0 */ |
866 | 0 | if (block == 0) |
867 | 0 | block = scan->rs_nblocks; |
868 | |
|
869 | 0 | block--; |
870 | |
|
871 | 0 | return block; |
872 | 0 | } |
873 | 0 | } |
874 | | |
875 | | /* ---------------- |
876 | | * heapgettup - fetch next heap tuple |
877 | | * |
878 | | * Initialize the scan if not already done; then advance to the next |
879 | | * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup, |
880 | | * or set scan->rs_ctup.t_data = NULL if no more tuples. |
881 | | * |
882 | | * Note: the reason nkeys/key are passed separately, even though they are |
883 | | * kept in the scan descriptor, is that the caller may not want us to check |
884 | | * the scankeys. |
885 | | * |
886 | | * Note: when we fall off the end of the scan in either direction, we |
887 | | * reset rs_inited. This means that a further request with the same |
888 | | * scan direction will restart the scan, which is a bit odd, but a |
889 | | * request with the opposite scan direction will start a fresh scan |
890 | | * in the proper direction. The latter is required behavior for cursors, |
891 | | * while the former case is generally undefined behavior in Postgres |
892 | | * so we don't care too much. |
893 | | * ---------------- |
894 | | */ |
895 | | static void |
896 | | heapgettup(HeapScanDesc scan, |
897 | | ScanDirection dir, |
898 | | int nkeys, |
899 | | ScanKey key) |
900 | 0 | { |
901 | 0 | HeapTuple tuple = &(scan->rs_ctup); |
902 | 0 | Page page; |
903 | 0 | OffsetNumber lineoff; |
904 | 0 | int linesleft; |
905 | |
|
906 | 0 | if (likely(scan->rs_inited)) |
907 | 0 | { |
908 | | /* continue from previously returned page/tuple */ |
909 | 0 | LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); |
910 | 0 | page = heapgettup_continue_page(scan, dir, &linesleft, &lineoff); |
911 | 0 | goto continue_page; |
912 | 0 | } |
913 | | |
914 | | /* |
915 | | * advance the scan until we find a qualifying tuple or run out of stuff |
916 | | * to scan |
917 | | */ |
918 | 0 | while (true) |
919 | 0 | { |
920 | 0 | heap_fetch_next_buffer(scan, dir); |
921 | | |
922 | | /* did we run out of blocks to scan? */ |
923 | 0 | if (!BufferIsValid(scan->rs_cbuf)) |
924 | 0 | break; |
925 | | |
926 | 0 | Assert(BufferGetBlockNumber(scan->rs_cbuf) == scan->rs_cblock); |
927 | |
|
928 | 0 | LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); |
929 | 0 | page = heapgettup_start_page(scan, dir, &linesleft, &lineoff); |
930 | 0 | continue_page: |
931 | | |
932 | | /* |
933 | | * Only continue scanning the page while we have lines left. |
934 | | * |
935 | | * Note that this protects us from accessing line pointers past |
936 | | * PageGetMaxOffsetNumber(); both for forward scans when we resume the |
937 | | * table scan, and for when we start scanning a new page. |
938 | | */ |
939 | 0 | for (; linesleft > 0; linesleft--, lineoff += dir) |
940 | 0 | { |
941 | 0 | bool visible; |
942 | 0 | ItemId lpp = PageGetItemId(page, lineoff); |
943 | |
|
944 | 0 | if (!ItemIdIsNormal(lpp)) |
945 | 0 | continue; |
946 | | |
947 | 0 | tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp); |
948 | 0 | tuple->t_len = ItemIdGetLength(lpp); |
949 | 0 | ItemPointerSet(&(tuple->t_self), scan->rs_cblock, lineoff); |
950 | |
|
951 | 0 | visible = HeapTupleSatisfiesVisibility(tuple, |
952 | 0 | scan->rs_base.rs_snapshot, |
953 | 0 | scan->rs_cbuf); |
954 | |
|
955 | 0 | HeapCheckForSerializableConflictOut(visible, scan->rs_base.rs_rd, |
956 | 0 | tuple, scan->rs_cbuf, |
957 | 0 | scan->rs_base.rs_snapshot); |
958 | | |
959 | | /* skip tuples not visible to this snapshot */ |
960 | 0 | if (!visible) |
961 | 0 | continue; |
962 | | |
963 | | /* skip any tuples that don't match the scan key */ |
964 | 0 | if (key != NULL && |
965 | 0 | !HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd), |
966 | 0 | nkeys, key)) |
967 | 0 | continue; |
968 | | |
969 | 0 | LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); |
970 | 0 | scan->rs_coffset = lineoff; |
971 | 0 | return; |
972 | 0 | } |
973 | | |
974 | | /* |
975 | | * if we get here, it means we've exhausted the items on this page and |
976 | | * it's time to move to the next. |
977 | | */ |
978 | 0 | LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); |
979 | 0 | } |
980 | | |
981 | | /* end of scan */ |
982 | 0 | if (BufferIsValid(scan->rs_cbuf)) |
983 | 0 | ReleaseBuffer(scan->rs_cbuf); |
984 | |
|
985 | 0 | scan->rs_cbuf = InvalidBuffer; |
986 | 0 | scan->rs_cblock = InvalidBlockNumber; |
987 | 0 | scan->rs_prefetch_block = InvalidBlockNumber; |
988 | 0 | tuple->t_data = NULL; |
989 | 0 | scan->rs_inited = false; |
990 | 0 | } |
991 | | |
992 | | /* ---------------- |
993 | | * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode |
994 | | * |
995 | | * Same API as heapgettup, but used in page-at-a-time mode |
996 | | * |
997 | | * The internal logic is much the same as heapgettup's too, but there are some |
998 | | * differences: we do not take the buffer content lock (that only needs to |
999 | | * happen inside heap_prepare_pagescan), and we iterate through just the |
1000 | | * tuples listed in rs_vistuples[] rather than all tuples on the page. Notice |
1001 | | * that lineindex is 0-based, where the corresponding loop variable lineoff in |
1002 | | * heapgettup is 1-based. |
1003 | | * ---------------- |
1004 | | */ |
1005 | | static void |
1006 | | heapgettup_pagemode(HeapScanDesc scan, |
1007 | | ScanDirection dir, |
1008 | | int nkeys, |
1009 | | ScanKey key) |
1010 | 0 | { |
1011 | 0 | HeapTuple tuple = &(scan->rs_ctup); |
1012 | 0 | Page page; |
1013 | 0 | uint32 lineindex; |
1014 | 0 | uint32 linesleft; |
1015 | |
|
1016 | 0 | if (likely(scan->rs_inited)) |
1017 | 0 | { |
1018 | | /* continue from previously returned page/tuple */ |
1019 | 0 | page = BufferGetPage(scan->rs_cbuf); |
1020 | |
|
1021 | 0 | lineindex = scan->rs_cindex + dir; |
1022 | 0 | if (ScanDirectionIsForward(dir)) |
1023 | 0 | linesleft = scan->rs_ntuples - lineindex; |
1024 | 0 | else |
1025 | 0 | linesleft = scan->rs_cindex; |
1026 | | /* lineindex now references the next or previous visible tid */ |
1027 | |
|
1028 | 0 | goto continue_page; |
1029 | 0 | } |
1030 | | |
1031 | | /* |
1032 | | * advance the scan until we find a qualifying tuple or run out of stuff |
1033 | | * to scan |
1034 | | */ |
1035 | 0 | while (true) |
1036 | 0 | { |
1037 | 0 | heap_fetch_next_buffer(scan, dir); |
1038 | | |
1039 | | /* did we run out of blocks to scan? */ |
1040 | 0 | if (!BufferIsValid(scan->rs_cbuf)) |
1041 | 0 | break; |
1042 | | |
1043 | 0 | Assert(BufferGetBlockNumber(scan->rs_cbuf) == scan->rs_cblock); |
1044 | | |
1045 | | /* prune the page and determine visible tuple offsets */ |
1046 | 0 | heap_prepare_pagescan((TableScanDesc) scan); |
1047 | 0 | page = BufferGetPage(scan->rs_cbuf); |
1048 | 0 | linesleft = scan->rs_ntuples; |
1049 | 0 | lineindex = ScanDirectionIsForward(dir) ? 0 : linesleft - 1; |
1050 | | |
1051 | | /* block is the same for all tuples, set it once outside the loop */ |
1052 | 0 | ItemPointerSetBlockNumber(&tuple->t_self, scan->rs_cblock); |
1053 | | |
1054 | | /* lineindex now references the next or previous visible tid */ |
1055 | 0 | continue_page: |
1056 | |
|
1057 | 0 | for (; linesleft > 0; linesleft--, lineindex += dir) |
1058 | 0 | { |
1059 | 0 | ItemId lpp; |
1060 | 0 | OffsetNumber lineoff; |
1061 | |
|
1062 | 0 | Assert(lineindex <= scan->rs_ntuples); |
1063 | 0 | lineoff = scan->rs_vistuples[lineindex]; |
1064 | 0 | lpp = PageGetItemId(page, lineoff); |
1065 | 0 | Assert(ItemIdIsNormal(lpp)); |
1066 | |
|
1067 | 0 | tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp); |
1068 | 0 | tuple->t_len = ItemIdGetLength(lpp); |
1069 | 0 | ItemPointerSetOffsetNumber(&tuple->t_self, lineoff); |
1070 | | |
1071 | | /* skip any tuples that don't match the scan key */ |
1072 | 0 | if (key != NULL && |
1073 | 0 | !HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd), |
1074 | 0 | nkeys, key)) |
1075 | 0 | continue; |
1076 | | |
1077 | 0 | scan->rs_cindex = lineindex; |
1078 | 0 | return; |
1079 | 0 | } |
1080 | 0 | } |
1081 | | |
1082 | | /* end of scan */ |
1083 | 0 | if (BufferIsValid(scan->rs_cbuf)) |
1084 | 0 | ReleaseBuffer(scan->rs_cbuf); |
1085 | 0 | scan->rs_cbuf = InvalidBuffer; |
1086 | 0 | scan->rs_cblock = InvalidBlockNumber; |
1087 | 0 | scan->rs_prefetch_block = InvalidBlockNumber; |
1088 | 0 | tuple->t_data = NULL; |
1089 | 0 | scan->rs_inited = false; |
1090 | 0 | } |
1091 | | |
1092 | | |
1093 | | /* ---------------------------------------------------------------- |
1094 | | * heap access method interface |
1095 | | * ---------------------------------------------------------------- |
1096 | | */ |
1097 | | |
1098 | | |
1099 | | TableScanDesc |
1100 | | heap_beginscan(Relation relation, Snapshot snapshot, |
1101 | | int nkeys, ScanKey key, |
1102 | | ParallelTableScanDesc parallel_scan, |
1103 | | uint32 flags) |
1104 | 0 | { |
1105 | 0 | HeapScanDesc scan; |
1106 | | |
1107 | | /* |
1108 | | * increment relation ref count while scanning relation |
1109 | | * |
1110 | | * This is just to make really sure the relcache entry won't go away while |
1111 | | * the scan has a pointer to it. Caller should be holding the rel open |
1112 | | * anyway, so this is redundant in all normal scenarios... |
1113 | | */ |
1114 | 0 | RelationIncrementReferenceCount(relation); |
1115 | | |
1116 | | /* |
1117 | | * allocate and initialize scan descriptor |
1118 | | */ |
1119 | 0 | if (flags & SO_TYPE_BITMAPSCAN) |
1120 | 0 | { |
1121 | 0 | BitmapHeapScanDesc bscan = palloc(sizeof(BitmapHeapScanDescData)); |
1122 | | |
1123 | | /* |
1124 | | * Bitmap Heap scans do not have any fields that a normal Heap Scan |
1125 | | * does not have, so no special initializations required here. |
1126 | | */ |
1127 | 0 | scan = (HeapScanDesc) bscan; |
1128 | 0 | } |
1129 | 0 | else |
1130 | 0 | scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData)); |
1131 | |
|
1132 | 0 | scan->rs_base.rs_rd = relation; |
1133 | 0 | scan->rs_base.rs_snapshot = snapshot; |
1134 | 0 | scan->rs_base.rs_nkeys = nkeys; |
1135 | 0 | scan->rs_base.rs_flags = flags; |
1136 | 0 | scan->rs_base.rs_parallel = parallel_scan; |
1137 | 0 | scan->rs_strategy = NULL; /* set in initscan */ |
1138 | 0 | scan->rs_cbuf = InvalidBuffer; |
1139 | | |
1140 | | /* |
1141 | | * Disable page-at-a-time mode if it's not a MVCC-safe snapshot. |
1142 | | */ |
1143 | 0 | if (!(snapshot && IsMVCCSnapshot(snapshot))) |
1144 | 0 | scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE; |
1145 | | |
1146 | | /* |
1147 | | * For seqscan and sample scans in a serializable transaction, acquire a |
1148 | | * predicate lock on the entire relation. This is required not only to |
1149 | | * lock all the matching tuples, but also to conflict with new insertions |
1150 | | * into the table. In an indexscan, we take page locks on the index pages |
1151 | | * covering the range specified in the scan qual, but in a heap scan there |
1152 | | * is nothing more fine-grained to lock. A bitmap scan is a different |
1153 | | * story, there we have already scanned the index and locked the index |
1154 | | * pages covering the predicate. But in that case we still have to lock |
1155 | | * any matching heap tuples. For sample scan we could optimize the locking |
1156 | | * to be at least page-level granularity, but we'd need to add per-tuple |
1157 | | * locking for that. |
1158 | | */ |
1159 | 0 | if (scan->rs_base.rs_flags & (SO_TYPE_SEQSCAN | SO_TYPE_SAMPLESCAN)) |
1160 | 0 | { |
1161 | | /* |
1162 | | * Ensure a missing snapshot is noticed reliably, even if the |
1163 | | * isolation mode means predicate locking isn't performed (and |
1164 | | * therefore the snapshot isn't used here). |
1165 | | */ |
1166 | 0 | Assert(snapshot); |
1167 | 0 | PredicateLockRelation(relation, snapshot); |
1168 | 0 | } |
1169 | | |
1170 | | /* we only need to set this up once */ |
1171 | 0 | scan->rs_ctup.t_tableOid = RelationGetRelid(relation); |
1172 | | |
1173 | | /* |
1174 | | * Allocate memory to keep track of page allocation for parallel workers |
1175 | | * when doing a parallel scan. |
1176 | | */ |
1177 | 0 | if (parallel_scan != NULL) |
1178 | 0 | scan->rs_parallelworkerdata = palloc(sizeof(ParallelBlockTableScanWorkerData)); |
1179 | 0 | else |
1180 | 0 | scan->rs_parallelworkerdata = NULL; |
1181 | | |
1182 | | /* |
1183 | | * we do this here instead of in initscan() because heap_rescan also calls |
1184 | | * initscan() and we don't want to allocate memory again |
1185 | | */ |
1186 | 0 | if (nkeys > 0) |
1187 | 0 | scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys); |
1188 | 0 | else |
1189 | 0 | scan->rs_base.rs_key = NULL; |
1190 | |
|
1191 | 0 | initscan(scan, key, false); |
1192 | |
|
1193 | 0 | scan->rs_read_stream = NULL; |
1194 | | |
1195 | | /* |
1196 | | * Set up a read stream for sequential scans and TID range scans. This |
1197 | | * should be done after initscan() because initscan() allocates the |
1198 | | * BufferAccessStrategy object passed to the read stream API. |
1199 | | */ |
1200 | 0 | if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN || |
1201 | 0 | scan->rs_base.rs_flags & SO_TYPE_TIDRANGESCAN) |
1202 | 0 | { |
1203 | 0 | ReadStreamBlockNumberCB cb; |
1204 | |
|
1205 | 0 | if (scan->rs_base.rs_parallel) |
1206 | 0 | cb = heap_scan_stream_read_next_parallel; |
1207 | 0 | else |
1208 | 0 | cb = heap_scan_stream_read_next_serial; |
1209 | | |
1210 | | /* --- |
1211 | | * It is safe to use batchmode as the only locks taken by `cb` |
1212 | | * are never taken while waiting for IO: |
1213 | | * - SyncScanLock is used in the non-parallel case |
1214 | | * - in the parallel case, only spinlocks and atomics are used |
1215 | | * --- |
1216 | | */ |
1217 | 0 | scan->rs_read_stream = read_stream_begin_relation(READ_STREAM_SEQUENTIAL | |
1218 | 0 | READ_STREAM_USE_BATCHING, |
1219 | 0 | scan->rs_strategy, |
1220 | 0 | scan->rs_base.rs_rd, |
1221 | 0 | MAIN_FORKNUM, |
1222 | 0 | cb, |
1223 | 0 | scan, |
1224 | 0 | 0); |
1225 | 0 | } |
1226 | 0 | else if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN) |
1227 | 0 | { |
1228 | 0 | scan->rs_read_stream = read_stream_begin_relation(READ_STREAM_DEFAULT | |
1229 | 0 | READ_STREAM_USE_BATCHING, |
1230 | 0 | scan->rs_strategy, |
1231 | 0 | scan->rs_base.rs_rd, |
1232 | 0 | MAIN_FORKNUM, |
1233 | 0 | bitmapheap_stream_read_next, |
1234 | 0 | scan, |
1235 | 0 | sizeof(TBMIterateResult)); |
1236 | 0 | } |
1237 | | |
1238 | |
|
1239 | 0 | return (TableScanDesc) scan; |
1240 | 0 | } |
1241 | | |
1242 | | void |
1243 | | heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, |
1244 | | bool allow_strat, bool allow_sync, bool allow_pagemode) |
1245 | 0 | { |
1246 | 0 | HeapScanDesc scan = (HeapScanDesc) sscan; |
1247 | |
|
1248 | 0 | if (set_params) |
1249 | 0 | { |
1250 | 0 | if (allow_strat) |
1251 | 0 | scan->rs_base.rs_flags |= SO_ALLOW_STRAT; |
1252 | 0 | else |
1253 | 0 | scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT; |
1254 | |
|
1255 | 0 | if (allow_sync) |
1256 | 0 | scan->rs_base.rs_flags |= SO_ALLOW_SYNC; |
1257 | 0 | else |
1258 | 0 | scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC; |
1259 | |
|
1260 | 0 | if (allow_pagemode && scan->rs_base.rs_snapshot && |
1261 | 0 | IsMVCCSnapshot(scan->rs_base.rs_snapshot)) |
1262 | 0 | scan->rs_base.rs_flags |= SO_ALLOW_PAGEMODE; |
1263 | 0 | else |
1264 | 0 | scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE; |
1265 | 0 | } |
1266 | | |
1267 | | /* |
1268 | | * unpin scan buffers |
1269 | | */ |
1270 | 0 | if (BufferIsValid(scan->rs_cbuf)) |
1271 | 0 | { |
1272 | 0 | ReleaseBuffer(scan->rs_cbuf); |
1273 | 0 | scan->rs_cbuf = InvalidBuffer; |
1274 | 0 | } |
1275 | | |
1276 | | /* |
1277 | | * SO_TYPE_BITMAPSCAN would be cleaned up here, but it does not hold any |
1278 | | * additional data vs a normal HeapScan |
1279 | | */ |
1280 | | |
1281 | | /* |
1282 | | * The read stream is reset on rescan. This must be done before |
1283 | | * initscan(), as some state referred to by read_stream_reset() is reset |
1284 | | * in initscan(). |
1285 | | */ |
1286 | 0 | if (scan->rs_read_stream) |
1287 | 0 | read_stream_reset(scan->rs_read_stream); |
1288 | | |
1289 | | /* |
1290 | | * reinitialize scan descriptor |
1291 | | */ |
1292 | 0 | initscan(scan, key, true); |
1293 | 0 | } |
1294 | | |
1295 | | void |
1296 | | heap_endscan(TableScanDesc sscan) |
1297 | 0 | { |
1298 | 0 | HeapScanDesc scan = (HeapScanDesc) sscan; |
1299 | | |
1300 | | /* Note: no locking manipulations needed */ |
1301 | | |
1302 | | /* |
1303 | | * unpin scan buffers |
1304 | | */ |
1305 | 0 | if (BufferIsValid(scan->rs_cbuf)) |
1306 | 0 | ReleaseBuffer(scan->rs_cbuf); |
1307 | | |
1308 | | /* |
1309 | | * Must free the read stream before freeing the BufferAccessStrategy. |
1310 | | */ |
1311 | 0 | if (scan->rs_read_stream) |
1312 | 0 | read_stream_end(scan->rs_read_stream); |
1313 | | |
1314 | | /* |
1315 | | * decrement relation reference count and free scan descriptor storage |
1316 | | */ |
1317 | 0 | RelationDecrementReferenceCount(scan->rs_base.rs_rd); |
1318 | |
|
1319 | 0 | if (scan->rs_base.rs_key) |
1320 | 0 | pfree(scan->rs_base.rs_key); |
1321 | |
|
1322 | 0 | if (scan->rs_strategy != NULL) |
1323 | 0 | FreeAccessStrategy(scan->rs_strategy); |
1324 | |
|
1325 | 0 | if (scan->rs_parallelworkerdata != NULL) |
1326 | 0 | pfree(scan->rs_parallelworkerdata); |
1327 | |
|
1328 | 0 | if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) |
1329 | 0 | UnregisterSnapshot(scan->rs_base.rs_snapshot); |
1330 | |
|
1331 | 0 | pfree(scan); |
1332 | 0 | } |
1333 | | |
1334 | | HeapTuple |
1335 | | heap_getnext(TableScanDesc sscan, ScanDirection direction) |
1336 | 0 | { |
1337 | 0 | HeapScanDesc scan = (HeapScanDesc) sscan; |
1338 | | |
1339 | | /* |
1340 | | * This is still widely used directly, without going through table AM, so |
1341 | | * add a safety check. It's possible we should, at a later point, |
1342 | | * downgrade this to an assert. The reason for checking the AM routine, |
1343 | | * rather than the AM oid, is that this allows to write regression tests |
1344 | | * that create another AM reusing the heap handler. |
1345 | | */ |
1346 | 0 | if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine())) |
1347 | 0 | ereport(ERROR, |
1348 | 0 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
1349 | 0 | errmsg_internal("only heap AM is supported"))); |
1350 | | |
1351 | | /* |
1352 | | * We don't expect direct calls to heap_getnext with valid CheckXidAlive |
1353 | | * for catalog or regular tables. See detailed comments in xact.c where |
1354 | | * these variables are declared. Normally we have such a check at tableam |
1355 | | * level API but this is called from many places so we need to ensure it |
1356 | | * here. |
1357 | | */ |
1358 | 0 | if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan)) |
1359 | 0 | elog(ERROR, "unexpected heap_getnext call during logical decoding"); |
1360 | | |
1361 | | /* Note: no locking manipulations needed */ |
1362 | | |
1363 | 0 | if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE) |
1364 | 0 | heapgettup_pagemode(scan, direction, |
1365 | 0 | scan->rs_base.rs_nkeys, scan->rs_base.rs_key); |
1366 | 0 | else |
1367 | 0 | heapgettup(scan, direction, |
1368 | 0 | scan->rs_base.rs_nkeys, scan->rs_base.rs_key); |
1369 | |
|
1370 | 0 | if (scan->rs_ctup.t_data == NULL) |
1371 | 0 | return NULL; |
1372 | | |
1373 | | /* |
1374 | | * if we get here it means we have a new current scan tuple, so point to |
1375 | | * the proper return buffer and return the tuple. |
1376 | | */ |
1377 | | |
1378 | 0 | pgstat_count_heap_getnext(scan->rs_base.rs_rd); |
1379 | |
|
1380 | 0 | return &scan->rs_ctup; |
1381 | 0 | } |
1382 | | |
1383 | | bool |
1384 | | heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot) |
1385 | 0 | { |
1386 | 0 | HeapScanDesc scan = (HeapScanDesc) sscan; |
1387 | | |
1388 | | /* Note: no locking manipulations needed */ |
1389 | |
|
1390 | 0 | if (sscan->rs_flags & SO_ALLOW_PAGEMODE) |
1391 | 0 | heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key); |
1392 | 0 | else |
1393 | 0 | heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key); |
1394 | |
|
1395 | 0 | if (scan->rs_ctup.t_data == NULL) |
1396 | 0 | { |
1397 | 0 | ExecClearTuple(slot); |
1398 | 0 | return false; |
1399 | 0 | } |
1400 | | |
1401 | | /* |
1402 | | * if we get here it means we have a new current scan tuple, so point to |
1403 | | * the proper return buffer and return the tuple. |
1404 | | */ |
1405 | | |
1406 | 0 | pgstat_count_heap_getnext(scan->rs_base.rs_rd); |
1407 | |
|
1408 | 0 | ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, |
1409 | 0 | scan->rs_cbuf); |
1410 | 0 | return true; |
1411 | 0 | } |
1412 | | |
1413 | | void |
1414 | | heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid, |
1415 | | ItemPointer maxtid) |
1416 | 0 | { |
1417 | 0 | HeapScanDesc scan = (HeapScanDesc) sscan; |
1418 | 0 | BlockNumber startBlk; |
1419 | 0 | BlockNumber numBlks; |
1420 | 0 | ItemPointerData highestItem; |
1421 | 0 | ItemPointerData lowestItem; |
1422 | | |
1423 | | /* |
1424 | | * For relations without any pages, we can simply leave the TID range |
1425 | | * unset. There will be no tuples to scan, therefore no tuples outside |
1426 | | * the given TID range. |
1427 | | */ |
1428 | 0 | if (scan->rs_nblocks == 0) |
1429 | 0 | return; |
1430 | | |
1431 | | /* |
1432 | | * Set up some ItemPointers which point to the first and last possible |
1433 | | * tuples in the heap. |
1434 | | */ |
1435 | 0 | ItemPointerSet(&highestItem, scan->rs_nblocks - 1, MaxOffsetNumber); |
1436 | 0 | ItemPointerSet(&lowestItem, 0, FirstOffsetNumber); |
1437 | | |
1438 | | /* |
1439 | | * If the given maximum TID is below the highest possible TID in the |
1440 | | * relation, then restrict the range to that, otherwise we scan to the end |
1441 | | * of the relation. |
1442 | | */ |
1443 | 0 | if (ItemPointerCompare(maxtid, &highestItem) < 0) |
1444 | 0 | ItemPointerCopy(maxtid, &highestItem); |
1445 | | |
1446 | | /* |
1447 | | * If the given minimum TID is above the lowest possible TID in the |
1448 | | * relation, then restrict the range to only scan for TIDs above that. |
1449 | | */ |
1450 | 0 | if (ItemPointerCompare(mintid, &lowestItem) > 0) |
1451 | 0 | ItemPointerCopy(mintid, &lowestItem); |
1452 | | |
1453 | | /* |
1454 | | * Check for an empty range and protect from would be negative results |
1455 | | * from the numBlks calculation below. |
1456 | | */ |
1457 | 0 | if (ItemPointerCompare(&highestItem, &lowestItem) < 0) |
1458 | 0 | { |
1459 | | /* Set an empty range of blocks to scan */ |
1460 | 0 | heap_setscanlimits(sscan, 0, 0); |
1461 | 0 | return; |
1462 | 0 | } |
1463 | | |
1464 | | /* |
1465 | | * Calculate the first block and the number of blocks we must scan. We |
1466 | | * could be more aggressive here and perform some more validation to try |
1467 | | * and further narrow the scope of blocks to scan by checking if the |
1468 | | * lowestItem has an offset above MaxOffsetNumber. In this case, we could |
1469 | | * advance startBlk by one. Likewise, if highestItem has an offset of 0 |
1470 | | * we could scan one fewer blocks. However, such an optimization does not |
1471 | | * seem worth troubling over, currently. |
1472 | | */ |
1473 | 0 | startBlk = ItemPointerGetBlockNumberNoCheck(&lowestItem); |
1474 | |
|
1475 | 0 | numBlks = ItemPointerGetBlockNumberNoCheck(&highestItem) - |
1476 | 0 | ItemPointerGetBlockNumberNoCheck(&lowestItem) + 1; |
1477 | | |
1478 | | /* Set the start block and number of blocks to scan */ |
1479 | 0 | heap_setscanlimits(sscan, startBlk, numBlks); |
1480 | | |
1481 | | /* Finally, set the TID range in sscan */ |
1482 | 0 | ItemPointerCopy(&lowestItem, &sscan->st.tidrange.rs_mintid); |
1483 | 0 | ItemPointerCopy(&highestItem, &sscan->st.tidrange.rs_maxtid); |
1484 | 0 | } |
1485 | | |
1486 | | bool |
1487 | | heap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction, |
1488 | | TupleTableSlot *slot) |
1489 | 0 | { |
1490 | 0 | HeapScanDesc scan = (HeapScanDesc) sscan; |
1491 | 0 | ItemPointer mintid = &sscan->st.tidrange.rs_mintid; |
1492 | 0 | ItemPointer maxtid = &sscan->st.tidrange.rs_maxtid; |
1493 | | |
1494 | | /* Note: no locking manipulations needed */ |
1495 | 0 | for (;;) |
1496 | 0 | { |
1497 | 0 | if (sscan->rs_flags & SO_ALLOW_PAGEMODE) |
1498 | 0 | heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key); |
1499 | 0 | else |
1500 | 0 | heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key); |
1501 | |
|
1502 | 0 | if (scan->rs_ctup.t_data == NULL) |
1503 | 0 | { |
1504 | 0 | ExecClearTuple(slot); |
1505 | 0 | return false; |
1506 | 0 | } |
1507 | | |
1508 | | /* |
1509 | | * heap_set_tidrange will have used heap_setscanlimits to limit the |
1510 | | * range of pages we scan to only ones that can contain the TID range |
1511 | | * we're scanning for. Here we must filter out any tuples from these |
1512 | | * pages that are outside of that range. |
1513 | | */ |
1514 | 0 | if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0) |
1515 | 0 | { |
1516 | 0 | ExecClearTuple(slot); |
1517 | | |
1518 | | /* |
1519 | | * When scanning backwards, the TIDs will be in descending order. |
1520 | | * Future tuples in this direction will be lower still, so we can |
1521 | | * just return false to indicate there will be no more tuples. |
1522 | | */ |
1523 | 0 | if (ScanDirectionIsBackward(direction)) |
1524 | 0 | return false; |
1525 | | |
1526 | 0 | continue; |
1527 | 0 | } |
1528 | | |
1529 | | /* |
1530 | | * Likewise for the final page, we must filter out TIDs greater than |
1531 | | * maxtid. |
1532 | | */ |
1533 | 0 | if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0) |
1534 | 0 | { |
1535 | 0 | ExecClearTuple(slot); |
1536 | | |
1537 | | /* |
1538 | | * When scanning forward, the TIDs will be in ascending order. |
1539 | | * Future tuples in this direction will be higher still, so we can |
1540 | | * just return false to indicate there will be no more tuples. |
1541 | | */ |
1542 | 0 | if (ScanDirectionIsForward(direction)) |
1543 | 0 | return false; |
1544 | 0 | continue; |
1545 | 0 | } |
1546 | | |
1547 | 0 | break; |
1548 | 0 | } |
1549 | | |
1550 | | /* |
1551 | | * if we get here it means we have a new current scan tuple, so point to |
1552 | | * the proper return buffer and return the tuple. |
1553 | | */ |
1554 | 0 | pgstat_count_heap_getnext(scan->rs_base.rs_rd); |
1555 | |
|
1556 | 0 | ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf); |
1557 | 0 | return true; |
1558 | 0 | } |
1559 | | |
1560 | | /* |
1561 | | * heap_fetch - retrieve tuple with given tid |
1562 | | * |
1563 | | * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding |
1564 | | * the tuple, fill in the remaining fields of *tuple, and check the tuple |
1565 | | * against the specified snapshot. |
1566 | | * |
1567 | | * If successful (tuple found and passes snapshot time qual), then *userbuf |
1568 | | * is set to the buffer holding the tuple and true is returned. The caller |
1569 | | * must unpin the buffer when done with the tuple. |
1570 | | * |
1571 | | * If the tuple is not found (ie, item number references a deleted slot), |
1572 | | * then tuple->t_data is set to NULL, *userbuf is set to InvalidBuffer, |
1573 | | * and false is returned. |
1574 | | * |
1575 | | * If the tuple is found but fails the time qual check, then the behavior |
1576 | | * depends on the keep_buf parameter. If keep_buf is false, the results |
1577 | | * are the same as for the tuple-not-found case. If keep_buf is true, |
1578 | | * then tuple->t_data and *userbuf are returned as for the success case, |
1579 | | * and again the caller must unpin the buffer; but false is returned. |
1580 | | * |
1581 | | * heap_fetch does not follow HOT chains: only the exact TID requested will |
1582 | | * be fetched. |
1583 | | * |
1584 | | * It is somewhat inconsistent that we ereport() on invalid block number but |
1585 | | * return false on invalid item number. There are a couple of reasons though. |
1586 | | * One is that the caller can relatively easily check the block number for |
1587 | | * validity, but cannot check the item number without reading the page |
1588 | | * himself. Another is that when we are following a t_ctid link, we can be |
1589 | | * reasonably confident that the page number is valid (since VACUUM shouldn't |
1590 | | * truncate off the destination page without having killed the referencing |
1591 | | * tuple first), but the item number might well not be good. |
1592 | | */ |
1593 | | bool |
1594 | | heap_fetch(Relation relation, |
1595 | | Snapshot snapshot, |
1596 | | HeapTuple tuple, |
1597 | | Buffer *userbuf, |
1598 | | bool keep_buf) |
1599 | 0 | { |
1600 | 0 | ItemPointer tid = &(tuple->t_self); |
1601 | 0 | ItemId lp; |
1602 | 0 | Buffer buffer; |
1603 | 0 | Page page; |
1604 | 0 | OffsetNumber offnum; |
1605 | 0 | bool valid; |
1606 | | |
1607 | | /* |
1608 | | * Fetch and pin the appropriate page of the relation. |
1609 | | */ |
1610 | 0 | buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); |
1611 | | |
1612 | | /* |
1613 | | * Need share lock on buffer to examine tuple commit status. |
1614 | | */ |
1615 | 0 | LockBuffer(buffer, BUFFER_LOCK_SHARE); |
1616 | 0 | page = BufferGetPage(buffer); |
1617 | | |
1618 | | /* |
1619 | | * We'd better check for out-of-range offnum in case of VACUUM since the |
1620 | | * TID was obtained. |
1621 | | */ |
1622 | 0 | offnum = ItemPointerGetOffsetNumber(tid); |
1623 | 0 | if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page)) |
1624 | 0 | { |
1625 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
1626 | 0 | ReleaseBuffer(buffer); |
1627 | 0 | *userbuf = InvalidBuffer; |
1628 | 0 | tuple->t_data = NULL; |
1629 | 0 | return false; |
1630 | 0 | } |
1631 | | |
1632 | | /* |
1633 | | * get the item line pointer corresponding to the requested tid |
1634 | | */ |
1635 | 0 | lp = PageGetItemId(page, offnum); |
1636 | | |
1637 | | /* |
1638 | | * Must check for deleted tuple. |
1639 | | */ |
1640 | 0 | if (!ItemIdIsNormal(lp)) |
1641 | 0 | { |
1642 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
1643 | 0 | ReleaseBuffer(buffer); |
1644 | 0 | *userbuf = InvalidBuffer; |
1645 | 0 | tuple->t_data = NULL; |
1646 | 0 | return false; |
1647 | 0 | } |
1648 | | |
1649 | | /* |
1650 | | * fill in *tuple fields |
1651 | | */ |
1652 | 0 | tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); |
1653 | 0 | tuple->t_len = ItemIdGetLength(lp); |
1654 | 0 | tuple->t_tableOid = RelationGetRelid(relation); |
1655 | | |
1656 | | /* |
1657 | | * check tuple visibility, then release lock |
1658 | | */ |
1659 | 0 | valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer); |
1660 | |
|
1661 | 0 | if (valid) |
1662 | 0 | PredicateLockTID(relation, &(tuple->t_self), snapshot, |
1663 | 0 | HeapTupleHeaderGetXmin(tuple->t_data)); |
1664 | |
|
1665 | 0 | HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot); |
1666 | |
|
1667 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
1668 | |
|
1669 | 0 | if (valid) |
1670 | 0 | { |
1671 | | /* |
1672 | | * All checks passed, so return the tuple as valid. Caller is now |
1673 | | * responsible for releasing the buffer. |
1674 | | */ |
1675 | 0 | *userbuf = buffer; |
1676 | |
|
1677 | 0 | return true; |
1678 | 0 | } |
1679 | | |
1680 | | /* Tuple failed time qual, but maybe caller wants to see it anyway. */ |
1681 | 0 | if (keep_buf) |
1682 | 0 | *userbuf = buffer; |
1683 | 0 | else |
1684 | 0 | { |
1685 | 0 | ReleaseBuffer(buffer); |
1686 | 0 | *userbuf = InvalidBuffer; |
1687 | 0 | tuple->t_data = NULL; |
1688 | 0 | } |
1689 | |
|
1690 | 0 | return false; |
1691 | 0 | } |
1692 | | |
1693 | | /* |
1694 | | * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot |
1695 | | * |
1696 | | * On entry, *tid is the TID of a tuple (either a simple tuple, or the root |
1697 | | * of a HOT chain), and buffer is the buffer holding this tuple. We search |
1698 | | * for the first chain member satisfying the given snapshot. If one is |
1699 | | * found, we update *tid to reference that tuple's offset number, and |
1700 | | * return true. If no match, return false without modifying *tid. |
1701 | | * |
1702 | | * heapTuple is a caller-supplied buffer. When a match is found, we return |
1703 | | * the tuple here, in addition to updating *tid. If no match is found, the |
1704 | | * contents of this buffer on return are undefined. |
1705 | | * |
1706 | | * If all_dead is not NULL, we check non-visible tuples to see if they are |
1707 | | * globally dead; *all_dead is set true if all members of the HOT chain |
1708 | | * are vacuumable, false if not. |
1709 | | * |
1710 | | * Unlike heap_fetch, the caller must already have pin and (at least) share |
1711 | | * lock on the buffer; it is still pinned/locked at exit. |
1712 | | */ |
1713 | | bool |
1714 | | heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, |
1715 | | Snapshot snapshot, HeapTuple heapTuple, |
1716 | | bool *all_dead, bool first_call) |
1717 | 0 | { |
1718 | 0 | Page page = BufferGetPage(buffer); |
1719 | 0 | TransactionId prev_xmax = InvalidTransactionId; |
1720 | 0 | BlockNumber blkno; |
1721 | 0 | OffsetNumber offnum; |
1722 | 0 | bool at_chain_start; |
1723 | 0 | bool valid; |
1724 | 0 | bool skip; |
1725 | 0 | GlobalVisState *vistest = NULL; |
1726 | | |
1727 | | /* If this is not the first call, previous call returned a (live!) tuple */ |
1728 | 0 | if (all_dead) |
1729 | 0 | *all_dead = first_call; |
1730 | |
|
1731 | 0 | blkno = ItemPointerGetBlockNumber(tid); |
1732 | 0 | offnum = ItemPointerGetOffsetNumber(tid); |
1733 | 0 | at_chain_start = first_call; |
1734 | 0 | skip = !first_call; |
1735 | | |
1736 | | /* XXX: we should assert that a snapshot is pushed or registered */ |
1737 | 0 | Assert(TransactionIdIsValid(RecentXmin)); |
1738 | 0 | Assert(BufferGetBlockNumber(buffer) == blkno); |
1739 | | |
1740 | | /* Scan through possible multiple members of HOT-chain */ |
1741 | 0 | for (;;) |
1742 | 0 | { |
1743 | 0 | ItemId lp; |
1744 | | |
1745 | | /* check for bogus TID */ |
1746 | 0 | if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page)) |
1747 | 0 | break; |
1748 | | |
1749 | 0 | lp = PageGetItemId(page, offnum); |
1750 | | |
1751 | | /* check for unused, dead, or redirected items */ |
1752 | 0 | if (!ItemIdIsNormal(lp)) |
1753 | 0 | { |
1754 | | /* We should only see a redirect at start of chain */ |
1755 | 0 | if (ItemIdIsRedirected(lp) && at_chain_start) |
1756 | 0 | { |
1757 | | /* Follow the redirect */ |
1758 | 0 | offnum = ItemIdGetRedirect(lp); |
1759 | 0 | at_chain_start = false; |
1760 | 0 | continue; |
1761 | 0 | } |
1762 | | /* else must be end of chain */ |
1763 | 0 | break; |
1764 | 0 | } |
1765 | | |
1766 | | /* |
1767 | | * Update heapTuple to point to the element of the HOT chain we're |
1768 | | * currently investigating. Having t_self set correctly is important |
1769 | | * because the SSI checks and the *Satisfies routine for historical |
1770 | | * MVCC snapshots need the correct tid to decide about the visibility. |
1771 | | */ |
1772 | 0 | heapTuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); |
1773 | 0 | heapTuple->t_len = ItemIdGetLength(lp); |
1774 | 0 | heapTuple->t_tableOid = RelationGetRelid(relation); |
1775 | 0 | ItemPointerSet(&heapTuple->t_self, blkno, offnum); |
1776 | | |
1777 | | /* |
1778 | | * Shouldn't see a HEAP_ONLY tuple at chain start. |
1779 | | */ |
1780 | 0 | if (at_chain_start && HeapTupleIsHeapOnly(heapTuple)) |
1781 | 0 | break; |
1782 | | |
1783 | | /* |
1784 | | * The xmin should match the previous xmax value, else chain is |
1785 | | * broken. |
1786 | | */ |
1787 | 0 | if (TransactionIdIsValid(prev_xmax) && |
1788 | 0 | !TransactionIdEquals(prev_xmax, |
1789 | 0 | HeapTupleHeaderGetXmin(heapTuple->t_data))) |
1790 | 0 | break; |
1791 | | |
1792 | | /* |
1793 | | * When first_call is true (and thus, skip is initially false) we'll |
1794 | | * return the first tuple we find. But on later passes, heapTuple |
1795 | | * will initially be pointing to the tuple we returned last time. |
1796 | | * Returning it again would be incorrect (and would loop forever), so |
1797 | | * we skip it and return the next match we find. |
1798 | | */ |
1799 | 0 | if (!skip) |
1800 | 0 | { |
1801 | | /* If it's visible per the snapshot, we must return it */ |
1802 | 0 | valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer); |
1803 | 0 | HeapCheckForSerializableConflictOut(valid, relation, heapTuple, |
1804 | 0 | buffer, snapshot); |
1805 | |
|
1806 | 0 | if (valid) |
1807 | 0 | { |
1808 | 0 | ItemPointerSetOffsetNumber(tid, offnum); |
1809 | 0 | PredicateLockTID(relation, &heapTuple->t_self, snapshot, |
1810 | 0 | HeapTupleHeaderGetXmin(heapTuple->t_data)); |
1811 | 0 | if (all_dead) |
1812 | 0 | *all_dead = false; |
1813 | 0 | return true; |
1814 | 0 | } |
1815 | 0 | } |
1816 | 0 | skip = false; |
1817 | | |
1818 | | /* |
1819 | | * If we can't see it, maybe no one else can either. At caller |
1820 | | * request, check whether all chain members are dead to all |
1821 | | * transactions. |
1822 | | * |
1823 | | * Note: if you change the criterion here for what is "dead", fix the |
1824 | | * planner's get_actual_variable_range() function to match. |
1825 | | */ |
1826 | 0 | if (all_dead && *all_dead) |
1827 | 0 | { |
1828 | 0 | if (!vistest) |
1829 | 0 | vistest = GlobalVisTestFor(relation); |
1830 | |
|
1831 | 0 | if (!HeapTupleIsSurelyDead(heapTuple, vistest)) |
1832 | 0 | *all_dead = false; |
1833 | 0 | } |
1834 | | |
1835 | | /* |
1836 | | * Check to see if HOT chain continues past this tuple; if so fetch |
1837 | | * the next offnum and loop around. |
1838 | | */ |
1839 | 0 | if (HeapTupleIsHotUpdated(heapTuple)) |
1840 | 0 | { |
1841 | 0 | Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) == |
1842 | 0 | blkno); |
1843 | 0 | offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid); |
1844 | 0 | at_chain_start = false; |
1845 | 0 | prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data); |
1846 | 0 | } |
1847 | 0 | else |
1848 | 0 | break; /* end of chain */ |
1849 | 0 | } |
1850 | | |
1851 | 0 | return false; |
1852 | 0 | } |
1853 | | |
1854 | | /* |
1855 | | * heap_get_latest_tid - get the latest tid of a specified tuple |
1856 | | * |
1857 | | * Actually, this gets the latest version that is visible according to the |
1858 | | * scan's snapshot. Create a scan using SnapshotDirty to get the very latest, |
1859 | | * possibly uncommitted version. |
1860 | | * |
1861 | | * *tid is both an input and an output parameter: it is updated to |
1862 | | * show the latest version of the row. Note that it will not be changed |
1863 | | * if no version of the row passes the snapshot test. |
1864 | | */ |
1865 | | void |
1866 | | heap_get_latest_tid(TableScanDesc sscan, |
1867 | | ItemPointer tid) |
1868 | 0 | { |
1869 | 0 | Relation relation = sscan->rs_rd; |
1870 | 0 | Snapshot snapshot = sscan->rs_snapshot; |
1871 | 0 | ItemPointerData ctid; |
1872 | 0 | TransactionId priorXmax; |
1873 | | |
1874 | | /* |
1875 | | * table_tuple_get_latest_tid() verified that the passed in tid is valid. |
1876 | | * Assume that t_ctid links are valid however - there shouldn't be invalid |
1877 | | * ones in the table. |
1878 | | */ |
1879 | 0 | Assert(ItemPointerIsValid(tid)); |
1880 | | |
1881 | | /* |
1882 | | * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we |
1883 | | * need to examine, and *tid is the TID we will return if ctid turns out |
1884 | | * to be bogus. |
1885 | | * |
1886 | | * Note that we will loop until we reach the end of the t_ctid chain. |
1887 | | * Depending on the snapshot passed, there might be at most one visible |
1888 | | * version of the row, but we don't try to optimize for that. |
1889 | | */ |
1890 | 0 | ctid = *tid; |
1891 | 0 | priorXmax = InvalidTransactionId; /* cannot check first XMIN */ |
1892 | 0 | for (;;) |
1893 | 0 | { |
1894 | 0 | Buffer buffer; |
1895 | 0 | Page page; |
1896 | 0 | OffsetNumber offnum; |
1897 | 0 | ItemId lp; |
1898 | 0 | HeapTupleData tp; |
1899 | 0 | bool valid; |
1900 | | |
1901 | | /* |
1902 | | * Read, pin, and lock the page. |
1903 | | */ |
1904 | 0 | buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid)); |
1905 | 0 | LockBuffer(buffer, BUFFER_LOCK_SHARE); |
1906 | 0 | page = BufferGetPage(buffer); |
1907 | | |
1908 | | /* |
1909 | | * Check for bogus item number. This is not treated as an error |
1910 | | * condition because it can happen while following a t_ctid link. We |
1911 | | * just assume that the prior tid is OK and return it unchanged. |
1912 | | */ |
1913 | 0 | offnum = ItemPointerGetOffsetNumber(&ctid); |
1914 | 0 | if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page)) |
1915 | 0 | { |
1916 | 0 | UnlockReleaseBuffer(buffer); |
1917 | 0 | break; |
1918 | 0 | } |
1919 | 0 | lp = PageGetItemId(page, offnum); |
1920 | 0 | if (!ItemIdIsNormal(lp)) |
1921 | 0 | { |
1922 | 0 | UnlockReleaseBuffer(buffer); |
1923 | 0 | break; |
1924 | 0 | } |
1925 | | |
1926 | | /* OK to access the tuple */ |
1927 | 0 | tp.t_self = ctid; |
1928 | 0 | tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); |
1929 | 0 | tp.t_len = ItemIdGetLength(lp); |
1930 | 0 | tp.t_tableOid = RelationGetRelid(relation); |
1931 | | |
1932 | | /* |
1933 | | * After following a t_ctid link, we might arrive at an unrelated |
1934 | | * tuple. Check for XMIN match. |
1935 | | */ |
1936 | 0 | if (TransactionIdIsValid(priorXmax) && |
1937 | 0 | !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data))) |
1938 | 0 | { |
1939 | 0 | UnlockReleaseBuffer(buffer); |
1940 | 0 | break; |
1941 | 0 | } |
1942 | | |
1943 | | /* |
1944 | | * Check tuple visibility; if visible, set it as the new result |
1945 | | * candidate. |
1946 | | */ |
1947 | 0 | valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer); |
1948 | 0 | HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot); |
1949 | 0 | if (valid) |
1950 | 0 | *tid = ctid; |
1951 | | |
1952 | | /* |
1953 | | * If there's a valid t_ctid link, follow it, else we're done. |
1954 | | */ |
1955 | 0 | if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || |
1956 | 0 | HeapTupleHeaderIsOnlyLocked(tp.t_data) || |
1957 | 0 | HeapTupleHeaderIndicatesMovedPartitions(tp.t_data) || |
1958 | 0 | ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) |
1959 | 0 | { |
1960 | 0 | UnlockReleaseBuffer(buffer); |
1961 | 0 | break; |
1962 | 0 | } |
1963 | | |
1964 | 0 | ctid = tp.t_data->t_ctid; |
1965 | 0 | priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data); |
1966 | 0 | UnlockReleaseBuffer(buffer); |
1967 | 0 | } /* end of loop */ |
1968 | 0 | } |
1969 | | |
1970 | | |
1971 | | /* |
1972 | | * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends |
1973 | | * |
1974 | | * This is called after we have waited for the XMAX transaction to terminate. |
1975 | | * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will |
1976 | | * be set on exit. If the transaction committed, we set the XMAX_COMMITTED |
1977 | | * hint bit if possible --- but beware that that may not yet be possible, |
1978 | | * if the transaction committed asynchronously. |
1979 | | * |
1980 | | * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID |
1981 | | * even if it commits. |
1982 | | * |
1983 | | * Hence callers should look only at XMAX_INVALID. |
1984 | | * |
1985 | | * Note this is not allowed for tuples whose xmax is a multixact. |
1986 | | */ |
1987 | | static void |
1988 | | UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid) |
1989 | 0 | { |
1990 | 0 | Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid)); |
1991 | 0 | Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI)); |
1992 | |
|
1993 | 0 | if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID))) |
1994 | 0 | { |
1995 | 0 | if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) && |
1996 | 0 | TransactionIdDidCommit(xid)) |
1997 | 0 | HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, |
1998 | 0 | xid); |
1999 | 0 | else |
2000 | 0 | HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID, |
2001 | 0 | InvalidTransactionId); |
2002 | 0 | } |
2003 | 0 | } |
2004 | | |
2005 | | |
2006 | | /* |
2007 | | * GetBulkInsertState - prepare status object for a bulk insert |
2008 | | */ |
2009 | | BulkInsertState |
2010 | | GetBulkInsertState(void) |
2011 | 0 | { |
2012 | 0 | BulkInsertState bistate; |
2013 | |
|
2014 | 0 | bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData)); |
2015 | 0 | bistate->strategy = GetAccessStrategy(BAS_BULKWRITE); |
2016 | 0 | bistate->current_buf = InvalidBuffer; |
2017 | 0 | bistate->next_free = InvalidBlockNumber; |
2018 | 0 | bistate->last_free = InvalidBlockNumber; |
2019 | 0 | bistate->already_extended_by = 0; |
2020 | 0 | return bistate; |
2021 | 0 | } |
2022 | | |
2023 | | /* |
2024 | | * FreeBulkInsertState - clean up after finishing a bulk insert |
2025 | | */ |
2026 | | void |
2027 | | FreeBulkInsertState(BulkInsertState bistate) |
2028 | 0 | { |
2029 | 0 | if (bistate->current_buf != InvalidBuffer) |
2030 | 0 | ReleaseBuffer(bistate->current_buf); |
2031 | 0 | FreeAccessStrategy(bistate->strategy); |
2032 | 0 | pfree(bistate); |
2033 | 0 | } |
2034 | | |
2035 | | /* |
2036 | | * ReleaseBulkInsertStatePin - release a buffer currently held in bistate |
2037 | | */ |
2038 | | void |
2039 | | ReleaseBulkInsertStatePin(BulkInsertState bistate) |
2040 | 0 | { |
2041 | 0 | if (bistate->current_buf != InvalidBuffer) |
2042 | 0 | ReleaseBuffer(bistate->current_buf); |
2043 | 0 | bistate->current_buf = InvalidBuffer; |
2044 | | |
2045 | | /* |
2046 | | * Despite the name, we also reset bulk relation extension state. |
2047 | | * Otherwise we can end up erroring out due to looking for free space in |
2048 | | * ->next_free of one partition, even though ->next_free was set when |
2049 | | * extending another partition. It could obviously also be bad for |
2050 | | * efficiency to look at existing blocks at offsets from another |
2051 | | * partition, even if we don't error out. |
2052 | | */ |
2053 | 0 | bistate->next_free = InvalidBlockNumber; |
2054 | 0 | bistate->last_free = InvalidBlockNumber; |
2055 | 0 | } |
2056 | | |
2057 | | |
2058 | | /* |
2059 | | * heap_insert - insert tuple into a heap |
2060 | | * |
2061 | | * The new tuple is stamped with current transaction ID and the specified |
2062 | | * command ID. |
2063 | | * |
2064 | | * See table_tuple_insert for comments about most of the input flags, except |
2065 | | * that this routine directly takes a tuple rather than a slot. |
2066 | | * |
2067 | | * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_ |
2068 | | * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to |
2069 | | * implement table_tuple_insert_speculative(). |
2070 | | * |
2071 | | * On return the header fields of *tup are updated to match the stored tuple; |
2072 | | * in particular tup->t_self receives the actual TID where the tuple was |
2073 | | * stored. But note that any toasting of fields within the tuple data is NOT |
2074 | | * reflected into *tup. |
2075 | | */ |
2076 | | void |
2077 | | heap_insert(Relation relation, HeapTuple tup, CommandId cid, |
2078 | | int options, BulkInsertState bistate) |
2079 | 0 | { |
2080 | 0 | TransactionId xid = GetCurrentTransactionId(); |
2081 | 0 | HeapTuple heaptup; |
2082 | 0 | Buffer buffer; |
2083 | 0 | Buffer vmbuffer = InvalidBuffer; |
2084 | 0 | bool all_visible_cleared = false; |
2085 | | |
2086 | | /* Cheap, simplistic check that the tuple matches the rel's rowtype. */ |
2087 | 0 | Assert(HeapTupleHeaderGetNatts(tup->t_data) <= |
2088 | 0 | RelationGetNumberOfAttributes(relation)); |
2089 | |
|
2090 | 0 | AssertHasSnapshotForToast(relation); |
2091 | | |
2092 | | /* |
2093 | | * Fill in tuple header fields and toast the tuple if necessary. |
2094 | | * |
2095 | | * Note: below this point, heaptup is the data we actually intend to store |
2096 | | * into the relation; tup is the caller's original untoasted data. |
2097 | | */ |
2098 | 0 | heaptup = heap_prepare_insert(relation, tup, xid, cid, options); |
2099 | | |
2100 | | /* |
2101 | | * Find buffer to insert this tuple into. If the page is all visible, |
2102 | | * this will also pin the requisite visibility map page. |
2103 | | */ |
2104 | 0 | buffer = RelationGetBufferForTuple(relation, heaptup->t_len, |
2105 | 0 | InvalidBuffer, options, bistate, |
2106 | 0 | &vmbuffer, NULL, |
2107 | 0 | 0); |
2108 | | |
2109 | | /* |
2110 | | * We're about to do the actual insert -- but check for conflict first, to |
2111 | | * avoid possibly having to roll back work we've just done. |
2112 | | * |
2113 | | * This is safe without a recheck as long as there is no possibility of |
2114 | | * another process scanning the page between this check and the insert |
2115 | | * being visible to the scan (i.e., an exclusive buffer content lock is |
2116 | | * continuously held from this point until the tuple insert is visible). |
2117 | | * |
2118 | | * For a heap insert, we only need to check for table-level SSI locks. Our |
2119 | | * new tuple can't possibly conflict with existing tuple locks, and heap |
2120 | | * page locks are only consolidated versions of tuple locks; they do not |
2121 | | * lock "gaps" as index page locks do. So we don't need to specify a |
2122 | | * buffer when making the call, which makes for a faster check. |
2123 | | */ |
2124 | 0 | CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); |
2125 | | |
2126 | | /* NO EREPORT(ERROR) from here till changes are logged */ |
2127 | 0 | START_CRIT_SECTION(); |
2128 | |
|
2129 | 0 | RelationPutHeapTuple(relation, buffer, heaptup, |
2130 | 0 | (options & HEAP_INSERT_SPECULATIVE) != 0); |
2131 | |
|
2132 | 0 | if (PageIsAllVisible(BufferGetPage(buffer))) |
2133 | 0 | { |
2134 | 0 | all_visible_cleared = true; |
2135 | 0 | PageClearAllVisible(BufferGetPage(buffer)); |
2136 | 0 | visibilitymap_clear(relation, |
2137 | 0 | ItemPointerGetBlockNumber(&(heaptup->t_self)), |
2138 | 0 | vmbuffer, VISIBILITYMAP_VALID_BITS); |
2139 | 0 | } |
2140 | | |
2141 | | /* |
2142 | | * XXX Should we set PageSetPrunable on this page ? |
2143 | | * |
2144 | | * The inserting transaction may eventually abort thus making this tuple |
2145 | | * DEAD and hence available for pruning. Though we don't want to optimize |
2146 | | * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the |
2147 | | * aborted tuple will never be pruned until next vacuum is triggered. |
2148 | | * |
2149 | | * If you do add PageSetPrunable here, add it in heap_xlog_insert too. |
2150 | | */ |
2151 | |
|
2152 | 0 | MarkBufferDirty(buffer); |
2153 | | |
2154 | | /* XLOG stuff */ |
2155 | 0 | if (RelationNeedsWAL(relation)) |
2156 | 0 | { |
2157 | 0 | xl_heap_insert xlrec; |
2158 | 0 | xl_heap_header xlhdr; |
2159 | 0 | XLogRecPtr recptr; |
2160 | 0 | Page page = BufferGetPage(buffer); |
2161 | 0 | uint8 info = XLOG_HEAP_INSERT; |
2162 | 0 | int bufflags = 0; |
2163 | | |
2164 | | /* |
2165 | | * If this is a catalog, we need to transmit combo CIDs to properly |
2166 | | * decode, so log that as well. |
2167 | | */ |
2168 | 0 | if (RelationIsAccessibleInLogicalDecoding(relation)) |
2169 | 0 | log_heap_new_cid(relation, heaptup); |
2170 | | |
2171 | | /* |
2172 | | * If this is the single and first tuple on page, we can reinit the |
2173 | | * page instead of restoring the whole thing. Set flag, and hide |
2174 | | * buffer references from XLogInsert. |
2175 | | */ |
2176 | 0 | if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber && |
2177 | 0 | PageGetMaxOffsetNumber(page) == FirstOffsetNumber) |
2178 | 0 | { |
2179 | 0 | info |= XLOG_HEAP_INIT_PAGE; |
2180 | 0 | bufflags |= REGBUF_WILL_INIT; |
2181 | 0 | } |
2182 | |
|
2183 | 0 | xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self); |
2184 | 0 | xlrec.flags = 0; |
2185 | 0 | if (all_visible_cleared) |
2186 | 0 | xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED; |
2187 | 0 | if (options & HEAP_INSERT_SPECULATIVE) |
2188 | 0 | xlrec.flags |= XLH_INSERT_IS_SPECULATIVE; |
2189 | 0 | Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer)); |
2190 | | |
2191 | | /* |
2192 | | * For logical decoding, we need the tuple even if we're doing a full |
2193 | | * page write, so make sure it's included even if we take a full-page |
2194 | | * image. (XXX We could alternatively store a pointer into the FPW). |
2195 | | */ |
2196 | 0 | if (RelationIsLogicallyLogged(relation) && |
2197 | 0 | !(options & HEAP_INSERT_NO_LOGICAL)) |
2198 | 0 | { |
2199 | 0 | xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE; |
2200 | 0 | bufflags |= REGBUF_KEEP_DATA; |
2201 | |
|
2202 | 0 | if (IsToastRelation(relation)) |
2203 | 0 | xlrec.flags |= XLH_INSERT_ON_TOAST_RELATION; |
2204 | 0 | } |
2205 | |
|
2206 | 0 | XLogBeginInsert(); |
2207 | 0 | XLogRegisterData(&xlrec, SizeOfHeapInsert); |
2208 | |
|
2209 | 0 | xlhdr.t_infomask2 = heaptup->t_data->t_infomask2; |
2210 | 0 | xlhdr.t_infomask = heaptup->t_data->t_infomask; |
2211 | 0 | xlhdr.t_hoff = heaptup->t_data->t_hoff; |
2212 | | |
2213 | | /* |
2214 | | * note we mark xlhdr as belonging to buffer; if XLogInsert decides to |
2215 | | * write the whole page to the xlog, we don't need to store |
2216 | | * xl_heap_header in the xlog. |
2217 | | */ |
2218 | 0 | XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); |
2219 | 0 | XLogRegisterBufData(0, &xlhdr, SizeOfHeapHeader); |
2220 | | /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ |
2221 | 0 | XLogRegisterBufData(0, |
2222 | 0 | (char *) heaptup->t_data + SizeofHeapTupleHeader, |
2223 | 0 | heaptup->t_len - SizeofHeapTupleHeader); |
2224 | | |
2225 | | /* filtering by origin on a row level is much more efficient */ |
2226 | 0 | XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); |
2227 | |
|
2228 | 0 | recptr = XLogInsert(RM_HEAP_ID, info); |
2229 | |
|
2230 | 0 | PageSetLSN(page, recptr); |
2231 | 0 | } |
2232 | |
|
2233 | 0 | END_CRIT_SECTION(); |
2234 | |
|
2235 | 0 | UnlockReleaseBuffer(buffer); |
2236 | 0 | if (vmbuffer != InvalidBuffer) |
2237 | 0 | ReleaseBuffer(vmbuffer); |
2238 | | |
2239 | | /* |
2240 | | * If tuple is cachable, mark it for invalidation from the caches in case |
2241 | | * we abort. Note it is OK to do this after releasing the buffer, because |
2242 | | * the heaptup data structure is all in local memory, not in the shared |
2243 | | * buffer. |
2244 | | */ |
2245 | 0 | CacheInvalidateHeapTuple(relation, heaptup, NULL); |
2246 | | |
2247 | | /* Note: speculative insertions are counted too, even if aborted later */ |
2248 | 0 | pgstat_count_heap_insert(relation, 1); |
2249 | | |
2250 | | /* |
2251 | | * If heaptup is a private copy, release it. Don't forget to copy t_self |
2252 | | * back to the caller's image, too. |
2253 | | */ |
2254 | 0 | if (heaptup != tup) |
2255 | 0 | { |
2256 | 0 | tup->t_self = heaptup->t_self; |
2257 | 0 | heap_freetuple(heaptup); |
2258 | 0 | } |
2259 | 0 | } |
2260 | | |
2261 | | /* |
2262 | | * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the |
2263 | | * tuple header fields and toasts the tuple if necessary. Returns a toasted |
2264 | | * version of the tuple if it was toasted, or the original tuple if not. Note |
2265 | | * that in any case, the header fields are also set in the original tuple. |
2266 | | */ |
2267 | | static HeapTuple |
2268 | | heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, |
2269 | | CommandId cid, int options) |
2270 | 0 | { |
2271 | | /* |
2272 | | * To allow parallel inserts, we need to ensure that they are safe to be |
2273 | | * performed in workers. We have the infrastructure to allow parallel |
2274 | | * inserts in general except for the cases where inserts generate a new |
2275 | | * CommandId (eg. inserts into a table having a foreign key column). |
2276 | | */ |
2277 | 0 | if (IsParallelWorker()) |
2278 | 0 | ereport(ERROR, |
2279 | 0 | (errcode(ERRCODE_INVALID_TRANSACTION_STATE), |
2280 | 0 | errmsg("cannot insert tuples in a parallel worker"))); |
2281 | | |
2282 | 0 | tup->t_data->t_infomask &= ~(HEAP_XACT_MASK); |
2283 | 0 | tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); |
2284 | 0 | tup->t_data->t_infomask |= HEAP_XMAX_INVALID; |
2285 | 0 | HeapTupleHeaderSetXmin(tup->t_data, xid); |
2286 | 0 | if (options & HEAP_INSERT_FROZEN) |
2287 | 0 | HeapTupleHeaderSetXminFrozen(tup->t_data); |
2288 | |
|
2289 | 0 | HeapTupleHeaderSetCmin(tup->t_data, cid); |
2290 | 0 | HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */ |
2291 | 0 | tup->t_tableOid = RelationGetRelid(relation); |
2292 | | |
2293 | | /* |
2294 | | * If the new tuple is too big for storage or contains already toasted |
2295 | | * out-of-line attributes from some other relation, invoke the toaster. |
2296 | | */ |
2297 | 0 | if (relation->rd_rel->relkind != RELKIND_RELATION && |
2298 | 0 | relation->rd_rel->relkind != RELKIND_MATVIEW) |
2299 | 0 | { |
2300 | | /* toast table entries should never be recursively toasted */ |
2301 | 0 | Assert(!HeapTupleHasExternal(tup)); |
2302 | 0 | return tup; |
2303 | 0 | } |
2304 | 0 | else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD) |
2305 | 0 | return heap_toast_insert_or_update(relation, tup, NULL, options); |
2306 | 0 | else |
2307 | 0 | return tup; |
2308 | 0 | } |
2309 | | |
2310 | | /* |
2311 | | * Helper for heap_multi_insert() that computes the number of entire pages |
2312 | | * that inserting the remaining heaptuples requires. Used to determine how |
2313 | | * much the relation needs to be extended by. |
2314 | | */ |
2315 | | static int |
2316 | | heap_multi_insert_pages(HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace) |
2317 | 0 | { |
2318 | 0 | size_t page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace; |
2319 | 0 | int npages = 1; |
2320 | |
|
2321 | 0 | for (int i = done; i < ntuples; i++) |
2322 | 0 | { |
2323 | 0 | size_t tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len); |
2324 | |
|
2325 | 0 | if (page_avail < tup_sz) |
2326 | 0 | { |
2327 | 0 | npages++; |
2328 | 0 | page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace; |
2329 | 0 | } |
2330 | 0 | page_avail -= tup_sz; |
2331 | 0 | } |
2332 | |
|
2333 | 0 | return npages; |
2334 | 0 | } |
2335 | | |
2336 | | /* |
2337 | | * heap_multi_insert - insert multiple tuples into a heap |
2338 | | * |
2339 | | * This is like heap_insert(), but inserts multiple tuples in one operation. |
2340 | | * That's faster than calling heap_insert() in a loop, because when multiple |
2341 | | * tuples can be inserted on a single page, we can write just a single WAL |
2342 | | * record covering all of them, and only need to lock/unlock the page once. |
2343 | | * |
2344 | | * Note: this leaks memory into the current memory context. You can create a |
2345 | | * temporary context before calling this, if that's a problem. |
2346 | | */ |
2347 | | void |
2348 | | heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, |
2349 | | CommandId cid, int options, BulkInsertState bistate) |
2350 | 0 | { |
2351 | 0 | TransactionId xid = GetCurrentTransactionId(); |
2352 | 0 | HeapTuple *heaptuples; |
2353 | 0 | int i; |
2354 | 0 | int ndone; |
2355 | 0 | PGAlignedBlock scratch; |
2356 | 0 | Page page; |
2357 | 0 | Buffer vmbuffer = InvalidBuffer; |
2358 | 0 | bool needwal; |
2359 | 0 | Size saveFreeSpace; |
2360 | 0 | bool need_tuple_data = RelationIsLogicallyLogged(relation); |
2361 | 0 | bool need_cids = RelationIsAccessibleInLogicalDecoding(relation); |
2362 | 0 | bool starting_with_empty_page = false; |
2363 | 0 | int npages = 0; |
2364 | 0 | int npages_used = 0; |
2365 | | |
2366 | | /* currently not needed (thus unsupported) for heap_multi_insert() */ |
2367 | 0 | Assert(!(options & HEAP_INSERT_NO_LOGICAL)); |
2368 | |
|
2369 | 0 | AssertHasSnapshotForToast(relation); |
2370 | |
|
2371 | 0 | needwal = RelationNeedsWAL(relation); |
2372 | 0 | saveFreeSpace = RelationGetTargetPageFreeSpace(relation, |
2373 | 0 | HEAP_DEFAULT_FILLFACTOR); |
2374 | | |
2375 | | /* Toast and set header data in all the slots */ |
2376 | 0 | heaptuples = palloc(ntuples * sizeof(HeapTuple)); |
2377 | 0 | for (i = 0; i < ntuples; i++) |
2378 | 0 | { |
2379 | 0 | HeapTuple tuple; |
2380 | |
|
2381 | 0 | tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL); |
2382 | 0 | slots[i]->tts_tableOid = RelationGetRelid(relation); |
2383 | 0 | tuple->t_tableOid = slots[i]->tts_tableOid; |
2384 | 0 | heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid, |
2385 | 0 | options); |
2386 | 0 | } |
2387 | | |
2388 | | /* |
2389 | | * We're about to do the actual inserts -- but check for conflict first, |
2390 | | * to minimize the possibility of having to roll back work we've just |
2391 | | * done. |
2392 | | * |
2393 | | * A check here does not definitively prevent a serialization anomaly; |
2394 | | * that check MUST be done at least past the point of acquiring an |
2395 | | * exclusive buffer content lock on every buffer that will be affected, |
2396 | | * and MAY be done after all inserts are reflected in the buffers and |
2397 | | * those locks are released; otherwise there is a race condition. Since |
2398 | | * multiple buffers can be locked and unlocked in the loop below, and it |
2399 | | * would not be feasible to identify and lock all of those buffers before |
2400 | | * the loop, we must do a final check at the end. |
2401 | | * |
2402 | | * The check here could be omitted with no loss of correctness; it is |
2403 | | * present strictly as an optimization. |
2404 | | * |
2405 | | * For heap inserts, we only need to check for table-level SSI locks. Our |
2406 | | * new tuples can't possibly conflict with existing tuple locks, and heap |
2407 | | * page locks are only consolidated versions of tuple locks; they do not |
2408 | | * lock "gaps" as index page locks do. So we don't need to specify a |
2409 | | * buffer when making the call, which makes for a faster check. |
2410 | | */ |
2411 | 0 | CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); |
2412 | |
|
2413 | 0 | ndone = 0; |
2414 | 0 | while (ndone < ntuples) |
2415 | 0 | { |
2416 | 0 | Buffer buffer; |
2417 | 0 | bool all_visible_cleared = false; |
2418 | 0 | bool all_frozen_set = false; |
2419 | 0 | int nthispage; |
2420 | |
|
2421 | 0 | CHECK_FOR_INTERRUPTS(); |
2422 | | |
2423 | | /* |
2424 | | * Compute number of pages needed to fit the to-be-inserted tuples in |
2425 | | * the worst case. This will be used to determine how much to extend |
2426 | | * the relation by in RelationGetBufferForTuple(), if needed. If we |
2427 | | * filled a prior page from scratch, we can just update our last |
2428 | | * computation, but if we started with a partially filled page, |
2429 | | * recompute from scratch, the number of potentially required pages |
2430 | | * can vary due to tuples needing to fit onto the page, page headers |
2431 | | * etc. |
2432 | | */ |
2433 | 0 | if (ndone == 0 || !starting_with_empty_page) |
2434 | 0 | { |
2435 | 0 | npages = heap_multi_insert_pages(heaptuples, ndone, ntuples, |
2436 | 0 | saveFreeSpace); |
2437 | 0 | npages_used = 0; |
2438 | 0 | } |
2439 | 0 | else |
2440 | 0 | npages_used++; |
2441 | | |
2442 | | /* |
2443 | | * Find buffer where at least the next tuple will fit. If the page is |
2444 | | * all-visible, this will also pin the requisite visibility map page. |
2445 | | * |
2446 | | * Also pin visibility map page if COPY FREEZE inserts tuples into an |
2447 | | * empty page. See all_frozen_set below. |
2448 | | */ |
2449 | 0 | buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len, |
2450 | 0 | InvalidBuffer, options, bistate, |
2451 | 0 | &vmbuffer, NULL, |
2452 | 0 | npages - npages_used); |
2453 | 0 | page = BufferGetPage(buffer); |
2454 | |
|
2455 | 0 | starting_with_empty_page = PageGetMaxOffsetNumber(page) == 0; |
2456 | |
|
2457 | 0 | if (starting_with_empty_page && (options & HEAP_INSERT_FROZEN)) |
2458 | 0 | all_frozen_set = true; |
2459 | | |
2460 | | /* NO EREPORT(ERROR) from here till changes are logged */ |
2461 | 0 | START_CRIT_SECTION(); |
2462 | | |
2463 | | /* |
2464 | | * RelationGetBufferForTuple has ensured that the first tuple fits. |
2465 | | * Put that on the page, and then as many other tuples as fit. |
2466 | | */ |
2467 | 0 | RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false); |
2468 | | |
2469 | | /* |
2470 | | * For logical decoding we need combo CIDs to properly decode the |
2471 | | * catalog. |
2472 | | */ |
2473 | 0 | if (needwal && need_cids) |
2474 | 0 | log_heap_new_cid(relation, heaptuples[ndone]); |
2475 | |
|
2476 | 0 | for (nthispage = 1; ndone + nthispage < ntuples; nthispage++) |
2477 | 0 | { |
2478 | 0 | HeapTuple heaptup = heaptuples[ndone + nthispage]; |
2479 | |
|
2480 | 0 | if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace) |
2481 | 0 | break; |
2482 | | |
2483 | 0 | RelationPutHeapTuple(relation, buffer, heaptup, false); |
2484 | | |
2485 | | /* |
2486 | | * For logical decoding we need combo CIDs to properly decode the |
2487 | | * catalog. |
2488 | | */ |
2489 | 0 | if (needwal && need_cids) |
2490 | 0 | log_heap_new_cid(relation, heaptup); |
2491 | 0 | } |
2492 | | |
2493 | | /* |
2494 | | * If the page is all visible, need to clear that, unless we're only |
2495 | | * going to add further frozen rows to it. |
2496 | | * |
2497 | | * If we're only adding already frozen rows to a previously empty |
2498 | | * page, mark it as all-visible. |
2499 | | */ |
2500 | 0 | if (PageIsAllVisible(page) && !(options & HEAP_INSERT_FROZEN)) |
2501 | 0 | { |
2502 | 0 | all_visible_cleared = true; |
2503 | 0 | PageClearAllVisible(page); |
2504 | 0 | visibilitymap_clear(relation, |
2505 | 0 | BufferGetBlockNumber(buffer), |
2506 | 0 | vmbuffer, VISIBILITYMAP_VALID_BITS); |
2507 | 0 | } |
2508 | 0 | else if (all_frozen_set) |
2509 | 0 | PageSetAllVisible(page); |
2510 | | |
2511 | | /* |
2512 | | * XXX Should we set PageSetPrunable on this page ? See heap_insert() |
2513 | | */ |
2514 | |
|
2515 | 0 | MarkBufferDirty(buffer); |
2516 | | |
2517 | | /* XLOG stuff */ |
2518 | 0 | if (needwal) |
2519 | 0 | { |
2520 | 0 | XLogRecPtr recptr; |
2521 | 0 | xl_heap_multi_insert *xlrec; |
2522 | 0 | uint8 info = XLOG_HEAP2_MULTI_INSERT; |
2523 | 0 | char *tupledata; |
2524 | 0 | int totaldatalen; |
2525 | 0 | char *scratchptr = scratch.data; |
2526 | 0 | bool init; |
2527 | 0 | int bufflags = 0; |
2528 | | |
2529 | | /* |
2530 | | * If the page was previously empty, we can reinit the page |
2531 | | * instead of restoring the whole thing. |
2532 | | */ |
2533 | 0 | init = starting_with_empty_page; |
2534 | | |
2535 | | /* allocate xl_heap_multi_insert struct from the scratch area */ |
2536 | 0 | xlrec = (xl_heap_multi_insert *) scratchptr; |
2537 | 0 | scratchptr += SizeOfHeapMultiInsert; |
2538 | | |
2539 | | /* |
2540 | | * Allocate offsets array. Unless we're reinitializing the page, |
2541 | | * in that case the tuples are stored in order starting at |
2542 | | * FirstOffsetNumber and we don't need to store the offsets |
2543 | | * explicitly. |
2544 | | */ |
2545 | 0 | if (!init) |
2546 | 0 | scratchptr += nthispage * sizeof(OffsetNumber); |
2547 | | |
2548 | | /* the rest of the scratch space is used for tuple data */ |
2549 | 0 | tupledata = scratchptr; |
2550 | | |
2551 | | /* check that the mutually exclusive flags are not both set */ |
2552 | 0 | Assert(!(all_visible_cleared && all_frozen_set)); |
2553 | |
|
2554 | 0 | xlrec->flags = 0; |
2555 | 0 | if (all_visible_cleared) |
2556 | 0 | xlrec->flags = XLH_INSERT_ALL_VISIBLE_CLEARED; |
2557 | 0 | if (all_frozen_set) |
2558 | 0 | xlrec->flags = XLH_INSERT_ALL_FROZEN_SET; |
2559 | |
|
2560 | 0 | xlrec->ntuples = nthispage; |
2561 | | |
2562 | | /* |
2563 | | * Write out an xl_multi_insert_tuple and the tuple data itself |
2564 | | * for each tuple. |
2565 | | */ |
2566 | 0 | for (i = 0; i < nthispage; i++) |
2567 | 0 | { |
2568 | 0 | HeapTuple heaptup = heaptuples[ndone + i]; |
2569 | 0 | xl_multi_insert_tuple *tuphdr; |
2570 | 0 | int datalen; |
2571 | |
|
2572 | 0 | if (!init) |
2573 | 0 | xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self); |
2574 | | /* xl_multi_insert_tuple needs two-byte alignment. */ |
2575 | 0 | tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr); |
2576 | 0 | scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple; |
2577 | |
|
2578 | 0 | tuphdr->t_infomask2 = heaptup->t_data->t_infomask2; |
2579 | 0 | tuphdr->t_infomask = heaptup->t_data->t_infomask; |
2580 | 0 | tuphdr->t_hoff = heaptup->t_data->t_hoff; |
2581 | | |
2582 | | /* write bitmap [+ padding] [+ oid] + data */ |
2583 | 0 | datalen = heaptup->t_len - SizeofHeapTupleHeader; |
2584 | 0 | memcpy(scratchptr, |
2585 | 0 | (char *) heaptup->t_data + SizeofHeapTupleHeader, |
2586 | 0 | datalen); |
2587 | 0 | tuphdr->datalen = datalen; |
2588 | 0 | scratchptr += datalen; |
2589 | 0 | } |
2590 | 0 | totaldatalen = scratchptr - tupledata; |
2591 | 0 | Assert((scratchptr - scratch.data) < BLCKSZ); |
2592 | |
|
2593 | 0 | if (need_tuple_data) |
2594 | 0 | xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE; |
2595 | | |
2596 | | /* |
2597 | | * Signal that this is the last xl_heap_multi_insert record |
2598 | | * emitted by this call to heap_multi_insert(). Needed for logical |
2599 | | * decoding so it knows when to cleanup temporary data. |
2600 | | */ |
2601 | 0 | if (ndone + nthispage == ntuples) |
2602 | 0 | xlrec->flags |= XLH_INSERT_LAST_IN_MULTI; |
2603 | |
|
2604 | 0 | if (init) |
2605 | 0 | { |
2606 | 0 | info |= XLOG_HEAP_INIT_PAGE; |
2607 | 0 | bufflags |= REGBUF_WILL_INIT; |
2608 | 0 | } |
2609 | | |
2610 | | /* |
2611 | | * If we're doing logical decoding, include the new tuple data |
2612 | | * even if we take a full-page image of the page. |
2613 | | */ |
2614 | 0 | if (need_tuple_data) |
2615 | 0 | bufflags |= REGBUF_KEEP_DATA; |
2616 | |
|
2617 | 0 | XLogBeginInsert(); |
2618 | 0 | XLogRegisterData(xlrec, tupledata - scratch.data); |
2619 | 0 | XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags); |
2620 | |
|
2621 | 0 | XLogRegisterBufData(0, tupledata, totaldatalen); |
2622 | | |
2623 | | /* filtering by origin on a row level is much more efficient */ |
2624 | 0 | XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); |
2625 | |
|
2626 | 0 | recptr = XLogInsert(RM_HEAP2_ID, info); |
2627 | |
|
2628 | 0 | PageSetLSN(page, recptr); |
2629 | 0 | } |
2630 | |
|
2631 | 0 | END_CRIT_SECTION(); |
2632 | | |
2633 | | /* |
2634 | | * If we've frozen everything on the page, update the visibilitymap. |
2635 | | * We're already holding pin on the vmbuffer. |
2636 | | */ |
2637 | 0 | if (all_frozen_set) |
2638 | 0 | { |
2639 | 0 | Assert(PageIsAllVisible(page)); |
2640 | 0 | Assert(visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer)); |
2641 | | |
2642 | | /* |
2643 | | * It's fine to use InvalidTransactionId here - this is only used |
2644 | | * when HEAP_INSERT_FROZEN is specified, which intentionally |
2645 | | * violates visibility rules. |
2646 | | */ |
2647 | 0 | visibilitymap_set(relation, BufferGetBlockNumber(buffer), buffer, |
2648 | 0 | InvalidXLogRecPtr, vmbuffer, |
2649 | 0 | InvalidTransactionId, |
2650 | 0 | VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN); |
2651 | 0 | } |
2652 | |
|
2653 | 0 | UnlockReleaseBuffer(buffer); |
2654 | 0 | ndone += nthispage; |
2655 | | |
2656 | | /* |
2657 | | * NB: Only release vmbuffer after inserting all tuples - it's fairly |
2658 | | * likely that we'll insert into subsequent heap pages that are likely |
2659 | | * to use the same vm page. |
2660 | | */ |
2661 | 0 | } |
2662 | | |
2663 | | /* We're done with inserting all tuples, so release the last vmbuffer. */ |
2664 | 0 | if (vmbuffer != InvalidBuffer) |
2665 | 0 | ReleaseBuffer(vmbuffer); |
2666 | | |
2667 | | /* |
2668 | | * We're done with the actual inserts. Check for conflicts again, to |
2669 | | * ensure that all rw-conflicts in to these inserts are detected. Without |
2670 | | * this final check, a sequential scan of the heap may have locked the |
2671 | | * table after the "before" check, missing one opportunity to detect the |
2672 | | * conflict, and then scanned the table before the new tuples were there, |
2673 | | * missing the other chance to detect the conflict. |
2674 | | * |
2675 | | * For heap inserts, we only need to check for table-level SSI locks. Our |
2676 | | * new tuples can't possibly conflict with existing tuple locks, and heap |
2677 | | * page locks are only consolidated versions of tuple locks; they do not |
2678 | | * lock "gaps" as index page locks do. So we don't need to specify a |
2679 | | * buffer when making the call. |
2680 | | */ |
2681 | 0 | CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber); |
2682 | | |
2683 | | /* |
2684 | | * If tuples are cachable, mark them for invalidation from the caches in |
2685 | | * case we abort. Note it is OK to do this after releasing the buffer, |
2686 | | * because the heaptuples data structure is all in local memory, not in |
2687 | | * the shared buffer. |
2688 | | */ |
2689 | 0 | if (IsCatalogRelation(relation)) |
2690 | 0 | { |
2691 | 0 | for (i = 0; i < ntuples; i++) |
2692 | 0 | CacheInvalidateHeapTuple(relation, heaptuples[i], NULL); |
2693 | 0 | } |
2694 | | |
2695 | | /* copy t_self fields back to the caller's slots */ |
2696 | 0 | for (i = 0; i < ntuples; i++) |
2697 | 0 | slots[i]->tts_tid = heaptuples[i]->t_self; |
2698 | |
|
2699 | 0 | pgstat_count_heap_insert(relation, ntuples); |
2700 | 0 | } |
2701 | | |
2702 | | /* |
2703 | | * simple_heap_insert - insert a tuple |
2704 | | * |
2705 | | * Currently, this routine differs from heap_insert only in supplying |
2706 | | * a default command ID and not allowing access to the speedup options. |
2707 | | * |
2708 | | * This should be used rather than using heap_insert directly in most places |
2709 | | * where we are modifying system catalogs. |
2710 | | */ |
2711 | | void |
2712 | | simple_heap_insert(Relation relation, HeapTuple tup) |
2713 | 0 | { |
2714 | 0 | heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL); |
2715 | 0 | } |
2716 | | |
2717 | | /* |
2718 | | * Given infomask/infomask2, compute the bits that must be saved in the |
2719 | | * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock, |
2720 | | * xl_heap_lock_updated WAL records. |
2721 | | * |
2722 | | * See fix_infomask_from_infobits. |
2723 | | */ |
2724 | | static uint8 |
2725 | | compute_infobits(uint16 infomask, uint16 infomask2) |
2726 | 0 | { |
2727 | 0 | return |
2728 | 0 | ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) | |
2729 | 0 | ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) | |
2730 | 0 | ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) | |
2731 | | /* note we ignore HEAP_XMAX_SHR_LOCK here */ |
2732 | 0 | ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) | |
2733 | 0 | ((infomask2 & HEAP_KEYS_UPDATED) != 0 ? |
2734 | 0 | XLHL_KEYS_UPDATED : 0); |
2735 | 0 | } |
2736 | | |
2737 | | /* |
2738 | | * Given two versions of the same t_infomask for a tuple, compare them and |
2739 | | * return whether the relevant status for a tuple Xmax has changed. This is |
2740 | | * used after a buffer lock has been released and reacquired: we want to ensure |
2741 | | * that the tuple state continues to be the same it was when we previously |
2742 | | * examined it. |
2743 | | * |
2744 | | * Note the Xmax field itself must be compared separately. |
2745 | | */ |
2746 | | static inline bool |
2747 | | xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask) |
2748 | 0 | { |
2749 | 0 | const uint16 interesting = |
2750 | 0 | HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK; |
2751 | |
|
2752 | 0 | if ((new_infomask & interesting) != (old_infomask & interesting)) |
2753 | 0 | return true; |
2754 | | |
2755 | 0 | return false; |
2756 | 0 | } |
2757 | | |
2758 | | /* |
2759 | | * heap_delete - delete a tuple |
2760 | | * |
2761 | | * See table_tuple_delete() for an explanation of the parameters, except that |
2762 | | * this routine directly takes a tuple rather than a slot. |
2763 | | * |
2764 | | * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, |
2765 | | * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last |
2766 | | * only for TM_SelfModified, since we cannot obtain cmax from a combo CID |
2767 | | * generated by another transaction). |
2768 | | */ |
2769 | | TM_Result |
2770 | | heap_delete(Relation relation, ItemPointer tid, |
2771 | | CommandId cid, Snapshot crosscheck, bool wait, |
2772 | | TM_FailureData *tmfd, bool changingPart) |
2773 | 0 | { |
2774 | 0 | TM_Result result; |
2775 | 0 | TransactionId xid = GetCurrentTransactionId(); |
2776 | 0 | ItemId lp; |
2777 | 0 | HeapTupleData tp; |
2778 | 0 | Page page; |
2779 | 0 | BlockNumber block; |
2780 | 0 | Buffer buffer; |
2781 | 0 | Buffer vmbuffer = InvalidBuffer; |
2782 | 0 | TransactionId new_xmax; |
2783 | 0 | uint16 new_infomask, |
2784 | 0 | new_infomask2; |
2785 | 0 | bool have_tuple_lock = false; |
2786 | 0 | bool iscombo; |
2787 | 0 | bool all_visible_cleared = false; |
2788 | 0 | HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */ |
2789 | 0 | bool old_key_copied = false; |
2790 | |
|
2791 | 0 | Assert(ItemPointerIsValid(tid)); |
2792 | |
|
2793 | 0 | AssertHasSnapshotForToast(relation); |
2794 | | |
2795 | | /* |
2796 | | * Forbid this during a parallel operation, lest it allocate a combo CID. |
2797 | | * Other workers might need that combo CID for visibility checks, and we |
2798 | | * have no provision for broadcasting it to them. |
2799 | | */ |
2800 | 0 | if (IsInParallelMode()) |
2801 | 0 | ereport(ERROR, |
2802 | 0 | (errcode(ERRCODE_INVALID_TRANSACTION_STATE), |
2803 | 0 | errmsg("cannot delete tuples during a parallel operation"))); |
2804 | | |
2805 | 0 | block = ItemPointerGetBlockNumber(tid); |
2806 | 0 | buffer = ReadBuffer(relation, block); |
2807 | 0 | page = BufferGetPage(buffer); |
2808 | | |
2809 | | /* |
2810 | | * Before locking the buffer, pin the visibility map page if it appears to |
2811 | | * be necessary. Since we haven't got the lock yet, someone else might be |
2812 | | * in the middle of changing this, so we'll need to recheck after we have |
2813 | | * the lock. |
2814 | | */ |
2815 | 0 | if (PageIsAllVisible(page)) |
2816 | 0 | visibilitymap_pin(relation, block, &vmbuffer); |
2817 | |
|
2818 | 0 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
2819 | |
|
2820 | 0 | lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); |
2821 | 0 | Assert(ItemIdIsNormal(lp)); |
2822 | |
|
2823 | 0 | tp.t_tableOid = RelationGetRelid(relation); |
2824 | 0 | tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); |
2825 | 0 | tp.t_len = ItemIdGetLength(lp); |
2826 | 0 | tp.t_self = *tid; |
2827 | |
|
2828 | 0 | l1: |
2829 | | |
2830 | | /* |
2831 | | * If we didn't pin the visibility map page and the page has become all |
2832 | | * visible while we were busy locking the buffer, we'll have to unlock and |
2833 | | * re-lock, to avoid holding the buffer lock across an I/O. That's a bit |
2834 | | * unfortunate, but hopefully shouldn't happen often. |
2835 | | */ |
2836 | 0 | if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) |
2837 | 0 | { |
2838 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
2839 | 0 | visibilitymap_pin(relation, block, &vmbuffer); |
2840 | 0 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
2841 | 0 | } |
2842 | |
|
2843 | 0 | result = HeapTupleSatisfiesUpdate(&tp, cid, buffer); |
2844 | |
|
2845 | 0 | if (result == TM_Invisible) |
2846 | 0 | { |
2847 | 0 | UnlockReleaseBuffer(buffer); |
2848 | 0 | ereport(ERROR, |
2849 | 0 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
2850 | 0 | errmsg("attempted to delete invisible tuple"))); |
2851 | 0 | } |
2852 | 0 | else if (result == TM_BeingModified && wait) |
2853 | 0 | { |
2854 | 0 | TransactionId xwait; |
2855 | 0 | uint16 infomask; |
2856 | | |
2857 | | /* must copy state data before unlocking buffer */ |
2858 | 0 | xwait = HeapTupleHeaderGetRawXmax(tp.t_data); |
2859 | 0 | infomask = tp.t_data->t_infomask; |
2860 | | |
2861 | | /* |
2862 | | * Sleep until concurrent transaction ends -- except when there's a |
2863 | | * single locker and it's our own transaction. Note we don't care |
2864 | | * which lock mode the locker has, because we need the strongest one. |
2865 | | * |
2866 | | * Before sleeping, we need to acquire tuple lock to establish our |
2867 | | * priority for the tuple (see heap_lock_tuple). LockTuple will |
2868 | | * release us when we are next-in-line for the tuple. |
2869 | | * |
2870 | | * If we are forced to "start over" below, we keep the tuple lock; |
2871 | | * this arranges that we stay at the head of the line while rechecking |
2872 | | * tuple state. |
2873 | | */ |
2874 | 0 | if (infomask & HEAP_XMAX_IS_MULTI) |
2875 | 0 | { |
2876 | 0 | bool current_is_member = false; |
2877 | |
|
2878 | 0 | if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask, |
2879 | 0 | LockTupleExclusive, ¤t_is_member)) |
2880 | 0 | { |
2881 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
2882 | | |
2883 | | /* |
2884 | | * Acquire the lock, if necessary (but skip it when we're |
2885 | | * requesting a lock and already have one; avoids deadlock). |
2886 | | */ |
2887 | 0 | if (!current_is_member) |
2888 | 0 | heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive, |
2889 | 0 | LockWaitBlock, &have_tuple_lock); |
2890 | | |
2891 | | /* wait for multixact */ |
2892 | 0 | MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate, infomask, |
2893 | 0 | relation, &(tp.t_self), XLTW_Delete, |
2894 | 0 | NULL); |
2895 | 0 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
2896 | | |
2897 | | /* |
2898 | | * If xwait had just locked the tuple then some other xact |
2899 | | * could update this tuple before we get to this point. Check |
2900 | | * for xmax change, and start over if so. |
2901 | | * |
2902 | | * We also must start over if we didn't pin the VM page, and |
2903 | | * the page has become all visible. |
2904 | | */ |
2905 | 0 | if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) || |
2906 | 0 | xmax_infomask_changed(tp.t_data->t_infomask, infomask) || |
2907 | 0 | !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data), |
2908 | 0 | xwait)) |
2909 | 0 | goto l1; |
2910 | 0 | } |
2911 | | |
2912 | | /* |
2913 | | * You might think the multixact is necessarily done here, but not |
2914 | | * so: it could have surviving members, namely our own xact or |
2915 | | * other subxacts of this backend. It is legal for us to delete |
2916 | | * the tuple in either case, however (the latter case is |
2917 | | * essentially a situation of upgrading our former shared lock to |
2918 | | * exclusive). We don't bother changing the on-disk hint bits |
2919 | | * since we are about to overwrite the xmax altogether. |
2920 | | */ |
2921 | 0 | } |
2922 | 0 | else if (!TransactionIdIsCurrentTransactionId(xwait)) |
2923 | 0 | { |
2924 | | /* |
2925 | | * Wait for regular transaction to end; but first, acquire tuple |
2926 | | * lock. |
2927 | | */ |
2928 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
2929 | 0 | heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive, |
2930 | 0 | LockWaitBlock, &have_tuple_lock); |
2931 | 0 | XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete); |
2932 | 0 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
2933 | | |
2934 | | /* |
2935 | | * xwait is done, but if xwait had just locked the tuple then some |
2936 | | * other xact could update this tuple before we get to this point. |
2937 | | * Check for xmax change, and start over if so. |
2938 | | * |
2939 | | * We also must start over if we didn't pin the VM page, and the |
2940 | | * page has become all visible. |
2941 | | */ |
2942 | 0 | if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) || |
2943 | 0 | xmax_infomask_changed(tp.t_data->t_infomask, infomask) || |
2944 | 0 | !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data), |
2945 | 0 | xwait)) |
2946 | 0 | goto l1; |
2947 | | |
2948 | | /* Otherwise check if it committed or aborted */ |
2949 | 0 | UpdateXmaxHintBits(tp.t_data, buffer, xwait); |
2950 | 0 | } |
2951 | | |
2952 | | /* |
2953 | | * We may overwrite if previous xmax aborted, or if it committed but |
2954 | | * only locked the tuple without updating it. |
2955 | | */ |
2956 | 0 | if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) || |
2957 | 0 | HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) || |
2958 | 0 | HeapTupleHeaderIsOnlyLocked(tp.t_data)) |
2959 | 0 | result = TM_Ok; |
2960 | 0 | else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) |
2961 | 0 | result = TM_Updated; |
2962 | 0 | else |
2963 | 0 | result = TM_Deleted; |
2964 | 0 | } |
2965 | | |
2966 | | /* sanity check the result HeapTupleSatisfiesUpdate() and the logic above */ |
2967 | 0 | if (result != TM_Ok) |
2968 | 0 | { |
2969 | 0 | Assert(result == TM_SelfModified || |
2970 | 0 | result == TM_Updated || |
2971 | 0 | result == TM_Deleted || |
2972 | 0 | result == TM_BeingModified); |
2973 | 0 | Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID)); |
2974 | 0 | Assert(result != TM_Updated || |
2975 | 0 | !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)); |
2976 | 0 | } |
2977 | |
|
2978 | 0 | if (crosscheck != InvalidSnapshot && result == TM_Ok) |
2979 | 0 | { |
2980 | | /* Perform additional check for transaction-snapshot mode RI updates */ |
2981 | 0 | if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer)) |
2982 | 0 | result = TM_Updated; |
2983 | 0 | } |
2984 | |
|
2985 | 0 | if (result != TM_Ok) |
2986 | 0 | { |
2987 | 0 | tmfd->ctid = tp.t_data->t_ctid; |
2988 | 0 | tmfd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data); |
2989 | 0 | if (result == TM_SelfModified) |
2990 | 0 | tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data); |
2991 | 0 | else |
2992 | 0 | tmfd->cmax = InvalidCommandId; |
2993 | 0 | UnlockReleaseBuffer(buffer); |
2994 | 0 | if (have_tuple_lock) |
2995 | 0 | UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive); |
2996 | 0 | if (vmbuffer != InvalidBuffer) |
2997 | 0 | ReleaseBuffer(vmbuffer); |
2998 | 0 | return result; |
2999 | 0 | } |
3000 | | |
3001 | | /* |
3002 | | * We're about to do the actual delete -- check for conflict first, to |
3003 | | * avoid possibly having to roll back work we've just done. |
3004 | | * |
3005 | | * This is safe without a recheck as long as there is no possibility of |
3006 | | * another process scanning the page between this check and the delete |
3007 | | * being visible to the scan (i.e., an exclusive buffer content lock is |
3008 | | * continuously held from this point until the tuple delete is visible). |
3009 | | */ |
3010 | 0 | CheckForSerializableConflictIn(relation, tid, BufferGetBlockNumber(buffer)); |
3011 | | |
3012 | | /* replace cid with a combo CID if necessary */ |
3013 | 0 | HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo); |
3014 | | |
3015 | | /* |
3016 | | * Compute replica identity tuple before entering the critical section so |
3017 | | * we don't PANIC upon a memory allocation failure. |
3018 | | */ |
3019 | 0 | old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied); |
3020 | | |
3021 | | /* |
3022 | | * If this is the first possibly-multixact-able operation in the current |
3023 | | * transaction, set my per-backend OldestMemberMXactId setting. We can be |
3024 | | * certain that the transaction will never become a member of any older |
3025 | | * MultiXactIds than that. (We have to do this even if we end up just |
3026 | | * using our own TransactionId below, since some other backend could |
3027 | | * incorporate our XID into a MultiXact immediately afterwards.) |
3028 | | */ |
3029 | 0 | MultiXactIdSetOldestMember(); |
3030 | |
|
3031 | 0 | compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data), |
3032 | 0 | tp.t_data->t_infomask, tp.t_data->t_infomask2, |
3033 | 0 | xid, LockTupleExclusive, true, |
3034 | 0 | &new_xmax, &new_infomask, &new_infomask2); |
3035 | |
|
3036 | 0 | START_CRIT_SECTION(); |
3037 | | |
3038 | | /* |
3039 | | * If this transaction commits, the tuple will become DEAD sooner or |
3040 | | * later. Set flag that this page is a candidate for pruning once our xid |
3041 | | * falls below the OldestXmin horizon. If the transaction finally aborts, |
3042 | | * the subsequent page pruning will be a no-op and the hint will be |
3043 | | * cleared. |
3044 | | */ |
3045 | 0 | PageSetPrunable(page, xid); |
3046 | |
|
3047 | 0 | if (PageIsAllVisible(page)) |
3048 | 0 | { |
3049 | 0 | all_visible_cleared = true; |
3050 | 0 | PageClearAllVisible(page); |
3051 | 0 | visibilitymap_clear(relation, BufferGetBlockNumber(buffer), |
3052 | 0 | vmbuffer, VISIBILITYMAP_VALID_BITS); |
3053 | 0 | } |
3054 | | |
3055 | | /* store transaction information of xact deleting the tuple */ |
3056 | 0 | tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); |
3057 | 0 | tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; |
3058 | 0 | tp.t_data->t_infomask |= new_infomask; |
3059 | 0 | tp.t_data->t_infomask2 |= new_infomask2; |
3060 | 0 | HeapTupleHeaderClearHotUpdated(tp.t_data); |
3061 | 0 | HeapTupleHeaderSetXmax(tp.t_data, new_xmax); |
3062 | 0 | HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo); |
3063 | | /* Make sure there is no forward chain link in t_ctid */ |
3064 | 0 | tp.t_data->t_ctid = tp.t_self; |
3065 | | |
3066 | | /* Signal that this is actually a move into another partition */ |
3067 | 0 | if (changingPart) |
3068 | 0 | HeapTupleHeaderSetMovedPartitions(tp.t_data); |
3069 | |
|
3070 | 0 | MarkBufferDirty(buffer); |
3071 | | |
3072 | | /* |
3073 | | * XLOG stuff |
3074 | | * |
3075 | | * NB: heap_abort_speculative() uses the same xlog record and replay |
3076 | | * routines. |
3077 | | */ |
3078 | 0 | if (RelationNeedsWAL(relation)) |
3079 | 0 | { |
3080 | 0 | xl_heap_delete xlrec; |
3081 | 0 | xl_heap_header xlhdr; |
3082 | 0 | XLogRecPtr recptr; |
3083 | | |
3084 | | /* |
3085 | | * For logical decode we need combo CIDs to properly decode the |
3086 | | * catalog |
3087 | | */ |
3088 | 0 | if (RelationIsAccessibleInLogicalDecoding(relation)) |
3089 | 0 | log_heap_new_cid(relation, &tp); |
3090 | |
|
3091 | 0 | xlrec.flags = 0; |
3092 | 0 | if (all_visible_cleared) |
3093 | 0 | xlrec.flags |= XLH_DELETE_ALL_VISIBLE_CLEARED; |
3094 | 0 | if (changingPart) |
3095 | 0 | xlrec.flags |= XLH_DELETE_IS_PARTITION_MOVE; |
3096 | 0 | xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask, |
3097 | 0 | tp.t_data->t_infomask2); |
3098 | 0 | xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); |
3099 | 0 | xlrec.xmax = new_xmax; |
3100 | |
|
3101 | 0 | if (old_key_tuple != NULL) |
3102 | 0 | { |
3103 | 0 | if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL) |
3104 | 0 | xlrec.flags |= XLH_DELETE_CONTAINS_OLD_TUPLE; |
3105 | 0 | else |
3106 | 0 | xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY; |
3107 | 0 | } |
3108 | |
|
3109 | 0 | XLogBeginInsert(); |
3110 | 0 | XLogRegisterData(&xlrec, SizeOfHeapDelete); |
3111 | |
|
3112 | 0 | XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); |
3113 | | |
3114 | | /* |
3115 | | * Log replica identity of the deleted tuple if there is one |
3116 | | */ |
3117 | 0 | if (old_key_tuple != NULL) |
3118 | 0 | { |
3119 | 0 | xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2; |
3120 | 0 | xlhdr.t_infomask = old_key_tuple->t_data->t_infomask; |
3121 | 0 | xlhdr.t_hoff = old_key_tuple->t_data->t_hoff; |
3122 | |
|
3123 | 0 | XLogRegisterData(&xlhdr, SizeOfHeapHeader); |
3124 | 0 | XLogRegisterData((char *) old_key_tuple->t_data |
3125 | 0 | + SizeofHeapTupleHeader, |
3126 | 0 | old_key_tuple->t_len |
3127 | 0 | - SizeofHeapTupleHeader); |
3128 | 0 | } |
3129 | | |
3130 | | /* filtering by origin on a row level is much more efficient */ |
3131 | 0 | XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); |
3132 | |
|
3133 | 0 | recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); |
3134 | |
|
3135 | 0 | PageSetLSN(page, recptr); |
3136 | 0 | } |
3137 | |
|
3138 | 0 | END_CRIT_SECTION(); |
3139 | |
|
3140 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
3141 | |
|
3142 | 0 | if (vmbuffer != InvalidBuffer) |
3143 | 0 | ReleaseBuffer(vmbuffer); |
3144 | | |
3145 | | /* |
3146 | | * If the tuple has toasted out-of-line attributes, we need to delete |
3147 | | * those items too. We have to do this before releasing the buffer |
3148 | | * because we need to look at the contents of the tuple, but it's OK to |
3149 | | * release the content lock on the buffer first. |
3150 | | */ |
3151 | 0 | if (relation->rd_rel->relkind != RELKIND_RELATION && |
3152 | 0 | relation->rd_rel->relkind != RELKIND_MATVIEW) |
3153 | 0 | { |
3154 | | /* toast table entries should never be recursively toasted */ |
3155 | 0 | Assert(!HeapTupleHasExternal(&tp)); |
3156 | 0 | } |
3157 | 0 | else if (HeapTupleHasExternal(&tp)) |
3158 | 0 | heap_toast_delete(relation, &tp, false); |
3159 | | |
3160 | | /* |
3161 | | * Mark tuple for invalidation from system caches at next command |
3162 | | * boundary. We have to do this before releasing the buffer because we |
3163 | | * need to look at the contents of the tuple. |
3164 | | */ |
3165 | 0 | CacheInvalidateHeapTuple(relation, &tp, NULL); |
3166 | | |
3167 | | /* Now we can release the buffer */ |
3168 | 0 | ReleaseBuffer(buffer); |
3169 | | |
3170 | | /* |
3171 | | * Release the lmgr tuple lock, if we had it. |
3172 | | */ |
3173 | 0 | if (have_tuple_lock) |
3174 | 0 | UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive); |
3175 | |
|
3176 | 0 | pgstat_count_heap_delete(relation); |
3177 | |
|
3178 | 0 | if (old_key_tuple != NULL && old_key_copied) |
3179 | 0 | heap_freetuple(old_key_tuple); |
3180 | |
|
3181 | 0 | return TM_Ok; |
3182 | 0 | } |
3183 | | |
3184 | | /* |
3185 | | * simple_heap_delete - delete a tuple |
3186 | | * |
3187 | | * This routine may be used to delete a tuple when concurrent updates of |
3188 | | * the target tuple are not expected (for example, because we have a lock |
3189 | | * on the relation associated with the tuple). Any failure is reported |
3190 | | * via ereport(). |
3191 | | */ |
3192 | | void |
3193 | | simple_heap_delete(Relation relation, ItemPointer tid) |
3194 | 0 | { |
3195 | 0 | TM_Result result; |
3196 | 0 | TM_FailureData tmfd; |
3197 | |
|
3198 | 0 | result = heap_delete(relation, tid, |
3199 | 0 | GetCurrentCommandId(true), InvalidSnapshot, |
3200 | 0 | true /* wait for commit */ , |
3201 | 0 | &tmfd, false /* changingPart */ ); |
3202 | 0 | switch (result) |
3203 | 0 | { |
3204 | 0 | case TM_SelfModified: |
3205 | | /* Tuple was already updated in current command? */ |
3206 | 0 | elog(ERROR, "tuple already updated by self"); |
3207 | 0 | break; |
3208 | | |
3209 | 0 | case TM_Ok: |
3210 | | /* done successfully */ |
3211 | 0 | break; |
3212 | | |
3213 | 0 | case TM_Updated: |
3214 | 0 | elog(ERROR, "tuple concurrently updated"); |
3215 | 0 | break; |
3216 | | |
3217 | 0 | case TM_Deleted: |
3218 | 0 | elog(ERROR, "tuple concurrently deleted"); |
3219 | 0 | break; |
3220 | | |
3221 | 0 | default: |
3222 | 0 | elog(ERROR, "unrecognized heap_delete status: %u", result); |
3223 | 0 | break; |
3224 | 0 | } |
3225 | 0 | } |
3226 | | |
3227 | | /* |
3228 | | * heap_update - replace a tuple |
3229 | | * |
3230 | | * See table_tuple_update() for an explanation of the parameters, except that |
3231 | | * this routine directly takes a tuple rather than a slot. |
3232 | | * |
3233 | | * In the failure cases, the routine fills *tmfd with the tuple's t_ctid, |
3234 | | * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last |
3235 | | * only for TM_SelfModified, since we cannot obtain cmax from a combo CID |
3236 | | * generated by another transaction). |
3237 | | */ |
3238 | | TM_Result |
3239 | | heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, |
3240 | | CommandId cid, Snapshot crosscheck, bool wait, |
3241 | | TM_FailureData *tmfd, LockTupleMode *lockmode, |
3242 | | TU_UpdateIndexes *update_indexes) |
3243 | 0 | { |
3244 | 0 | TM_Result result; |
3245 | 0 | TransactionId xid = GetCurrentTransactionId(); |
3246 | 0 | Bitmapset *hot_attrs; |
3247 | 0 | Bitmapset *sum_attrs; |
3248 | 0 | Bitmapset *key_attrs; |
3249 | 0 | Bitmapset *id_attrs; |
3250 | 0 | Bitmapset *interesting_attrs; |
3251 | 0 | Bitmapset *modified_attrs; |
3252 | 0 | ItemId lp; |
3253 | 0 | HeapTupleData oldtup; |
3254 | 0 | HeapTuple heaptup; |
3255 | 0 | HeapTuple old_key_tuple = NULL; |
3256 | 0 | bool old_key_copied = false; |
3257 | 0 | Page page; |
3258 | 0 | BlockNumber block; |
3259 | 0 | MultiXactStatus mxact_status; |
3260 | 0 | Buffer buffer, |
3261 | 0 | newbuf, |
3262 | 0 | vmbuffer = InvalidBuffer, |
3263 | 0 | vmbuffer_new = InvalidBuffer; |
3264 | 0 | bool need_toast; |
3265 | 0 | Size newtupsize, |
3266 | 0 | pagefree; |
3267 | 0 | bool have_tuple_lock = false; |
3268 | 0 | bool iscombo; |
3269 | 0 | bool use_hot_update = false; |
3270 | 0 | bool summarized_update = false; |
3271 | 0 | bool key_intact; |
3272 | 0 | bool all_visible_cleared = false; |
3273 | 0 | bool all_visible_cleared_new = false; |
3274 | 0 | bool checked_lockers; |
3275 | 0 | bool locker_remains; |
3276 | 0 | bool id_has_external = false; |
3277 | 0 | TransactionId xmax_new_tuple, |
3278 | 0 | xmax_old_tuple; |
3279 | 0 | uint16 infomask_old_tuple, |
3280 | 0 | infomask2_old_tuple, |
3281 | 0 | infomask_new_tuple, |
3282 | 0 | infomask2_new_tuple; |
3283 | |
|
3284 | 0 | Assert(ItemPointerIsValid(otid)); |
3285 | | |
3286 | | /* Cheap, simplistic check that the tuple matches the rel's rowtype. */ |
3287 | 0 | Assert(HeapTupleHeaderGetNatts(newtup->t_data) <= |
3288 | 0 | RelationGetNumberOfAttributes(relation)); |
3289 | |
|
3290 | 0 | AssertHasSnapshotForToast(relation); |
3291 | | |
3292 | | /* |
3293 | | * Forbid this during a parallel operation, lest it allocate a combo CID. |
3294 | | * Other workers might need that combo CID for visibility checks, and we |
3295 | | * have no provision for broadcasting it to them. |
3296 | | */ |
3297 | 0 | if (IsInParallelMode()) |
3298 | 0 | ereport(ERROR, |
3299 | 0 | (errcode(ERRCODE_INVALID_TRANSACTION_STATE), |
3300 | 0 | errmsg("cannot update tuples during a parallel operation"))); |
3301 | | |
3302 | | #ifdef USE_ASSERT_CHECKING |
3303 | | check_lock_if_inplace_updateable_rel(relation, otid, newtup); |
3304 | | #endif |
3305 | | |
3306 | | /* |
3307 | | * Fetch the list of attributes to be checked for various operations. |
3308 | | * |
3309 | | * For HOT considerations, this is wasted effort if we fail to update or |
3310 | | * have to put the new tuple on a different page. But we must compute the |
3311 | | * list before obtaining buffer lock --- in the worst case, if we are |
3312 | | * doing an update on one of the relevant system catalogs, we could |
3313 | | * deadlock if we try to fetch the list later. In any case, the relcache |
3314 | | * caches the data so this is usually pretty cheap. |
3315 | | * |
3316 | | * We also need columns used by the replica identity and columns that are |
3317 | | * considered the "key" of rows in the table. |
3318 | | * |
3319 | | * Note that we get copies of each bitmap, so we need not worry about |
3320 | | * relcache flush happening midway through. |
3321 | | */ |
3322 | 0 | hot_attrs = RelationGetIndexAttrBitmap(relation, |
3323 | 0 | INDEX_ATTR_BITMAP_HOT_BLOCKING); |
3324 | 0 | sum_attrs = RelationGetIndexAttrBitmap(relation, |
3325 | 0 | INDEX_ATTR_BITMAP_SUMMARIZED); |
3326 | 0 | key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY); |
3327 | 0 | id_attrs = RelationGetIndexAttrBitmap(relation, |
3328 | 0 | INDEX_ATTR_BITMAP_IDENTITY_KEY); |
3329 | 0 | interesting_attrs = NULL; |
3330 | 0 | interesting_attrs = bms_add_members(interesting_attrs, hot_attrs); |
3331 | 0 | interesting_attrs = bms_add_members(interesting_attrs, sum_attrs); |
3332 | 0 | interesting_attrs = bms_add_members(interesting_attrs, key_attrs); |
3333 | 0 | interesting_attrs = bms_add_members(interesting_attrs, id_attrs); |
3334 | |
|
3335 | 0 | block = ItemPointerGetBlockNumber(otid); |
3336 | 0 | INJECTION_POINT("heap_update-before-pin", NULL); |
3337 | 0 | buffer = ReadBuffer(relation, block); |
3338 | 0 | page = BufferGetPage(buffer); |
3339 | | |
3340 | | /* |
3341 | | * Before locking the buffer, pin the visibility map page if it appears to |
3342 | | * be necessary. Since we haven't got the lock yet, someone else might be |
3343 | | * in the middle of changing this, so we'll need to recheck after we have |
3344 | | * the lock. |
3345 | | */ |
3346 | 0 | if (PageIsAllVisible(page)) |
3347 | 0 | visibilitymap_pin(relation, block, &vmbuffer); |
3348 | |
|
3349 | 0 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
3350 | |
|
3351 | 0 | lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid)); |
3352 | | |
3353 | | /* |
3354 | | * Usually, a buffer pin and/or snapshot blocks pruning of otid, ensuring |
3355 | | * we see LP_NORMAL here. When the otid origin is a syscache, we may have |
3356 | | * neither a pin nor a snapshot. Hence, we may see other LP_ states, each |
3357 | | * of which indicates concurrent pruning. |
3358 | | * |
3359 | | * Failing with TM_Updated would be most accurate. However, unlike other |
3360 | | * TM_Updated scenarios, we don't know the successor ctid in LP_UNUSED and |
3361 | | * LP_DEAD cases. While the distinction between TM_Updated and TM_Deleted |
3362 | | * does matter to SQL statements UPDATE and MERGE, those SQL statements |
3363 | | * hold a snapshot that ensures LP_NORMAL. Hence, the choice between |
3364 | | * TM_Updated and TM_Deleted affects only the wording of error messages. |
3365 | | * Settle on TM_Deleted, for two reasons. First, it avoids complicating |
3366 | | * the specification of when tmfd->ctid is valid. Second, it creates |
3367 | | * error log evidence that we took this branch. |
3368 | | * |
3369 | | * Since it's possible to see LP_UNUSED at otid, it's also possible to see |
3370 | | * LP_NORMAL for a tuple that replaced LP_UNUSED. If it's a tuple for an |
3371 | | * unrelated row, we'll fail with "duplicate key value violates unique". |
3372 | | * XXX if otid is the live, newer version of the newtup row, we'll discard |
3373 | | * changes originating in versions of this catalog row after the version |
3374 | | * the caller got from syscache. See syscache-update-pruned.spec. |
3375 | | */ |
3376 | 0 | if (!ItemIdIsNormal(lp)) |
3377 | 0 | { |
3378 | 0 | Assert(RelationSupportsSysCache(RelationGetRelid(relation))); |
3379 | |
|
3380 | 0 | UnlockReleaseBuffer(buffer); |
3381 | 0 | Assert(!have_tuple_lock); |
3382 | 0 | if (vmbuffer != InvalidBuffer) |
3383 | 0 | ReleaseBuffer(vmbuffer); |
3384 | 0 | tmfd->ctid = *otid; |
3385 | 0 | tmfd->xmax = InvalidTransactionId; |
3386 | 0 | tmfd->cmax = InvalidCommandId; |
3387 | 0 | *update_indexes = TU_None; |
3388 | |
|
3389 | 0 | bms_free(hot_attrs); |
3390 | 0 | bms_free(sum_attrs); |
3391 | 0 | bms_free(key_attrs); |
3392 | 0 | bms_free(id_attrs); |
3393 | | /* modified_attrs not yet initialized */ |
3394 | 0 | bms_free(interesting_attrs); |
3395 | 0 | return TM_Deleted; |
3396 | 0 | } |
3397 | | |
3398 | | /* |
3399 | | * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work |
3400 | | * properly. |
3401 | | */ |
3402 | 0 | oldtup.t_tableOid = RelationGetRelid(relation); |
3403 | 0 | oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp); |
3404 | 0 | oldtup.t_len = ItemIdGetLength(lp); |
3405 | 0 | oldtup.t_self = *otid; |
3406 | | |
3407 | | /* the new tuple is ready, except for this: */ |
3408 | 0 | newtup->t_tableOid = RelationGetRelid(relation); |
3409 | | |
3410 | | /* |
3411 | | * Determine columns modified by the update. Additionally, identify |
3412 | | * whether any of the unmodified replica identity key attributes in the |
3413 | | * old tuple is externally stored or not. This is required because for |
3414 | | * such attributes the flattened value won't be WAL logged as part of the |
3415 | | * new tuple so we must include it as part of the old_key_tuple. See |
3416 | | * ExtractReplicaIdentity. |
3417 | | */ |
3418 | 0 | modified_attrs = HeapDetermineColumnsInfo(relation, interesting_attrs, |
3419 | 0 | id_attrs, &oldtup, |
3420 | 0 | newtup, &id_has_external); |
3421 | | |
3422 | | /* |
3423 | | * If we're not updating any "key" column, we can grab a weaker lock type. |
3424 | | * This allows for more concurrency when we are running simultaneously |
3425 | | * with foreign key checks. |
3426 | | * |
3427 | | * Note that if a column gets detoasted while executing the update, but |
3428 | | * the value ends up being the same, this test will fail and we will use |
3429 | | * the stronger lock. This is acceptable; the important case to optimize |
3430 | | * is updates that don't manipulate key columns, not those that |
3431 | | * serendipitously arrive at the same key values. |
3432 | | */ |
3433 | 0 | if (!bms_overlap(modified_attrs, key_attrs)) |
3434 | 0 | { |
3435 | 0 | *lockmode = LockTupleNoKeyExclusive; |
3436 | 0 | mxact_status = MultiXactStatusNoKeyUpdate; |
3437 | 0 | key_intact = true; |
3438 | | |
3439 | | /* |
3440 | | * If this is the first possibly-multixact-able operation in the |
3441 | | * current transaction, set my per-backend OldestMemberMXactId |
3442 | | * setting. We can be certain that the transaction will never become a |
3443 | | * member of any older MultiXactIds than that. (We have to do this |
3444 | | * even if we end up just using our own TransactionId below, since |
3445 | | * some other backend could incorporate our XID into a MultiXact |
3446 | | * immediately afterwards.) |
3447 | | */ |
3448 | 0 | MultiXactIdSetOldestMember(); |
3449 | 0 | } |
3450 | 0 | else |
3451 | 0 | { |
3452 | 0 | *lockmode = LockTupleExclusive; |
3453 | 0 | mxact_status = MultiXactStatusUpdate; |
3454 | 0 | key_intact = false; |
3455 | 0 | } |
3456 | | |
3457 | | /* |
3458 | | * Note: beyond this point, use oldtup not otid to refer to old tuple. |
3459 | | * otid may very well point at newtup->t_self, which we will overwrite |
3460 | | * with the new tuple's location, so there's great risk of confusion if we |
3461 | | * use otid anymore. |
3462 | | */ |
3463 | |
|
3464 | 0 | l2: |
3465 | 0 | checked_lockers = false; |
3466 | 0 | locker_remains = false; |
3467 | 0 | result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer); |
3468 | | |
3469 | | /* see below about the "no wait" case */ |
3470 | 0 | Assert(result != TM_BeingModified || wait); |
3471 | |
|
3472 | 0 | if (result == TM_Invisible) |
3473 | 0 | { |
3474 | 0 | UnlockReleaseBuffer(buffer); |
3475 | 0 | ereport(ERROR, |
3476 | 0 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
3477 | 0 | errmsg("attempted to update invisible tuple"))); |
3478 | 0 | } |
3479 | 0 | else if (result == TM_BeingModified && wait) |
3480 | 0 | { |
3481 | 0 | TransactionId xwait; |
3482 | 0 | uint16 infomask; |
3483 | 0 | bool can_continue = false; |
3484 | | |
3485 | | /* |
3486 | | * XXX note that we don't consider the "no wait" case here. This |
3487 | | * isn't a problem currently because no caller uses that case, but it |
3488 | | * should be fixed if such a caller is introduced. It wasn't a |
3489 | | * problem previously because this code would always wait, but now |
3490 | | * that some tuple locks do not conflict with one of the lock modes we |
3491 | | * use, it is possible that this case is interesting to handle |
3492 | | * specially. |
3493 | | * |
3494 | | * This may cause failures with third-party code that calls |
3495 | | * heap_update directly. |
3496 | | */ |
3497 | | |
3498 | | /* must copy state data before unlocking buffer */ |
3499 | 0 | xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data); |
3500 | 0 | infomask = oldtup.t_data->t_infomask; |
3501 | | |
3502 | | /* |
3503 | | * Now we have to do something about the existing locker. If it's a |
3504 | | * multi, sleep on it; we might be awakened before it is completely |
3505 | | * gone (or even not sleep at all in some cases); we need to preserve |
3506 | | * it as locker, unless it is gone completely. |
3507 | | * |
3508 | | * If it's not a multi, we need to check for sleeping conditions |
3509 | | * before actually going to sleep. If the update doesn't conflict |
3510 | | * with the locks, we just continue without sleeping (but making sure |
3511 | | * it is preserved). |
3512 | | * |
3513 | | * Before sleeping, we need to acquire tuple lock to establish our |
3514 | | * priority for the tuple (see heap_lock_tuple). LockTuple will |
3515 | | * release us when we are next-in-line for the tuple. Note we must |
3516 | | * not acquire the tuple lock until we're sure we're going to sleep; |
3517 | | * otherwise we're open for race conditions with other transactions |
3518 | | * holding the tuple lock which sleep on us. |
3519 | | * |
3520 | | * If we are forced to "start over" below, we keep the tuple lock; |
3521 | | * this arranges that we stay at the head of the line while rechecking |
3522 | | * tuple state. |
3523 | | */ |
3524 | 0 | if (infomask & HEAP_XMAX_IS_MULTI) |
3525 | 0 | { |
3526 | 0 | TransactionId update_xact; |
3527 | 0 | int remain; |
3528 | 0 | bool current_is_member = false; |
3529 | |
|
3530 | 0 | if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask, |
3531 | 0 | *lockmode, ¤t_is_member)) |
3532 | 0 | { |
3533 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
3534 | | |
3535 | | /* |
3536 | | * Acquire the lock, if necessary (but skip it when we're |
3537 | | * requesting a lock and already have one; avoids deadlock). |
3538 | | */ |
3539 | 0 | if (!current_is_member) |
3540 | 0 | heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode, |
3541 | 0 | LockWaitBlock, &have_tuple_lock); |
3542 | | |
3543 | | /* wait for multixact */ |
3544 | 0 | MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask, |
3545 | 0 | relation, &oldtup.t_self, XLTW_Update, |
3546 | 0 | &remain); |
3547 | 0 | checked_lockers = true; |
3548 | 0 | locker_remains = remain != 0; |
3549 | 0 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
3550 | | |
3551 | | /* |
3552 | | * If xwait had just locked the tuple then some other xact |
3553 | | * could update this tuple before we get to this point. Check |
3554 | | * for xmax change, and start over if so. |
3555 | | */ |
3556 | 0 | if (xmax_infomask_changed(oldtup.t_data->t_infomask, |
3557 | 0 | infomask) || |
3558 | 0 | !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data), |
3559 | 0 | xwait)) |
3560 | 0 | goto l2; |
3561 | 0 | } |
3562 | | |
3563 | | /* |
3564 | | * Note that the multixact may not be done by now. It could have |
3565 | | * surviving members; our own xact or other subxacts of this |
3566 | | * backend, and also any other concurrent transaction that locked |
3567 | | * the tuple with LockTupleKeyShare if we only got |
3568 | | * LockTupleNoKeyExclusive. If this is the case, we have to be |
3569 | | * careful to mark the updated tuple with the surviving members in |
3570 | | * Xmax. |
3571 | | * |
3572 | | * Note that there could have been another update in the |
3573 | | * MultiXact. In that case, we need to check whether it committed |
3574 | | * or aborted. If it aborted we are safe to update it again; |
3575 | | * otherwise there is an update conflict, and we have to return |
3576 | | * TableTuple{Deleted, Updated} below. |
3577 | | * |
3578 | | * In the LockTupleExclusive case, we still need to preserve the |
3579 | | * surviving members: those would include the tuple locks we had |
3580 | | * before this one, which are important to keep in case this |
3581 | | * subxact aborts. |
3582 | | */ |
3583 | 0 | if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask)) |
3584 | 0 | update_xact = HeapTupleGetUpdateXid(oldtup.t_data); |
3585 | 0 | else |
3586 | 0 | update_xact = InvalidTransactionId; |
3587 | | |
3588 | | /* |
3589 | | * There was no UPDATE in the MultiXact; or it aborted. No |
3590 | | * TransactionIdIsInProgress() call needed here, since we called |
3591 | | * MultiXactIdWait() above. |
3592 | | */ |
3593 | 0 | if (!TransactionIdIsValid(update_xact) || |
3594 | 0 | TransactionIdDidAbort(update_xact)) |
3595 | 0 | can_continue = true; |
3596 | 0 | } |
3597 | 0 | else if (TransactionIdIsCurrentTransactionId(xwait)) |
3598 | 0 | { |
3599 | | /* |
3600 | | * The only locker is ourselves; we can avoid grabbing the tuple |
3601 | | * lock here, but must preserve our locking information. |
3602 | | */ |
3603 | 0 | checked_lockers = true; |
3604 | 0 | locker_remains = true; |
3605 | 0 | can_continue = true; |
3606 | 0 | } |
3607 | 0 | else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact) |
3608 | 0 | { |
3609 | | /* |
3610 | | * If it's just a key-share locker, and we're not changing the key |
3611 | | * columns, we don't need to wait for it to end; but we need to |
3612 | | * preserve it as locker. |
3613 | | */ |
3614 | 0 | checked_lockers = true; |
3615 | 0 | locker_remains = true; |
3616 | 0 | can_continue = true; |
3617 | 0 | } |
3618 | 0 | else |
3619 | 0 | { |
3620 | | /* |
3621 | | * Wait for regular transaction to end; but first, acquire tuple |
3622 | | * lock. |
3623 | | */ |
3624 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
3625 | 0 | heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode, |
3626 | 0 | LockWaitBlock, &have_tuple_lock); |
3627 | 0 | XactLockTableWait(xwait, relation, &oldtup.t_self, |
3628 | 0 | XLTW_Update); |
3629 | 0 | checked_lockers = true; |
3630 | 0 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
3631 | | |
3632 | | /* |
3633 | | * xwait is done, but if xwait had just locked the tuple then some |
3634 | | * other xact could update this tuple before we get to this point. |
3635 | | * Check for xmax change, and start over if so. |
3636 | | */ |
3637 | 0 | if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) || |
3638 | 0 | !TransactionIdEquals(xwait, |
3639 | 0 | HeapTupleHeaderGetRawXmax(oldtup.t_data))) |
3640 | 0 | goto l2; |
3641 | | |
3642 | | /* Otherwise check if it committed or aborted */ |
3643 | 0 | UpdateXmaxHintBits(oldtup.t_data, buffer, xwait); |
3644 | 0 | if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) |
3645 | 0 | can_continue = true; |
3646 | 0 | } |
3647 | | |
3648 | 0 | if (can_continue) |
3649 | 0 | result = TM_Ok; |
3650 | 0 | else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid)) |
3651 | 0 | result = TM_Updated; |
3652 | 0 | else |
3653 | 0 | result = TM_Deleted; |
3654 | 0 | } |
3655 | | |
3656 | | /* Sanity check the result HeapTupleSatisfiesUpdate() and the logic above */ |
3657 | 0 | if (result != TM_Ok) |
3658 | 0 | { |
3659 | 0 | Assert(result == TM_SelfModified || |
3660 | 0 | result == TM_Updated || |
3661 | 0 | result == TM_Deleted || |
3662 | 0 | result == TM_BeingModified); |
3663 | 0 | Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)); |
3664 | 0 | Assert(result != TM_Updated || |
3665 | 0 | !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid)); |
3666 | 0 | } |
3667 | |
|
3668 | 0 | if (crosscheck != InvalidSnapshot && result == TM_Ok) |
3669 | 0 | { |
3670 | | /* Perform additional check for transaction-snapshot mode RI updates */ |
3671 | 0 | if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer)) |
3672 | 0 | result = TM_Updated; |
3673 | 0 | } |
3674 | |
|
3675 | 0 | if (result != TM_Ok) |
3676 | 0 | { |
3677 | 0 | tmfd->ctid = oldtup.t_data->t_ctid; |
3678 | 0 | tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data); |
3679 | 0 | if (result == TM_SelfModified) |
3680 | 0 | tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data); |
3681 | 0 | else |
3682 | 0 | tmfd->cmax = InvalidCommandId; |
3683 | 0 | UnlockReleaseBuffer(buffer); |
3684 | 0 | if (have_tuple_lock) |
3685 | 0 | UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode); |
3686 | 0 | if (vmbuffer != InvalidBuffer) |
3687 | 0 | ReleaseBuffer(vmbuffer); |
3688 | 0 | *update_indexes = TU_None; |
3689 | |
|
3690 | 0 | bms_free(hot_attrs); |
3691 | 0 | bms_free(sum_attrs); |
3692 | 0 | bms_free(key_attrs); |
3693 | 0 | bms_free(id_attrs); |
3694 | 0 | bms_free(modified_attrs); |
3695 | 0 | bms_free(interesting_attrs); |
3696 | 0 | return result; |
3697 | 0 | } |
3698 | | |
3699 | | /* |
3700 | | * If we didn't pin the visibility map page and the page has become all |
3701 | | * visible while we were busy locking the buffer, or during some |
3702 | | * subsequent window during which we had it unlocked, we'll have to unlock |
3703 | | * and re-lock, to avoid holding the buffer lock across an I/O. That's a |
3704 | | * bit unfortunate, especially since we'll now have to recheck whether the |
3705 | | * tuple has been locked or updated under us, but hopefully it won't |
3706 | | * happen very often. |
3707 | | */ |
3708 | 0 | if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) |
3709 | 0 | { |
3710 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
3711 | 0 | visibilitymap_pin(relation, block, &vmbuffer); |
3712 | 0 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
3713 | 0 | goto l2; |
3714 | 0 | } |
3715 | | |
3716 | | /* Fill in transaction status data */ |
3717 | | |
3718 | | /* |
3719 | | * If the tuple we're updating is locked, we need to preserve the locking |
3720 | | * info in the old tuple's Xmax. Prepare a new Xmax value for this. |
3721 | | */ |
3722 | 0 | compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data), |
3723 | 0 | oldtup.t_data->t_infomask, |
3724 | 0 | oldtup.t_data->t_infomask2, |
3725 | 0 | xid, *lockmode, true, |
3726 | 0 | &xmax_old_tuple, &infomask_old_tuple, |
3727 | 0 | &infomask2_old_tuple); |
3728 | | |
3729 | | /* |
3730 | | * And also prepare an Xmax value for the new copy of the tuple. If there |
3731 | | * was no xmax previously, or there was one but all lockers are now gone, |
3732 | | * then use InvalidTransactionId; otherwise, get the xmax from the old |
3733 | | * tuple. (In rare cases that might also be InvalidTransactionId and yet |
3734 | | * not have the HEAP_XMAX_INVALID bit set; that's fine.) |
3735 | | */ |
3736 | 0 | if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) || |
3737 | 0 | HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) || |
3738 | 0 | (checked_lockers && !locker_remains)) |
3739 | 0 | xmax_new_tuple = InvalidTransactionId; |
3740 | 0 | else |
3741 | 0 | xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data); |
3742 | |
|
3743 | 0 | if (!TransactionIdIsValid(xmax_new_tuple)) |
3744 | 0 | { |
3745 | 0 | infomask_new_tuple = HEAP_XMAX_INVALID; |
3746 | 0 | infomask2_new_tuple = 0; |
3747 | 0 | } |
3748 | 0 | else |
3749 | 0 | { |
3750 | | /* |
3751 | | * If we found a valid Xmax for the new tuple, then the infomask bits |
3752 | | * to use on the new tuple depend on what was there on the old one. |
3753 | | * Note that since we're doing an update, the only possibility is that |
3754 | | * the lockers had FOR KEY SHARE lock. |
3755 | | */ |
3756 | 0 | if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI) |
3757 | 0 | { |
3758 | 0 | GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple, |
3759 | 0 | &infomask2_new_tuple); |
3760 | 0 | } |
3761 | 0 | else |
3762 | 0 | { |
3763 | 0 | infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY; |
3764 | 0 | infomask2_new_tuple = 0; |
3765 | 0 | } |
3766 | 0 | } |
3767 | | |
3768 | | /* |
3769 | | * Prepare the new tuple with the appropriate initial values of Xmin and |
3770 | | * Xmax, as well as initial infomask bits as computed above. |
3771 | | */ |
3772 | 0 | newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK); |
3773 | 0 | newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); |
3774 | 0 | HeapTupleHeaderSetXmin(newtup->t_data, xid); |
3775 | 0 | HeapTupleHeaderSetCmin(newtup->t_data, cid); |
3776 | 0 | newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple; |
3777 | 0 | newtup->t_data->t_infomask2 |= infomask2_new_tuple; |
3778 | 0 | HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple); |
3779 | | |
3780 | | /* |
3781 | | * Replace cid with a combo CID if necessary. Note that we already put |
3782 | | * the plain cid into the new tuple. |
3783 | | */ |
3784 | 0 | HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo); |
3785 | | |
3786 | | /* |
3787 | | * If the toaster needs to be activated, OR if the new tuple will not fit |
3788 | | * on the same page as the old, then we need to release the content lock |
3789 | | * (but not the pin!) on the old tuple's buffer while we are off doing |
3790 | | * TOAST and/or table-file-extension work. We must mark the old tuple to |
3791 | | * show that it's locked, else other processes may try to update it |
3792 | | * themselves. |
3793 | | * |
3794 | | * We need to invoke the toaster if there are already any out-of-line |
3795 | | * toasted values present, or if the new tuple is over-threshold. |
3796 | | */ |
3797 | 0 | if (relation->rd_rel->relkind != RELKIND_RELATION && |
3798 | 0 | relation->rd_rel->relkind != RELKIND_MATVIEW) |
3799 | 0 | { |
3800 | | /* toast table entries should never be recursively toasted */ |
3801 | 0 | Assert(!HeapTupleHasExternal(&oldtup)); |
3802 | 0 | Assert(!HeapTupleHasExternal(newtup)); |
3803 | 0 | need_toast = false; |
3804 | 0 | } |
3805 | 0 | else |
3806 | 0 | need_toast = (HeapTupleHasExternal(&oldtup) || |
3807 | 0 | HeapTupleHasExternal(newtup) || |
3808 | 0 | newtup->t_len > TOAST_TUPLE_THRESHOLD); |
3809 | |
|
3810 | 0 | pagefree = PageGetHeapFreeSpace(page); |
3811 | |
|
3812 | 0 | newtupsize = MAXALIGN(newtup->t_len); |
3813 | |
|
3814 | 0 | if (need_toast || newtupsize > pagefree) |
3815 | 0 | { |
3816 | 0 | TransactionId xmax_lock_old_tuple; |
3817 | 0 | uint16 infomask_lock_old_tuple, |
3818 | 0 | infomask2_lock_old_tuple; |
3819 | 0 | bool cleared_all_frozen = false; |
3820 | | |
3821 | | /* |
3822 | | * To prevent concurrent sessions from updating the tuple, we have to |
3823 | | * temporarily mark it locked, while we release the page-level lock. |
3824 | | * |
3825 | | * To satisfy the rule that any xid potentially appearing in a buffer |
3826 | | * written out to disk, we unfortunately have to WAL log this |
3827 | | * temporary modification. We can reuse xl_heap_lock for this |
3828 | | * purpose. If we crash/error before following through with the |
3829 | | * actual update, xmax will be of an aborted transaction, allowing |
3830 | | * other sessions to proceed. |
3831 | | */ |
3832 | | |
3833 | | /* |
3834 | | * Compute xmax / infomask appropriate for locking the tuple. This has |
3835 | | * to be done separately from the combo that's going to be used for |
3836 | | * updating, because the potentially created multixact would otherwise |
3837 | | * be wrong. |
3838 | | */ |
3839 | 0 | compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data), |
3840 | 0 | oldtup.t_data->t_infomask, |
3841 | 0 | oldtup.t_data->t_infomask2, |
3842 | 0 | xid, *lockmode, false, |
3843 | 0 | &xmax_lock_old_tuple, &infomask_lock_old_tuple, |
3844 | 0 | &infomask2_lock_old_tuple); |
3845 | |
|
3846 | 0 | Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple)); |
3847 | |
|
3848 | 0 | START_CRIT_SECTION(); |
3849 | | |
3850 | | /* Clear obsolete visibility flags ... */ |
3851 | 0 | oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); |
3852 | 0 | oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; |
3853 | 0 | HeapTupleClearHotUpdated(&oldtup); |
3854 | | /* ... and store info about transaction updating this tuple */ |
3855 | 0 | Assert(TransactionIdIsValid(xmax_lock_old_tuple)); |
3856 | 0 | HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple); |
3857 | 0 | oldtup.t_data->t_infomask |= infomask_lock_old_tuple; |
3858 | 0 | oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple; |
3859 | 0 | HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo); |
3860 | | |
3861 | | /* temporarily make it look not-updated, but locked */ |
3862 | 0 | oldtup.t_data->t_ctid = oldtup.t_self; |
3863 | | |
3864 | | /* |
3865 | | * Clear all-frozen bit on visibility map if needed. We could |
3866 | | * immediately reset ALL_VISIBLE, but given that the WAL logging |
3867 | | * overhead would be unchanged, that doesn't seem necessarily |
3868 | | * worthwhile. |
3869 | | */ |
3870 | 0 | if (PageIsAllVisible(page) && |
3871 | 0 | visibilitymap_clear(relation, block, vmbuffer, |
3872 | 0 | VISIBILITYMAP_ALL_FROZEN)) |
3873 | 0 | cleared_all_frozen = true; |
3874 | |
|
3875 | 0 | MarkBufferDirty(buffer); |
3876 | |
|
3877 | 0 | if (RelationNeedsWAL(relation)) |
3878 | 0 | { |
3879 | 0 | xl_heap_lock xlrec; |
3880 | 0 | XLogRecPtr recptr; |
3881 | |
|
3882 | 0 | XLogBeginInsert(); |
3883 | 0 | XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); |
3884 | |
|
3885 | 0 | xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self); |
3886 | 0 | xlrec.xmax = xmax_lock_old_tuple; |
3887 | 0 | xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask, |
3888 | 0 | oldtup.t_data->t_infomask2); |
3889 | 0 | xlrec.flags = |
3890 | 0 | cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; |
3891 | 0 | XLogRegisterData(&xlrec, SizeOfHeapLock); |
3892 | 0 | recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); |
3893 | 0 | PageSetLSN(page, recptr); |
3894 | 0 | } |
3895 | |
|
3896 | 0 | END_CRIT_SECTION(); |
3897 | |
|
3898 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
3899 | | |
3900 | | /* |
3901 | | * Let the toaster do its thing, if needed. |
3902 | | * |
3903 | | * Note: below this point, heaptup is the data we actually intend to |
3904 | | * store into the relation; newtup is the caller's original untoasted |
3905 | | * data. |
3906 | | */ |
3907 | 0 | if (need_toast) |
3908 | 0 | { |
3909 | | /* Note we always use WAL and FSM during updates */ |
3910 | 0 | heaptup = heap_toast_insert_or_update(relation, newtup, &oldtup, 0); |
3911 | 0 | newtupsize = MAXALIGN(heaptup->t_len); |
3912 | 0 | } |
3913 | 0 | else |
3914 | 0 | heaptup = newtup; |
3915 | | |
3916 | | /* |
3917 | | * Now, do we need a new page for the tuple, or not? This is a bit |
3918 | | * tricky since someone else could have added tuples to the page while |
3919 | | * we weren't looking. We have to recheck the available space after |
3920 | | * reacquiring the buffer lock. But don't bother to do that if the |
3921 | | * former amount of free space is still not enough; it's unlikely |
3922 | | * there's more free now than before. |
3923 | | * |
3924 | | * What's more, if we need to get a new page, we will need to acquire |
3925 | | * buffer locks on both old and new pages. To avoid deadlock against |
3926 | | * some other backend trying to get the same two locks in the other |
3927 | | * order, we must be consistent about the order we get the locks in. |
3928 | | * We use the rule "lock the lower-numbered page of the relation |
3929 | | * first". To implement this, we must do RelationGetBufferForTuple |
3930 | | * while not holding the lock on the old page, and we must rely on it |
3931 | | * to get the locks on both pages in the correct order. |
3932 | | * |
3933 | | * Another consideration is that we need visibility map page pin(s) if |
3934 | | * we will have to clear the all-visible flag on either page. If we |
3935 | | * call RelationGetBufferForTuple, we rely on it to acquire any such |
3936 | | * pins; but if we don't, we have to handle that here. Hence we need |
3937 | | * a loop. |
3938 | | */ |
3939 | 0 | for (;;) |
3940 | 0 | { |
3941 | 0 | if (newtupsize > pagefree) |
3942 | 0 | { |
3943 | | /* It doesn't fit, must use RelationGetBufferForTuple. */ |
3944 | 0 | newbuf = RelationGetBufferForTuple(relation, heaptup->t_len, |
3945 | 0 | buffer, 0, NULL, |
3946 | 0 | &vmbuffer_new, &vmbuffer, |
3947 | 0 | 0); |
3948 | | /* We're all done. */ |
3949 | 0 | break; |
3950 | 0 | } |
3951 | | /* Acquire VM page pin if needed and we don't have it. */ |
3952 | 0 | if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) |
3953 | 0 | visibilitymap_pin(relation, block, &vmbuffer); |
3954 | | /* Re-acquire the lock on the old tuple's page. */ |
3955 | 0 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
3956 | | /* Re-check using the up-to-date free space */ |
3957 | 0 | pagefree = PageGetHeapFreeSpace(page); |
3958 | 0 | if (newtupsize > pagefree || |
3959 | 0 | (vmbuffer == InvalidBuffer && PageIsAllVisible(page))) |
3960 | 0 | { |
3961 | | /* |
3962 | | * Rats, it doesn't fit anymore, or somebody just now set the |
3963 | | * all-visible flag. We must now unlock and loop to avoid |
3964 | | * deadlock. Fortunately, this path should seldom be taken. |
3965 | | */ |
3966 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
3967 | 0 | } |
3968 | 0 | else |
3969 | 0 | { |
3970 | | /* We're all done. */ |
3971 | 0 | newbuf = buffer; |
3972 | 0 | break; |
3973 | 0 | } |
3974 | 0 | } |
3975 | 0 | } |
3976 | 0 | else |
3977 | 0 | { |
3978 | | /* No TOAST work needed, and it'll fit on same page */ |
3979 | 0 | newbuf = buffer; |
3980 | 0 | heaptup = newtup; |
3981 | 0 | } |
3982 | | |
3983 | | /* |
3984 | | * We're about to do the actual update -- check for conflict first, to |
3985 | | * avoid possibly having to roll back work we've just done. |
3986 | | * |
3987 | | * This is safe without a recheck as long as there is no possibility of |
3988 | | * another process scanning the pages between this check and the update |
3989 | | * being visible to the scan (i.e., exclusive buffer content lock(s) are |
3990 | | * continuously held from this point until the tuple update is visible). |
3991 | | * |
3992 | | * For the new tuple the only check needed is at the relation level, but |
3993 | | * since both tuples are in the same relation and the check for oldtup |
3994 | | * will include checking the relation level, there is no benefit to a |
3995 | | * separate check for the new tuple. |
3996 | | */ |
3997 | 0 | CheckForSerializableConflictIn(relation, &oldtup.t_self, |
3998 | 0 | BufferGetBlockNumber(buffer)); |
3999 | | |
4000 | | /* |
4001 | | * At this point newbuf and buffer are both pinned and locked, and newbuf |
4002 | | * has enough space for the new tuple. If they are the same buffer, only |
4003 | | * one pin is held. |
4004 | | */ |
4005 | |
|
4006 | 0 | if (newbuf == buffer) |
4007 | 0 | { |
4008 | | /* |
4009 | | * Since the new tuple is going into the same page, we might be able |
4010 | | * to do a HOT update. Check if any of the index columns have been |
4011 | | * changed. |
4012 | | */ |
4013 | 0 | if (!bms_overlap(modified_attrs, hot_attrs)) |
4014 | 0 | { |
4015 | 0 | use_hot_update = true; |
4016 | | |
4017 | | /* |
4018 | | * If none of the columns that are used in hot-blocking indexes |
4019 | | * were updated, we can apply HOT, but we do still need to check |
4020 | | * if we need to update the summarizing indexes, and update those |
4021 | | * indexes if the columns were updated, or we may fail to detect |
4022 | | * e.g. value bound changes in BRIN minmax indexes. |
4023 | | */ |
4024 | 0 | if (bms_overlap(modified_attrs, sum_attrs)) |
4025 | 0 | summarized_update = true; |
4026 | 0 | } |
4027 | 0 | } |
4028 | 0 | else |
4029 | 0 | { |
4030 | | /* Set a hint that the old page could use prune/defrag */ |
4031 | 0 | PageSetFull(page); |
4032 | 0 | } |
4033 | | |
4034 | | /* |
4035 | | * Compute replica identity tuple before entering the critical section so |
4036 | | * we don't PANIC upon a memory allocation failure. |
4037 | | * ExtractReplicaIdentity() will return NULL if nothing needs to be |
4038 | | * logged. Pass old key required as true only if the replica identity key |
4039 | | * columns are modified or it has external data. |
4040 | | */ |
4041 | 0 | old_key_tuple = ExtractReplicaIdentity(relation, &oldtup, |
4042 | 0 | bms_overlap(modified_attrs, id_attrs) || |
4043 | 0 | id_has_external, |
4044 | 0 | &old_key_copied); |
4045 | | |
4046 | | /* NO EREPORT(ERROR) from here till changes are logged */ |
4047 | 0 | START_CRIT_SECTION(); |
4048 | | |
4049 | | /* |
4050 | | * If this transaction commits, the old tuple will become DEAD sooner or |
4051 | | * later. Set flag that this page is a candidate for pruning once our xid |
4052 | | * falls below the OldestXmin horizon. If the transaction finally aborts, |
4053 | | * the subsequent page pruning will be a no-op and the hint will be |
4054 | | * cleared. |
4055 | | * |
4056 | | * XXX Should we set hint on newbuf as well? If the transaction aborts, |
4057 | | * there would be a prunable tuple in the newbuf; but for now we choose |
4058 | | * not to optimize for aborts. Note that heap_xlog_update must be kept in |
4059 | | * sync if this decision changes. |
4060 | | */ |
4061 | 0 | PageSetPrunable(page, xid); |
4062 | |
|
4063 | 0 | if (use_hot_update) |
4064 | 0 | { |
4065 | | /* Mark the old tuple as HOT-updated */ |
4066 | 0 | HeapTupleSetHotUpdated(&oldtup); |
4067 | | /* And mark the new tuple as heap-only */ |
4068 | 0 | HeapTupleSetHeapOnly(heaptup); |
4069 | | /* Mark the caller's copy too, in case different from heaptup */ |
4070 | 0 | HeapTupleSetHeapOnly(newtup); |
4071 | 0 | } |
4072 | 0 | else |
4073 | 0 | { |
4074 | | /* Make sure tuples are correctly marked as not-HOT */ |
4075 | 0 | HeapTupleClearHotUpdated(&oldtup); |
4076 | 0 | HeapTupleClearHeapOnly(heaptup); |
4077 | 0 | HeapTupleClearHeapOnly(newtup); |
4078 | 0 | } |
4079 | |
|
4080 | 0 | RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */ |
4081 | | |
4082 | | |
4083 | | /* Clear obsolete visibility flags, possibly set by ourselves above... */ |
4084 | 0 | oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); |
4085 | 0 | oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; |
4086 | | /* ... and store info about transaction updating this tuple */ |
4087 | 0 | Assert(TransactionIdIsValid(xmax_old_tuple)); |
4088 | 0 | HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple); |
4089 | 0 | oldtup.t_data->t_infomask |= infomask_old_tuple; |
4090 | 0 | oldtup.t_data->t_infomask2 |= infomask2_old_tuple; |
4091 | 0 | HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo); |
4092 | | |
4093 | | /* record address of new tuple in t_ctid of old one */ |
4094 | 0 | oldtup.t_data->t_ctid = heaptup->t_self; |
4095 | | |
4096 | | /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */ |
4097 | 0 | if (PageIsAllVisible(BufferGetPage(buffer))) |
4098 | 0 | { |
4099 | 0 | all_visible_cleared = true; |
4100 | 0 | PageClearAllVisible(BufferGetPage(buffer)); |
4101 | 0 | visibilitymap_clear(relation, BufferGetBlockNumber(buffer), |
4102 | 0 | vmbuffer, VISIBILITYMAP_VALID_BITS); |
4103 | 0 | } |
4104 | 0 | if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf))) |
4105 | 0 | { |
4106 | 0 | all_visible_cleared_new = true; |
4107 | 0 | PageClearAllVisible(BufferGetPage(newbuf)); |
4108 | 0 | visibilitymap_clear(relation, BufferGetBlockNumber(newbuf), |
4109 | 0 | vmbuffer_new, VISIBILITYMAP_VALID_BITS); |
4110 | 0 | } |
4111 | |
|
4112 | 0 | if (newbuf != buffer) |
4113 | 0 | MarkBufferDirty(newbuf); |
4114 | 0 | MarkBufferDirty(buffer); |
4115 | | |
4116 | | /* XLOG stuff */ |
4117 | 0 | if (RelationNeedsWAL(relation)) |
4118 | 0 | { |
4119 | 0 | XLogRecPtr recptr; |
4120 | | |
4121 | | /* |
4122 | | * For logical decoding we need combo CIDs to properly decode the |
4123 | | * catalog. |
4124 | | */ |
4125 | 0 | if (RelationIsAccessibleInLogicalDecoding(relation)) |
4126 | 0 | { |
4127 | 0 | log_heap_new_cid(relation, &oldtup); |
4128 | 0 | log_heap_new_cid(relation, heaptup); |
4129 | 0 | } |
4130 | |
|
4131 | 0 | recptr = log_heap_update(relation, buffer, |
4132 | 0 | newbuf, &oldtup, heaptup, |
4133 | 0 | old_key_tuple, |
4134 | 0 | all_visible_cleared, |
4135 | 0 | all_visible_cleared_new); |
4136 | 0 | if (newbuf != buffer) |
4137 | 0 | { |
4138 | 0 | PageSetLSN(BufferGetPage(newbuf), recptr); |
4139 | 0 | } |
4140 | 0 | PageSetLSN(BufferGetPage(buffer), recptr); |
4141 | 0 | } |
4142 | |
|
4143 | 0 | END_CRIT_SECTION(); |
4144 | |
|
4145 | 0 | if (newbuf != buffer) |
4146 | 0 | LockBuffer(newbuf, BUFFER_LOCK_UNLOCK); |
4147 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
4148 | | |
4149 | | /* |
4150 | | * Mark old tuple for invalidation from system caches at next command |
4151 | | * boundary, and mark the new tuple for invalidation in case we abort. We |
4152 | | * have to do this before releasing the buffer because oldtup is in the |
4153 | | * buffer. (heaptup is all in local memory, but it's necessary to process |
4154 | | * both tuple versions in one call to inval.c so we can avoid redundant |
4155 | | * sinval messages.) |
4156 | | */ |
4157 | 0 | CacheInvalidateHeapTuple(relation, &oldtup, heaptup); |
4158 | | |
4159 | | /* Now we can release the buffer(s) */ |
4160 | 0 | if (newbuf != buffer) |
4161 | 0 | ReleaseBuffer(newbuf); |
4162 | 0 | ReleaseBuffer(buffer); |
4163 | 0 | if (BufferIsValid(vmbuffer_new)) |
4164 | 0 | ReleaseBuffer(vmbuffer_new); |
4165 | 0 | if (BufferIsValid(vmbuffer)) |
4166 | 0 | ReleaseBuffer(vmbuffer); |
4167 | | |
4168 | | /* |
4169 | | * Release the lmgr tuple lock, if we had it. |
4170 | | */ |
4171 | 0 | if (have_tuple_lock) |
4172 | 0 | UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode); |
4173 | |
|
4174 | 0 | pgstat_count_heap_update(relation, use_hot_update, newbuf != buffer); |
4175 | | |
4176 | | /* |
4177 | | * If heaptup is a private copy, release it. Don't forget to copy t_self |
4178 | | * back to the caller's image, too. |
4179 | | */ |
4180 | 0 | if (heaptup != newtup) |
4181 | 0 | { |
4182 | 0 | newtup->t_self = heaptup->t_self; |
4183 | 0 | heap_freetuple(heaptup); |
4184 | 0 | } |
4185 | | |
4186 | | /* |
4187 | | * If it is a HOT update, the update may still need to update summarized |
4188 | | * indexes, lest we fail to update those summaries and get incorrect |
4189 | | * results (for example, minmax bounds of the block may change with this |
4190 | | * update). |
4191 | | */ |
4192 | 0 | if (use_hot_update) |
4193 | 0 | { |
4194 | 0 | if (summarized_update) |
4195 | 0 | *update_indexes = TU_Summarizing; |
4196 | 0 | else |
4197 | 0 | *update_indexes = TU_None; |
4198 | 0 | } |
4199 | 0 | else |
4200 | 0 | *update_indexes = TU_All; |
4201 | |
|
4202 | 0 | if (old_key_tuple != NULL && old_key_copied) |
4203 | 0 | heap_freetuple(old_key_tuple); |
4204 | |
|
4205 | 0 | bms_free(hot_attrs); |
4206 | 0 | bms_free(sum_attrs); |
4207 | 0 | bms_free(key_attrs); |
4208 | 0 | bms_free(id_attrs); |
4209 | 0 | bms_free(modified_attrs); |
4210 | 0 | bms_free(interesting_attrs); |
4211 | |
|
4212 | 0 | return TM_Ok; |
4213 | 0 | } |
4214 | | |
4215 | | #ifdef USE_ASSERT_CHECKING |
4216 | | /* |
4217 | | * Confirm adequate lock held during heap_update(), per rules from |
4218 | | * README.tuplock section "Locking to write inplace-updated tables". |
4219 | | */ |
4220 | | static void |
4221 | | check_lock_if_inplace_updateable_rel(Relation relation, |
4222 | | ItemPointer otid, |
4223 | | HeapTuple newtup) |
4224 | | { |
4225 | | /* LOCKTAG_TUPLE acceptable for any catalog */ |
4226 | | switch (RelationGetRelid(relation)) |
4227 | | { |
4228 | | case RelationRelationId: |
4229 | | case DatabaseRelationId: |
4230 | | { |
4231 | | LOCKTAG tuptag; |
4232 | | |
4233 | | SET_LOCKTAG_TUPLE(tuptag, |
4234 | | relation->rd_lockInfo.lockRelId.dbId, |
4235 | | relation->rd_lockInfo.lockRelId.relId, |
4236 | | ItemPointerGetBlockNumber(otid), |
4237 | | ItemPointerGetOffsetNumber(otid)); |
4238 | | if (LockHeldByMe(&tuptag, InplaceUpdateTupleLock, false)) |
4239 | | return; |
4240 | | } |
4241 | | break; |
4242 | | default: |
4243 | | Assert(!IsInplaceUpdateRelation(relation)); |
4244 | | return; |
4245 | | } |
4246 | | |
4247 | | switch (RelationGetRelid(relation)) |
4248 | | { |
4249 | | case RelationRelationId: |
4250 | | { |
4251 | | /* LOCKTAG_TUPLE or LOCKTAG_RELATION ok */ |
4252 | | Form_pg_class classForm = (Form_pg_class) GETSTRUCT(newtup); |
4253 | | Oid relid = classForm->oid; |
4254 | | Oid dbid; |
4255 | | LOCKTAG tag; |
4256 | | |
4257 | | if (IsSharedRelation(relid)) |
4258 | | dbid = InvalidOid; |
4259 | | else |
4260 | | dbid = MyDatabaseId; |
4261 | | |
4262 | | if (classForm->relkind == RELKIND_INDEX) |
4263 | | { |
4264 | | Relation irel = index_open(relid, AccessShareLock); |
4265 | | |
4266 | | SET_LOCKTAG_RELATION(tag, dbid, irel->rd_index->indrelid); |
4267 | | index_close(irel, AccessShareLock); |
4268 | | } |
4269 | | else |
4270 | | SET_LOCKTAG_RELATION(tag, dbid, relid); |
4271 | | |
4272 | | if (!LockHeldByMe(&tag, ShareUpdateExclusiveLock, false) && |
4273 | | !LockHeldByMe(&tag, ShareRowExclusiveLock, true)) |
4274 | | elog(WARNING, |
4275 | | "missing lock for relation \"%s\" (OID %u, relkind %c) @ TID (%u,%u)", |
4276 | | NameStr(classForm->relname), |
4277 | | relid, |
4278 | | classForm->relkind, |
4279 | | ItemPointerGetBlockNumber(otid), |
4280 | | ItemPointerGetOffsetNumber(otid)); |
4281 | | } |
4282 | | break; |
4283 | | case DatabaseRelationId: |
4284 | | { |
4285 | | /* LOCKTAG_TUPLE required */ |
4286 | | Form_pg_database dbForm = (Form_pg_database) GETSTRUCT(newtup); |
4287 | | |
4288 | | elog(WARNING, |
4289 | | "missing lock on database \"%s\" (OID %u) @ TID (%u,%u)", |
4290 | | NameStr(dbForm->datname), |
4291 | | dbForm->oid, |
4292 | | ItemPointerGetBlockNumber(otid), |
4293 | | ItemPointerGetOffsetNumber(otid)); |
4294 | | } |
4295 | | break; |
4296 | | } |
4297 | | } |
4298 | | |
4299 | | /* |
4300 | | * Confirm adequate relation lock held, per rules from README.tuplock section |
4301 | | * "Locking to write inplace-updated tables". |
4302 | | */ |
4303 | | static void |
4304 | | check_inplace_rel_lock(HeapTuple oldtup) |
4305 | | { |
4306 | | Form_pg_class classForm = (Form_pg_class) GETSTRUCT(oldtup); |
4307 | | Oid relid = classForm->oid; |
4308 | | Oid dbid; |
4309 | | LOCKTAG tag; |
4310 | | |
4311 | | if (IsSharedRelation(relid)) |
4312 | | dbid = InvalidOid; |
4313 | | else |
4314 | | dbid = MyDatabaseId; |
4315 | | |
4316 | | if (classForm->relkind == RELKIND_INDEX) |
4317 | | { |
4318 | | Relation irel = index_open(relid, AccessShareLock); |
4319 | | |
4320 | | SET_LOCKTAG_RELATION(tag, dbid, irel->rd_index->indrelid); |
4321 | | index_close(irel, AccessShareLock); |
4322 | | } |
4323 | | else |
4324 | | SET_LOCKTAG_RELATION(tag, dbid, relid); |
4325 | | |
4326 | | if (!LockHeldByMe(&tag, ShareUpdateExclusiveLock, true)) |
4327 | | elog(WARNING, |
4328 | | "missing lock for relation \"%s\" (OID %u, relkind %c) @ TID (%u,%u)", |
4329 | | NameStr(classForm->relname), |
4330 | | relid, |
4331 | | classForm->relkind, |
4332 | | ItemPointerGetBlockNumber(&oldtup->t_self), |
4333 | | ItemPointerGetOffsetNumber(&oldtup->t_self)); |
4334 | | } |
4335 | | #endif |
4336 | | |
4337 | | /* |
4338 | | * Check if the specified attribute's values are the same. Subroutine for |
4339 | | * HeapDetermineColumnsInfo. |
4340 | | */ |
4341 | | static bool |
4342 | | heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2, |
4343 | | bool isnull1, bool isnull2) |
4344 | 0 | { |
4345 | | /* |
4346 | | * If one value is NULL and other is not, then they are certainly not |
4347 | | * equal |
4348 | | */ |
4349 | 0 | if (isnull1 != isnull2) |
4350 | 0 | return false; |
4351 | | |
4352 | | /* |
4353 | | * If both are NULL, they can be considered equal. |
4354 | | */ |
4355 | 0 | if (isnull1) |
4356 | 0 | return true; |
4357 | | |
4358 | | /* |
4359 | | * We do simple binary comparison of the two datums. This may be overly |
4360 | | * strict because there can be multiple binary representations for the |
4361 | | * same logical value. But we should be OK as long as there are no false |
4362 | | * positives. Using a type-specific equality operator is messy because |
4363 | | * there could be multiple notions of equality in different operator |
4364 | | * classes; furthermore, we cannot safely invoke user-defined functions |
4365 | | * while holding exclusive buffer lock. |
4366 | | */ |
4367 | 0 | if (attrnum <= 0) |
4368 | 0 | { |
4369 | | /* The only allowed system columns are OIDs, so do this */ |
4370 | 0 | return (DatumGetObjectId(value1) == DatumGetObjectId(value2)); |
4371 | 0 | } |
4372 | 0 | else |
4373 | 0 | { |
4374 | 0 | CompactAttribute *att; |
4375 | |
|
4376 | 0 | Assert(attrnum <= tupdesc->natts); |
4377 | 0 | att = TupleDescCompactAttr(tupdesc, attrnum - 1); |
4378 | 0 | return datumIsEqual(value1, value2, att->attbyval, att->attlen); |
4379 | 0 | } |
4380 | 0 | } |
4381 | | |
4382 | | /* |
4383 | | * Check which columns are being updated. |
4384 | | * |
4385 | | * Given an updated tuple, determine (and return into the output bitmapset), |
4386 | | * from those listed as interesting, the set of columns that changed. |
4387 | | * |
4388 | | * has_external indicates if any of the unmodified attributes (from those |
4389 | | * listed as interesting) of the old tuple is a member of external_cols and is |
4390 | | * stored externally. |
4391 | | */ |
4392 | | static Bitmapset * |
4393 | | HeapDetermineColumnsInfo(Relation relation, |
4394 | | Bitmapset *interesting_cols, |
4395 | | Bitmapset *external_cols, |
4396 | | HeapTuple oldtup, HeapTuple newtup, |
4397 | | bool *has_external) |
4398 | 0 | { |
4399 | 0 | int attidx; |
4400 | 0 | Bitmapset *modified = NULL; |
4401 | 0 | TupleDesc tupdesc = RelationGetDescr(relation); |
4402 | |
|
4403 | 0 | attidx = -1; |
4404 | 0 | while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0) |
4405 | 0 | { |
4406 | | /* attidx is zero-based, attrnum is the normal attribute number */ |
4407 | 0 | AttrNumber attrnum = attidx + FirstLowInvalidHeapAttributeNumber; |
4408 | 0 | Datum value1, |
4409 | 0 | value2; |
4410 | 0 | bool isnull1, |
4411 | 0 | isnull2; |
4412 | | |
4413 | | /* |
4414 | | * If it's a whole-tuple reference, say "not equal". It's not really |
4415 | | * worth supporting this case, since it could only succeed after a |
4416 | | * no-op update, which is hardly a case worth optimizing for. |
4417 | | */ |
4418 | 0 | if (attrnum == 0) |
4419 | 0 | { |
4420 | 0 | modified = bms_add_member(modified, attidx); |
4421 | 0 | continue; |
4422 | 0 | } |
4423 | | |
4424 | | /* |
4425 | | * Likewise, automatically say "not equal" for any system attribute |
4426 | | * other than tableOID; we cannot expect these to be consistent in a |
4427 | | * HOT chain, or even to be set correctly yet in the new tuple. |
4428 | | */ |
4429 | 0 | if (attrnum < 0) |
4430 | 0 | { |
4431 | 0 | if (attrnum != TableOidAttributeNumber) |
4432 | 0 | { |
4433 | 0 | modified = bms_add_member(modified, attidx); |
4434 | 0 | continue; |
4435 | 0 | } |
4436 | 0 | } |
4437 | | |
4438 | | /* |
4439 | | * Extract the corresponding values. XXX this is pretty inefficient |
4440 | | * if there are many indexed columns. Should we do a single |
4441 | | * heap_deform_tuple call on each tuple, instead? But that doesn't |
4442 | | * work for system columns ... |
4443 | | */ |
4444 | 0 | value1 = heap_getattr(oldtup, attrnum, tupdesc, &isnull1); |
4445 | 0 | value2 = heap_getattr(newtup, attrnum, tupdesc, &isnull2); |
4446 | |
|
4447 | 0 | if (!heap_attr_equals(tupdesc, attrnum, value1, |
4448 | 0 | value2, isnull1, isnull2)) |
4449 | 0 | { |
4450 | 0 | modified = bms_add_member(modified, attidx); |
4451 | 0 | continue; |
4452 | 0 | } |
4453 | | |
4454 | | /* |
4455 | | * No need to check attributes that can't be stored externally. Note |
4456 | | * that system attributes can't be stored externally. |
4457 | | */ |
4458 | 0 | if (attrnum < 0 || isnull1 || |
4459 | 0 | TupleDescCompactAttr(tupdesc, attrnum - 1)->attlen != -1) |
4460 | 0 | continue; |
4461 | | |
4462 | | /* |
4463 | | * Check if the old tuple's attribute is stored externally and is a |
4464 | | * member of external_cols. |
4465 | | */ |
4466 | 0 | if (VARATT_IS_EXTERNAL((struct varlena *) DatumGetPointer(value1)) && |
4467 | 0 | bms_is_member(attidx, external_cols)) |
4468 | 0 | *has_external = true; |
4469 | 0 | } |
4470 | |
|
4471 | 0 | return modified; |
4472 | 0 | } |
4473 | | |
4474 | | /* |
4475 | | * simple_heap_update - replace a tuple |
4476 | | * |
4477 | | * This routine may be used to update a tuple when concurrent updates of |
4478 | | * the target tuple are not expected (for example, because we have a lock |
4479 | | * on the relation associated with the tuple). Any failure is reported |
4480 | | * via ereport(). |
4481 | | */ |
4482 | | void |
4483 | | simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup, |
4484 | | TU_UpdateIndexes *update_indexes) |
4485 | 0 | { |
4486 | 0 | TM_Result result; |
4487 | 0 | TM_FailureData tmfd; |
4488 | 0 | LockTupleMode lockmode; |
4489 | |
|
4490 | 0 | result = heap_update(relation, otid, tup, |
4491 | 0 | GetCurrentCommandId(true), InvalidSnapshot, |
4492 | 0 | true /* wait for commit */ , |
4493 | 0 | &tmfd, &lockmode, update_indexes); |
4494 | 0 | switch (result) |
4495 | 0 | { |
4496 | 0 | case TM_SelfModified: |
4497 | | /* Tuple was already updated in current command? */ |
4498 | 0 | elog(ERROR, "tuple already updated by self"); |
4499 | 0 | break; |
4500 | | |
4501 | 0 | case TM_Ok: |
4502 | | /* done successfully */ |
4503 | 0 | break; |
4504 | | |
4505 | 0 | case TM_Updated: |
4506 | 0 | elog(ERROR, "tuple concurrently updated"); |
4507 | 0 | break; |
4508 | | |
4509 | 0 | case TM_Deleted: |
4510 | 0 | elog(ERROR, "tuple concurrently deleted"); |
4511 | 0 | break; |
4512 | | |
4513 | 0 | default: |
4514 | 0 | elog(ERROR, "unrecognized heap_update status: %u", result); |
4515 | 0 | break; |
4516 | 0 | } |
4517 | 0 | } |
4518 | | |
4519 | | |
4520 | | /* |
4521 | | * Return the MultiXactStatus corresponding to the given tuple lock mode. |
4522 | | */ |
4523 | | static MultiXactStatus |
4524 | | get_mxact_status_for_lock(LockTupleMode mode, bool is_update) |
4525 | 0 | { |
4526 | 0 | int retval; |
4527 | |
|
4528 | 0 | if (is_update) |
4529 | 0 | retval = tupleLockExtraInfo[mode].updstatus; |
4530 | 0 | else |
4531 | 0 | retval = tupleLockExtraInfo[mode].lockstatus; |
4532 | |
|
4533 | 0 | if (retval == -1) |
4534 | 0 | elog(ERROR, "invalid lock tuple mode %d/%s", mode, |
4535 | 0 | is_update ? "true" : "false"); |
4536 | | |
4537 | 0 | return (MultiXactStatus) retval; |
4538 | 0 | } |
4539 | | |
4540 | | /* |
4541 | | * heap_lock_tuple - lock a tuple in shared or exclusive mode |
4542 | | * |
4543 | | * Note that this acquires a buffer pin, which the caller must release. |
4544 | | * |
4545 | | * Input parameters: |
4546 | | * relation: relation containing tuple (caller must hold suitable lock) |
4547 | | * tid: TID of tuple to lock |
4548 | | * cid: current command ID (used for visibility test, and stored into |
4549 | | * tuple's cmax if lock is successful) |
4550 | | * mode: indicates if shared or exclusive tuple lock is desired |
4551 | | * wait_policy: what to do if tuple lock is not available |
4552 | | * follow_updates: if true, follow the update chain to also lock descendant |
4553 | | * tuples. |
4554 | | * |
4555 | | * Output parameters: |
4556 | | * *tuple: all fields filled in |
4557 | | * *buffer: set to buffer holding tuple (pinned but not locked at exit) |
4558 | | * *tmfd: filled in failure cases (see below) |
4559 | | * |
4560 | | * Function results are the same as the ones for table_tuple_lock(). |
4561 | | * |
4562 | | * In the failure cases other than TM_Invisible, the routine fills |
4563 | | * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact, |
4564 | | * if necessary), and t_cmax (the last only for TM_SelfModified, |
4565 | | * since we cannot obtain cmax from a combo CID generated by another |
4566 | | * transaction). |
4567 | | * See comments for struct TM_FailureData for additional info. |
4568 | | * |
4569 | | * See README.tuplock for a thorough explanation of this mechanism. |
4570 | | */ |
4571 | | TM_Result |
4572 | | heap_lock_tuple(Relation relation, HeapTuple tuple, |
4573 | | CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, |
4574 | | bool follow_updates, |
4575 | | Buffer *buffer, TM_FailureData *tmfd) |
4576 | 0 | { |
4577 | 0 | TM_Result result; |
4578 | 0 | ItemPointer tid = &(tuple->t_self); |
4579 | 0 | ItemId lp; |
4580 | 0 | Page page; |
4581 | 0 | Buffer vmbuffer = InvalidBuffer; |
4582 | 0 | BlockNumber block; |
4583 | 0 | TransactionId xid, |
4584 | 0 | xmax; |
4585 | 0 | uint16 old_infomask, |
4586 | 0 | new_infomask, |
4587 | 0 | new_infomask2; |
4588 | 0 | bool first_time = true; |
4589 | 0 | bool skip_tuple_lock = false; |
4590 | 0 | bool have_tuple_lock = false; |
4591 | 0 | bool cleared_all_frozen = false; |
4592 | |
|
4593 | 0 | *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); |
4594 | 0 | block = ItemPointerGetBlockNumber(tid); |
4595 | | |
4596 | | /* |
4597 | | * Before locking the buffer, pin the visibility map page if it appears to |
4598 | | * be necessary. Since we haven't got the lock yet, someone else might be |
4599 | | * in the middle of changing this, so we'll need to recheck after we have |
4600 | | * the lock. |
4601 | | */ |
4602 | 0 | if (PageIsAllVisible(BufferGetPage(*buffer))) |
4603 | 0 | visibilitymap_pin(relation, block, &vmbuffer); |
4604 | |
|
4605 | 0 | LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
4606 | |
|
4607 | 0 | page = BufferGetPage(*buffer); |
4608 | 0 | lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); |
4609 | 0 | Assert(ItemIdIsNormal(lp)); |
4610 | |
|
4611 | 0 | tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); |
4612 | 0 | tuple->t_len = ItemIdGetLength(lp); |
4613 | 0 | tuple->t_tableOid = RelationGetRelid(relation); |
4614 | |
|
4615 | 0 | l3: |
4616 | 0 | result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer); |
4617 | |
|
4618 | 0 | if (result == TM_Invisible) |
4619 | 0 | { |
4620 | | /* |
4621 | | * This is possible, but only when locking a tuple for ON CONFLICT |
4622 | | * UPDATE. We return this value here rather than throwing an error in |
4623 | | * order to give that case the opportunity to throw a more specific |
4624 | | * error. |
4625 | | */ |
4626 | 0 | result = TM_Invisible; |
4627 | 0 | goto out_locked; |
4628 | 0 | } |
4629 | 0 | else if (result == TM_BeingModified || |
4630 | 0 | result == TM_Updated || |
4631 | 0 | result == TM_Deleted) |
4632 | 0 | { |
4633 | 0 | TransactionId xwait; |
4634 | 0 | uint16 infomask; |
4635 | 0 | uint16 infomask2; |
4636 | 0 | bool require_sleep; |
4637 | 0 | ItemPointerData t_ctid; |
4638 | | |
4639 | | /* must copy state data before unlocking buffer */ |
4640 | 0 | xwait = HeapTupleHeaderGetRawXmax(tuple->t_data); |
4641 | 0 | infomask = tuple->t_data->t_infomask; |
4642 | 0 | infomask2 = tuple->t_data->t_infomask2; |
4643 | 0 | ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid); |
4644 | |
|
4645 | 0 | LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); |
4646 | | |
4647 | | /* |
4648 | | * If any subtransaction of the current top transaction already holds |
4649 | | * a lock as strong as or stronger than what we're requesting, we |
4650 | | * effectively hold the desired lock already. We *must* succeed |
4651 | | * without trying to take the tuple lock, else we will deadlock |
4652 | | * against anyone wanting to acquire a stronger lock. |
4653 | | * |
4654 | | * Note we only do this the first time we loop on the HTSU result; |
4655 | | * there is no point in testing in subsequent passes, because |
4656 | | * evidently our own transaction cannot have acquired a new lock after |
4657 | | * the first time we checked. |
4658 | | */ |
4659 | 0 | if (first_time) |
4660 | 0 | { |
4661 | 0 | first_time = false; |
4662 | |
|
4663 | 0 | if (infomask & HEAP_XMAX_IS_MULTI) |
4664 | 0 | { |
4665 | 0 | int i; |
4666 | 0 | int nmembers; |
4667 | 0 | MultiXactMember *members; |
4668 | | |
4669 | | /* |
4670 | | * We don't need to allow old multixacts here; if that had |
4671 | | * been the case, HeapTupleSatisfiesUpdate would have returned |
4672 | | * MayBeUpdated and we wouldn't be here. |
4673 | | */ |
4674 | 0 | nmembers = |
4675 | 0 | GetMultiXactIdMembers(xwait, &members, false, |
4676 | 0 | HEAP_XMAX_IS_LOCKED_ONLY(infomask)); |
4677 | |
|
4678 | 0 | for (i = 0; i < nmembers; i++) |
4679 | 0 | { |
4680 | | /* only consider members of our own transaction */ |
4681 | 0 | if (!TransactionIdIsCurrentTransactionId(members[i].xid)) |
4682 | 0 | continue; |
4683 | | |
4684 | 0 | if (TUPLOCK_from_mxstatus(members[i].status) >= mode) |
4685 | 0 | { |
4686 | 0 | pfree(members); |
4687 | 0 | result = TM_Ok; |
4688 | 0 | goto out_unlocked; |
4689 | 0 | } |
4690 | 0 | else |
4691 | 0 | { |
4692 | | /* |
4693 | | * Disable acquisition of the heavyweight tuple lock. |
4694 | | * Otherwise, when promoting a weaker lock, we might |
4695 | | * deadlock with another locker that has acquired the |
4696 | | * heavyweight tuple lock and is waiting for our |
4697 | | * transaction to finish. |
4698 | | * |
4699 | | * Note that in this case we still need to wait for |
4700 | | * the multixact if required, to avoid acquiring |
4701 | | * conflicting locks. |
4702 | | */ |
4703 | 0 | skip_tuple_lock = true; |
4704 | 0 | } |
4705 | 0 | } |
4706 | | |
4707 | 0 | if (members) |
4708 | 0 | pfree(members); |
4709 | 0 | } |
4710 | 0 | else if (TransactionIdIsCurrentTransactionId(xwait)) |
4711 | 0 | { |
4712 | 0 | switch (mode) |
4713 | 0 | { |
4714 | 0 | case LockTupleKeyShare: |
4715 | 0 | Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) || |
4716 | 0 | HEAP_XMAX_IS_SHR_LOCKED(infomask) || |
4717 | 0 | HEAP_XMAX_IS_EXCL_LOCKED(infomask)); |
4718 | 0 | result = TM_Ok; |
4719 | 0 | goto out_unlocked; |
4720 | 0 | case LockTupleShare: |
4721 | 0 | if (HEAP_XMAX_IS_SHR_LOCKED(infomask) || |
4722 | 0 | HEAP_XMAX_IS_EXCL_LOCKED(infomask)) |
4723 | 0 | { |
4724 | 0 | result = TM_Ok; |
4725 | 0 | goto out_unlocked; |
4726 | 0 | } |
4727 | 0 | break; |
4728 | 0 | case LockTupleNoKeyExclusive: |
4729 | 0 | if (HEAP_XMAX_IS_EXCL_LOCKED(infomask)) |
4730 | 0 | { |
4731 | 0 | result = TM_Ok; |
4732 | 0 | goto out_unlocked; |
4733 | 0 | } |
4734 | 0 | break; |
4735 | 0 | case LockTupleExclusive: |
4736 | 0 | if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) && |
4737 | 0 | infomask2 & HEAP_KEYS_UPDATED) |
4738 | 0 | { |
4739 | 0 | result = TM_Ok; |
4740 | 0 | goto out_unlocked; |
4741 | 0 | } |
4742 | 0 | break; |
4743 | 0 | } |
4744 | 0 | } |
4745 | 0 | } |
4746 | | |
4747 | | /* |
4748 | | * Initially assume that we will have to wait for the locking |
4749 | | * transaction(s) to finish. We check various cases below in which |
4750 | | * this can be turned off. |
4751 | | */ |
4752 | 0 | require_sleep = true; |
4753 | 0 | if (mode == LockTupleKeyShare) |
4754 | 0 | { |
4755 | | /* |
4756 | | * If we're requesting KeyShare, and there's no update present, we |
4757 | | * don't need to wait. Even if there is an update, we can still |
4758 | | * continue if the key hasn't been modified. |
4759 | | * |
4760 | | * However, if there are updates, we need to walk the update chain |
4761 | | * to mark future versions of the row as locked, too. That way, |
4762 | | * if somebody deletes that future version, we're protected |
4763 | | * against the key going away. This locking of future versions |
4764 | | * could block momentarily, if a concurrent transaction is |
4765 | | * deleting a key; or it could return a value to the effect that |
4766 | | * the transaction deleting the key has already committed. So we |
4767 | | * do this before re-locking the buffer; otherwise this would be |
4768 | | * prone to deadlocks. |
4769 | | * |
4770 | | * Note that the TID we're locking was grabbed before we unlocked |
4771 | | * the buffer. For it to change while we're not looking, the |
4772 | | * other properties we're testing for below after re-locking the |
4773 | | * buffer would also change, in which case we would restart this |
4774 | | * loop above. |
4775 | | */ |
4776 | 0 | if (!(infomask2 & HEAP_KEYS_UPDATED)) |
4777 | 0 | { |
4778 | 0 | bool updated; |
4779 | |
|
4780 | 0 | updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask); |
4781 | | |
4782 | | /* |
4783 | | * If there are updates, follow the update chain; bail out if |
4784 | | * that cannot be done. |
4785 | | */ |
4786 | 0 | if (follow_updates && updated) |
4787 | 0 | { |
4788 | 0 | TM_Result res; |
4789 | |
|
4790 | 0 | res = heap_lock_updated_tuple(relation, tuple, &t_ctid, |
4791 | 0 | GetCurrentTransactionId(), |
4792 | 0 | mode); |
4793 | 0 | if (res != TM_Ok) |
4794 | 0 | { |
4795 | 0 | result = res; |
4796 | | /* recovery code expects to have buffer lock held */ |
4797 | 0 | LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
4798 | 0 | goto failed; |
4799 | 0 | } |
4800 | 0 | } |
4801 | | |
4802 | 0 | LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
4803 | | |
4804 | | /* |
4805 | | * Make sure it's still an appropriate lock, else start over. |
4806 | | * Also, if it wasn't updated before we released the lock, but |
4807 | | * is updated now, we start over too; the reason is that we |
4808 | | * now need to follow the update chain to lock the new |
4809 | | * versions. |
4810 | | */ |
4811 | 0 | if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) && |
4812 | 0 | ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) || |
4813 | 0 | !updated)) |
4814 | 0 | goto l3; |
4815 | | |
4816 | | /* Things look okay, so we can skip sleeping */ |
4817 | 0 | require_sleep = false; |
4818 | | |
4819 | | /* |
4820 | | * Note we allow Xmax to change here; other updaters/lockers |
4821 | | * could have modified it before we grabbed the buffer lock. |
4822 | | * However, this is not a problem, because with the recheck we |
4823 | | * just did we ensure that they still don't conflict with the |
4824 | | * lock we want. |
4825 | | */ |
4826 | 0 | } |
4827 | 0 | } |
4828 | 0 | else if (mode == LockTupleShare) |
4829 | 0 | { |
4830 | | /* |
4831 | | * If we're requesting Share, we can similarly avoid sleeping if |
4832 | | * there's no update and no exclusive lock present. |
4833 | | */ |
4834 | 0 | if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) && |
4835 | 0 | !HEAP_XMAX_IS_EXCL_LOCKED(infomask)) |
4836 | 0 | { |
4837 | 0 | LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
4838 | | |
4839 | | /* |
4840 | | * Make sure it's still an appropriate lock, else start over. |
4841 | | * See above about allowing xmax to change. |
4842 | | */ |
4843 | 0 | if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) || |
4844 | 0 | HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask)) |
4845 | 0 | goto l3; |
4846 | 0 | require_sleep = false; |
4847 | 0 | } |
4848 | 0 | } |
4849 | 0 | else if (mode == LockTupleNoKeyExclusive) |
4850 | 0 | { |
4851 | | /* |
4852 | | * If we're requesting NoKeyExclusive, we might also be able to |
4853 | | * avoid sleeping; just ensure that there no conflicting lock |
4854 | | * already acquired. |
4855 | | */ |
4856 | 0 | if (infomask & HEAP_XMAX_IS_MULTI) |
4857 | 0 | { |
4858 | 0 | if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask, |
4859 | 0 | mode, NULL)) |
4860 | 0 | { |
4861 | | /* |
4862 | | * No conflict, but if the xmax changed under us in the |
4863 | | * meantime, start over. |
4864 | | */ |
4865 | 0 | LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
4866 | 0 | if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || |
4867 | 0 | !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), |
4868 | 0 | xwait)) |
4869 | 0 | goto l3; |
4870 | | |
4871 | | /* otherwise, we're good */ |
4872 | 0 | require_sleep = false; |
4873 | 0 | } |
4874 | 0 | } |
4875 | 0 | else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask)) |
4876 | 0 | { |
4877 | 0 | LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
4878 | | |
4879 | | /* if the xmax changed in the meantime, start over */ |
4880 | 0 | if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || |
4881 | 0 | !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), |
4882 | 0 | xwait)) |
4883 | 0 | goto l3; |
4884 | | /* otherwise, we're good */ |
4885 | 0 | require_sleep = false; |
4886 | 0 | } |
4887 | 0 | } |
4888 | | |
4889 | | /* |
4890 | | * As a check independent from those above, we can also avoid sleeping |
4891 | | * if the current transaction is the sole locker of the tuple. Note |
4892 | | * that the strength of the lock already held is irrelevant; this is |
4893 | | * not about recording the lock in Xmax (which will be done regardless |
4894 | | * of this optimization, below). Also, note that the cases where we |
4895 | | * hold a lock stronger than we are requesting are already handled |
4896 | | * above by not doing anything. |
4897 | | * |
4898 | | * Note we only deal with the non-multixact case here; MultiXactIdWait |
4899 | | * is well equipped to deal with this situation on its own. |
4900 | | */ |
4901 | 0 | if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) && |
4902 | 0 | TransactionIdIsCurrentTransactionId(xwait)) |
4903 | 0 | { |
4904 | | /* ... but if the xmax changed in the meantime, start over */ |
4905 | 0 | LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
4906 | 0 | if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || |
4907 | 0 | !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), |
4908 | 0 | xwait)) |
4909 | 0 | goto l3; |
4910 | 0 | Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask)); |
4911 | 0 | require_sleep = false; |
4912 | 0 | } |
4913 | | |
4914 | | /* |
4915 | | * Time to sleep on the other transaction/multixact, if necessary. |
4916 | | * |
4917 | | * If the other transaction is an update/delete that's already |
4918 | | * committed, then sleeping cannot possibly do any good: if we're |
4919 | | * required to sleep, get out to raise an error instead. |
4920 | | * |
4921 | | * By here, we either have already acquired the buffer exclusive lock, |
4922 | | * or we must wait for the locking transaction or multixact; so below |
4923 | | * we ensure that we grab buffer lock after the sleep. |
4924 | | */ |
4925 | 0 | if (require_sleep && (result == TM_Updated || result == TM_Deleted)) |
4926 | 0 | { |
4927 | 0 | LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
4928 | 0 | goto failed; |
4929 | 0 | } |
4930 | 0 | else if (require_sleep) |
4931 | 0 | { |
4932 | | /* |
4933 | | * Acquire tuple lock to establish our priority for the tuple, or |
4934 | | * die trying. LockTuple will release us when we are next-in-line |
4935 | | * for the tuple. We must do this even if we are share-locking, |
4936 | | * but not if we already have a weaker lock on the tuple. |
4937 | | * |
4938 | | * If we are forced to "start over" below, we keep the tuple lock; |
4939 | | * this arranges that we stay at the head of the line while |
4940 | | * rechecking tuple state. |
4941 | | */ |
4942 | 0 | if (!skip_tuple_lock && |
4943 | 0 | !heap_acquire_tuplock(relation, tid, mode, wait_policy, |
4944 | 0 | &have_tuple_lock)) |
4945 | 0 | { |
4946 | | /* |
4947 | | * This can only happen if wait_policy is Skip and the lock |
4948 | | * couldn't be obtained. |
4949 | | */ |
4950 | 0 | result = TM_WouldBlock; |
4951 | | /* recovery code expects to have buffer lock held */ |
4952 | 0 | LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
4953 | 0 | goto failed; |
4954 | 0 | } |
4955 | | |
4956 | 0 | if (infomask & HEAP_XMAX_IS_MULTI) |
4957 | 0 | { |
4958 | 0 | MultiXactStatus status = get_mxact_status_for_lock(mode, false); |
4959 | | |
4960 | | /* We only ever lock tuples, never update them */ |
4961 | 0 | if (status >= MultiXactStatusNoKeyUpdate) |
4962 | 0 | elog(ERROR, "invalid lock mode in heap_lock_tuple"); |
4963 | | |
4964 | | /* wait for multixact to end, or die trying */ |
4965 | 0 | switch (wait_policy) |
4966 | 0 | { |
4967 | 0 | case LockWaitBlock: |
4968 | 0 | MultiXactIdWait((MultiXactId) xwait, status, infomask, |
4969 | 0 | relation, &tuple->t_self, XLTW_Lock, NULL); |
4970 | 0 | break; |
4971 | 0 | case LockWaitSkip: |
4972 | 0 | if (!ConditionalMultiXactIdWait((MultiXactId) xwait, |
4973 | 0 | status, infomask, relation, |
4974 | 0 | NULL, false)) |
4975 | 0 | { |
4976 | 0 | result = TM_WouldBlock; |
4977 | | /* recovery code expects to have buffer lock held */ |
4978 | 0 | LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
4979 | 0 | goto failed; |
4980 | 0 | } |
4981 | 0 | break; |
4982 | 0 | case LockWaitError: |
4983 | 0 | if (!ConditionalMultiXactIdWait((MultiXactId) xwait, |
4984 | 0 | status, infomask, relation, |
4985 | 0 | NULL, log_lock_failures)) |
4986 | 0 | ereport(ERROR, |
4987 | 0 | (errcode(ERRCODE_LOCK_NOT_AVAILABLE), |
4988 | 0 | errmsg("could not obtain lock on row in relation \"%s\"", |
4989 | 0 | RelationGetRelationName(relation)))); |
4990 | | |
4991 | 0 | break; |
4992 | 0 | } |
4993 | | |
4994 | | /* |
4995 | | * Of course, the multixact might not be done here: if we're |
4996 | | * requesting a light lock mode, other transactions with light |
4997 | | * locks could still be alive, as well as locks owned by our |
4998 | | * own xact or other subxacts of this backend. We need to |
4999 | | * preserve the surviving MultiXact members. Note that it |
5000 | | * isn't absolutely necessary in the latter case, but doing so |
5001 | | * is simpler. |
5002 | | */ |
5003 | 0 | } |
5004 | 0 | else |
5005 | 0 | { |
5006 | | /* wait for regular transaction to end, or die trying */ |
5007 | 0 | switch (wait_policy) |
5008 | 0 | { |
5009 | 0 | case LockWaitBlock: |
5010 | 0 | XactLockTableWait(xwait, relation, &tuple->t_self, |
5011 | 0 | XLTW_Lock); |
5012 | 0 | break; |
5013 | 0 | case LockWaitSkip: |
5014 | 0 | if (!ConditionalXactLockTableWait(xwait, false)) |
5015 | 0 | { |
5016 | 0 | result = TM_WouldBlock; |
5017 | | /* recovery code expects to have buffer lock held */ |
5018 | 0 | LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
5019 | 0 | goto failed; |
5020 | 0 | } |
5021 | 0 | break; |
5022 | 0 | case LockWaitError: |
5023 | 0 | if (!ConditionalXactLockTableWait(xwait, log_lock_failures)) |
5024 | 0 | ereport(ERROR, |
5025 | 0 | (errcode(ERRCODE_LOCK_NOT_AVAILABLE), |
5026 | 0 | errmsg("could not obtain lock on row in relation \"%s\"", |
5027 | 0 | RelationGetRelationName(relation)))); |
5028 | 0 | break; |
5029 | 0 | } |
5030 | 0 | } |
5031 | | |
5032 | | /* if there are updates, follow the update chain */ |
5033 | 0 | if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask)) |
5034 | 0 | { |
5035 | 0 | TM_Result res; |
5036 | |
|
5037 | 0 | res = heap_lock_updated_tuple(relation, tuple, &t_ctid, |
5038 | 0 | GetCurrentTransactionId(), |
5039 | 0 | mode); |
5040 | 0 | if (res != TM_Ok) |
5041 | 0 | { |
5042 | 0 | result = res; |
5043 | | /* recovery code expects to have buffer lock held */ |
5044 | 0 | LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
5045 | 0 | goto failed; |
5046 | 0 | } |
5047 | 0 | } |
5048 | | |
5049 | 0 | LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
5050 | | |
5051 | | /* |
5052 | | * xwait is done, but if xwait had just locked the tuple then some |
5053 | | * other xact could update this tuple before we get to this point. |
5054 | | * Check for xmax change, and start over if so. |
5055 | | */ |
5056 | 0 | if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) || |
5057 | 0 | !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data), |
5058 | 0 | xwait)) |
5059 | 0 | goto l3; |
5060 | | |
5061 | 0 | if (!(infomask & HEAP_XMAX_IS_MULTI)) |
5062 | 0 | { |
5063 | | /* |
5064 | | * Otherwise check if it committed or aborted. Note we cannot |
5065 | | * be here if the tuple was only locked by somebody who didn't |
5066 | | * conflict with us; that would have been handled above. So |
5067 | | * that transaction must necessarily be gone by now. But |
5068 | | * don't check for this in the multixact case, because some |
5069 | | * locker transactions might still be running. |
5070 | | */ |
5071 | 0 | UpdateXmaxHintBits(tuple->t_data, *buffer, xwait); |
5072 | 0 | } |
5073 | 0 | } |
5074 | | |
5075 | | /* By here, we're certain that we hold buffer exclusive lock again */ |
5076 | | |
5077 | | /* |
5078 | | * We may lock if previous xmax aborted, or if it committed but only |
5079 | | * locked the tuple without updating it; or if we didn't have to wait |
5080 | | * at all for whatever reason. |
5081 | | */ |
5082 | 0 | if (!require_sleep || |
5083 | 0 | (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) || |
5084 | 0 | HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) || |
5085 | 0 | HeapTupleHeaderIsOnlyLocked(tuple->t_data)) |
5086 | 0 | result = TM_Ok; |
5087 | 0 | else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid)) |
5088 | 0 | result = TM_Updated; |
5089 | 0 | else |
5090 | 0 | result = TM_Deleted; |
5091 | 0 | } |
5092 | | |
5093 | 0 | failed: |
5094 | 0 | if (result != TM_Ok) |
5095 | 0 | { |
5096 | 0 | Assert(result == TM_SelfModified || result == TM_Updated || |
5097 | 0 | result == TM_Deleted || result == TM_WouldBlock); |
5098 | | |
5099 | | /* |
5100 | | * When locking a tuple under LockWaitSkip semantics and we fail with |
5101 | | * TM_WouldBlock above, it's possible for concurrent transactions to |
5102 | | * release the lock and set HEAP_XMAX_INVALID in the meantime. So |
5103 | | * this assert is slightly different from the equivalent one in |
5104 | | * heap_delete and heap_update. |
5105 | | */ |
5106 | 0 | Assert((result == TM_WouldBlock) || |
5107 | 0 | !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID)); |
5108 | 0 | Assert(result != TM_Updated || |
5109 | 0 | !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid)); |
5110 | 0 | tmfd->ctid = tuple->t_data->t_ctid; |
5111 | 0 | tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data); |
5112 | 0 | if (result == TM_SelfModified) |
5113 | 0 | tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data); |
5114 | 0 | else |
5115 | 0 | tmfd->cmax = InvalidCommandId; |
5116 | 0 | goto out_locked; |
5117 | 0 | } |
5118 | | |
5119 | | /* |
5120 | | * If we didn't pin the visibility map page and the page has become all |
5121 | | * visible while we were busy locking the buffer, or during some |
5122 | | * subsequent window during which we had it unlocked, we'll have to unlock |
5123 | | * and re-lock, to avoid holding the buffer lock across I/O. That's a bit |
5124 | | * unfortunate, especially since we'll now have to recheck whether the |
5125 | | * tuple has been locked or updated under us, but hopefully it won't |
5126 | | * happen very often. |
5127 | | */ |
5128 | 0 | if (vmbuffer == InvalidBuffer && PageIsAllVisible(page)) |
5129 | 0 | { |
5130 | 0 | LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); |
5131 | 0 | visibilitymap_pin(relation, block, &vmbuffer); |
5132 | 0 | LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
5133 | 0 | goto l3; |
5134 | 0 | } |
5135 | | |
5136 | 0 | xmax = HeapTupleHeaderGetRawXmax(tuple->t_data); |
5137 | 0 | old_infomask = tuple->t_data->t_infomask; |
5138 | | |
5139 | | /* |
5140 | | * If this is the first possibly-multixact-able operation in the current |
5141 | | * transaction, set my per-backend OldestMemberMXactId setting. We can be |
5142 | | * certain that the transaction will never become a member of any older |
5143 | | * MultiXactIds than that. (We have to do this even if we end up just |
5144 | | * using our own TransactionId below, since some other backend could |
5145 | | * incorporate our XID into a MultiXact immediately afterwards.) |
5146 | | */ |
5147 | 0 | MultiXactIdSetOldestMember(); |
5148 | | |
5149 | | /* |
5150 | | * Compute the new xmax and infomask to store into the tuple. Note we do |
5151 | | * not modify the tuple just yet, because that would leave it in the wrong |
5152 | | * state if multixact.c elogs. |
5153 | | */ |
5154 | 0 | compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2, |
5155 | 0 | GetCurrentTransactionId(), mode, false, |
5156 | 0 | &xid, &new_infomask, &new_infomask2); |
5157 | |
|
5158 | 0 | START_CRIT_SECTION(); |
5159 | | |
5160 | | /* |
5161 | | * Store transaction information of xact locking the tuple. |
5162 | | * |
5163 | | * Note: Cmax is meaningless in this context, so don't set it; this avoids |
5164 | | * possibly generating a useless combo CID. Moreover, if we're locking a |
5165 | | * previously updated tuple, it's important to preserve the Cmax. |
5166 | | * |
5167 | | * Also reset the HOT UPDATE bit, but only if there's no update; otherwise |
5168 | | * we would break the HOT chain. |
5169 | | */ |
5170 | 0 | tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS; |
5171 | 0 | tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; |
5172 | 0 | tuple->t_data->t_infomask |= new_infomask; |
5173 | 0 | tuple->t_data->t_infomask2 |= new_infomask2; |
5174 | 0 | if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask)) |
5175 | 0 | HeapTupleHeaderClearHotUpdated(tuple->t_data); |
5176 | 0 | HeapTupleHeaderSetXmax(tuple->t_data, xid); |
5177 | | |
5178 | | /* |
5179 | | * Make sure there is no forward chain link in t_ctid. Note that in the |
5180 | | * cases where the tuple has been updated, we must not overwrite t_ctid, |
5181 | | * because it was set by the updater. Moreover, if the tuple has been |
5182 | | * updated, we need to follow the update chain to lock the new versions of |
5183 | | * the tuple as well. |
5184 | | */ |
5185 | 0 | if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask)) |
5186 | 0 | tuple->t_data->t_ctid = *tid; |
5187 | | |
5188 | | /* Clear only the all-frozen bit on visibility map if needed */ |
5189 | 0 | if (PageIsAllVisible(page) && |
5190 | 0 | visibilitymap_clear(relation, block, vmbuffer, |
5191 | 0 | VISIBILITYMAP_ALL_FROZEN)) |
5192 | 0 | cleared_all_frozen = true; |
5193 | | |
5194 | |
|
5195 | 0 | MarkBufferDirty(*buffer); |
5196 | | |
5197 | | /* |
5198 | | * XLOG stuff. You might think that we don't need an XLOG record because |
5199 | | * there is no state change worth restoring after a crash. You would be |
5200 | | * wrong however: we have just written either a TransactionId or a |
5201 | | * MultiXactId that may never have been seen on disk before, and we need |
5202 | | * to make sure that there are XLOG entries covering those ID numbers. |
5203 | | * Else the same IDs might be re-used after a crash, which would be |
5204 | | * disastrous if this page made it to disk before the crash. Essentially |
5205 | | * we have to enforce the WAL log-before-data rule even in this case. |
5206 | | * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG |
5207 | | * entries for everything anyway.) |
5208 | | */ |
5209 | 0 | if (RelationNeedsWAL(relation)) |
5210 | 0 | { |
5211 | 0 | xl_heap_lock xlrec; |
5212 | 0 | XLogRecPtr recptr; |
5213 | |
|
5214 | 0 | XLogBeginInsert(); |
5215 | 0 | XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD); |
5216 | |
|
5217 | 0 | xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); |
5218 | 0 | xlrec.xmax = xid; |
5219 | 0 | xlrec.infobits_set = compute_infobits(new_infomask, |
5220 | 0 | tuple->t_data->t_infomask2); |
5221 | 0 | xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; |
5222 | 0 | XLogRegisterData(&xlrec, SizeOfHeapLock); |
5223 | | |
5224 | | /* we don't decode row locks atm, so no need to log the origin */ |
5225 | |
|
5226 | 0 | recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); |
5227 | |
|
5228 | 0 | PageSetLSN(page, recptr); |
5229 | 0 | } |
5230 | |
|
5231 | 0 | END_CRIT_SECTION(); |
5232 | |
|
5233 | 0 | result = TM_Ok; |
5234 | |
|
5235 | 0 | out_locked: |
5236 | 0 | LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); |
5237 | |
|
5238 | 0 | out_unlocked: |
5239 | 0 | if (BufferIsValid(vmbuffer)) |
5240 | 0 | ReleaseBuffer(vmbuffer); |
5241 | | |
5242 | | /* |
5243 | | * Don't update the visibility map here. Locking a tuple doesn't change |
5244 | | * visibility info. |
5245 | | */ |
5246 | | |
5247 | | /* |
5248 | | * Now that we have successfully marked the tuple as locked, we can |
5249 | | * release the lmgr tuple lock, if we had it. |
5250 | | */ |
5251 | 0 | if (have_tuple_lock) |
5252 | 0 | UnlockTupleTuplock(relation, tid, mode); |
5253 | |
|
5254 | 0 | return result; |
5255 | 0 | } |
5256 | | |
5257 | | /* |
5258 | | * Acquire heavyweight lock on the given tuple, in preparation for acquiring |
5259 | | * its normal, Xmax-based tuple lock. |
5260 | | * |
5261 | | * have_tuple_lock is an input and output parameter: on input, it indicates |
5262 | | * whether the lock has previously been acquired (and this function does |
5263 | | * nothing in that case). If this function returns success, have_tuple_lock |
5264 | | * has been flipped to true. |
5265 | | * |
5266 | | * Returns false if it was unable to obtain the lock; this can only happen if |
5267 | | * wait_policy is Skip. |
5268 | | */ |
5269 | | static bool |
5270 | | heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode, |
5271 | | LockWaitPolicy wait_policy, bool *have_tuple_lock) |
5272 | 0 | { |
5273 | 0 | if (*have_tuple_lock) |
5274 | 0 | return true; |
5275 | | |
5276 | 0 | switch (wait_policy) |
5277 | 0 | { |
5278 | 0 | case LockWaitBlock: |
5279 | 0 | LockTupleTuplock(relation, tid, mode); |
5280 | 0 | break; |
5281 | | |
5282 | 0 | case LockWaitSkip: |
5283 | 0 | if (!ConditionalLockTupleTuplock(relation, tid, mode, false)) |
5284 | 0 | return false; |
5285 | 0 | break; |
5286 | | |
5287 | 0 | case LockWaitError: |
5288 | 0 | if (!ConditionalLockTupleTuplock(relation, tid, mode, log_lock_failures)) |
5289 | 0 | ereport(ERROR, |
5290 | 0 | (errcode(ERRCODE_LOCK_NOT_AVAILABLE), |
5291 | 0 | errmsg("could not obtain lock on row in relation \"%s\"", |
5292 | 0 | RelationGetRelationName(relation)))); |
5293 | 0 | break; |
5294 | 0 | } |
5295 | 0 | *have_tuple_lock = true; |
5296 | |
|
5297 | 0 | return true; |
5298 | 0 | } |
5299 | | |
5300 | | /* |
5301 | | * Given an original set of Xmax and infomask, and a transaction (identified by |
5302 | | * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and |
5303 | | * corresponding infomasks to use on the tuple. |
5304 | | * |
5305 | | * Note that this might have side effects such as creating a new MultiXactId. |
5306 | | * |
5307 | | * Most callers will have called HeapTupleSatisfiesUpdate before this function; |
5308 | | * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId |
5309 | | * but it was not running anymore. There is a race condition, which is that the |
5310 | | * MultiXactId may have finished since then, but that uncommon case is handled |
5311 | | * either here, or within MultiXactIdExpand. |
5312 | | * |
5313 | | * There is a similar race condition possible when the old xmax was a regular |
5314 | | * TransactionId. We test TransactionIdIsInProgress again just to narrow the |
5315 | | * window, but it's still possible to end up creating an unnecessary |
5316 | | * MultiXactId. Fortunately this is harmless. |
5317 | | */ |
5318 | | static void |
5319 | | compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, |
5320 | | uint16 old_infomask2, TransactionId add_to_xmax, |
5321 | | LockTupleMode mode, bool is_update, |
5322 | | TransactionId *result_xmax, uint16 *result_infomask, |
5323 | | uint16 *result_infomask2) |
5324 | 0 | { |
5325 | 0 | TransactionId new_xmax; |
5326 | 0 | uint16 new_infomask, |
5327 | 0 | new_infomask2; |
5328 | |
|
5329 | 0 | Assert(TransactionIdIsCurrentTransactionId(add_to_xmax)); |
5330 | |
|
5331 | 0 | l5: |
5332 | 0 | new_infomask = 0; |
5333 | 0 | new_infomask2 = 0; |
5334 | 0 | if (old_infomask & HEAP_XMAX_INVALID) |
5335 | 0 | { |
5336 | | /* |
5337 | | * No previous locker; we just insert our own TransactionId. |
5338 | | * |
5339 | | * Note that it's critical that this case be the first one checked, |
5340 | | * because there are several blocks below that come back to this one |
5341 | | * to implement certain optimizations; old_infomask might contain |
5342 | | * other dirty bits in those cases, but we don't really care. |
5343 | | */ |
5344 | 0 | if (is_update) |
5345 | 0 | { |
5346 | 0 | new_xmax = add_to_xmax; |
5347 | 0 | if (mode == LockTupleExclusive) |
5348 | 0 | new_infomask2 |= HEAP_KEYS_UPDATED; |
5349 | 0 | } |
5350 | 0 | else |
5351 | 0 | { |
5352 | 0 | new_infomask |= HEAP_XMAX_LOCK_ONLY; |
5353 | 0 | switch (mode) |
5354 | 0 | { |
5355 | 0 | case LockTupleKeyShare: |
5356 | 0 | new_xmax = add_to_xmax; |
5357 | 0 | new_infomask |= HEAP_XMAX_KEYSHR_LOCK; |
5358 | 0 | break; |
5359 | 0 | case LockTupleShare: |
5360 | 0 | new_xmax = add_to_xmax; |
5361 | 0 | new_infomask |= HEAP_XMAX_SHR_LOCK; |
5362 | 0 | break; |
5363 | 0 | case LockTupleNoKeyExclusive: |
5364 | 0 | new_xmax = add_to_xmax; |
5365 | 0 | new_infomask |= HEAP_XMAX_EXCL_LOCK; |
5366 | 0 | break; |
5367 | 0 | case LockTupleExclusive: |
5368 | 0 | new_xmax = add_to_xmax; |
5369 | 0 | new_infomask |= HEAP_XMAX_EXCL_LOCK; |
5370 | 0 | new_infomask2 |= HEAP_KEYS_UPDATED; |
5371 | 0 | break; |
5372 | 0 | default: |
5373 | 0 | new_xmax = InvalidTransactionId; /* silence compiler */ |
5374 | 0 | elog(ERROR, "invalid lock mode"); |
5375 | 0 | } |
5376 | 0 | } |
5377 | 0 | } |
5378 | 0 | else if (old_infomask & HEAP_XMAX_IS_MULTI) |
5379 | 0 | { |
5380 | 0 | MultiXactStatus new_status; |
5381 | | |
5382 | | /* |
5383 | | * Currently we don't allow XMAX_COMMITTED to be set for multis, so |
5384 | | * cross-check. |
5385 | | */ |
5386 | 0 | Assert(!(old_infomask & HEAP_XMAX_COMMITTED)); |
5387 | | |
5388 | | /* |
5389 | | * A multixact together with LOCK_ONLY set but neither lock bit set |
5390 | | * (i.e. a pg_upgraded share locked tuple) cannot possibly be running |
5391 | | * anymore. This check is critical for databases upgraded by |
5392 | | * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume |
5393 | | * that such multis are never passed. |
5394 | | */ |
5395 | 0 | if (HEAP_LOCKED_UPGRADED(old_infomask)) |
5396 | 0 | { |
5397 | 0 | old_infomask &= ~HEAP_XMAX_IS_MULTI; |
5398 | 0 | old_infomask |= HEAP_XMAX_INVALID; |
5399 | 0 | goto l5; |
5400 | 0 | } |
5401 | | |
5402 | | /* |
5403 | | * If the XMAX is already a MultiXactId, then we need to expand it to |
5404 | | * include add_to_xmax; but if all the members were lockers and are |
5405 | | * all gone, we can do away with the IS_MULTI bit and just set |
5406 | | * add_to_xmax as the only locker/updater. If all lockers are gone |
5407 | | * and we have an updater that aborted, we can also do without a |
5408 | | * multi. |
5409 | | * |
5410 | | * The cost of doing GetMultiXactIdMembers would be paid by |
5411 | | * MultiXactIdExpand if we weren't to do this, so this check is not |
5412 | | * incurring extra work anyhow. |
5413 | | */ |
5414 | 0 | if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))) |
5415 | 0 | { |
5416 | 0 | if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) || |
5417 | 0 | !TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax, |
5418 | 0 | old_infomask))) |
5419 | 0 | { |
5420 | | /* |
5421 | | * Reset these bits and restart; otherwise fall through to |
5422 | | * create a new multi below. |
5423 | | */ |
5424 | 0 | old_infomask &= ~HEAP_XMAX_IS_MULTI; |
5425 | 0 | old_infomask |= HEAP_XMAX_INVALID; |
5426 | 0 | goto l5; |
5427 | 0 | } |
5428 | 0 | } |
5429 | | |
5430 | 0 | new_status = get_mxact_status_for_lock(mode, is_update); |
5431 | |
|
5432 | 0 | new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax, |
5433 | 0 | new_status); |
5434 | 0 | GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); |
5435 | 0 | } |
5436 | 0 | else if (old_infomask & HEAP_XMAX_COMMITTED) |
5437 | 0 | { |
5438 | | /* |
5439 | | * It's a committed update, so we need to preserve him as updater of |
5440 | | * the tuple. |
5441 | | */ |
5442 | 0 | MultiXactStatus status; |
5443 | 0 | MultiXactStatus new_status; |
5444 | |
|
5445 | 0 | if (old_infomask2 & HEAP_KEYS_UPDATED) |
5446 | 0 | status = MultiXactStatusUpdate; |
5447 | 0 | else |
5448 | 0 | status = MultiXactStatusNoKeyUpdate; |
5449 | |
|
5450 | 0 | new_status = get_mxact_status_for_lock(mode, is_update); |
5451 | | |
5452 | | /* |
5453 | | * since it's not running, it's obviously impossible for the old |
5454 | | * updater to be identical to the current one, so we need not check |
5455 | | * for that case as we do in the block above. |
5456 | | */ |
5457 | 0 | new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status); |
5458 | 0 | GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); |
5459 | 0 | } |
5460 | 0 | else if (TransactionIdIsInProgress(xmax)) |
5461 | 0 | { |
5462 | | /* |
5463 | | * If the XMAX is a valid, in-progress TransactionId, then we need to |
5464 | | * create a new MultiXactId that includes both the old locker or |
5465 | | * updater and our own TransactionId. |
5466 | | */ |
5467 | 0 | MultiXactStatus new_status; |
5468 | 0 | MultiXactStatus old_status; |
5469 | 0 | LockTupleMode old_mode; |
5470 | |
|
5471 | 0 | if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)) |
5472 | 0 | { |
5473 | 0 | if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask)) |
5474 | 0 | old_status = MultiXactStatusForKeyShare; |
5475 | 0 | else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask)) |
5476 | 0 | old_status = MultiXactStatusForShare; |
5477 | 0 | else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) |
5478 | 0 | { |
5479 | 0 | if (old_infomask2 & HEAP_KEYS_UPDATED) |
5480 | 0 | old_status = MultiXactStatusForUpdate; |
5481 | 0 | else |
5482 | 0 | old_status = MultiXactStatusForNoKeyUpdate; |
5483 | 0 | } |
5484 | 0 | else |
5485 | 0 | { |
5486 | | /* |
5487 | | * LOCK_ONLY can be present alone only when a page has been |
5488 | | * upgraded by pg_upgrade. But in that case, |
5489 | | * TransactionIdIsInProgress() should have returned false. We |
5490 | | * assume it's no longer locked in this case. |
5491 | | */ |
5492 | 0 | elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax); |
5493 | 0 | old_infomask |= HEAP_XMAX_INVALID; |
5494 | 0 | old_infomask &= ~HEAP_XMAX_LOCK_ONLY; |
5495 | 0 | goto l5; |
5496 | 0 | } |
5497 | 0 | } |
5498 | 0 | else |
5499 | 0 | { |
5500 | | /* it's an update, but which kind? */ |
5501 | 0 | if (old_infomask2 & HEAP_KEYS_UPDATED) |
5502 | 0 | old_status = MultiXactStatusUpdate; |
5503 | 0 | else |
5504 | 0 | old_status = MultiXactStatusNoKeyUpdate; |
5505 | 0 | } |
5506 | | |
5507 | 0 | old_mode = TUPLOCK_from_mxstatus(old_status); |
5508 | | |
5509 | | /* |
5510 | | * If the lock to be acquired is for the same TransactionId as the |
5511 | | * existing lock, there's an optimization possible: consider only the |
5512 | | * strongest of both locks as the only one present, and restart. |
5513 | | */ |
5514 | 0 | if (xmax == add_to_xmax) |
5515 | 0 | { |
5516 | | /* |
5517 | | * Note that it's not possible for the original tuple to be |
5518 | | * updated: we wouldn't be here because the tuple would have been |
5519 | | * invisible and we wouldn't try to update it. As a subtlety, |
5520 | | * this code can also run when traversing an update chain to lock |
5521 | | * future versions of a tuple. But we wouldn't be here either, |
5522 | | * because the add_to_xmax would be different from the original |
5523 | | * updater. |
5524 | | */ |
5525 | 0 | Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)); |
5526 | | |
5527 | | /* acquire the strongest of both */ |
5528 | 0 | if (mode < old_mode) |
5529 | 0 | mode = old_mode; |
5530 | | /* mustn't touch is_update */ |
5531 | |
|
5532 | 0 | old_infomask |= HEAP_XMAX_INVALID; |
5533 | 0 | goto l5; |
5534 | 0 | } |
5535 | | |
5536 | | /* otherwise, just fall back to creating a new multixact */ |
5537 | 0 | new_status = get_mxact_status_for_lock(mode, is_update); |
5538 | 0 | new_xmax = MultiXactIdCreate(xmax, old_status, |
5539 | 0 | add_to_xmax, new_status); |
5540 | 0 | GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); |
5541 | 0 | } |
5542 | 0 | else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) && |
5543 | 0 | TransactionIdDidCommit(xmax)) |
5544 | 0 | { |
5545 | | /* |
5546 | | * It's a committed update, so we gotta preserve him as updater of the |
5547 | | * tuple. |
5548 | | */ |
5549 | 0 | MultiXactStatus status; |
5550 | 0 | MultiXactStatus new_status; |
5551 | |
|
5552 | 0 | if (old_infomask2 & HEAP_KEYS_UPDATED) |
5553 | 0 | status = MultiXactStatusUpdate; |
5554 | 0 | else |
5555 | 0 | status = MultiXactStatusNoKeyUpdate; |
5556 | |
|
5557 | 0 | new_status = get_mxact_status_for_lock(mode, is_update); |
5558 | | |
5559 | | /* |
5560 | | * since it's not running, it's obviously impossible for the old |
5561 | | * updater to be identical to the current one, so we need not check |
5562 | | * for that case as we do in the block above. |
5563 | | */ |
5564 | 0 | new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status); |
5565 | 0 | GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); |
5566 | 0 | } |
5567 | 0 | else |
5568 | 0 | { |
5569 | | /* |
5570 | | * Can get here iff the locking/updating transaction was running when |
5571 | | * the infomask was extracted from the tuple, but finished before |
5572 | | * TransactionIdIsInProgress got to run. Deal with it as if there was |
5573 | | * no locker at all in the first place. |
5574 | | */ |
5575 | 0 | old_infomask |= HEAP_XMAX_INVALID; |
5576 | 0 | goto l5; |
5577 | 0 | } |
5578 | | |
5579 | 0 | *result_infomask = new_infomask; |
5580 | 0 | *result_infomask2 = new_infomask2; |
5581 | 0 | *result_xmax = new_xmax; |
5582 | 0 | } |
5583 | | |
5584 | | /* |
5585 | | * Subroutine for heap_lock_updated_tuple_rec. |
5586 | | * |
5587 | | * Given a hypothetical multixact status held by the transaction identified |
5588 | | * with the given xid, does the current transaction need to wait, fail, or can |
5589 | | * it continue if it wanted to acquire a lock of the given mode? "needwait" |
5590 | | * is set to true if waiting is necessary; if it can continue, then TM_Ok is |
5591 | | * returned. If the lock is already held by the current transaction, return |
5592 | | * TM_SelfModified. In case of a conflict with another transaction, a |
5593 | | * different HeapTupleSatisfiesUpdate return code is returned. |
5594 | | * |
5595 | | * The held status is said to be hypothetical because it might correspond to a |
5596 | | * lock held by a single Xid, i.e. not a real MultiXactId; we express it this |
5597 | | * way for simplicity of API. |
5598 | | */ |
5599 | | static TM_Result |
5600 | | test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, |
5601 | | LockTupleMode mode, HeapTuple tup, |
5602 | | bool *needwait) |
5603 | 0 | { |
5604 | 0 | MultiXactStatus wantedstatus; |
5605 | |
|
5606 | 0 | *needwait = false; |
5607 | 0 | wantedstatus = get_mxact_status_for_lock(mode, false); |
5608 | | |
5609 | | /* |
5610 | | * Note: we *must* check TransactionIdIsInProgress before |
5611 | | * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c |
5612 | | * for an explanation. |
5613 | | */ |
5614 | 0 | if (TransactionIdIsCurrentTransactionId(xid)) |
5615 | 0 | { |
5616 | | /* |
5617 | | * The tuple has already been locked by our own transaction. This is |
5618 | | * very rare but can happen if multiple transactions are trying to |
5619 | | * lock an ancient version of the same tuple. |
5620 | | */ |
5621 | 0 | return TM_SelfModified; |
5622 | 0 | } |
5623 | 0 | else if (TransactionIdIsInProgress(xid)) |
5624 | 0 | { |
5625 | | /* |
5626 | | * If the locking transaction is running, what we do depends on |
5627 | | * whether the lock modes conflict: if they do, then we must wait for |
5628 | | * it to finish; otherwise we can fall through to lock this tuple |
5629 | | * version without waiting. |
5630 | | */ |
5631 | 0 | if (DoLockModesConflict(LOCKMODE_from_mxstatus(status), |
5632 | 0 | LOCKMODE_from_mxstatus(wantedstatus))) |
5633 | 0 | { |
5634 | 0 | *needwait = true; |
5635 | 0 | } |
5636 | | |
5637 | | /* |
5638 | | * If we set needwait above, then this value doesn't matter; |
5639 | | * otherwise, this value signals to caller that it's okay to proceed. |
5640 | | */ |
5641 | 0 | return TM_Ok; |
5642 | 0 | } |
5643 | 0 | else if (TransactionIdDidAbort(xid)) |
5644 | 0 | return TM_Ok; |
5645 | 0 | else if (TransactionIdDidCommit(xid)) |
5646 | 0 | { |
5647 | | /* |
5648 | | * The other transaction committed. If it was only a locker, then the |
5649 | | * lock is completely gone now and we can return success; but if it |
5650 | | * was an update, then what we do depends on whether the two lock |
5651 | | * modes conflict. If they conflict, then we must report error to |
5652 | | * caller. But if they don't, we can fall through to allow the current |
5653 | | * transaction to lock the tuple. |
5654 | | * |
5655 | | * Note: the reason we worry about ISUPDATE here is because as soon as |
5656 | | * a transaction ends, all its locks are gone and meaningless, and |
5657 | | * thus we can ignore them; whereas its updates persist. In the |
5658 | | * TransactionIdIsInProgress case, above, we don't need to check |
5659 | | * because we know the lock is still "alive" and thus a conflict needs |
5660 | | * always be checked. |
5661 | | */ |
5662 | 0 | if (!ISUPDATE_from_mxstatus(status)) |
5663 | 0 | return TM_Ok; |
5664 | | |
5665 | 0 | if (DoLockModesConflict(LOCKMODE_from_mxstatus(status), |
5666 | 0 | LOCKMODE_from_mxstatus(wantedstatus))) |
5667 | 0 | { |
5668 | | /* bummer */ |
5669 | 0 | if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid)) |
5670 | 0 | return TM_Updated; |
5671 | 0 | else |
5672 | 0 | return TM_Deleted; |
5673 | 0 | } |
5674 | | |
5675 | 0 | return TM_Ok; |
5676 | 0 | } |
5677 | | |
5678 | | /* Not in progress, not aborted, not committed -- must have crashed */ |
5679 | 0 | return TM_Ok; |
5680 | 0 | } |
5681 | | |
5682 | | |
5683 | | /* |
5684 | | * Recursive part of heap_lock_updated_tuple |
5685 | | * |
5686 | | * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given |
5687 | | * xid with the given mode; if this tuple is updated, recurse to lock the new |
5688 | | * version as well. |
5689 | | */ |
5690 | | static TM_Result |
5691 | | heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid, |
5692 | | LockTupleMode mode) |
5693 | 0 | { |
5694 | 0 | TM_Result result; |
5695 | 0 | ItemPointerData tupid; |
5696 | 0 | HeapTupleData mytup; |
5697 | 0 | Buffer buf; |
5698 | 0 | uint16 new_infomask, |
5699 | 0 | new_infomask2, |
5700 | 0 | old_infomask, |
5701 | 0 | old_infomask2; |
5702 | 0 | TransactionId xmax, |
5703 | 0 | new_xmax; |
5704 | 0 | TransactionId priorXmax = InvalidTransactionId; |
5705 | 0 | bool cleared_all_frozen = false; |
5706 | 0 | bool pinned_desired_page; |
5707 | 0 | Buffer vmbuffer = InvalidBuffer; |
5708 | 0 | BlockNumber block; |
5709 | |
|
5710 | 0 | ItemPointerCopy(tid, &tupid); |
5711 | |
|
5712 | 0 | for (;;) |
5713 | 0 | { |
5714 | 0 | new_infomask = 0; |
5715 | 0 | new_xmax = InvalidTransactionId; |
5716 | 0 | block = ItemPointerGetBlockNumber(&tupid); |
5717 | 0 | ItemPointerCopy(&tupid, &(mytup.t_self)); |
5718 | |
|
5719 | 0 | if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false)) |
5720 | 0 | { |
5721 | | /* |
5722 | | * if we fail to find the updated version of the tuple, it's |
5723 | | * because it was vacuumed/pruned away after its creator |
5724 | | * transaction aborted. So behave as if we got to the end of the |
5725 | | * chain, and there's no further tuple to lock: return success to |
5726 | | * caller. |
5727 | | */ |
5728 | 0 | result = TM_Ok; |
5729 | 0 | goto out_unlocked; |
5730 | 0 | } |
5731 | | |
5732 | 0 | l4: |
5733 | 0 | CHECK_FOR_INTERRUPTS(); |
5734 | | |
5735 | | /* |
5736 | | * Before locking the buffer, pin the visibility map page if it |
5737 | | * appears to be necessary. Since we haven't got the lock yet, |
5738 | | * someone else might be in the middle of changing this, so we'll need |
5739 | | * to recheck after we have the lock. |
5740 | | */ |
5741 | 0 | if (PageIsAllVisible(BufferGetPage(buf))) |
5742 | 0 | { |
5743 | 0 | visibilitymap_pin(rel, block, &vmbuffer); |
5744 | 0 | pinned_desired_page = true; |
5745 | 0 | } |
5746 | 0 | else |
5747 | 0 | pinned_desired_page = false; |
5748 | |
|
5749 | 0 | LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
5750 | | |
5751 | | /* |
5752 | | * If we didn't pin the visibility map page and the page has become |
5753 | | * all visible while we were busy locking the buffer, we'll have to |
5754 | | * unlock and re-lock, to avoid holding the buffer lock across I/O. |
5755 | | * That's a bit unfortunate, but hopefully shouldn't happen often. |
5756 | | * |
5757 | | * Note: in some paths through this function, we will reach here |
5758 | | * holding a pin on a vm page that may or may not be the one matching |
5759 | | * this page. If this page isn't all-visible, we won't use the vm |
5760 | | * page, but we hold onto such a pin till the end of the function. |
5761 | | */ |
5762 | 0 | if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf))) |
5763 | 0 | { |
5764 | 0 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
5765 | 0 | visibilitymap_pin(rel, block, &vmbuffer); |
5766 | 0 | LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
5767 | 0 | } |
5768 | | |
5769 | | /* |
5770 | | * Check the tuple XMIN against prior XMAX, if any. If we reached the |
5771 | | * end of the chain, we're done, so return success. |
5772 | | */ |
5773 | 0 | if (TransactionIdIsValid(priorXmax) && |
5774 | 0 | !TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data), |
5775 | 0 | priorXmax)) |
5776 | 0 | { |
5777 | 0 | result = TM_Ok; |
5778 | 0 | goto out_locked; |
5779 | 0 | } |
5780 | | |
5781 | | /* |
5782 | | * Also check Xmin: if this tuple was created by an aborted |
5783 | | * (sub)transaction, then we already locked the last live one in the |
5784 | | * chain, thus we're done, so return success. |
5785 | | */ |
5786 | 0 | if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data))) |
5787 | 0 | { |
5788 | 0 | result = TM_Ok; |
5789 | 0 | goto out_locked; |
5790 | 0 | } |
5791 | | |
5792 | 0 | old_infomask = mytup.t_data->t_infomask; |
5793 | 0 | old_infomask2 = mytup.t_data->t_infomask2; |
5794 | 0 | xmax = HeapTupleHeaderGetRawXmax(mytup.t_data); |
5795 | | |
5796 | | /* |
5797 | | * If this tuple version has been updated or locked by some concurrent |
5798 | | * transaction(s), what we do depends on whether our lock mode |
5799 | | * conflicts with what those other transactions hold, and also on the |
5800 | | * status of them. |
5801 | | */ |
5802 | 0 | if (!(old_infomask & HEAP_XMAX_INVALID)) |
5803 | 0 | { |
5804 | 0 | TransactionId rawxmax; |
5805 | 0 | bool needwait; |
5806 | |
|
5807 | 0 | rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data); |
5808 | 0 | if (old_infomask & HEAP_XMAX_IS_MULTI) |
5809 | 0 | { |
5810 | 0 | int nmembers; |
5811 | 0 | int i; |
5812 | 0 | MultiXactMember *members; |
5813 | | |
5814 | | /* |
5815 | | * We don't need a test for pg_upgrade'd tuples: this is only |
5816 | | * applied to tuples after the first in an update chain. Said |
5817 | | * first tuple in the chain may well be locked-in-9.2-and- |
5818 | | * pg_upgraded, but that one was already locked by our caller, |
5819 | | * not us; and any subsequent ones cannot be because our |
5820 | | * caller must necessarily have obtained a snapshot later than |
5821 | | * the pg_upgrade itself. |
5822 | | */ |
5823 | 0 | Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask)); |
5824 | |
|
5825 | 0 | nmembers = GetMultiXactIdMembers(rawxmax, &members, false, |
5826 | 0 | HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)); |
5827 | 0 | for (i = 0; i < nmembers; i++) |
5828 | 0 | { |
5829 | 0 | result = test_lockmode_for_conflict(members[i].status, |
5830 | 0 | members[i].xid, |
5831 | 0 | mode, |
5832 | 0 | &mytup, |
5833 | 0 | &needwait); |
5834 | | |
5835 | | /* |
5836 | | * If the tuple was already locked by ourselves in a |
5837 | | * previous iteration of this (say heap_lock_tuple was |
5838 | | * forced to restart the locking loop because of a change |
5839 | | * in xmax), then we hold the lock already on this tuple |
5840 | | * version and we don't need to do anything; and this is |
5841 | | * not an error condition either. We just need to skip |
5842 | | * this tuple and continue locking the next version in the |
5843 | | * update chain. |
5844 | | */ |
5845 | 0 | if (result == TM_SelfModified) |
5846 | 0 | { |
5847 | 0 | pfree(members); |
5848 | 0 | goto next; |
5849 | 0 | } |
5850 | | |
5851 | 0 | if (needwait) |
5852 | 0 | { |
5853 | 0 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
5854 | 0 | XactLockTableWait(members[i].xid, rel, |
5855 | 0 | &mytup.t_self, |
5856 | 0 | XLTW_LockUpdated); |
5857 | 0 | pfree(members); |
5858 | 0 | goto l4; |
5859 | 0 | } |
5860 | 0 | if (result != TM_Ok) |
5861 | 0 | { |
5862 | 0 | pfree(members); |
5863 | 0 | goto out_locked; |
5864 | 0 | } |
5865 | 0 | } |
5866 | 0 | if (members) |
5867 | 0 | pfree(members); |
5868 | 0 | } |
5869 | 0 | else |
5870 | 0 | { |
5871 | 0 | MultiXactStatus status; |
5872 | | |
5873 | | /* |
5874 | | * For a non-multi Xmax, we first need to compute the |
5875 | | * corresponding MultiXactStatus by using the infomask bits. |
5876 | | */ |
5877 | 0 | if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)) |
5878 | 0 | { |
5879 | 0 | if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask)) |
5880 | 0 | status = MultiXactStatusForKeyShare; |
5881 | 0 | else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask)) |
5882 | 0 | status = MultiXactStatusForShare; |
5883 | 0 | else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask)) |
5884 | 0 | { |
5885 | 0 | if (old_infomask2 & HEAP_KEYS_UPDATED) |
5886 | 0 | status = MultiXactStatusForUpdate; |
5887 | 0 | else |
5888 | 0 | status = MultiXactStatusForNoKeyUpdate; |
5889 | 0 | } |
5890 | 0 | else |
5891 | 0 | { |
5892 | | /* |
5893 | | * LOCK_ONLY present alone (a pg_upgraded tuple marked |
5894 | | * as share-locked in the old cluster) shouldn't be |
5895 | | * seen in the middle of an update chain. |
5896 | | */ |
5897 | 0 | elog(ERROR, "invalid lock status in tuple"); |
5898 | 0 | } |
5899 | 0 | } |
5900 | 0 | else |
5901 | 0 | { |
5902 | | /* it's an update, but which kind? */ |
5903 | 0 | if (old_infomask2 & HEAP_KEYS_UPDATED) |
5904 | 0 | status = MultiXactStatusUpdate; |
5905 | 0 | else |
5906 | 0 | status = MultiXactStatusNoKeyUpdate; |
5907 | 0 | } |
5908 | | |
5909 | 0 | result = test_lockmode_for_conflict(status, rawxmax, mode, |
5910 | 0 | &mytup, &needwait); |
5911 | | |
5912 | | /* |
5913 | | * If the tuple was already locked by ourselves in a previous |
5914 | | * iteration of this (say heap_lock_tuple was forced to |
5915 | | * restart the locking loop because of a change in xmax), then |
5916 | | * we hold the lock already on this tuple version and we don't |
5917 | | * need to do anything; and this is not an error condition |
5918 | | * either. We just need to skip this tuple and continue |
5919 | | * locking the next version in the update chain. |
5920 | | */ |
5921 | 0 | if (result == TM_SelfModified) |
5922 | 0 | goto next; |
5923 | | |
5924 | 0 | if (needwait) |
5925 | 0 | { |
5926 | 0 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
5927 | 0 | XactLockTableWait(rawxmax, rel, &mytup.t_self, |
5928 | 0 | XLTW_LockUpdated); |
5929 | 0 | goto l4; |
5930 | 0 | } |
5931 | 0 | if (result != TM_Ok) |
5932 | 0 | { |
5933 | 0 | goto out_locked; |
5934 | 0 | } |
5935 | 0 | } |
5936 | 0 | } |
5937 | | |
5938 | | /* compute the new Xmax and infomask values for the tuple ... */ |
5939 | 0 | compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2, |
5940 | 0 | xid, mode, false, |
5941 | 0 | &new_xmax, &new_infomask, &new_infomask2); |
5942 | |
|
5943 | 0 | if (PageIsAllVisible(BufferGetPage(buf)) && |
5944 | 0 | visibilitymap_clear(rel, block, vmbuffer, |
5945 | 0 | VISIBILITYMAP_ALL_FROZEN)) |
5946 | 0 | cleared_all_frozen = true; |
5947 | |
|
5948 | 0 | START_CRIT_SECTION(); |
5949 | | |
5950 | | /* ... and set them */ |
5951 | 0 | HeapTupleHeaderSetXmax(mytup.t_data, new_xmax); |
5952 | 0 | mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS; |
5953 | 0 | mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; |
5954 | 0 | mytup.t_data->t_infomask |= new_infomask; |
5955 | 0 | mytup.t_data->t_infomask2 |= new_infomask2; |
5956 | |
|
5957 | 0 | MarkBufferDirty(buf); |
5958 | | |
5959 | | /* XLOG stuff */ |
5960 | 0 | if (RelationNeedsWAL(rel)) |
5961 | 0 | { |
5962 | 0 | xl_heap_lock_updated xlrec; |
5963 | 0 | XLogRecPtr recptr; |
5964 | 0 | Page page = BufferGetPage(buf); |
5965 | |
|
5966 | 0 | XLogBeginInsert(); |
5967 | 0 | XLogRegisterBuffer(0, buf, REGBUF_STANDARD); |
5968 | |
|
5969 | 0 | xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self); |
5970 | 0 | xlrec.xmax = new_xmax; |
5971 | 0 | xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2); |
5972 | 0 | xlrec.flags = |
5973 | 0 | cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; |
5974 | |
|
5975 | 0 | XLogRegisterData(&xlrec, SizeOfHeapLockUpdated); |
5976 | |
|
5977 | 0 | recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED); |
5978 | |
|
5979 | 0 | PageSetLSN(page, recptr); |
5980 | 0 | } |
5981 | |
|
5982 | 0 | END_CRIT_SECTION(); |
5983 | |
|
5984 | 0 | next: |
5985 | | /* if we find the end of update chain, we're done. */ |
5986 | 0 | if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID || |
5987 | 0 | HeapTupleHeaderIndicatesMovedPartitions(mytup.t_data) || |
5988 | 0 | ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) || |
5989 | 0 | HeapTupleHeaderIsOnlyLocked(mytup.t_data)) |
5990 | 0 | { |
5991 | 0 | result = TM_Ok; |
5992 | 0 | goto out_locked; |
5993 | 0 | } |
5994 | | |
5995 | | /* tail recursion */ |
5996 | 0 | priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data); |
5997 | 0 | ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid); |
5998 | 0 | UnlockReleaseBuffer(buf); |
5999 | 0 | } |
6000 | | |
6001 | 0 | result = TM_Ok; |
6002 | |
|
6003 | 0 | out_locked: |
6004 | 0 | UnlockReleaseBuffer(buf); |
6005 | |
|
6006 | 0 | out_unlocked: |
6007 | 0 | if (vmbuffer != InvalidBuffer) |
6008 | 0 | ReleaseBuffer(vmbuffer); |
6009 | |
|
6010 | 0 | return result; |
6011 | 0 | } |
6012 | | |
6013 | | /* |
6014 | | * heap_lock_updated_tuple |
6015 | | * Follow update chain when locking an updated tuple, acquiring locks (row |
6016 | | * marks) on the updated versions. |
6017 | | * |
6018 | | * The initial tuple is assumed to be already locked. |
6019 | | * |
6020 | | * This function doesn't check visibility, it just unconditionally marks the |
6021 | | * tuple(s) as locked. If any tuple in the updated chain is being deleted |
6022 | | * concurrently (or updated with the key being modified), sleep until the |
6023 | | * transaction doing it is finished. |
6024 | | * |
6025 | | * Note that we don't acquire heavyweight tuple locks on the tuples we walk |
6026 | | * when we have to wait for other transactions to release them, as opposed to |
6027 | | * what heap_lock_tuple does. The reason is that having more than one |
6028 | | * transaction walking the chain is probably uncommon enough that risk of |
6029 | | * starvation is not likely: one of the preconditions for being here is that |
6030 | | * the snapshot in use predates the update that created this tuple (because we |
6031 | | * started at an earlier version of the tuple), but at the same time such a |
6032 | | * transaction cannot be using repeatable read or serializable isolation |
6033 | | * levels, because that would lead to a serializability failure. |
6034 | | */ |
6035 | | static TM_Result |
6036 | | heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid, |
6037 | | TransactionId xid, LockTupleMode mode) |
6038 | 0 | { |
6039 | | /* |
6040 | | * If the tuple has not been updated, or has moved into another partition |
6041 | | * (effectively a delete) stop here. |
6042 | | */ |
6043 | 0 | if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) && |
6044 | 0 | !ItemPointerEquals(&tuple->t_self, ctid)) |
6045 | 0 | { |
6046 | | /* |
6047 | | * If this is the first possibly-multixact-able operation in the |
6048 | | * current transaction, set my per-backend OldestMemberMXactId |
6049 | | * setting. We can be certain that the transaction will never become a |
6050 | | * member of any older MultiXactIds than that. (We have to do this |
6051 | | * even if we end up just using our own TransactionId below, since |
6052 | | * some other backend could incorporate our XID into a MultiXact |
6053 | | * immediately afterwards.) |
6054 | | */ |
6055 | 0 | MultiXactIdSetOldestMember(); |
6056 | |
|
6057 | 0 | return heap_lock_updated_tuple_rec(rel, ctid, xid, mode); |
6058 | 0 | } |
6059 | | |
6060 | | /* nothing to lock */ |
6061 | 0 | return TM_Ok; |
6062 | 0 | } |
6063 | | |
6064 | | /* |
6065 | | * heap_finish_speculative - mark speculative insertion as successful |
6066 | | * |
6067 | | * To successfully finish a speculative insertion we have to clear speculative |
6068 | | * token from tuple. To do so the t_ctid field, which will contain a |
6069 | | * speculative token value, is modified in place to point to the tuple itself, |
6070 | | * which is characteristic of a newly inserted ordinary tuple. |
6071 | | * |
6072 | | * NB: It is not ok to commit without either finishing or aborting a |
6073 | | * speculative insertion. We could treat speculative tuples of committed |
6074 | | * transactions implicitly as completed, but then we would have to be prepared |
6075 | | * to deal with speculative tokens on committed tuples. That wouldn't be |
6076 | | * difficult - no-one looks at the ctid field of a tuple with invalid xmax - |
6077 | | * but clearing the token at completion isn't very expensive either. |
6078 | | * An explicit confirmation WAL record also makes logical decoding simpler. |
6079 | | */ |
6080 | | void |
6081 | | heap_finish_speculative(Relation relation, ItemPointer tid) |
6082 | 0 | { |
6083 | 0 | Buffer buffer; |
6084 | 0 | Page page; |
6085 | 0 | OffsetNumber offnum; |
6086 | 0 | ItemId lp = NULL; |
6087 | 0 | HeapTupleHeader htup; |
6088 | |
|
6089 | 0 | buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); |
6090 | 0 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
6091 | 0 | page = (Page) BufferGetPage(buffer); |
6092 | |
|
6093 | 0 | offnum = ItemPointerGetOffsetNumber(tid); |
6094 | 0 | if (PageGetMaxOffsetNumber(page) >= offnum) |
6095 | 0 | lp = PageGetItemId(page, offnum); |
6096 | |
|
6097 | 0 | if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp)) |
6098 | 0 | elog(ERROR, "invalid lp"); |
6099 | | |
6100 | 0 | htup = (HeapTupleHeader) PageGetItem(page, lp); |
6101 | | |
6102 | | /* NO EREPORT(ERROR) from here till changes are logged */ |
6103 | 0 | START_CRIT_SECTION(); |
6104 | |
|
6105 | 0 | Assert(HeapTupleHeaderIsSpeculative(htup)); |
6106 | |
|
6107 | 0 | MarkBufferDirty(buffer); |
6108 | | |
6109 | | /* |
6110 | | * Replace the speculative insertion token with a real t_ctid, pointing to |
6111 | | * itself like it does on regular tuples. |
6112 | | */ |
6113 | 0 | htup->t_ctid = *tid; |
6114 | | |
6115 | | /* XLOG stuff */ |
6116 | 0 | if (RelationNeedsWAL(relation)) |
6117 | 0 | { |
6118 | 0 | xl_heap_confirm xlrec; |
6119 | 0 | XLogRecPtr recptr; |
6120 | |
|
6121 | 0 | xlrec.offnum = ItemPointerGetOffsetNumber(tid); |
6122 | |
|
6123 | 0 | XLogBeginInsert(); |
6124 | | |
6125 | | /* We want the same filtering on this as on a plain insert */ |
6126 | 0 | XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); |
6127 | |
|
6128 | 0 | XLogRegisterData(&xlrec, SizeOfHeapConfirm); |
6129 | 0 | XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); |
6130 | |
|
6131 | 0 | recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM); |
6132 | |
|
6133 | 0 | PageSetLSN(page, recptr); |
6134 | 0 | } |
6135 | |
|
6136 | 0 | END_CRIT_SECTION(); |
6137 | |
|
6138 | 0 | UnlockReleaseBuffer(buffer); |
6139 | 0 | } |
6140 | | |
6141 | | /* |
6142 | | * heap_abort_speculative - kill a speculatively inserted tuple |
6143 | | * |
6144 | | * Marks a tuple that was speculatively inserted in the same command as dead, |
6145 | | * by setting its xmin as invalid. That makes it immediately appear as dead |
6146 | | * to all transactions, including our own. In particular, it makes |
6147 | | * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend |
6148 | | * inserting a duplicate key value won't unnecessarily wait for our whole |
6149 | | * transaction to finish (it'll just wait for our speculative insertion to |
6150 | | * finish). |
6151 | | * |
6152 | | * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks |
6153 | | * that arise due to a mutual dependency that is not user visible. By |
6154 | | * definition, unprincipled deadlocks cannot be prevented by the user |
6155 | | * reordering lock acquisition in client code, because the implementation level |
6156 | | * lock acquisitions are not under the user's direct control. If speculative |
6157 | | * inserters did not take this precaution, then under high concurrency they |
6158 | | * could deadlock with each other, which would not be acceptable. |
6159 | | * |
6160 | | * This is somewhat redundant with heap_delete, but we prefer to have a |
6161 | | * dedicated routine with stripped down requirements. Note that this is also |
6162 | | * used to delete the TOAST tuples created during speculative insertion. |
6163 | | * |
6164 | | * This routine does not affect logical decoding as it only looks at |
6165 | | * confirmation records. |
6166 | | */ |
6167 | | void |
6168 | | heap_abort_speculative(Relation relation, ItemPointer tid) |
6169 | 0 | { |
6170 | 0 | TransactionId xid = GetCurrentTransactionId(); |
6171 | 0 | ItemId lp; |
6172 | 0 | HeapTupleData tp; |
6173 | 0 | Page page; |
6174 | 0 | BlockNumber block; |
6175 | 0 | Buffer buffer; |
6176 | |
|
6177 | 0 | Assert(ItemPointerIsValid(tid)); |
6178 | |
|
6179 | 0 | block = ItemPointerGetBlockNumber(tid); |
6180 | 0 | buffer = ReadBuffer(relation, block); |
6181 | 0 | page = BufferGetPage(buffer); |
6182 | |
|
6183 | 0 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
6184 | | |
6185 | | /* |
6186 | | * Page can't be all visible, we just inserted into it, and are still |
6187 | | * running. |
6188 | | */ |
6189 | 0 | Assert(!PageIsAllVisible(page)); |
6190 | |
|
6191 | 0 | lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid)); |
6192 | 0 | Assert(ItemIdIsNormal(lp)); |
6193 | |
|
6194 | 0 | tp.t_tableOid = RelationGetRelid(relation); |
6195 | 0 | tp.t_data = (HeapTupleHeader) PageGetItem(page, lp); |
6196 | 0 | tp.t_len = ItemIdGetLength(lp); |
6197 | 0 | tp.t_self = *tid; |
6198 | | |
6199 | | /* |
6200 | | * Sanity check that the tuple really is a speculatively inserted tuple, |
6201 | | * inserted by us. |
6202 | | */ |
6203 | 0 | if (tp.t_data->t_choice.t_heap.t_xmin != xid) |
6204 | 0 | elog(ERROR, "attempted to kill a tuple inserted by another transaction"); |
6205 | 0 | if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data))) |
6206 | 0 | elog(ERROR, "attempted to kill a non-speculative tuple"); |
6207 | 0 | Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data)); |
6208 | | |
6209 | | /* |
6210 | | * No need to check for serializable conflicts here. There is never a |
6211 | | * need for a combo CID, either. No need to extract replica identity, or |
6212 | | * do anything special with infomask bits. |
6213 | | */ |
6214 | |
|
6215 | 0 | START_CRIT_SECTION(); |
6216 | | |
6217 | | /* |
6218 | | * The tuple will become DEAD immediately. Flag that this page is a |
6219 | | * candidate for pruning by setting xmin to TransactionXmin. While not |
6220 | | * immediately prunable, it is the oldest xid we can cheaply determine |
6221 | | * that's safe against wraparound / being older than the table's |
6222 | | * relfrozenxid. To defend against the unlikely case of a new relation |
6223 | | * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid |
6224 | | * if so (vacuum can't subsequently move relfrozenxid to beyond |
6225 | | * TransactionXmin, so there's no race here). |
6226 | | */ |
6227 | 0 | Assert(TransactionIdIsValid(TransactionXmin)); |
6228 | 0 | { |
6229 | 0 | TransactionId relfrozenxid = relation->rd_rel->relfrozenxid; |
6230 | 0 | TransactionId prune_xid; |
6231 | |
|
6232 | 0 | if (TransactionIdPrecedes(TransactionXmin, relfrozenxid)) |
6233 | 0 | prune_xid = relfrozenxid; |
6234 | 0 | else |
6235 | 0 | prune_xid = TransactionXmin; |
6236 | 0 | PageSetPrunable(page, prune_xid); |
6237 | 0 | } |
6238 | | |
6239 | | /* store transaction information of xact deleting the tuple */ |
6240 | 0 | tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); |
6241 | 0 | tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED; |
6242 | | |
6243 | | /* |
6244 | | * Set the tuple header xmin to InvalidTransactionId. This makes the |
6245 | | * tuple immediately invisible everyone. (In particular, to any |
6246 | | * transactions waiting on the speculative token, woken up later.) |
6247 | | */ |
6248 | 0 | HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId); |
6249 | | |
6250 | | /* Clear the speculative insertion token too */ |
6251 | 0 | tp.t_data->t_ctid = tp.t_self; |
6252 | |
|
6253 | 0 | MarkBufferDirty(buffer); |
6254 | | |
6255 | | /* |
6256 | | * XLOG stuff |
6257 | | * |
6258 | | * The WAL records generated here match heap_delete(). The same recovery |
6259 | | * routines are used. |
6260 | | */ |
6261 | 0 | if (RelationNeedsWAL(relation)) |
6262 | 0 | { |
6263 | 0 | xl_heap_delete xlrec; |
6264 | 0 | XLogRecPtr recptr; |
6265 | |
|
6266 | 0 | xlrec.flags = XLH_DELETE_IS_SUPER; |
6267 | 0 | xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask, |
6268 | 0 | tp.t_data->t_infomask2); |
6269 | 0 | xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); |
6270 | 0 | xlrec.xmax = xid; |
6271 | |
|
6272 | 0 | XLogBeginInsert(); |
6273 | 0 | XLogRegisterData(&xlrec, SizeOfHeapDelete); |
6274 | 0 | XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); |
6275 | | |
6276 | | /* No replica identity & replication origin logged */ |
6277 | |
|
6278 | 0 | recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); |
6279 | |
|
6280 | 0 | PageSetLSN(page, recptr); |
6281 | 0 | } |
6282 | |
|
6283 | 0 | END_CRIT_SECTION(); |
6284 | |
|
6285 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
6286 | |
|
6287 | 0 | if (HeapTupleHasExternal(&tp)) |
6288 | 0 | { |
6289 | 0 | Assert(!IsToastRelation(relation)); |
6290 | 0 | heap_toast_delete(relation, &tp, true); |
6291 | 0 | } |
6292 | | |
6293 | | /* |
6294 | | * Never need to mark tuple for invalidation, since catalogs don't support |
6295 | | * speculative insertion |
6296 | | */ |
6297 | | |
6298 | | /* Now we can release the buffer */ |
6299 | 0 | ReleaseBuffer(buffer); |
6300 | | |
6301 | | /* count deletion, as we counted the insertion too */ |
6302 | 0 | pgstat_count_heap_delete(relation); |
6303 | 0 | } |
6304 | | |
6305 | | /* |
6306 | | * heap_inplace_lock - protect inplace update from concurrent heap_update() |
6307 | | * |
6308 | | * Evaluate whether the tuple's state is compatible with a no-key update. |
6309 | | * Current transaction rowmarks are fine, as is KEY SHARE from any |
6310 | | * transaction. If compatible, return true with the buffer exclusive-locked, |
6311 | | * and the caller must release that by calling |
6312 | | * heap_inplace_update_and_unlock(), calling heap_inplace_unlock(), or raising |
6313 | | * an error. Otherwise, call release_callback(arg), wait for blocking |
6314 | | * transactions to end, and return false. |
6315 | | * |
6316 | | * Since this is intended for system catalogs and SERIALIZABLE doesn't cover |
6317 | | * DDL, this doesn't guarantee any particular predicate locking. |
6318 | | * |
6319 | | * One could modify this to return true for tuples with delete in progress, |
6320 | | * All inplace updaters take a lock that conflicts with DROP. If explicit |
6321 | | * "DELETE FROM pg_class" is in progress, we'll wait for it like we would an |
6322 | | * update. |
6323 | | * |
6324 | | * Readers of inplace-updated fields expect changes to those fields are |
6325 | | * durable. For example, vac_truncate_clog() reads datfrozenxid from |
6326 | | * pg_database tuples via catalog snapshots. A future snapshot must not |
6327 | | * return a lower datfrozenxid for the same database OID (lower in the |
6328 | | * FullTransactionIdPrecedes() sense). We achieve that since no update of a |
6329 | | * tuple can start while we hold a lock on its buffer. In cases like |
6330 | | * BEGIN;GRANT;CREATE INDEX;COMMIT we're inplace-updating a tuple visible only |
6331 | | * to this transaction. ROLLBACK then is one case where it's okay to lose |
6332 | | * inplace updates. (Restoring relhasindex=false on ROLLBACK is fine, since |
6333 | | * any concurrent CREATE INDEX would have blocked, then inplace-updated the |
6334 | | * committed tuple.) |
6335 | | * |
6336 | | * In principle, we could avoid waiting by overwriting every tuple in the |
6337 | | * updated tuple chain. Reader expectations permit updating a tuple only if |
6338 | | * it's aborted, is the tail of the chain, or we already updated the tuple |
6339 | | * referenced in its t_ctid. Hence, we would need to overwrite the tuples in |
6340 | | * order from tail to head. That would imply either (a) mutating all tuples |
6341 | | * in one critical section or (b) accepting a chance of partial completion. |
6342 | | * Partial completion of a relfrozenxid update would have the weird |
6343 | | * consequence that the table's next VACUUM could see the table's relfrozenxid |
6344 | | * move forward between vacuum_get_cutoffs() and finishing. |
6345 | | */ |
6346 | | bool |
6347 | | heap_inplace_lock(Relation relation, |
6348 | | HeapTuple oldtup_ptr, Buffer buffer, |
6349 | | void (*release_callback) (void *), void *arg) |
6350 | 0 | { |
6351 | 0 | HeapTupleData oldtup = *oldtup_ptr; /* minimize diff vs. heap_update() */ |
6352 | 0 | TM_Result result; |
6353 | 0 | bool ret; |
6354 | |
|
6355 | | #ifdef USE_ASSERT_CHECKING |
6356 | | if (RelationGetRelid(relation) == RelationRelationId) |
6357 | | check_inplace_rel_lock(oldtup_ptr); |
6358 | | #endif |
6359 | |
|
6360 | 0 | Assert(BufferIsValid(buffer)); |
6361 | | |
6362 | | /* |
6363 | | * Construct shared cache inval if necessary. Because we pass a tuple |
6364 | | * version without our own inplace changes or inplace changes other |
6365 | | * sessions complete while we wait for locks, inplace update mustn't |
6366 | | * change catcache lookup keys. But we aren't bothering with index |
6367 | | * updates either, so that's true a fortiori. After LockBuffer(), it |
6368 | | * would be too late, because this might reach a |
6369 | | * CatalogCacheInitializeCache() that locks "buffer". |
6370 | | */ |
6371 | 0 | CacheInvalidateHeapTupleInplace(relation, oldtup_ptr, NULL); |
6372 | |
|
6373 | 0 | LockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock); |
6374 | 0 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
6375 | | |
6376 | | /*---------- |
6377 | | * Interpret HeapTupleSatisfiesUpdate() like heap_update() does, except: |
6378 | | * |
6379 | | * - wait unconditionally |
6380 | | * - already locked tuple above, since inplace needs that unconditionally |
6381 | | * - don't recheck header after wait: simpler to defer to next iteration |
6382 | | * - don't try to continue even if the updater aborts: likewise |
6383 | | * - no crosscheck |
6384 | | */ |
6385 | 0 | result = HeapTupleSatisfiesUpdate(&oldtup, GetCurrentCommandId(false), |
6386 | 0 | buffer); |
6387 | |
|
6388 | 0 | if (result == TM_Invisible) |
6389 | 0 | { |
6390 | | /* no known way this can happen */ |
6391 | 0 | ereport(ERROR, |
6392 | 0 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
6393 | 0 | errmsg_internal("attempted to overwrite invisible tuple"))); |
6394 | 0 | } |
6395 | 0 | else if (result == TM_SelfModified) |
6396 | 0 | { |
6397 | | /* |
6398 | | * CREATE INDEX might reach this if an expression is silly enough to |
6399 | | * call e.g. SELECT ... FROM pg_class FOR SHARE. C code of other SQL |
6400 | | * statements might get here after a heap_update() of the same row, in |
6401 | | * the absence of an intervening CommandCounterIncrement(). |
6402 | | */ |
6403 | 0 | ereport(ERROR, |
6404 | 0 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
6405 | 0 | errmsg("tuple to be updated was already modified by an operation triggered by the current command"))); |
6406 | 0 | } |
6407 | 0 | else if (result == TM_BeingModified) |
6408 | 0 | { |
6409 | 0 | TransactionId xwait; |
6410 | 0 | uint16 infomask; |
6411 | |
|
6412 | 0 | xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data); |
6413 | 0 | infomask = oldtup.t_data->t_infomask; |
6414 | |
|
6415 | 0 | if (infomask & HEAP_XMAX_IS_MULTI) |
6416 | 0 | { |
6417 | 0 | LockTupleMode lockmode = LockTupleNoKeyExclusive; |
6418 | 0 | MultiXactStatus mxact_status = MultiXactStatusNoKeyUpdate; |
6419 | 0 | int remain; |
6420 | |
|
6421 | 0 | if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask, |
6422 | 0 | lockmode, NULL)) |
6423 | 0 | { |
6424 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
6425 | 0 | release_callback(arg); |
6426 | 0 | ret = false; |
6427 | 0 | MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask, |
6428 | 0 | relation, &oldtup.t_self, XLTW_Update, |
6429 | 0 | &remain); |
6430 | 0 | } |
6431 | 0 | else |
6432 | 0 | ret = true; |
6433 | 0 | } |
6434 | 0 | else if (TransactionIdIsCurrentTransactionId(xwait)) |
6435 | 0 | ret = true; |
6436 | 0 | else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask)) |
6437 | 0 | ret = true; |
6438 | 0 | else |
6439 | 0 | { |
6440 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
6441 | 0 | release_callback(arg); |
6442 | 0 | ret = false; |
6443 | 0 | XactLockTableWait(xwait, relation, &oldtup.t_self, |
6444 | 0 | XLTW_Update); |
6445 | 0 | } |
6446 | 0 | } |
6447 | 0 | else |
6448 | 0 | { |
6449 | 0 | ret = (result == TM_Ok); |
6450 | 0 | if (!ret) |
6451 | 0 | { |
6452 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
6453 | 0 | release_callback(arg); |
6454 | 0 | } |
6455 | 0 | } |
6456 | | |
6457 | | /* |
6458 | | * GetCatalogSnapshot() relies on invalidation messages to know when to |
6459 | | * take a new snapshot. COMMIT of xwait is responsible for sending the |
6460 | | * invalidation. We're not acquiring heavyweight locks sufficient to |
6461 | | * block if not yet sent, so we must take a new snapshot to ensure a later |
6462 | | * attempt has a fair chance. While we don't need this if xwait aborted, |
6463 | | * don't bother optimizing that. |
6464 | | */ |
6465 | 0 | if (!ret) |
6466 | 0 | { |
6467 | 0 | UnlockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock); |
6468 | 0 | ForgetInplace_Inval(); |
6469 | 0 | InvalidateCatalogSnapshot(); |
6470 | 0 | } |
6471 | 0 | return ret; |
6472 | 0 | } |
6473 | | |
6474 | | /* |
6475 | | * heap_inplace_update_and_unlock - core of systable_inplace_update_finish |
6476 | | * |
6477 | | * The tuple cannot change size, and therefore its header fields and null |
6478 | | * bitmap (if any) don't change either. |
6479 | | * |
6480 | | * Since we hold LOCKTAG_TUPLE, no updater has a local copy of this tuple. |
6481 | | */ |
6482 | | void |
6483 | | heap_inplace_update_and_unlock(Relation relation, |
6484 | | HeapTuple oldtup, HeapTuple tuple, |
6485 | | Buffer buffer) |
6486 | 0 | { |
6487 | 0 | HeapTupleHeader htup = oldtup->t_data; |
6488 | 0 | uint32 oldlen; |
6489 | 0 | uint32 newlen; |
6490 | 0 | char *dst; |
6491 | 0 | char *src; |
6492 | 0 | int nmsgs = 0; |
6493 | 0 | SharedInvalidationMessage *invalMessages = NULL; |
6494 | 0 | bool RelcacheInitFileInval = false; |
6495 | |
|
6496 | 0 | Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self)); |
6497 | 0 | oldlen = oldtup->t_len - htup->t_hoff; |
6498 | 0 | newlen = tuple->t_len - tuple->t_data->t_hoff; |
6499 | 0 | if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff) |
6500 | 0 | elog(ERROR, "wrong tuple length"); |
6501 | | |
6502 | 0 | dst = (char *) htup + htup->t_hoff; |
6503 | 0 | src = (char *) tuple->t_data + tuple->t_data->t_hoff; |
6504 | | |
6505 | | /* Like RecordTransactionCommit(), log only if needed */ |
6506 | 0 | if (XLogStandbyInfoActive()) |
6507 | 0 | nmsgs = inplaceGetInvalidationMessages(&invalMessages, |
6508 | 0 | &RelcacheInitFileInval); |
6509 | | |
6510 | | /* |
6511 | | * Unlink relcache init files as needed. If unlinking, acquire |
6512 | | * RelCacheInitLock until after associated invalidations. By doing this |
6513 | | * in advance, if we checkpoint and then crash between inplace |
6514 | | * XLogInsert() and inval, we don't rely on StartupXLOG() -> |
6515 | | * RelationCacheInitFileRemove(). That uses elevel==LOG, so replay would |
6516 | | * neglect to PANIC on EIO. |
6517 | | */ |
6518 | 0 | PreInplace_Inval(); |
6519 | | |
6520 | | /*---------- |
6521 | | * NO EREPORT(ERROR) from here till changes are complete |
6522 | | * |
6523 | | * Our buffer lock won't stop a reader having already pinned and checked |
6524 | | * visibility for this tuple. Hence, we write WAL first, then mutate the |
6525 | | * buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(), |
6526 | | * checkpoint delay makes that acceptable. With the usual order of |
6527 | | * changes, a crash after memcpy() and before XLogInsert() could allow |
6528 | | * datfrozenxid to overtake relfrozenxid: |
6529 | | * |
6530 | | * ["D" is a VACUUM (ONLY_DATABASE_STATS)] |
6531 | | * ["R" is a VACUUM tbl] |
6532 | | * D: vac_update_datfrozenxid() -> systable_beginscan(pg_class) |
6533 | | * D: systable_getnext() returns pg_class tuple of tbl |
6534 | | * R: memcpy() into pg_class tuple of tbl |
6535 | | * D: raise pg_database.datfrozenxid, XLogInsert(), finish |
6536 | | * [crash] |
6537 | | * [recovery restores datfrozenxid w/o relfrozenxid] |
6538 | | * |
6539 | | * Mimic MarkBufferDirtyHint() subroutine XLogSaveBufferForHint(). |
6540 | | * Specifically, use DELAY_CHKPT_START, and copy the buffer to the stack. |
6541 | | * The stack copy facilitates a FPI of the post-mutation block before we |
6542 | | * accept other sessions seeing it. DELAY_CHKPT_START allows us to |
6543 | | * XLogInsert() before MarkBufferDirty(). Since XLogSaveBufferForHint() |
6544 | | * can operate under BUFFER_LOCK_SHARED, it can't avoid DELAY_CHKPT_START. |
6545 | | * This function, however, likely could avoid it with the following order |
6546 | | * of operations: MarkBufferDirty(), XLogInsert(), memcpy(). Opt to use |
6547 | | * DELAY_CHKPT_START here, too, as a way to have fewer distinct code |
6548 | | * patterns to analyze. Inplace update isn't so frequent that it should |
6549 | | * pursue the small optimization of skipping DELAY_CHKPT_START. |
6550 | | */ |
6551 | 0 | Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); |
6552 | 0 | START_CRIT_SECTION(); |
6553 | 0 | MyProc->delayChkptFlags |= DELAY_CHKPT_START; |
6554 | | |
6555 | | /* XLOG stuff */ |
6556 | 0 | if (RelationNeedsWAL(relation)) |
6557 | 0 | { |
6558 | 0 | xl_heap_inplace xlrec; |
6559 | 0 | PGAlignedBlock copied_buffer; |
6560 | 0 | char *origdata = (char *) BufferGetBlock(buffer); |
6561 | 0 | Page page = BufferGetPage(buffer); |
6562 | 0 | uint16 lower = ((PageHeader) page)->pd_lower; |
6563 | 0 | uint16 upper = ((PageHeader) page)->pd_upper; |
6564 | 0 | uintptr_t dst_offset_in_block; |
6565 | 0 | RelFileLocator rlocator; |
6566 | 0 | ForkNumber forkno; |
6567 | 0 | BlockNumber blkno; |
6568 | 0 | XLogRecPtr recptr; |
6569 | |
|
6570 | 0 | xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self); |
6571 | 0 | xlrec.dbId = MyDatabaseId; |
6572 | 0 | xlrec.tsId = MyDatabaseTableSpace; |
6573 | 0 | xlrec.relcacheInitFileInval = RelcacheInitFileInval; |
6574 | 0 | xlrec.nmsgs = nmsgs; |
6575 | |
|
6576 | 0 | XLogBeginInsert(); |
6577 | 0 | XLogRegisterData(&xlrec, MinSizeOfHeapInplace); |
6578 | 0 | if (nmsgs != 0) |
6579 | 0 | XLogRegisterData(invalMessages, |
6580 | 0 | nmsgs * sizeof(SharedInvalidationMessage)); |
6581 | | |
6582 | | /* register block matching what buffer will look like after changes */ |
6583 | 0 | memcpy(copied_buffer.data, origdata, lower); |
6584 | 0 | memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper); |
6585 | 0 | dst_offset_in_block = dst - origdata; |
6586 | 0 | memcpy(copied_buffer.data + dst_offset_in_block, src, newlen); |
6587 | 0 | BufferGetTag(buffer, &rlocator, &forkno, &blkno); |
6588 | 0 | Assert(forkno == MAIN_FORKNUM); |
6589 | 0 | XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data, |
6590 | 0 | REGBUF_STANDARD); |
6591 | 0 | XLogRegisterBufData(0, src, newlen); |
6592 | | |
6593 | | /* inplace updates aren't decoded atm, don't log the origin */ |
6594 | |
|
6595 | 0 | recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE); |
6596 | |
|
6597 | 0 | PageSetLSN(page, recptr); |
6598 | 0 | } |
6599 | |
|
6600 | 0 | memcpy(dst, src, newlen); |
6601 | |
|
6602 | 0 | MarkBufferDirty(buffer); |
6603 | |
|
6604 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
6605 | | |
6606 | | /* |
6607 | | * Send invalidations to shared queue. SearchSysCacheLocked1() assumes we |
6608 | | * do this before UnlockTuple(). |
6609 | | * |
6610 | | * If we're mutating a tuple visible only to this transaction, there's an |
6611 | | * equivalent transactional inval from the action that created the tuple, |
6612 | | * and this inval is superfluous. |
6613 | | */ |
6614 | 0 | AtInplace_Inval(); |
6615 | |
|
6616 | 0 | MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; |
6617 | 0 | END_CRIT_SECTION(); |
6618 | 0 | UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock); |
6619 | |
|
6620 | 0 | AcceptInvalidationMessages(); /* local processing of just-sent inval */ |
6621 | | |
6622 | | /* |
6623 | | * Queue a transactional inval. The immediate invalidation we just sent |
6624 | | * is the only one known to be necessary. To reduce risk from the |
6625 | | * transition to immediate invalidation, continue sending a transactional |
6626 | | * invalidation like we've long done. Third-party code might rely on it. |
6627 | | */ |
6628 | 0 | if (!IsBootstrapProcessingMode()) |
6629 | 0 | CacheInvalidateHeapTuple(relation, tuple, NULL); |
6630 | 0 | } |
6631 | | |
6632 | | /* |
6633 | | * heap_inplace_unlock - reverse of heap_inplace_lock |
6634 | | */ |
6635 | | void |
6636 | | heap_inplace_unlock(Relation relation, |
6637 | | HeapTuple oldtup, Buffer buffer) |
6638 | 0 | { |
6639 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
6640 | 0 | UnlockTuple(relation, &oldtup->t_self, InplaceUpdateTupleLock); |
6641 | 0 | ForgetInplace_Inval(); |
6642 | 0 | } |
6643 | | |
6644 | 0 | #define FRM_NOOP 0x0001 |
6645 | 0 | #define FRM_INVALIDATE_XMAX 0x0002 |
6646 | 0 | #define FRM_RETURN_IS_XID 0x0004 |
6647 | 0 | #define FRM_RETURN_IS_MULTI 0x0008 |
6648 | 0 | #define FRM_MARK_COMMITTED 0x0010 |
6649 | | |
6650 | | /* |
6651 | | * FreezeMultiXactId |
6652 | | * Determine what to do during freezing when a tuple is marked by a |
6653 | | * MultiXactId. |
6654 | | * |
6655 | | * "flags" is an output value; it's used to tell caller what to do on return. |
6656 | | * "pagefrz" is an input/output value, used to manage page level freezing. |
6657 | | * |
6658 | | * Possible values that we can set in "flags": |
6659 | | * FRM_NOOP |
6660 | | * don't do anything -- keep existing Xmax |
6661 | | * FRM_INVALIDATE_XMAX |
6662 | | * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag. |
6663 | | * FRM_RETURN_IS_XID |
6664 | | * The Xid return value is a single update Xid to set as xmax. |
6665 | | * FRM_MARK_COMMITTED |
6666 | | * Xmax can be marked as HEAP_XMAX_COMMITTED |
6667 | | * FRM_RETURN_IS_MULTI |
6668 | | * The return value is a new MultiXactId to set as new Xmax. |
6669 | | * (caller must obtain proper infomask bits using GetMultiXactIdHintBits) |
6670 | | * |
6671 | | * Caller delegates control of page freezing to us. In practice we always |
6672 | | * force freezing of caller's page unless FRM_NOOP processing is indicated. |
6673 | | * We help caller ensure that XIDs < FreezeLimit and MXIDs < MultiXactCutoff |
6674 | | * can never be left behind. We freely choose when and how to process each |
6675 | | * Multi, without ever violating the cutoff postconditions for freezing. |
6676 | | * |
6677 | | * It's useful to remove Multis on a proactive timeline (relative to freezing |
6678 | | * XIDs) to keep MultiXact member SLRU buffer misses to a minimum. It can also |
6679 | | * be cheaper in the short run, for us, since we too can avoid SLRU buffer |
6680 | | * misses through eager processing. |
6681 | | * |
6682 | | * NB: Creates a _new_ MultiXactId when FRM_RETURN_IS_MULTI is set, though only |
6683 | | * when FreezeLimit and/or MultiXactCutoff cutoffs leave us with no choice. |
6684 | | * This can usually be put off, which is usually enough to avoid it altogether. |
6685 | | * Allocating new multis during VACUUM should be avoided on general principle; |
6686 | | * only VACUUM can advance relminmxid, so allocating new Multis here comes with |
6687 | | * its own special risks. |
6688 | | * |
6689 | | * NB: Caller must maintain "no freeze" NewRelfrozenXid/NewRelminMxid trackers |
6690 | | * using heap_tuple_should_freeze when we haven't forced page-level freezing. |
6691 | | * |
6692 | | * NB: Caller should avoid needlessly calling heap_tuple_should_freeze when we |
6693 | | * have already forced page-level freezing, since that might incur the same |
6694 | | * SLRU buffer misses that we specifically intended to avoid by freezing. |
6695 | | */ |
6696 | | static TransactionId |
6697 | | FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, |
6698 | | const struct VacuumCutoffs *cutoffs, uint16 *flags, |
6699 | | HeapPageFreeze *pagefrz) |
6700 | 0 | { |
6701 | 0 | TransactionId newxmax; |
6702 | 0 | MultiXactMember *members; |
6703 | 0 | int nmembers; |
6704 | 0 | bool need_replace; |
6705 | 0 | int nnewmembers; |
6706 | 0 | MultiXactMember *newmembers; |
6707 | 0 | bool has_lockers; |
6708 | 0 | TransactionId update_xid; |
6709 | 0 | bool update_committed; |
6710 | 0 | TransactionId FreezePageRelfrozenXid; |
6711 | |
|
6712 | 0 | *flags = 0; |
6713 | | |
6714 | | /* We should only be called in Multis */ |
6715 | 0 | Assert(t_infomask & HEAP_XMAX_IS_MULTI); |
6716 | |
|
6717 | 0 | if (!MultiXactIdIsValid(multi) || |
6718 | 0 | HEAP_LOCKED_UPGRADED(t_infomask)) |
6719 | 0 | { |
6720 | 0 | *flags |= FRM_INVALIDATE_XMAX; |
6721 | 0 | pagefrz->freeze_required = true; |
6722 | 0 | return InvalidTransactionId; |
6723 | 0 | } |
6724 | 0 | else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid)) |
6725 | 0 | ereport(ERROR, |
6726 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
6727 | 0 | errmsg_internal("found multixact %u from before relminmxid %u", |
6728 | 0 | multi, cutoffs->relminmxid))); |
6729 | 0 | else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact)) |
6730 | 0 | { |
6731 | 0 | TransactionId update_xact; |
6732 | | |
6733 | | /* |
6734 | | * This old multi cannot possibly have members still running, but |
6735 | | * verify just in case. If it was a locker only, it can be removed |
6736 | | * without any further consideration; but if it contained an update, |
6737 | | * we might need to preserve it. |
6738 | | */ |
6739 | 0 | if (MultiXactIdIsRunning(multi, |
6740 | 0 | HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))) |
6741 | 0 | ereport(ERROR, |
6742 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
6743 | 0 | errmsg_internal("multixact %u from before multi freeze cutoff %u found to be still running", |
6744 | 0 | multi, cutoffs->OldestMxact))); |
6745 | | |
6746 | 0 | if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)) |
6747 | 0 | { |
6748 | 0 | *flags |= FRM_INVALIDATE_XMAX; |
6749 | 0 | pagefrz->freeze_required = true; |
6750 | 0 | return InvalidTransactionId; |
6751 | 0 | } |
6752 | | |
6753 | | /* replace multi with single XID for its updater? */ |
6754 | 0 | update_xact = MultiXactIdGetUpdateXid(multi, t_infomask); |
6755 | 0 | if (TransactionIdPrecedes(update_xact, cutoffs->relfrozenxid)) |
6756 | 0 | ereport(ERROR, |
6757 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
6758 | 0 | errmsg_internal("multixact %u contains update XID %u from before relfrozenxid %u", |
6759 | 0 | multi, update_xact, |
6760 | 0 | cutoffs->relfrozenxid))); |
6761 | 0 | else if (TransactionIdPrecedes(update_xact, cutoffs->OldestXmin)) |
6762 | 0 | { |
6763 | | /* |
6764 | | * Updater XID has to have aborted (otherwise the tuple would have |
6765 | | * been pruned away instead, since updater XID is < OldestXmin). |
6766 | | * Just remove xmax. |
6767 | | */ |
6768 | 0 | if (TransactionIdDidCommit(update_xact)) |
6769 | 0 | ereport(ERROR, |
6770 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
6771 | 0 | errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u", |
6772 | 0 | multi, update_xact, |
6773 | 0 | cutoffs->OldestXmin))); |
6774 | 0 | *flags |= FRM_INVALIDATE_XMAX; |
6775 | 0 | pagefrz->freeze_required = true; |
6776 | 0 | return InvalidTransactionId; |
6777 | 0 | } |
6778 | | |
6779 | | /* Have to keep updater XID as new xmax */ |
6780 | 0 | *flags |= FRM_RETURN_IS_XID; |
6781 | 0 | pagefrz->freeze_required = true; |
6782 | 0 | return update_xact; |
6783 | 0 | } |
6784 | | |
6785 | | /* |
6786 | | * Some member(s) of this Multi may be below FreezeLimit xid cutoff, so we |
6787 | | * need to walk the whole members array to figure out what to do, if |
6788 | | * anything. |
6789 | | */ |
6790 | 0 | nmembers = |
6791 | 0 | GetMultiXactIdMembers(multi, &members, false, |
6792 | 0 | HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)); |
6793 | 0 | if (nmembers <= 0) |
6794 | 0 | { |
6795 | | /* Nothing worth keeping */ |
6796 | 0 | *flags |= FRM_INVALIDATE_XMAX; |
6797 | 0 | pagefrz->freeze_required = true; |
6798 | 0 | return InvalidTransactionId; |
6799 | 0 | } |
6800 | | |
6801 | | /* |
6802 | | * The FRM_NOOP case is the only case where we might need to ratchet back |
6803 | | * FreezePageRelfrozenXid or FreezePageRelminMxid. It is also the only |
6804 | | * case where our caller might ratchet back its NoFreezePageRelfrozenXid |
6805 | | * or NoFreezePageRelminMxid "no freeze" trackers to deal with a multi. |
6806 | | * FRM_NOOP handling should result in the NewRelfrozenXid/NewRelminMxid |
6807 | | * trackers managed by VACUUM being ratcheting back by xmax to the degree |
6808 | | * required to make it safe to leave xmax undisturbed, independent of |
6809 | | * whether or not page freezing is triggered somewhere else. |
6810 | | * |
6811 | | * Our policy is to force freezing in every case other than FRM_NOOP, |
6812 | | * which obviates the need to maintain either set of trackers, anywhere. |
6813 | | * Every other case will reliably execute a freeze plan for xmax that |
6814 | | * either replaces xmax with an XID/MXID >= OldestXmin/OldestMxact, or |
6815 | | * sets xmax to an InvalidTransactionId XID, rendering xmax fully frozen. |
6816 | | * (VACUUM's NewRelfrozenXid/NewRelminMxid trackers are initialized with |
6817 | | * OldestXmin/OldestMxact, so later values never need to be tracked here.) |
6818 | | */ |
6819 | 0 | need_replace = false; |
6820 | 0 | FreezePageRelfrozenXid = pagefrz->FreezePageRelfrozenXid; |
6821 | 0 | for (int i = 0; i < nmembers; i++) |
6822 | 0 | { |
6823 | 0 | TransactionId xid = members[i].xid; |
6824 | |
|
6825 | 0 | Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid)); |
6826 | |
|
6827 | 0 | if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit)) |
6828 | 0 | { |
6829 | | /* Can't violate the FreezeLimit postcondition */ |
6830 | 0 | need_replace = true; |
6831 | 0 | break; |
6832 | 0 | } |
6833 | 0 | if (TransactionIdPrecedes(xid, FreezePageRelfrozenXid)) |
6834 | 0 | FreezePageRelfrozenXid = xid; |
6835 | 0 | } |
6836 | | |
6837 | | /* Can't violate the MultiXactCutoff postcondition, either */ |
6838 | 0 | if (!need_replace) |
6839 | 0 | need_replace = MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff); |
6840 | |
|
6841 | 0 | if (!need_replace) |
6842 | 0 | { |
6843 | | /* |
6844 | | * vacuumlazy.c might ratchet back NewRelminMxid, NewRelfrozenXid, or |
6845 | | * both together to make it safe to retain this particular multi after |
6846 | | * freezing its page |
6847 | | */ |
6848 | 0 | *flags |= FRM_NOOP; |
6849 | 0 | pagefrz->FreezePageRelfrozenXid = FreezePageRelfrozenXid; |
6850 | 0 | if (MultiXactIdPrecedes(multi, pagefrz->FreezePageRelminMxid)) |
6851 | 0 | pagefrz->FreezePageRelminMxid = multi; |
6852 | 0 | pfree(members); |
6853 | 0 | return multi; |
6854 | 0 | } |
6855 | | |
6856 | | /* |
6857 | | * Do a more thorough second pass over the multi to figure out which |
6858 | | * member XIDs actually need to be kept. Checking the precise status of |
6859 | | * individual members might even show that we don't need to keep anything. |
6860 | | * That is quite possible even though the Multi must be >= OldestMxact, |
6861 | | * since our second pass only keeps member XIDs when it's truly necessary; |
6862 | | * even member XIDs >= OldestXmin often won't be kept by second pass. |
6863 | | */ |
6864 | 0 | nnewmembers = 0; |
6865 | 0 | newmembers = palloc(sizeof(MultiXactMember) * nmembers); |
6866 | 0 | has_lockers = false; |
6867 | 0 | update_xid = InvalidTransactionId; |
6868 | 0 | update_committed = false; |
6869 | | |
6870 | | /* |
6871 | | * Determine whether to keep each member xid, or to ignore it instead |
6872 | | */ |
6873 | 0 | for (int i = 0; i < nmembers; i++) |
6874 | 0 | { |
6875 | 0 | TransactionId xid = members[i].xid; |
6876 | 0 | MultiXactStatus mstatus = members[i].status; |
6877 | |
|
6878 | 0 | Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid)); |
6879 | |
|
6880 | 0 | if (!ISUPDATE_from_mxstatus(mstatus)) |
6881 | 0 | { |
6882 | | /* |
6883 | | * Locker XID (not updater XID). We only keep lockers that are |
6884 | | * still running. |
6885 | | */ |
6886 | 0 | if (TransactionIdIsCurrentTransactionId(xid) || |
6887 | 0 | TransactionIdIsInProgress(xid)) |
6888 | 0 | { |
6889 | 0 | if (TransactionIdPrecedes(xid, cutoffs->OldestXmin)) |
6890 | 0 | ereport(ERROR, |
6891 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
6892 | 0 | errmsg_internal("multixact %u contains running locker XID %u from before removable cutoff %u", |
6893 | 0 | multi, xid, |
6894 | 0 | cutoffs->OldestXmin))); |
6895 | 0 | newmembers[nnewmembers++] = members[i]; |
6896 | 0 | has_lockers = true; |
6897 | 0 | } |
6898 | | |
6899 | 0 | continue; |
6900 | 0 | } |
6901 | | |
6902 | | /* |
6903 | | * Updater XID (not locker XID). Should we keep it? |
6904 | | * |
6905 | | * Since the tuple wasn't totally removed when vacuum pruned, the |
6906 | | * update Xid cannot possibly be older than OldestXmin cutoff unless |
6907 | | * the updater XID aborted. If the updater transaction is known |
6908 | | * aborted or crashed then it's okay to ignore it, otherwise not. |
6909 | | * |
6910 | | * In any case the Multi should never contain two updaters, whatever |
6911 | | * their individual commit status. Check for that first, in passing. |
6912 | | */ |
6913 | 0 | if (TransactionIdIsValid(update_xid)) |
6914 | 0 | ereport(ERROR, |
6915 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
6916 | 0 | errmsg_internal("multixact %u has two or more updating members", |
6917 | 0 | multi), |
6918 | 0 | errdetail_internal("First updater XID=%u second updater XID=%u.", |
6919 | 0 | update_xid, xid))); |
6920 | | |
6921 | | /* |
6922 | | * As with all tuple visibility routines, it's critical to test |
6923 | | * TransactionIdIsInProgress before TransactionIdDidCommit, because of |
6924 | | * race conditions explained in detail in heapam_visibility.c. |
6925 | | */ |
6926 | 0 | if (TransactionIdIsCurrentTransactionId(xid) || |
6927 | 0 | TransactionIdIsInProgress(xid)) |
6928 | 0 | update_xid = xid; |
6929 | 0 | else if (TransactionIdDidCommit(xid)) |
6930 | 0 | { |
6931 | | /* |
6932 | | * The transaction committed, so we can tell caller to set |
6933 | | * HEAP_XMAX_COMMITTED. (We can only do this because we know the |
6934 | | * transaction is not running.) |
6935 | | */ |
6936 | 0 | update_committed = true; |
6937 | 0 | update_xid = xid; |
6938 | 0 | } |
6939 | 0 | else |
6940 | 0 | { |
6941 | | /* |
6942 | | * Not in progress, not committed -- must be aborted or crashed; |
6943 | | * we can ignore it. |
6944 | | */ |
6945 | 0 | continue; |
6946 | 0 | } |
6947 | | |
6948 | | /* |
6949 | | * We determined that updater must be kept -- add it to pending new |
6950 | | * members list |
6951 | | */ |
6952 | 0 | if (TransactionIdPrecedes(xid, cutoffs->OldestXmin)) |
6953 | 0 | ereport(ERROR, |
6954 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
6955 | 0 | errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u", |
6956 | 0 | multi, xid, cutoffs->OldestXmin))); |
6957 | 0 | newmembers[nnewmembers++] = members[i]; |
6958 | 0 | } |
6959 | | |
6960 | 0 | pfree(members); |
6961 | | |
6962 | | /* |
6963 | | * Determine what to do with caller's multi based on information gathered |
6964 | | * during our second pass |
6965 | | */ |
6966 | 0 | if (nnewmembers == 0) |
6967 | 0 | { |
6968 | | /* Nothing worth keeping */ |
6969 | 0 | *flags |= FRM_INVALIDATE_XMAX; |
6970 | 0 | newxmax = InvalidTransactionId; |
6971 | 0 | } |
6972 | 0 | else if (TransactionIdIsValid(update_xid) && !has_lockers) |
6973 | 0 | { |
6974 | | /* |
6975 | | * If there's a single member and it's an update, pass it back alone |
6976 | | * without creating a new Multi. (XXX we could do this when there's a |
6977 | | * single remaining locker, too, but that would complicate the API too |
6978 | | * much; moreover, the case with the single updater is more |
6979 | | * interesting, because those are longer-lived.) |
6980 | | */ |
6981 | 0 | Assert(nnewmembers == 1); |
6982 | 0 | *flags |= FRM_RETURN_IS_XID; |
6983 | 0 | if (update_committed) |
6984 | 0 | *flags |= FRM_MARK_COMMITTED; |
6985 | 0 | newxmax = update_xid; |
6986 | 0 | } |
6987 | 0 | else |
6988 | 0 | { |
6989 | | /* |
6990 | | * Create a new multixact with the surviving members of the previous |
6991 | | * one, to set as new Xmax in the tuple |
6992 | | */ |
6993 | 0 | newxmax = MultiXactIdCreateFromMembers(nnewmembers, newmembers); |
6994 | 0 | *flags |= FRM_RETURN_IS_MULTI; |
6995 | 0 | } |
6996 | |
|
6997 | 0 | pfree(newmembers); |
6998 | |
|
6999 | 0 | pagefrz->freeze_required = true; |
7000 | 0 | return newxmax; |
7001 | 0 | } |
7002 | | |
7003 | | /* |
7004 | | * heap_prepare_freeze_tuple |
7005 | | * |
7006 | | * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac) |
7007 | | * are older than the OldestXmin and/or OldestMxact freeze cutoffs. If so, |
7008 | | * setup enough state (in the *frz output argument) to enable caller to |
7009 | | * process this tuple as part of freezing its page, and return true. Return |
7010 | | * false if nothing can be changed about the tuple right now. |
7011 | | * |
7012 | | * Also sets *totally_frozen to true if the tuple will be totally frozen once |
7013 | | * caller executes returned freeze plan (or if the tuple was already totally |
7014 | | * frozen by an earlier VACUUM). This indicates that there are no remaining |
7015 | | * XIDs or MultiXactIds that will need to be processed by a future VACUUM. |
7016 | | * |
7017 | | * VACUUM caller must assemble HeapTupleFreeze freeze plan entries for every |
7018 | | * tuple that we returned true for, and then execute freezing. Caller must |
7019 | | * initialize pagefrz fields for page as a whole before first call here for |
7020 | | * each heap page. |
7021 | | * |
7022 | | * VACUUM caller decides on whether or not to freeze the page as a whole. |
7023 | | * We'll often prepare freeze plans for a page that caller just discards. |
7024 | | * However, VACUUM doesn't always get to make a choice; it must freeze when |
7025 | | * pagefrz.freeze_required is set, to ensure that any XIDs < FreezeLimit (and |
7026 | | * MXIDs < MultiXactCutoff) can never be left behind. We help to make sure |
7027 | | * that VACUUM always follows that rule. |
7028 | | * |
7029 | | * We sometimes force freezing of xmax MultiXactId values long before it is |
7030 | | * strictly necessary to do so just to ensure the FreezeLimit postcondition. |
7031 | | * It's worth processing MultiXactIds proactively when it is cheap to do so, |
7032 | | * and it's convenient to make that happen by piggy-backing it on the "force |
7033 | | * freezing" mechanism. Conversely, we sometimes delay freezing MultiXactIds |
7034 | | * because it is expensive right now (though only when it's still possible to |
7035 | | * do so without violating the FreezeLimit/MultiXactCutoff postcondition). |
7036 | | * |
7037 | | * It is assumed that the caller has checked the tuple with |
7038 | | * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD |
7039 | | * (else we should be removing the tuple, not freezing it). |
7040 | | * |
7041 | | * NB: This function has side effects: it might allocate a new MultiXactId. |
7042 | | * It will be set as tuple's new xmax when our *frz output is processed within |
7043 | | * heap_execute_freeze_tuple later on. If the tuple is in a shared buffer |
7044 | | * then caller had better have an exclusive lock on it already. |
7045 | | */ |
7046 | | bool |
7047 | | heap_prepare_freeze_tuple(HeapTupleHeader tuple, |
7048 | | const struct VacuumCutoffs *cutoffs, |
7049 | | HeapPageFreeze *pagefrz, |
7050 | | HeapTupleFreeze *frz, bool *totally_frozen) |
7051 | 0 | { |
7052 | 0 | bool xmin_already_frozen = false, |
7053 | 0 | xmax_already_frozen = false; |
7054 | 0 | bool freeze_xmin = false, |
7055 | 0 | replace_xvac = false, |
7056 | 0 | replace_xmax = false, |
7057 | 0 | freeze_xmax = false; |
7058 | 0 | TransactionId xid; |
7059 | |
|
7060 | 0 | frz->xmax = HeapTupleHeaderGetRawXmax(tuple); |
7061 | 0 | frz->t_infomask2 = tuple->t_infomask2; |
7062 | 0 | frz->t_infomask = tuple->t_infomask; |
7063 | 0 | frz->frzflags = 0; |
7064 | 0 | frz->checkflags = 0; |
7065 | | |
7066 | | /* |
7067 | | * Process xmin, while keeping track of whether it's already frozen, or |
7068 | | * will become frozen iff our freeze plan is executed by caller (could be |
7069 | | * neither). |
7070 | | */ |
7071 | 0 | xid = HeapTupleHeaderGetXmin(tuple); |
7072 | 0 | if (!TransactionIdIsNormal(xid)) |
7073 | 0 | xmin_already_frozen = true; |
7074 | 0 | else |
7075 | 0 | { |
7076 | 0 | if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid)) |
7077 | 0 | ereport(ERROR, |
7078 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
7079 | 0 | errmsg_internal("found xmin %u from before relfrozenxid %u", |
7080 | 0 | xid, cutoffs->relfrozenxid))); |
7081 | | |
7082 | | /* Will set freeze_xmin flags in freeze plan below */ |
7083 | 0 | freeze_xmin = TransactionIdPrecedes(xid, cutoffs->OldestXmin); |
7084 | | |
7085 | | /* Verify that xmin committed if and when freeze plan is executed */ |
7086 | 0 | if (freeze_xmin) |
7087 | 0 | frz->checkflags |= HEAP_FREEZE_CHECK_XMIN_COMMITTED; |
7088 | 0 | } |
7089 | | |
7090 | | /* |
7091 | | * Old-style VACUUM FULL is gone, but we have to process xvac for as long |
7092 | | * as we support having MOVED_OFF/MOVED_IN tuples in the database |
7093 | | */ |
7094 | 0 | xid = HeapTupleHeaderGetXvac(tuple); |
7095 | 0 | if (TransactionIdIsNormal(xid)) |
7096 | 0 | { |
7097 | 0 | Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid)); |
7098 | 0 | Assert(TransactionIdPrecedes(xid, cutoffs->OldestXmin)); |
7099 | | |
7100 | | /* |
7101 | | * For Xvac, we always freeze proactively. This allows totally_frozen |
7102 | | * tracking to ignore xvac. |
7103 | | */ |
7104 | 0 | replace_xvac = pagefrz->freeze_required = true; |
7105 | | |
7106 | | /* Will set replace_xvac flags in freeze plan below */ |
7107 | 0 | } |
7108 | | |
7109 | | /* Now process xmax */ |
7110 | 0 | xid = frz->xmax; |
7111 | 0 | if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) |
7112 | 0 | { |
7113 | | /* Raw xmax is a MultiXactId */ |
7114 | 0 | TransactionId newxmax; |
7115 | 0 | uint16 flags; |
7116 | | |
7117 | | /* |
7118 | | * We will either remove xmax completely (in the "freeze_xmax" path), |
7119 | | * process xmax by replacing it (in the "replace_xmax" path), or |
7120 | | * perform no-op xmax processing. The only constraint is that the |
7121 | | * FreezeLimit/MultiXactCutoff postcondition must never be violated. |
7122 | | */ |
7123 | 0 | newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs, |
7124 | 0 | &flags, pagefrz); |
7125 | |
|
7126 | 0 | if (flags & FRM_NOOP) |
7127 | 0 | { |
7128 | | /* |
7129 | | * xmax is a MultiXactId, and nothing about it changes for now. |
7130 | | * This is the only case where 'freeze_required' won't have been |
7131 | | * set for us by FreezeMultiXactId, as well as the only case where |
7132 | | * neither freeze_xmax nor replace_xmax are set (given a multi). |
7133 | | * |
7134 | | * This is a no-op, but the call to FreezeMultiXactId might have |
7135 | | * ratcheted back NewRelfrozenXid and/or NewRelminMxid trackers |
7136 | | * for us (the "freeze page" variants, specifically). That'll |
7137 | | * make it safe for our caller to freeze the page later on, while |
7138 | | * leaving this particular xmax undisturbed. |
7139 | | * |
7140 | | * FreezeMultiXactId is _not_ responsible for the "no freeze" |
7141 | | * NewRelfrozenXid/NewRelminMxid trackers, though -- that's our |
7142 | | * job. A call to heap_tuple_should_freeze for this same tuple |
7143 | | * will take place below if 'freeze_required' isn't set already. |
7144 | | * (This repeats work from FreezeMultiXactId, but allows "no |
7145 | | * freeze" tracker maintenance to happen in only one place.) |
7146 | | */ |
7147 | 0 | Assert(!MultiXactIdPrecedes(newxmax, cutoffs->MultiXactCutoff)); |
7148 | 0 | Assert(MultiXactIdIsValid(newxmax) && xid == newxmax); |
7149 | 0 | } |
7150 | 0 | else if (flags & FRM_RETURN_IS_XID) |
7151 | 0 | { |
7152 | | /* |
7153 | | * xmax will become an updater Xid (original MultiXact's updater |
7154 | | * member Xid will be carried forward as a simple Xid in Xmax). |
7155 | | */ |
7156 | 0 | Assert(!TransactionIdPrecedes(newxmax, cutoffs->OldestXmin)); |
7157 | | |
7158 | | /* |
7159 | | * NB -- some of these transformations are only valid because we |
7160 | | * know the return Xid is a tuple updater (i.e. not merely a |
7161 | | * locker.) Also note that the only reason we don't explicitly |
7162 | | * worry about HEAP_KEYS_UPDATED is because it lives in |
7163 | | * t_infomask2 rather than t_infomask. |
7164 | | */ |
7165 | 0 | frz->t_infomask &= ~HEAP_XMAX_BITS; |
7166 | 0 | frz->xmax = newxmax; |
7167 | 0 | if (flags & FRM_MARK_COMMITTED) |
7168 | 0 | frz->t_infomask |= HEAP_XMAX_COMMITTED; |
7169 | 0 | replace_xmax = true; |
7170 | 0 | } |
7171 | 0 | else if (flags & FRM_RETURN_IS_MULTI) |
7172 | 0 | { |
7173 | 0 | uint16 newbits; |
7174 | 0 | uint16 newbits2; |
7175 | | |
7176 | | /* |
7177 | | * xmax is an old MultiXactId that we have to replace with a new |
7178 | | * MultiXactId, to carry forward two or more original member XIDs. |
7179 | | */ |
7180 | 0 | Assert(!MultiXactIdPrecedes(newxmax, cutoffs->OldestMxact)); |
7181 | | |
7182 | | /* |
7183 | | * We can't use GetMultiXactIdHintBits directly on the new multi |
7184 | | * here; that routine initializes the masks to all zeroes, which |
7185 | | * would lose other bits we need. Doing it this way ensures all |
7186 | | * unrelated bits remain untouched. |
7187 | | */ |
7188 | 0 | frz->t_infomask &= ~HEAP_XMAX_BITS; |
7189 | 0 | frz->t_infomask2 &= ~HEAP_KEYS_UPDATED; |
7190 | 0 | GetMultiXactIdHintBits(newxmax, &newbits, &newbits2); |
7191 | 0 | frz->t_infomask |= newbits; |
7192 | 0 | frz->t_infomask2 |= newbits2; |
7193 | 0 | frz->xmax = newxmax; |
7194 | 0 | replace_xmax = true; |
7195 | 0 | } |
7196 | 0 | else |
7197 | 0 | { |
7198 | | /* |
7199 | | * Freeze plan for tuple "freezes xmax" in the strictest sense: |
7200 | | * it'll leave nothing in xmax (neither an Xid nor a MultiXactId). |
7201 | | */ |
7202 | 0 | Assert(flags & FRM_INVALIDATE_XMAX); |
7203 | 0 | Assert(!TransactionIdIsValid(newxmax)); |
7204 | | |
7205 | | /* Will set freeze_xmax flags in freeze plan below */ |
7206 | 0 | freeze_xmax = true; |
7207 | 0 | } |
7208 | | |
7209 | | /* MultiXactId processing forces freezing (barring FRM_NOOP case) */ |
7210 | 0 | Assert(pagefrz->freeze_required || (!freeze_xmax && !replace_xmax)); |
7211 | 0 | } |
7212 | 0 | else if (TransactionIdIsNormal(xid)) |
7213 | 0 | { |
7214 | | /* Raw xmax is normal XID */ |
7215 | 0 | if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid)) |
7216 | 0 | ereport(ERROR, |
7217 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
7218 | 0 | errmsg_internal("found xmax %u from before relfrozenxid %u", |
7219 | 0 | xid, cutoffs->relfrozenxid))); |
7220 | | |
7221 | | /* Will set freeze_xmax flags in freeze plan below */ |
7222 | 0 | freeze_xmax = TransactionIdPrecedes(xid, cutoffs->OldestXmin); |
7223 | | |
7224 | | /* |
7225 | | * Verify that xmax aborted if and when freeze plan is executed, |
7226 | | * provided it's from an update. (A lock-only xmax can be removed |
7227 | | * independent of this, since the lock is released at xact end.) |
7228 | | */ |
7229 | 0 | if (freeze_xmax && !HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) |
7230 | 0 | frz->checkflags |= HEAP_FREEZE_CHECK_XMAX_ABORTED; |
7231 | 0 | } |
7232 | 0 | else if (!TransactionIdIsValid(xid)) |
7233 | 0 | { |
7234 | | /* Raw xmax is InvalidTransactionId XID */ |
7235 | 0 | Assert((tuple->t_infomask & HEAP_XMAX_IS_MULTI) == 0); |
7236 | 0 | xmax_already_frozen = true; |
7237 | 0 | } |
7238 | 0 | else |
7239 | 0 | ereport(ERROR, |
7240 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
7241 | 0 | errmsg_internal("found raw xmax %u (infomask 0x%04x) not invalid and not multi", |
7242 | 0 | xid, tuple->t_infomask))); |
7243 | | |
7244 | 0 | if (freeze_xmin) |
7245 | 0 | { |
7246 | 0 | Assert(!xmin_already_frozen); |
7247 | |
|
7248 | 0 | frz->t_infomask |= HEAP_XMIN_FROZEN; |
7249 | 0 | } |
7250 | 0 | if (replace_xvac) |
7251 | 0 | { |
7252 | | /* |
7253 | | * If a MOVED_OFF tuple is not dead, the xvac transaction must have |
7254 | | * failed; whereas a non-dead MOVED_IN tuple must mean the xvac |
7255 | | * transaction succeeded. |
7256 | | */ |
7257 | 0 | Assert(pagefrz->freeze_required); |
7258 | 0 | if (tuple->t_infomask & HEAP_MOVED_OFF) |
7259 | 0 | frz->frzflags |= XLH_INVALID_XVAC; |
7260 | 0 | else |
7261 | 0 | frz->frzflags |= XLH_FREEZE_XVAC; |
7262 | 0 | } |
7263 | 0 | if (replace_xmax) |
7264 | 0 | { |
7265 | 0 | Assert(!xmax_already_frozen && !freeze_xmax); |
7266 | 0 | Assert(pagefrz->freeze_required); |
7267 | | |
7268 | | /* Already set replace_xmax flags in freeze plan earlier */ |
7269 | 0 | } |
7270 | 0 | if (freeze_xmax) |
7271 | 0 | { |
7272 | 0 | Assert(!xmax_already_frozen && !replace_xmax); |
7273 | |
|
7274 | 0 | frz->xmax = InvalidTransactionId; |
7275 | | |
7276 | | /* |
7277 | | * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED + |
7278 | | * LOCKED. Normalize to INVALID just to be sure no one gets confused. |
7279 | | * Also get rid of the HEAP_KEYS_UPDATED bit. |
7280 | | */ |
7281 | 0 | frz->t_infomask &= ~HEAP_XMAX_BITS; |
7282 | 0 | frz->t_infomask |= HEAP_XMAX_INVALID; |
7283 | 0 | frz->t_infomask2 &= ~HEAP_HOT_UPDATED; |
7284 | 0 | frz->t_infomask2 &= ~HEAP_KEYS_UPDATED; |
7285 | 0 | } |
7286 | | |
7287 | | /* |
7288 | | * Determine if this tuple is already totally frozen, or will become |
7289 | | * totally frozen (provided caller executes freeze plans for the page) |
7290 | | */ |
7291 | 0 | *totally_frozen = ((freeze_xmin || xmin_already_frozen) && |
7292 | 0 | (freeze_xmax || xmax_already_frozen)); |
7293 | |
|
7294 | 0 | if (!pagefrz->freeze_required && !(xmin_already_frozen && |
7295 | 0 | xmax_already_frozen)) |
7296 | 0 | { |
7297 | | /* |
7298 | | * So far no previous tuple from the page made freezing mandatory. |
7299 | | * Does this tuple force caller to freeze the entire page? |
7300 | | */ |
7301 | 0 | pagefrz->freeze_required = |
7302 | 0 | heap_tuple_should_freeze(tuple, cutoffs, |
7303 | 0 | &pagefrz->NoFreezePageRelfrozenXid, |
7304 | 0 | &pagefrz->NoFreezePageRelminMxid); |
7305 | 0 | } |
7306 | | |
7307 | | /* Tell caller if this tuple has a usable freeze plan set in *frz */ |
7308 | 0 | return freeze_xmin || replace_xvac || replace_xmax || freeze_xmax; |
7309 | 0 | } |
7310 | | |
7311 | | /* |
7312 | | * Perform xmin/xmax XID status sanity checks before actually executing freeze |
7313 | | * plans. |
7314 | | * |
7315 | | * heap_prepare_freeze_tuple doesn't perform these checks directly because |
7316 | | * pg_xact lookups are relatively expensive. They shouldn't be repeated by |
7317 | | * successive VACUUMs that each decide against freezing the same page. |
7318 | | */ |
7319 | | void |
7320 | | heap_pre_freeze_checks(Buffer buffer, |
7321 | | HeapTupleFreeze *tuples, int ntuples) |
7322 | 0 | { |
7323 | 0 | Page page = BufferGetPage(buffer); |
7324 | |
|
7325 | 0 | for (int i = 0; i < ntuples; i++) |
7326 | 0 | { |
7327 | 0 | HeapTupleFreeze *frz = tuples + i; |
7328 | 0 | ItemId itemid = PageGetItemId(page, frz->offset); |
7329 | 0 | HeapTupleHeader htup; |
7330 | |
|
7331 | 0 | htup = (HeapTupleHeader) PageGetItem(page, itemid); |
7332 | | |
7333 | | /* Deliberately avoid relying on tuple hint bits here */ |
7334 | 0 | if (frz->checkflags & HEAP_FREEZE_CHECK_XMIN_COMMITTED) |
7335 | 0 | { |
7336 | 0 | TransactionId xmin = HeapTupleHeaderGetRawXmin(htup); |
7337 | |
|
7338 | 0 | Assert(!HeapTupleHeaderXminFrozen(htup)); |
7339 | 0 | if (unlikely(!TransactionIdDidCommit(xmin))) |
7340 | 0 | ereport(ERROR, |
7341 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
7342 | 0 | errmsg_internal("uncommitted xmin %u needs to be frozen", |
7343 | 0 | xmin))); |
7344 | 0 | } |
7345 | | |
7346 | | /* |
7347 | | * TransactionIdDidAbort won't work reliably in the presence of XIDs |
7348 | | * left behind by transactions that were in progress during a crash, |
7349 | | * so we can only check that xmax didn't commit |
7350 | | */ |
7351 | 0 | if (frz->checkflags & HEAP_FREEZE_CHECK_XMAX_ABORTED) |
7352 | 0 | { |
7353 | 0 | TransactionId xmax = HeapTupleHeaderGetRawXmax(htup); |
7354 | |
|
7355 | 0 | Assert(TransactionIdIsNormal(xmax)); |
7356 | 0 | if (unlikely(TransactionIdDidCommit(xmax))) |
7357 | 0 | ereport(ERROR, |
7358 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
7359 | 0 | errmsg_internal("cannot freeze committed xmax %u", |
7360 | 0 | xmax))); |
7361 | 0 | } |
7362 | 0 | } |
7363 | 0 | } |
7364 | | |
7365 | | /* |
7366 | | * Helper which executes freezing of one or more heap tuples on a page on |
7367 | | * behalf of caller. Caller passes an array of tuple plans from |
7368 | | * heap_prepare_freeze_tuple. Caller must set 'offset' in each plan for us. |
7369 | | * Must be called in a critical section that also marks the buffer dirty and, |
7370 | | * if needed, emits WAL. |
7371 | | */ |
7372 | | void |
7373 | | heap_freeze_prepared_tuples(Buffer buffer, HeapTupleFreeze *tuples, int ntuples) |
7374 | 0 | { |
7375 | 0 | Page page = BufferGetPage(buffer); |
7376 | |
|
7377 | 0 | for (int i = 0; i < ntuples; i++) |
7378 | 0 | { |
7379 | 0 | HeapTupleFreeze *frz = tuples + i; |
7380 | 0 | ItemId itemid = PageGetItemId(page, frz->offset); |
7381 | 0 | HeapTupleHeader htup; |
7382 | |
|
7383 | 0 | htup = (HeapTupleHeader) PageGetItem(page, itemid); |
7384 | 0 | heap_execute_freeze_tuple(htup, frz); |
7385 | 0 | } |
7386 | 0 | } |
7387 | | |
7388 | | /* |
7389 | | * heap_freeze_tuple |
7390 | | * Freeze tuple in place, without WAL logging. |
7391 | | * |
7392 | | * Useful for callers like CLUSTER that perform their own WAL logging. |
7393 | | */ |
7394 | | bool |
7395 | | heap_freeze_tuple(HeapTupleHeader tuple, |
7396 | | TransactionId relfrozenxid, TransactionId relminmxid, |
7397 | | TransactionId FreezeLimit, TransactionId MultiXactCutoff) |
7398 | 0 | { |
7399 | 0 | HeapTupleFreeze frz; |
7400 | 0 | bool do_freeze; |
7401 | 0 | bool totally_frozen; |
7402 | 0 | struct VacuumCutoffs cutoffs; |
7403 | 0 | HeapPageFreeze pagefrz; |
7404 | |
|
7405 | 0 | cutoffs.relfrozenxid = relfrozenxid; |
7406 | 0 | cutoffs.relminmxid = relminmxid; |
7407 | 0 | cutoffs.OldestXmin = FreezeLimit; |
7408 | 0 | cutoffs.OldestMxact = MultiXactCutoff; |
7409 | 0 | cutoffs.FreezeLimit = FreezeLimit; |
7410 | 0 | cutoffs.MultiXactCutoff = MultiXactCutoff; |
7411 | |
|
7412 | 0 | pagefrz.freeze_required = true; |
7413 | 0 | pagefrz.FreezePageRelfrozenXid = FreezeLimit; |
7414 | 0 | pagefrz.FreezePageRelminMxid = MultiXactCutoff; |
7415 | 0 | pagefrz.NoFreezePageRelfrozenXid = FreezeLimit; |
7416 | 0 | pagefrz.NoFreezePageRelminMxid = MultiXactCutoff; |
7417 | |
|
7418 | 0 | do_freeze = heap_prepare_freeze_tuple(tuple, &cutoffs, |
7419 | 0 | &pagefrz, &frz, &totally_frozen); |
7420 | | |
7421 | | /* |
7422 | | * Note that because this is not a WAL-logged operation, we don't need to |
7423 | | * fill in the offset in the freeze record. |
7424 | | */ |
7425 | |
|
7426 | 0 | if (do_freeze) |
7427 | 0 | heap_execute_freeze_tuple(tuple, &frz); |
7428 | 0 | return do_freeze; |
7429 | 0 | } |
7430 | | |
7431 | | /* |
7432 | | * For a given MultiXactId, return the hint bits that should be set in the |
7433 | | * tuple's infomask. |
7434 | | * |
7435 | | * Normally this should be called for a multixact that was just created, and |
7436 | | * so is on our local cache, so the GetMembers call is fast. |
7437 | | */ |
7438 | | static void |
7439 | | GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, |
7440 | | uint16 *new_infomask2) |
7441 | 0 | { |
7442 | 0 | int nmembers; |
7443 | 0 | MultiXactMember *members; |
7444 | 0 | int i; |
7445 | 0 | uint16 bits = HEAP_XMAX_IS_MULTI; |
7446 | 0 | uint16 bits2 = 0; |
7447 | 0 | bool has_update = false; |
7448 | 0 | LockTupleMode strongest = LockTupleKeyShare; |
7449 | | |
7450 | | /* |
7451 | | * We only use this in multis we just created, so they cannot be values |
7452 | | * pre-pg_upgrade. |
7453 | | */ |
7454 | 0 | nmembers = GetMultiXactIdMembers(multi, &members, false, false); |
7455 | |
|
7456 | 0 | for (i = 0; i < nmembers; i++) |
7457 | 0 | { |
7458 | 0 | LockTupleMode mode; |
7459 | | |
7460 | | /* |
7461 | | * Remember the strongest lock mode held by any member of the |
7462 | | * multixact. |
7463 | | */ |
7464 | 0 | mode = TUPLOCK_from_mxstatus(members[i].status); |
7465 | 0 | if (mode > strongest) |
7466 | 0 | strongest = mode; |
7467 | | |
7468 | | /* See what other bits we need */ |
7469 | 0 | switch (members[i].status) |
7470 | 0 | { |
7471 | 0 | case MultiXactStatusForKeyShare: |
7472 | 0 | case MultiXactStatusForShare: |
7473 | 0 | case MultiXactStatusForNoKeyUpdate: |
7474 | 0 | break; |
7475 | | |
7476 | 0 | case MultiXactStatusForUpdate: |
7477 | 0 | bits2 |= HEAP_KEYS_UPDATED; |
7478 | 0 | break; |
7479 | | |
7480 | 0 | case MultiXactStatusNoKeyUpdate: |
7481 | 0 | has_update = true; |
7482 | 0 | break; |
7483 | | |
7484 | 0 | case MultiXactStatusUpdate: |
7485 | 0 | bits2 |= HEAP_KEYS_UPDATED; |
7486 | 0 | has_update = true; |
7487 | 0 | break; |
7488 | 0 | } |
7489 | 0 | } |
7490 | | |
7491 | 0 | if (strongest == LockTupleExclusive || |
7492 | 0 | strongest == LockTupleNoKeyExclusive) |
7493 | 0 | bits |= HEAP_XMAX_EXCL_LOCK; |
7494 | 0 | else if (strongest == LockTupleShare) |
7495 | 0 | bits |= HEAP_XMAX_SHR_LOCK; |
7496 | 0 | else if (strongest == LockTupleKeyShare) |
7497 | 0 | bits |= HEAP_XMAX_KEYSHR_LOCK; |
7498 | |
|
7499 | 0 | if (!has_update) |
7500 | 0 | bits |= HEAP_XMAX_LOCK_ONLY; |
7501 | |
|
7502 | 0 | if (nmembers > 0) |
7503 | 0 | pfree(members); |
7504 | |
|
7505 | 0 | *new_infomask = bits; |
7506 | 0 | *new_infomask2 = bits2; |
7507 | 0 | } |
7508 | | |
7509 | | /* |
7510 | | * MultiXactIdGetUpdateXid |
7511 | | * |
7512 | | * Given a multixact Xmax and corresponding infomask, which does not have the |
7513 | | * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating |
7514 | | * transaction. |
7515 | | * |
7516 | | * Caller is expected to check the status of the updating transaction, if |
7517 | | * necessary. |
7518 | | */ |
7519 | | static TransactionId |
7520 | | MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask) |
7521 | 0 | { |
7522 | 0 | TransactionId update_xact = InvalidTransactionId; |
7523 | 0 | MultiXactMember *members; |
7524 | 0 | int nmembers; |
7525 | |
|
7526 | 0 | Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY)); |
7527 | 0 | Assert(t_infomask & HEAP_XMAX_IS_MULTI); |
7528 | | |
7529 | | /* |
7530 | | * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from |
7531 | | * pre-pg_upgrade. |
7532 | | */ |
7533 | 0 | nmembers = GetMultiXactIdMembers(xmax, &members, false, false); |
7534 | |
|
7535 | 0 | if (nmembers > 0) |
7536 | 0 | { |
7537 | 0 | int i; |
7538 | |
|
7539 | 0 | for (i = 0; i < nmembers; i++) |
7540 | 0 | { |
7541 | | /* Ignore lockers */ |
7542 | 0 | if (!ISUPDATE_from_mxstatus(members[i].status)) |
7543 | 0 | continue; |
7544 | | |
7545 | | /* there can be at most one updater */ |
7546 | 0 | Assert(update_xact == InvalidTransactionId); |
7547 | 0 | update_xact = members[i].xid; |
7548 | 0 | #ifndef USE_ASSERT_CHECKING |
7549 | | |
7550 | | /* |
7551 | | * in an assert-enabled build, walk the whole array to ensure |
7552 | | * there's no other updater. |
7553 | | */ |
7554 | 0 | break; |
7555 | 0 | #endif |
7556 | 0 | } |
7557 | |
|
7558 | 0 | pfree(members); |
7559 | 0 | } |
7560 | |
|
7561 | 0 | return update_xact; |
7562 | 0 | } |
7563 | | |
7564 | | /* |
7565 | | * HeapTupleGetUpdateXid |
7566 | | * As above, but use a HeapTupleHeader |
7567 | | * |
7568 | | * See also HeapTupleHeaderGetUpdateXid, which can be used without previously |
7569 | | * checking the hint bits. |
7570 | | */ |
7571 | | TransactionId |
7572 | | HeapTupleGetUpdateXid(const HeapTupleHeaderData *tup) |
7573 | 0 | { |
7574 | 0 | return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tup), |
7575 | 0 | tup->t_infomask); |
7576 | 0 | } |
7577 | | |
7578 | | /* |
7579 | | * Does the given multixact conflict with the current transaction grabbing a |
7580 | | * tuple lock of the given strength? |
7581 | | * |
7582 | | * The passed infomask pairs up with the given multixact in the tuple header. |
7583 | | * |
7584 | | * If current_is_member is not NULL, it is set to 'true' if the current |
7585 | | * transaction is a member of the given multixact. |
7586 | | */ |
7587 | | static bool |
7588 | | DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, |
7589 | | LockTupleMode lockmode, bool *current_is_member) |
7590 | 0 | { |
7591 | 0 | int nmembers; |
7592 | 0 | MultiXactMember *members; |
7593 | 0 | bool result = false; |
7594 | 0 | LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock; |
7595 | |
|
7596 | 0 | if (HEAP_LOCKED_UPGRADED(infomask)) |
7597 | 0 | return false; |
7598 | | |
7599 | 0 | nmembers = GetMultiXactIdMembers(multi, &members, false, |
7600 | 0 | HEAP_XMAX_IS_LOCKED_ONLY(infomask)); |
7601 | 0 | if (nmembers >= 0) |
7602 | 0 | { |
7603 | 0 | int i; |
7604 | |
|
7605 | 0 | for (i = 0; i < nmembers; i++) |
7606 | 0 | { |
7607 | 0 | TransactionId memxid; |
7608 | 0 | LOCKMODE memlockmode; |
7609 | |
|
7610 | 0 | if (result && (current_is_member == NULL || *current_is_member)) |
7611 | 0 | break; |
7612 | | |
7613 | 0 | memlockmode = LOCKMODE_from_mxstatus(members[i].status); |
7614 | | |
7615 | | /* ignore members from current xact (but track their presence) */ |
7616 | 0 | memxid = members[i].xid; |
7617 | 0 | if (TransactionIdIsCurrentTransactionId(memxid)) |
7618 | 0 | { |
7619 | 0 | if (current_is_member != NULL) |
7620 | 0 | *current_is_member = true; |
7621 | 0 | continue; |
7622 | 0 | } |
7623 | 0 | else if (result) |
7624 | 0 | continue; |
7625 | | |
7626 | | /* ignore members that don't conflict with the lock we want */ |
7627 | 0 | if (!DoLockModesConflict(memlockmode, wanted)) |
7628 | 0 | continue; |
7629 | | |
7630 | 0 | if (ISUPDATE_from_mxstatus(members[i].status)) |
7631 | 0 | { |
7632 | | /* ignore aborted updaters */ |
7633 | 0 | if (TransactionIdDidAbort(memxid)) |
7634 | 0 | continue; |
7635 | 0 | } |
7636 | 0 | else |
7637 | 0 | { |
7638 | | /* ignore lockers-only that are no longer in progress */ |
7639 | 0 | if (!TransactionIdIsInProgress(memxid)) |
7640 | 0 | continue; |
7641 | 0 | } |
7642 | | |
7643 | | /* |
7644 | | * Whatever remains are either live lockers that conflict with our |
7645 | | * wanted lock, and updaters that are not aborted. Those conflict |
7646 | | * with what we want. Set up to return true, but keep going to |
7647 | | * look for the current transaction among the multixact members, |
7648 | | * if needed. |
7649 | | */ |
7650 | 0 | result = true; |
7651 | 0 | } |
7652 | 0 | pfree(members); |
7653 | 0 | } |
7654 | |
|
7655 | 0 | return result; |
7656 | 0 | } |
7657 | | |
7658 | | /* |
7659 | | * Do_MultiXactIdWait |
7660 | | * Actual implementation for the two functions below. |
7661 | | * |
7662 | | * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is |
7663 | | * needed to ensure we only sleep on conflicting members, and the infomask is |
7664 | | * used to optimize multixact access in case it's a lock-only multi); 'nowait' |
7665 | | * indicates whether to use conditional lock acquisition, to allow callers to |
7666 | | * fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up |
7667 | | * context information for error messages. 'remaining', if not NULL, receives |
7668 | | * the number of members that are still running, including any (non-aborted) |
7669 | | * subtransactions of our own transaction. 'logLockFailure' indicates whether |
7670 | | * to log details when a lock acquisition fails with 'nowait' enabled. |
7671 | | * |
7672 | | * We do this by sleeping on each member using XactLockTableWait. Any |
7673 | | * members that belong to the current backend are *not* waited for, however; |
7674 | | * this would not merely be useless but would lead to Assert failure inside |
7675 | | * XactLockTableWait. By the time this returns, it is certain that all |
7676 | | * transactions *of other backends* that were members of the MultiXactId |
7677 | | * that conflict with the requested status are dead (and no new ones can have |
7678 | | * been added, since it is not legal to add members to an existing |
7679 | | * MultiXactId). |
7680 | | * |
7681 | | * But by the time we finish sleeping, someone else may have changed the Xmax |
7682 | | * of the containing tuple, so the caller needs to iterate on us somehow. |
7683 | | * |
7684 | | * Note that in case we return false, the number of remaining members is |
7685 | | * not to be trusted. |
7686 | | */ |
7687 | | static bool |
7688 | | Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status, |
7689 | | uint16 infomask, bool nowait, |
7690 | | Relation rel, ItemPointer ctid, XLTW_Oper oper, |
7691 | | int *remaining, bool logLockFailure) |
7692 | 0 | { |
7693 | 0 | bool result = true; |
7694 | 0 | MultiXactMember *members; |
7695 | 0 | int nmembers; |
7696 | 0 | int remain = 0; |
7697 | | |
7698 | | /* for pre-pg_upgrade tuples, no need to sleep at all */ |
7699 | 0 | nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 : |
7700 | 0 | GetMultiXactIdMembers(multi, &members, false, |
7701 | 0 | HEAP_XMAX_IS_LOCKED_ONLY(infomask)); |
7702 | |
|
7703 | 0 | if (nmembers >= 0) |
7704 | 0 | { |
7705 | 0 | int i; |
7706 | |
|
7707 | 0 | for (i = 0; i < nmembers; i++) |
7708 | 0 | { |
7709 | 0 | TransactionId memxid = members[i].xid; |
7710 | 0 | MultiXactStatus memstatus = members[i].status; |
7711 | |
|
7712 | 0 | if (TransactionIdIsCurrentTransactionId(memxid)) |
7713 | 0 | { |
7714 | 0 | remain++; |
7715 | 0 | continue; |
7716 | 0 | } |
7717 | | |
7718 | 0 | if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus), |
7719 | 0 | LOCKMODE_from_mxstatus(status))) |
7720 | 0 | { |
7721 | 0 | if (remaining && TransactionIdIsInProgress(memxid)) |
7722 | 0 | remain++; |
7723 | 0 | continue; |
7724 | 0 | } |
7725 | | |
7726 | | /* |
7727 | | * This member conflicts with our multi, so we have to sleep (or |
7728 | | * return failure, if asked to avoid waiting.) |
7729 | | * |
7730 | | * Note that we don't set up an error context callback ourselves, |
7731 | | * but instead we pass the info down to XactLockTableWait. This |
7732 | | * might seem a bit wasteful because the context is set up and |
7733 | | * tore down for each member of the multixact, but in reality it |
7734 | | * should be barely noticeable, and it avoids duplicate code. |
7735 | | */ |
7736 | 0 | if (nowait) |
7737 | 0 | { |
7738 | 0 | result = ConditionalXactLockTableWait(memxid, logLockFailure); |
7739 | 0 | if (!result) |
7740 | 0 | break; |
7741 | 0 | } |
7742 | 0 | else |
7743 | 0 | XactLockTableWait(memxid, rel, ctid, oper); |
7744 | 0 | } |
7745 | |
|
7746 | 0 | pfree(members); |
7747 | 0 | } |
7748 | |
|
7749 | 0 | if (remaining) |
7750 | 0 | *remaining = remain; |
7751 | |
|
7752 | 0 | return result; |
7753 | 0 | } |
7754 | | |
7755 | | /* |
7756 | | * MultiXactIdWait |
7757 | | * Sleep on a MultiXactId. |
7758 | | * |
7759 | | * By the time we finish sleeping, someone else may have changed the Xmax |
7760 | | * of the containing tuple, so the caller needs to iterate on us somehow. |
7761 | | * |
7762 | | * We return (in *remaining, if not NULL) the number of members that are still |
7763 | | * running, including any (non-aborted) subtransactions of our own transaction. |
7764 | | */ |
7765 | | static void |
7766 | | MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, |
7767 | | Relation rel, ItemPointer ctid, XLTW_Oper oper, |
7768 | | int *remaining) |
7769 | 0 | { |
7770 | 0 | (void) Do_MultiXactIdWait(multi, status, infomask, false, |
7771 | 0 | rel, ctid, oper, remaining, false); |
7772 | 0 | } |
7773 | | |
7774 | | /* |
7775 | | * ConditionalMultiXactIdWait |
7776 | | * As above, but only lock if we can get the lock without blocking. |
7777 | | * |
7778 | | * By the time we finish sleeping, someone else may have changed the Xmax |
7779 | | * of the containing tuple, so the caller needs to iterate on us somehow. |
7780 | | * |
7781 | | * If the multixact is now all gone, return true. Returns false if some |
7782 | | * transactions might still be running. |
7783 | | * |
7784 | | * We return (in *remaining, if not NULL) the number of members that are still |
7785 | | * running, including any (non-aborted) subtransactions of our own transaction. |
7786 | | */ |
7787 | | static bool |
7788 | | ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, |
7789 | | uint16 infomask, Relation rel, int *remaining, |
7790 | | bool logLockFailure) |
7791 | 0 | { |
7792 | 0 | return Do_MultiXactIdWait(multi, status, infomask, true, |
7793 | 0 | rel, NULL, XLTW_None, remaining, logLockFailure); |
7794 | 0 | } |
7795 | | |
7796 | | /* |
7797 | | * heap_tuple_needs_eventual_freeze |
7798 | | * |
7799 | | * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac) |
7800 | | * will eventually require freezing (if tuple isn't removed by pruning first). |
7801 | | */ |
7802 | | bool |
7803 | | heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple) |
7804 | 0 | { |
7805 | 0 | TransactionId xid; |
7806 | | |
7807 | | /* |
7808 | | * If xmin is a normal transaction ID, this tuple is definitely not |
7809 | | * frozen. |
7810 | | */ |
7811 | 0 | xid = HeapTupleHeaderGetXmin(tuple); |
7812 | 0 | if (TransactionIdIsNormal(xid)) |
7813 | 0 | return true; |
7814 | | |
7815 | | /* |
7816 | | * If xmax is a valid xact or multixact, this tuple is also not frozen. |
7817 | | */ |
7818 | 0 | if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) |
7819 | 0 | { |
7820 | 0 | MultiXactId multi; |
7821 | |
|
7822 | 0 | multi = HeapTupleHeaderGetRawXmax(tuple); |
7823 | 0 | if (MultiXactIdIsValid(multi)) |
7824 | 0 | return true; |
7825 | 0 | } |
7826 | 0 | else |
7827 | 0 | { |
7828 | 0 | xid = HeapTupleHeaderGetRawXmax(tuple); |
7829 | 0 | if (TransactionIdIsNormal(xid)) |
7830 | 0 | return true; |
7831 | 0 | } |
7832 | | |
7833 | 0 | if (tuple->t_infomask & HEAP_MOVED) |
7834 | 0 | { |
7835 | 0 | xid = HeapTupleHeaderGetXvac(tuple); |
7836 | 0 | if (TransactionIdIsNormal(xid)) |
7837 | 0 | return true; |
7838 | 0 | } |
7839 | | |
7840 | 0 | return false; |
7841 | 0 | } |
7842 | | |
7843 | | /* |
7844 | | * heap_tuple_should_freeze |
7845 | | * |
7846 | | * Return value indicates if heap_prepare_freeze_tuple sibling function would |
7847 | | * (or should) force freezing of the heap page that contains caller's tuple. |
7848 | | * Tuple header XIDs/MXIDs < FreezeLimit/MultiXactCutoff trigger freezing. |
7849 | | * This includes (xmin, xmax, xvac) fields, as well as MultiXact member XIDs. |
7850 | | * |
7851 | | * The *NoFreezePageRelfrozenXid and *NoFreezePageRelminMxid input/output |
7852 | | * arguments help VACUUM track the oldest extant XID/MXID remaining in rel. |
7853 | | * Our working assumption is that caller won't decide to freeze this tuple. |
7854 | | * It's up to caller to only ratchet back its own top-level trackers after the |
7855 | | * point that it fully commits to not freezing the tuple/page in question. |
7856 | | */ |
7857 | | bool |
7858 | | heap_tuple_should_freeze(HeapTupleHeader tuple, |
7859 | | const struct VacuumCutoffs *cutoffs, |
7860 | | TransactionId *NoFreezePageRelfrozenXid, |
7861 | | MultiXactId *NoFreezePageRelminMxid) |
7862 | 0 | { |
7863 | 0 | TransactionId xid; |
7864 | 0 | MultiXactId multi; |
7865 | 0 | bool freeze = false; |
7866 | | |
7867 | | /* First deal with xmin */ |
7868 | 0 | xid = HeapTupleHeaderGetXmin(tuple); |
7869 | 0 | if (TransactionIdIsNormal(xid)) |
7870 | 0 | { |
7871 | 0 | Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid)); |
7872 | 0 | if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid)) |
7873 | 0 | *NoFreezePageRelfrozenXid = xid; |
7874 | 0 | if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit)) |
7875 | 0 | freeze = true; |
7876 | 0 | } |
7877 | | |
7878 | | /* Now deal with xmax */ |
7879 | 0 | xid = InvalidTransactionId; |
7880 | 0 | multi = InvalidMultiXactId; |
7881 | 0 | if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) |
7882 | 0 | multi = HeapTupleHeaderGetRawXmax(tuple); |
7883 | 0 | else |
7884 | 0 | xid = HeapTupleHeaderGetRawXmax(tuple); |
7885 | |
|
7886 | 0 | if (TransactionIdIsNormal(xid)) |
7887 | 0 | { |
7888 | 0 | Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid)); |
7889 | | /* xmax is a non-permanent XID */ |
7890 | 0 | if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid)) |
7891 | 0 | *NoFreezePageRelfrozenXid = xid; |
7892 | 0 | if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit)) |
7893 | 0 | freeze = true; |
7894 | 0 | } |
7895 | 0 | else if (!MultiXactIdIsValid(multi)) |
7896 | 0 | { |
7897 | | /* xmax is a permanent XID or invalid MultiXactId/XID */ |
7898 | 0 | } |
7899 | 0 | else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask)) |
7900 | 0 | { |
7901 | | /* xmax is a pg_upgrade'd MultiXact, which can't have updater XID */ |
7902 | 0 | if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid)) |
7903 | 0 | *NoFreezePageRelminMxid = multi; |
7904 | | /* heap_prepare_freeze_tuple always freezes pg_upgrade'd xmax */ |
7905 | 0 | freeze = true; |
7906 | 0 | } |
7907 | 0 | else |
7908 | 0 | { |
7909 | | /* xmax is a MultiXactId that may have an updater XID */ |
7910 | 0 | MultiXactMember *members; |
7911 | 0 | int nmembers; |
7912 | |
|
7913 | 0 | Assert(MultiXactIdPrecedesOrEquals(cutoffs->relminmxid, multi)); |
7914 | 0 | if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid)) |
7915 | 0 | *NoFreezePageRelminMxid = multi; |
7916 | 0 | if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff)) |
7917 | 0 | freeze = true; |
7918 | | |
7919 | | /* need to check whether any member of the mxact is old */ |
7920 | 0 | nmembers = GetMultiXactIdMembers(multi, &members, false, |
7921 | 0 | HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)); |
7922 | |
|
7923 | 0 | for (int i = 0; i < nmembers; i++) |
7924 | 0 | { |
7925 | 0 | xid = members[i].xid; |
7926 | 0 | Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid)); |
7927 | 0 | if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid)) |
7928 | 0 | *NoFreezePageRelfrozenXid = xid; |
7929 | 0 | if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit)) |
7930 | 0 | freeze = true; |
7931 | 0 | } |
7932 | 0 | if (nmembers > 0) |
7933 | 0 | pfree(members); |
7934 | 0 | } |
7935 | |
|
7936 | 0 | if (tuple->t_infomask & HEAP_MOVED) |
7937 | 0 | { |
7938 | 0 | xid = HeapTupleHeaderGetXvac(tuple); |
7939 | 0 | if (TransactionIdIsNormal(xid)) |
7940 | 0 | { |
7941 | 0 | Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid)); |
7942 | 0 | if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid)) |
7943 | 0 | *NoFreezePageRelfrozenXid = xid; |
7944 | | /* heap_prepare_freeze_tuple forces xvac freezing */ |
7945 | 0 | freeze = true; |
7946 | 0 | } |
7947 | 0 | } |
7948 | |
|
7949 | 0 | return freeze; |
7950 | 0 | } |
7951 | | |
7952 | | /* |
7953 | | * Maintain snapshotConflictHorizon for caller by ratcheting forward its value |
7954 | | * using any committed XIDs contained in 'tuple', an obsolescent heap tuple |
7955 | | * that caller is in the process of physically removing, e.g. via HOT pruning |
7956 | | * or index deletion. |
7957 | | * |
7958 | | * Caller must initialize its value to InvalidTransactionId, which is |
7959 | | * generally interpreted as "definitely no need for a recovery conflict". |
7960 | | * Final value must reflect all heap tuples that caller will physically remove |
7961 | | * (or remove TID references to) via its ongoing pruning/deletion operation. |
7962 | | * ResolveRecoveryConflictWithSnapshot() is passed the final value (taken from |
7963 | | * caller's WAL record) by REDO routine when it replays caller's operation. |
7964 | | */ |
7965 | | void |
7966 | | HeapTupleHeaderAdvanceConflictHorizon(HeapTupleHeader tuple, |
7967 | | TransactionId *snapshotConflictHorizon) |
7968 | 0 | { |
7969 | 0 | TransactionId xmin = HeapTupleHeaderGetXmin(tuple); |
7970 | 0 | TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple); |
7971 | 0 | TransactionId xvac = HeapTupleHeaderGetXvac(tuple); |
7972 | |
|
7973 | 0 | if (tuple->t_infomask & HEAP_MOVED) |
7974 | 0 | { |
7975 | 0 | if (TransactionIdPrecedes(*snapshotConflictHorizon, xvac)) |
7976 | 0 | *snapshotConflictHorizon = xvac; |
7977 | 0 | } |
7978 | | |
7979 | | /* |
7980 | | * Ignore tuples inserted by an aborted transaction or if the tuple was |
7981 | | * updated/deleted by the inserting transaction. |
7982 | | * |
7983 | | * Look for a committed hint bit, or if no xmin bit is set, check clog. |
7984 | | */ |
7985 | 0 | if (HeapTupleHeaderXminCommitted(tuple) || |
7986 | 0 | (!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin))) |
7987 | 0 | { |
7988 | 0 | if (xmax != xmin && |
7989 | 0 | TransactionIdFollows(xmax, *snapshotConflictHorizon)) |
7990 | 0 | *snapshotConflictHorizon = xmax; |
7991 | 0 | } |
7992 | 0 | } |
7993 | | |
7994 | | #ifdef USE_PREFETCH |
7995 | | /* |
7996 | | * Helper function for heap_index_delete_tuples. Issues prefetch requests for |
7997 | | * prefetch_count buffers. The prefetch_state keeps track of all the buffers |
7998 | | * we can prefetch, and which have already been prefetched; each call to this |
7999 | | * function picks up where the previous call left off. |
8000 | | * |
8001 | | * Note: we expect the deltids array to be sorted in an order that groups TIDs |
8002 | | * by heap block, with all TIDs for each block appearing together in exactly |
8003 | | * one group. |
8004 | | */ |
8005 | | static void |
8006 | | index_delete_prefetch_buffer(Relation rel, |
8007 | | IndexDeletePrefetchState *prefetch_state, |
8008 | | int prefetch_count) |
8009 | 0 | { |
8010 | 0 | BlockNumber cur_hblkno = prefetch_state->cur_hblkno; |
8011 | 0 | int count = 0; |
8012 | 0 | int i; |
8013 | 0 | int ndeltids = prefetch_state->ndeltids; |
8014 | 0 | TM_IndexDelete *deltids = prefetch_state->deltids; |
8015 | |
|
8016 | 0 | for (i = prefetch_state->next_item; |
8017 | 0 | i < ndeltids && count < prefetch_count; |
8018 | 0 | i++) |
8019 | 0 | { |
8020 | 0 | ItemPointer htid = &deltids[i].tid; |
8021 | |
|
8022 | 0 | if (cur_hblkno == InvalidBlockNumber || |
8023 | 0 | ItemPointerGetBlockNumber(htid) != cur_hblkno) |
8024 | 0 | { |
8025 | 0 | cur_hblkno = ItemPointerGetBlockNumber(htid); |
8026 | 0 | PrefetchBuffer(rel, MAIN_FORKNUM, cur_hblkno); |
8027 | 0 | count++; |
8028 | 0 | } |
8029 | 0 | } |
8030 | | |
8031 | | /* |
8032 | | * Save the prefetch position so that next time we can continue from that |
8033 | | * position. |
8034 | | */ |
8035 | 0 | prefetch_state->next_item = i; |
8036 | 0 | prefetch_state->cur_hblkno = cur_hblkno; |
8037 | 0 | } |
8038 | | #endif |
8039 | | |
8040 | | /* |
8041 | | * Helper function for heap_index_delete_tuples. Checks for index corruption |
8042 | | * involving an invalid TID in index AM caller's index page. |
8043 | | * |
8044 | | * This is an ideal place for these checks. The index AM must hold a buffer |
8045 | | * lock on the index page containing the TIDs we examine here, so we don't |
8046 | | * have to worry about concurrent VACUUMs at all. We can be sure that the |
8047 | | * index is corrupt when htid points directly to an LP_UNUSED item or |
8048 | | * heap-only tuple, which is not the case during standard index scans. |
8049 | | */ |
8050 | | static inline void |
8051 | | index_delete_check_htid(TM_IndexDeleteOp *delstate, |
8052 | | Page page, OffsetNumber maxoff, |
8053 | | ItemPointer htid, TM_IndexStatus *istatus) |
8054 | 0 | { |
8055 | 0 | OffsetNumber indexpagehoffnum = ItemPointerGetOffsetNumber(htid); |
8056 | 0 | ItemId iid; |
8057 | |
|
8058 | 0 | Assert(OffsetNumberIsValid(istatus->idxoffnum)); |
8059 | |
|
8060 | 0 | if (unlikely(indexpagehoffnum > maxoff)) |
8061 | 0 | ereport(ERROR, |
8062 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
8063 | 0 | errmsg_internal("heap tid from index tuple (%u,%u) points past end of heap page line pointer array at offset %u of block %u in index \"%s\"", |
8064 | 0 | ItemPointerGetBlockNumber(htid), |
8065 | 0 | indexpagehoffnum, |
8066 | 0 | istatus->idxoffnum, delstate->iblknum, |
8067 | 0 | RelationGetRelationName(delstate->irel)))); |
8068 | | |
8069 | 0 | iid = PageGetItemId(page, indexpagehoffnum); |
8070 | 0 | if (unlikely(!ItemIdIsUsed(iid))) |
8071 | 0 | ereport(ERROR, |
8072 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
8073 | 0 | errmsg_internal("heap tid from index tuple (%u,%u) points to unused heap page item at offset %u of block %u in index \"%s\"", |
8074 | 0 | ItemPointerGetBlockNumber(htid), |
8075 | 0 | indexpagehoffnum, |
8076 | 0 | istatus->idxoffnum, delstate->iblknum, |
8077 | 0 | RelationGetRelationName(delstate->irel)))); |
8078 | | |
8079 | 0 | if (ItemIdHasStorage(iid)) |
8080 | 0 | { |
8081 | 0 | HeapTupleHeader htup; |
8082 | |
|
8083 | 0 | Assert(ItemIdIsNormal(iid)); |
8084 | 0 | htup = (HeapTupleHeader) PageGetItem(page, iid); |
8085 | |
|
8086 | 0 | if (unlikely(HeapTupleHeaderIsHeapOnly(htup))) |
8087 | 0 | ereport(ERROR, |
8088 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
8089 | 0 | errmsg_internal("heap tid from index tuple (%u,%u) points to heap-only tuple at offset %u of block %u in index \"%s\"", |
8090 | 0 | ItemPointerGetBlockNumber(htid), |
8091 | 0 | indexpagehoffnum, |
8092 | 0 | istatus->idxoffnum, delstate->iblknum, |
8093 | 0 | RelationGetRelationName(delstate->irel)))); |
8094 | 0 | } |
8095 | 0 | } |
8096 | | |
8097 | | /* |
8098 | | * heapam implementation of tableam's index_delete_tuples interface. |
8099 | | * |
8100 | | * This helper function is called by index AMs during index tuple deletion. |
8101 | | * See tableam header comments for an explanation of the interface implemented |
8102 | | * here and a general theory of operation. Note that each call here is either |
8103 | | * a simple index deletion call, or a bottom-up index deletion call. |
8104 | | * |
8105 | | * It's possible for this to generate a fair amount of I/O, since we may be |
8106 | | * deleting hundreds of tuples from a single index block. To amortize that |
8107 | | * cost to some degree, this uses prefetching and combines repeat accesses to |
8108 | | * the same heap block. |
8109 | | */ |
8110 | | TransactionId |
8111 | | heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) |
8112 | 0 | { |
8113 | | /* Initial assumption is that earlier pruning took care of conflict */ |
8114 | 0 | TransactionId snapshotConflictHorizon = InvalidTransactionId; |
8115 | 0 | BlockNumber blkno = InvalidBlockNumber; |
8116 | 0 | Buffer buf = InvalidBuffer; |
8117 | 0 | Page page = NULL; |
8118 | 0 | OffsetNumber maxoff = InvalidOffsetNumber; |
8119 | 0 | TransactionId priorXmax; |
8120 | 0 | #ifdef USE_PREFETCH |
8121 | 0 | IndexDeletePrefetchState prefetch_state; |
8122 | 0 | int prefetch_distance; |
8123 | 0 | #endif |
8124 | 0 | SnapshotData SnapshotNonVacuumable; |
8125 | 0 | int finalndeltids = 0, |
8126 | 0 | nblocksaccessed = 0; |
8127 | | |
8128 | | /* State that's only used in bottom-up index deletion case */ |
8129 | 0 | int nblocksfavorable = 0; |
8130 | 0 | int curtargetfreespace = delstate->bottomupfreespace, |
8131 | 0 | lastfreespace = 0, |
8132 | 0 | actualfreespace = 0; |
8133 | 0 | bool bottomup_final_block = false; |
8134 | |
|
8135 | 0 | InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(rel)); |
8136 | | |
8137 | | /* Sort caller's deltids array by TID for further processing */ |
8138 | 0 | index_delete_sort(delstate); |
8139 | | |
8140 | | /* |
8141 | | * Bottom-up case: resort deltids array in an order attuned to where the |
8142 | | * greatest number of promising TIDs are to be found, and determine how |
8143 | | * many blocks from the start of sorted array should be considered |
8144 | | * favorable. This will also shrink the deltids array in order to |
8145 | | * eliminate completely unfavorable blocks up front. |
8146 | | */ |
8147 | 0 | if (delstate->bottomup) |
8148 | 0 | nblocksfavorable = bottomup_sort_and_shrink(delstate); |
8149 | |
|
8150 | 0 | #ifdef USE_PREFETCH |
8151 | | /* Initialize prefetch state. */ |
8152 | 0 | prefetch_state.cur_hblkno = InvalidBlockNumber; |
8153 | 0 | prefetch_state.next_item = 0; |
8154 | 0 | prefetch_state.ndeltids = delstate->ndeltids; |
8155 | 0 | prefetch_state.deltids = delstate->deltids; |
8156 | | |
8157 | | /* |
8158 | | * Determine the prefetch distance that we will attempt to maintain. |
8159 | | * |
8160 | | * Since the caller holds a buffer lock somewhere in rel, we'd better make |
8161 | | * sure that isn't a catalog relation before we call code that does |
8162 | | * syscache lookups, to avoid risk of deadlock. |
8163 | | */ |
8164 | 0 | if (IsCatalogRelation(rel)) |
8165 | 0 | prefetch_distance = maintenance_io_concurrency; |
8166 | 0 | else |
8167 | 0 | prefetch_distance = |
8168 | 0 | get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); |
8169 | | |
8170 | | /* Cap initial prefetch distance for bottom-up deletion caller */ |
8171 | 0 | if (delstate->bottomup) |
8172 | 0 | { |
8173 | 0 | Assert(nblocksfavorable >= 1); |
8174 | 0 | Assert(nblocksfavorable <= BOTTOMUP_MAX_NBLOCKS); |
8175 | 0 | prefetch_distance = Min(prefetch_distance, nblocksfavorable); |
8176 | 0 | } |
8177 | | |
8178 | | /* Start prefetching. */ |
8179 | 0 | index_delete_prefetch_buffer(rel, &prefetch_state, prefetch_distance); |
8180 | 0 | #endif |
8181 | | |
8182 | | /* Iterate over deltids, determine which to delete, check their horizon */ |
8183 | 0 | Assert(delstate->ndeltids > 0); |
8184 | 0 | for (int i = 0; i < delstate->ndeltids; i++) |
8185 | 0 | { |
8186 | 0 | TM_IndexDelete *ideltid = &delstate->deltids[i]; |
8187 | 0 | TM_IndexStatus *istatus = delstate->status + ideltid->id; |
8188 | 0 | ItemPointer htid = &ideltid->tid; |
8189 | 0 | OffsetNumber offnum; |
8190 | | |
8191 | | /* |
8192 | | * Read buffer, and perform required extra steps each time a new block |
8193 | | * is encountered. Avoid refetching if it's the same block as the one |
8194 | | * from the last htid. |
8195 | | */ |
8196 | 0 | if (blkno == InvalidBlockNumber || |
8197 | 0 | ItemPointerGetBlockNumber(htid) != blkno) |
8198 | 0 | { |
8199 | | /* |
8200 | | * Consider giving up early for bottom-up index deletion caller |
8201 | | * first. (Only prefetch next-next block afterwards, when it |
8202 | | * becomes clear that we're at least going to access the next |
8203 | | * block in line.) |
8204 | | * |
8205 | | * Sometimes the first block frees so much space for bottom-up |
8206 | | * caller that the deletion process can end without accessing any |
8207 | | * more blocks. It is usually necessary to access 2 or 3 blocks |
8208 | | * per bottom-up deletion operation, though. |
8209 | | */ |
8210 | 0 | if (delstate->bottomup) |
8211 | 0 | { |
8212 | | /* |
8213 | | * We often allow caller to delete a few additional items |
8214 | | * whose entries we reached after the point that space target |
8215 | | * from caller was satisfied. The cost of accessing the page |
8216 | | * was already paid at that point, so it made sense to finish |
8217 | | * it off. When that happened, we finalize everything here |
8218 | | * (by finishing off the whole bottom-up deletion operation |
8219 | | * without needlessly paying the cost of accessing any more |
8220 | | * blocks). |
8221 | | */ |
8222 | 0 | if (bottomup_final_block) |
8223 | 0 | break; |
8224 | | |
8225 | | /* |
8226 | | * Give up when we didn't enable our caller to free any |
8227 | | * additional space as a result of processing the page that we |
8228 | | * just finished up with. This rule is the main way in which |
8229 | | * we keep the cost of bottom-up deletion under control. |
8230 | | */ |
8231 | 0 | if (nblocksaccessed >= 1 && actualfreespace == lastfreespace) |
8232 | 0 | break; |
8233 | 0 | lastfreespace = actualfreespace; /* for next time */ |
8234 | | |
8235 | | /* |
8236 | | * Deletion operation (which is bottom-up) will definitely |
8237 | | * access the next block in line. Prepare for that now. |
8238 | | * |
8239 | | * Decay target free space so that we don't hang on for too |
8240 | | * long with a marginal case. (Space target is only truly |
8241 | | * helpful when it allows us to recognize that we don't need |
8242 | | * to access more than 1 or 2 blocks to satisfy caller due to |
8243 | | * agreeable workload characteristics.) |
8244 | | * |
8245 | | * We are a bit more patient when we encounter contiguous |
8246 | | * blocks, though: these are treated as favorable blocks. The |
8247 | | * decay process is only applied when the next block in line |
8248 | | * is not a favorable/contiguous block. This is not an |
8249 | | * exception to the general rule; we still insist on finding |
8250 | | * at least one deletable item per block accessed. See |
8251 | | * bottomup_nblocksfavorable() for full details of the theory |
8252 | | * behind favorable blocks and heap block locality in general. |
8253 | | * |
8254 | | * Note: The first block in line is always treated as a |
8255 | | * favorable block, so the earliest possible point that the |
8256 | | * decay can be applied is just before we access the second |
8257 | | * block in line. The Assert() verifies this for us. |
8258 | | */ |
8259 | 0 | Assert(nblocksaccessed > 0 || nblocksfavorable > 0); |
8260 | 0 | if (nblocksfavorable > 0) |
8261 | 0 | nblocksfavorable--; |
8262 | 0 | else |
8263 | 0 | curtargetfreespace /= 2; |
8264 | 0 | } |
8265 | | |
8266 | | /* release old buffer */ |
8267 | 0 | if (BufferIsValid(buf)) |
8268 | 0 | UnlockReleaseBuffer(buf); |
8269 | |
|
8270 | 0 | blkno = ItemPointerGetBlockNumber(htid); |
8271 | 0 | buf = ReadBuffer(rel, blkno); |
8272 | 0 | nblocksaccessed++; |
8273 | 0 | Assert(!delstate->bottomup || |
8274 | 0 | nblocksaccessed <= BOTTOMUP_MAX_NBLOCKS); |
8275 | |
|
8276 | 0 | #ifdef USE_PREFETCH |
8277 | | |
8278 | | /* |
8279 | | * To maintain the prefetch distance, prefetch one more page for |
8280 | | * each page we read. |
8281 | | */ |
8282 | 0 | index_delete_prefetch_buffer(rel, &prefetch_state, 1); |
8283 | 0 | #endif |
8284 | |
|
8285 | 0 | LockBuffer(buf, BUFFER_LOCK_SHARE); |
8286 | |
|
8287 | 0 | page = BufferGetPage(buf); |
8288 | 0 | maxoff = PageGetMaxOffsetNumber(page); |
8289 | 0 | } |
8290 | | |
8291 | | /* |
8292 | | * In passing, detect index corruption involving an index page with a |
8293 | | * TID that points to a location in the heap that couldn't possibly be |
8294 | | * correct. We only do this with actual TIDs from caller's index page |
8295 | | * (not items reached by traversing through a HOT chain). |
8296 | | */ |
8297 | 0 | index_delete_check_htid(delstate, page, maxoff, htid, istatus); |
8298 | |
|
8299 | 0 | if (istatus->knowndeletable) |
8300 | 0 | Assert(!delstate->bottomup && !istatus->promising); |
8301 | 0 | else |
8302 | 0 | { |
8303 | 0 | ItemPointerData tmp = *htid; |
8304 | 0 | HeapTupleData heapTuple; |
8305 | | |
8306 | | /* Are any tuples from this HOT chain non-vacuumable? */ |
8307 | 0 | if (heap_hot_search_buffer(&tmp, rel, buf, &SnapshotNonVacuumable, |
8308 | 0 | &heapTuple, NULL, true)) |
8309 | 0 | continue; /* can't delete entry */ |
8310 | | |
8311 | | /* Caller will delete, since whole HOT chain is vacuumable */ |
8312 | 0 | istatus->knowndeletable = true; |
8313 | | |
8314 | | /* Maintain index free space info for bottom-up deletion case */ |
8315 | 0 | if (delstate->bottomup) |
8316 | 0 | { |
8317 | 0 | Assert(istatus->freespace > 0); |
8318 | 0 | actualfreespace += istatus->freespace; |
8319 | 0 | if (actualfreespace >= curtargetfreespace) |
8320 | 0 | bottomup_final_block = true; |
8321 | 0 | } |
8322 | 0 | } |
8323 | | |
8324 | | /* |
8325 | | * Maintain snapshotConflictHorizon value for deletion operation as a |
8326 | | * whole by advancing current value using heap tuple headers. This is |
8327 | | * loosely based on the logic for pruning a HOT chain. |
8328 | | */ |
8329 | 0 | offnum = ItemPointerGetOffsetNumber(htid); |
8330 | 0 | priorXmax = InvalidTransactionId; /* cannot check first XMIN */ |
8331 | 0 | for (;;) |
8332 | 0 | { |
8333 | 0 | ItemId lp; |
8334 | 0 | HeapTupleHeader htup; |
8335 | | |
8336 | | /* Sanity check (pure paranoia) */ |
8337 | 0 | if (offnum < FirstOffsetNumber) |
8338 | 0 | break; |
8339 | | |
8340 | | /* |
8341 | | * An offset past the end of page's line pointer array is possible |
8342 | | * when the array was truncated |
8343 | | */ |
8344 | 0 | if (offnum > maxoff) |
8345 | 0 | break; |
8346 | | |
8347 | 0 | lp = PageGetItemId(page, offnum); |
8348 | 0 | if (ItemIdIsRedirected(lp)) |
8349 | 0 | { |
8350 | 0 | offnum = ItemIdGetRedirect(lp); |
8351 | 0 | continue; |
8352 | 0 | } |
8353 | | |
8354 | | /* |
8355 | | * We'll often encounter LP_DEAD line pointers (especially with an |
8356 | | * entry marked knowndeletable by our caller up front). No heap |
8357 | | * tuple headers get examined for an htid that leads us to an |
8358 | | * LP_DEAD item. This is okay because the earlier pruning |
8359 | | * operation that made the line pointer LP_DEAD in the first place |
8360 | | * must have considered the original tuple header as part of |
8361 | | * generating its own snapshotConflictHorizon value. |
8362 | | * |
8363 | | * Relying on XLOG_HEAP2_PRUNE_VACUUM_SCAN records like this is |
8364 | | * the same strategy that index vacuuming uses in all cases. Index |
8365 | | * VACUUM WAL records don't even have a snapshotConflictHorizon |
8366 | | * field of their own for this reason. |
8367 | | */ |
8368 | 0 | if (!ItemIdIsNormal(lp)) |
8369 | 0 | break; |
8370 | | |
8371 | 0 | htup = (HeapTupleHeader) PageGetItem(page, lp); |
8372 | | |
8373 | | /* |
8374 | | * Check the tuple XMIN against prior XMAX, if any |
8375 | | */ |
8376 | 0 | if (TransactionIdIsValid(priorXmax) && |
8377 | 0 | !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) |
8378 | 0 | break; |
8379 | | |
8380 | 0 | HeapTupleHeaderAdvanceConflictHorizon(htup, |
8381 | 0 | &snapshotConflictHorizon); |
8382 | | |
8383 | | /* |
8384 | | * If the tuple is not HOT-updated, then we are at the end of this |
8385 | | * HOT-chain. No need to visit later tuples from the same update |
8386 | | * chain (they get their own index entries) -- just move on to |
8387 | | * next htid from index AM caller. |
8388 | | */ |
8389 | 0 | if (!HeapTupleHeaderIsHotUpdated(htup)) |
8390 | 0 | break; |
8391 | | |
8392 | | /* Advance to next HOT chain member */ |
8393 | 0 | Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno); |
8394 | 0 | offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); |
8395 | 0 | priorXmax = HeapTupleHeaderGetUpdateXid(htup); |
8396 | 0 | } |
8397 | | |
8398 | | /* Enable further/final shrinking of deltids for caller */ |
8399 | 0 | finalndeltids = i + 1; |
8400 | 0 | } |
8401 | |
|
8402 | 0 | UnlockReleaseBuffer(buf); |
8403 | | |
8404 | | /* |
8405 | | * Shrink deltids array to exclude non-deletable entries at the end. This |
8406 | | * is not just a minor optimization. Final deltids array size might be |
8407 | | * zero for a bottom-up caller. Index AM is explicitly allowed to rely on |
8408 | | * ndeltids being zero in all cases with zero total deletable entries. |
8409 | | */ |
8410 | 0 | Assert(finalndeltids > 0 || delstate->bottomup); |
8411 | 0 | delstate->ndeltids = finalndeltids; |
8412 | |
|
8413 | 0 | return snapshotConflictHorizon; |
8414 | 0 | } |
8415 | | |
8416 | | /* |
8417 | | * Specialized inlineable comparison function for index_delete_sort() |
8418 | | */ |
8419 | | static inline int |
8420 | | index_delete_sort_cmp(TM_IndexDelete *deltid1, TM_IndexDelete *deltid2) |
8421 | 0 | { |
8422 | 0 | ItemPointer tid1 = &deltid1->tid; |
8423 | 0 | ItemPointer tid2 = &deltid2->tid; |
8424 | |
|
8425 | 0 | { |
8426 | 0 | BlockNumber blk1 = ItemPointerGetBlockNumber(tid1); |
8427 | 0 | BlockNumber blk2 = ItemPointerGetBlockNumber(tid2); |
8428 | |
|
8429 | 0 | if (blk1 != blk2) |
8430 | 0 | return (blk1 < blk2) ? -1 : 1; |
8431 | 0 | } |
8432 | 0 | { |
8433 | 0 | OffsetNumber pos1 = ItemPointerGetOffsetNumber(tid1); |
8434 | 0 | OffsetNumber pos2 = ItemPointerGetOffsetNumber(tid2); |
8435 | |
|
8436 | 0 | if (pos1 != pos2) |
8437 | 0 | return (pos1 < pos2) ? -1 : 1; |
8438 | 0 | } |
8439 | | |
8440 | 0 | Assert(false); |
8441 | |
|
8442 | 0 | return 0; |
8443 | 0 | } |
8444 | | |
8445 | | /* |
8446 | | * Sort deltids array from delstate by TID. This prepares it for further |
8447 | | * processing by heap_index_delete_tuples(). |
8448 | | * |
8449 | | * This operation becomes a noticeable consumer of CPU cycles with some |
8450 | | * workloads, so we go to the trouble of specialization/micro optimization. |
8451 | | * We use shellsort for this because it's easy to specialize, compiles to |
8452 | | * relatively few instructions, and is adaptive to presorted inputs/subsets |
8453 | | * (which are typical here). |
8454 | | */ |
8455 | | static void |
8456 | | index_delete_sort(TM_IndexDeleteOp *delstate) |
8457 | 0 | { |
8458 | 0 | TM_IndexDelete *deltids = delstate->deltids; |
8459 | 0 | int ndeltids = delstate->ndeltids; |
8460 | | |
8461 | | /* |
8462 | | * Shellsort gap sequence (taken from Sedgewick-Incerpi paper). |
8463 | | * |
8464 | | * This implementation is fast with array sizes up to ~4500. This covers |
8465 | | * all supported BLCKSZ values. |
8466 | | */ |
8467 | 0 | const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1}; |
8468 | | |
8469 | | /* Think carefully before changing anything here -- keep swaps cheap */ |
8470 | 0 | StaticAssertDecl(sizeof(TM_IndexDelete) <= 8, |
8471 | 0 | "element size exceeds 8 bytes"); |
8472 | |
|
8473 | 0 | for (int g = 0; g < lengthof(gaps); g++) |
8474 | 0 | { |
8475 | 0 | for (int hi = gaps[g], i = hi; i < ndeltids; i++) |
8476 | 0 | { |
8477 | 0 | TM_IndexDelete d = deltids[i]; |
8478 | 0 | int j = i; |
8479 | |
|
8480 | 0 | while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0) |
8481 | 0 | { |
8482 | 0 | deltids[j] = deltids[j - hi]; |
8483 | 0 | j -= hi; |
8484 | 0 | } |
8485 | 0 | deltids[j] = d; |
8486 | 0 | } |
8487 | 0 | } |
8488 | 0 | } |
8489 | | |
8490 | | /* |
8491 | | * Returns how many blocks should be considered favorable/contiguous for a |
8492 | | * bottom-up index deletion pass. This is a number of heap blocks that starts |
8493 | | * from and includes the first block in line. |
8494 | | * |
8495 | | * There is always at least one favorable block during bottom-up index |
8496 | | * deletion. In the worst case (i.e. with totally random heap blocks) the |
8497 | | * first block in line (the only favorable block) can be thought of as a |
8498 | | * degenerate array of contiguous blocks that consists of a single block. |
8499 | | * heap_index_delete_tuples() will expect this. |
8500 | | * |
8501 | | * Caller passes blockgroups, a description of the final order that deltids |
8502 | | * will be sorted in for heap_index_delete_tuples() bottom-up index deletion |
8503 | | * processing. Note that deltids need not actually be sorted just yet (caller |
8504 | | * only passes deltids to us so that we can interpret blockgroups). |
8505 | | * |
8506 | | * You might guess that the existence of contiguous blocks cannot matter much, |
8507 | | * since in general the main factor that determines which blocks we visit is |
8508 | | * the number of promising TIDs, which is a fixed hint from the index AM. |
8509 | | * We're not really targeting the general case, though -- the actual goal is |
8510 | | * to adapt our behavior to a wide variety of naturally occurring conditions. |
8511 | | * The effects of most of the heuristics we apply are only noticeable in the |
8512 | | * aggregate, over time and across many _related_ bottom-up index deletion |
8513 | | * passes. |
8514 | | * |
8515 | | * Deeming certain blocks favorable allows heapam to recognize and adapt to |
8516 | | * workloads where heap blocks visited during bottom-up index deletion can be |
8517 | | * accessed contiguously, in the sense that each newly visited block is the |
8518 | | * neighbor of the block that bottom-up deletion just finished processing (or |
8519 | | * close enough to it). It will likely be cheaper to access more favorable |
8520 | | * blocks sooner rather than later (e.g. in this pass, not across a series of |
8521 | | * related bottom-up passes). Either way it is probably only a matter of time |
8522 | | * (or a matter of further correlated version churn) before all blocks that |
8523 | | * appear together as a single large batch of favorable blocks get accessed by |
8524 | | * _some_ bottom-up pass. Large batches of favorable blocks tend to either |
8525 | | * appear almost constantly or not even once (it all depends on per-index |
8526 | | * workload characteristics). |
8527 | | * |
8528 | | * Note that the blockgroups sort order applies a power-of-two bucketing |
8529 | | * scheme that creates opportunities for contiguous groups of blocks to get |
8530 | | * batched together, at least with workloads that are naturally amenable to |
8531 | | * being driven by heap block locality. This doesn't just enhance the spatial |
8532 | | * locality of bottom-up heap block processing in the obvious way. It also |
8533 | | * enables temporal locality of access, since sorting by heap block number |
8534 | | * naturally tends to make the bottom-up processing order deterministic. |
8535 | | * |
8536 | | * Consider the following example to get a sense of how temporal locality |
8537 | | * might matter: There is a heap relation with several indexes, each of which |
8538 | | * is low to medium cardinality. It is subject to constant non-HOT updates. |
8539 | | * The updates are skewed (in one part of the primary key, perhaps). None of |
8540 | | * the indexes are logically modified by the UPDATE statements (if they were |
8541 | | * then bottom-up index deletion would not be triggered in the first place). |
8542 | | * Naturally, each new round of index tuples (for each heap tuple that gets a |
8543 | | * heap_update() call) will have the same heap TID in each and every index. |
8544 | | * Since these indexes are low cardinality and never get logically modified, |
8545 | | * heapam processing during bottom-up deletion passes will access heap blocks |
8546 | | * in approximately sequential order. Temporal locality of access occurs due |
8547 | | * to bottom-up deletion passes behaving very similarly across each of the |
8548 | | * indexes at any given moment. This keeps the number of buffer misses needed |
8549 | | * to visit heap blocks to a minimum. |
8550 | | */ |
8551 | | static int |
8552 | | bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups, |
8553 | | TM_IndexDelete *deltids) |
8554 | 0 | { |
8555 | 0 | int64 lastblock = -1; |
8556 | 0 | int nblocksfavorable = 0; |
8557 | |
|
8558 | 0 | Assert(nblockgroups >= 1); |
8559 | 0 | Assert(nblockgroups <= BOTTOMUP_MAX_NBLOCKS); |
8560 | | |
8561 | | /* |
8562 | | * We tolerate heap blocks that will be accessed only slightly out of |
8563 | | * physical order. Small blips occur when a pair of almost-contiguous |
8564 | | * blocks happen to fall into different buckets (perhaps due only to a |
8565 | | * small difference in npromisingtids that the bucketing scheme didn't |
8566 | | * quite manage to ignore). We effectively ignore these blips by applying |
8567 | | * a small tolerance. The precise tolerance we use is a little arbitrary, |
8568 | | * but it works well enough in practice. |
8569 | | */ |
8570 | 0 | for (int b = 0; b < nblockgroups; b++) |
8571 | 0 | { |
8572 | 0 | IndexDeleteCounts *group = blockgroups + b; |
8573 | 0 | TM_IndexDelete *firstdtid = deltids + group->ifirsttid; |
8574 | 0 | BlockNumber block = ItemPointerGetBlockNumber(&firstdtid->tid); |
8575 | |
|
8576 | 0 | if (lastblock != -1 && |
8577 | 0 | ((int64) block < lastblock - BOTTOMUP_TOLERANCE_NBLOCKS || |
8578 | 0 | (int64) block > lastblock + BOTTOMUP_TOLERANCE_NBLOCKS)) |
8579 | 0 | break; |
8580 | | |
8581 | 0 | nblocksfavorable++; |
8582 | 0 | lastblock = block; |
8583 | 0 | } |
8584 | | |
8585 | | /* Always indicate that there is at least 1 favorable block */ |
8586 | 0 | Assert(nblocksfavorable >= 1); |
8587 | |
|
8588 | 0 | return nblocksfavorable; |
8589 | 0 | } |
8590 | | |
8591 | | /* |
8592 | | * qsort comparison function for bottomup_sort_and_shrink() |
8593 | | */ |
8594 | | static int |
8595 | | bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2) |
8596 | 0 | { |
8597 | 0 | const IndexDeleteCounts *group1 = (const IndexDeleteCounts *) arg1; |
8598 | 0 | const IndexDeleteCounts *group2 = (const IndexDeleteCounts *) arg2; |
8599 | | |
8600 | | /* |
8601 | | * Most significant field is npromisingtids (which we invert the order of |
8602 | | * so as to sort in desc order). |
8603 | | * |
8604 | | * Caller should have already normalized npromisingtids fields into |
8605 | | * power-of-two values (buckets). |
8606 | | */ |
8607 | 0 | if (group1->npromisingtids > group2->npromisingtids) |
8608 | 0 | return -1; |
8609 | 0 | if (group1->npromisingtids < group2->npromisingtids) |
8610 | 0 | return 1; |
8611 | | |
8612 | | /* |
8613 | | * Tiebreak: desc ntids sort order. |
8614 | | * |
8615 | | * We cannot expect power-of-two values for ntids fields. We should |
8616 | | * behave as if they were already rounded up for us instead. |
8617 | | */ |
8618 | 0 | if (group1->ntids != group2->ntids) |
8619 | 0 | { |
8620 | 0 | uint32 ntids1 = pg_nextpower2_32((uint32) group1->ntids); |
8621 | 0 | uint32 ntids2 = pg_nextpower2_32((uint32) group2->ntids); |
8622 | |
|
8623 | 0 | if (ntids1 > ntids2) |
8624 | 0 | return -1; |
8625 | 0 | if (ntids1 < ntids2) |
8626 | 0 | return 1; |
8627 | 0 | } |
8628 | | |
8629 | | /* |
8630 | | * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for |
8631 | | * block in deltids array) order. |
8632 | | * |
8633 | | * This is equivalent to sorting in ascending heap block number order |
8634 | | * (among otherwise equal subsets of the array). This approach allows us |
8635 | | * to avoid accessing the out-of-line TID. (We rely on the assumption |
8636 | | * that the deltids array was sorted in ascending heap TID order when |
8637 | | * these offsets to the first TID from each heap block group were formed.) |
8638 | | */ |
8639 | 0 | if (group1->ifirsttid > group2->ifirsttid) |
8640 | 0 | return 1; |
8641 | 0 | if (group1->ifirsttid < group2->ifirsttid) |
8642 | 0 | return -1; |
8643 | | |
8644 | 0 | pg_unreachable(); |
8645 | | |
8646 | 0 | return 0; |
8647 | 0 | } |
8648 | | |
8649 | | /* |
8650 | | * heap_index_delete_tuples() helper function for bottom-up deletion callers. |
8651 | | * |
8652 | | * Sorts deltids array in the order needed for useful processing by bottom-up |
8653 | | * deletion. The array should already be sorted in TID order when we're |
8654 | | * called. The sort process groups heap TIDs from deltids into heap block |
8655 | | * groupings. Earlier/more-promising groups/blocks are usually those that are |
8656 | | * known to have the most "promising" TIDs. |
8657 | | * |
8658 | | * Sets new size of deltids array (ndeltids) in state. deltids will only have |
8659 | | * TIDs from the BOTTOMUP_MAX_NBLOCKS most promising heap blocks when we |
8660 | | * return. This often means that deltids will be shrunk to a small fraction |
8661 | | * of its original size (we eliminate many heap blocks from consideration for |
8662 | | * caller up front). |
8663 | | * |
8664 | | * Returns the number of "favorable" blocks. See bottomup_nblocksfavorable() |
8665 | | * for a definition and full details. |
8666 | | */ |
8667 | | static int |
8668 | | bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate) |
8669 | 0 | { |
8670 | 0 | IndexDeleteCounts *blockgroups; |
8671 | 0 | TM_IndexDelete *reordereddeltids; |
8672 | 0 | BlockNumber curblock = InvalidBlockNumber; |
8673 | 0 | int nblockgroups = 0; |
8674 | 0 | int ncopied = 0; |
8675 | 0 | int nblocksfavorable = 0; |
8676 | |
|
8677 | 0 | Assert(delstate->bottomup); |
8678 | 0 | Assert(delstate->ndeltids > 0); |
8679 | | |
8680 | | /* Calculate per-heap-block count of TIDs */ |
8681 | 0 | blockgroups = palloc(sizeof(IndexDeleteCounts) * delstate->ndeltids); |
8682 | 0 | for (int i = 0; i < delstate->ndeltids; i++) |
8683 | 0 | { |
8684 | 0 | TM_IndexDelete *ideltid = &delstate->deltids[i]; |
8685 | 0 | TM_IndexStatus *istatus = delstate->status + ideltid->id; |
8686 | 0 | ItemPointer htid = &ideltid->tid; |
8687 | 0 | bool promising = istatus->promising; |
8688 | |
|
8689 | 0 | if (curblock != ItemPointerGetBlockNumber(htid)) |
8690 | 0 | { |
8691 | | /* New block group */ |
8692 | 0 | nblockgroups++; |
8693 | |
|
8694 | 0 | Assert(curblock < ItemPointerGetBlockNumber(htid) || |
8695 | 0 | !BlockNumberIsValid(curblock)); |
8696 | |
|
8697 | 0 | curblock = ItemPointerGetBlockNumber(htid); |
8698 | 0 | blockgroups[nblockgroups - 1].ifirsttid = i; |
8699 | 0 | blockgroups[nblockgroups - 1].ntids = 1; |
8700 | 0 | blockgroups[nblockgroups - 1].npromisingtids = 0; |
8701 | 0 | } |
8702 | 0 | else |
8703 | 0 | { |
8704 | 0 | blockgroups[nblockgroups - 1].ntids++; |
8705 | 0 | } |
8706 | |
|
8707 | 0 | if (promising) |
8708 | 0 | blockgroups[nblockgroups - 1].npromisingtids++; |
8709 | 0 | } |
8710 | | |
8711 | | /* |
8712 | | * We're about ready to sort block groups to determine the optimal order |
8713 | | * for visiting heap blocks. But before we do, round the number of |
8714 | | * promising tuples for each block group up to the next power-of-two, |
8715 | | * unless it is very low (less than 4), in which case we round up to 4. |
8716 | | * npromisingtids is far too noisy to trust when choosing between a pair |
8717 | | * of block groups that both have very low values. |
8718 | | * |
8719 | | * This scheme divides heap blocks/block groups into buckets. Each bucket |
8720 | | * contains blocks that have _approximately_ the same number of promising |
8721 | | * TIDs as each other. The goal is to ignore relatively small differences |
8722 | | * in the total number of promising entries, so that the whole process can |
8723 | | * give a little weight to heapam factors (like heap block locality) |
8724 | | * instead. This isn't a trade-off, really -- we have nothing to lose. It |
8725 | | * would be foolish to interpret small differences in npromisingtids |
8726 | | * values as anything more than noise. |
8727 | | * |
8728 | | * We tiebreak on nhtids when sorting block group subsets that have the |
8729 | | * same npromisingtids, but this has the same issues as npromisingtids, |
8730 | | * and so nhtids is subject to the same power-of-two bucketing scheme. The |
8731 | | * only reason that we don't fix nhtids in the same way here too is that |
8732 | | * we'll need accurate nhtids values after the sort. We handle nhtids |
8733 | | * bucketization dynamically instead (in the sort comparator). |
8734 | | * |
8735 | | * See bottomup_nblocksfavorable() for a full explanation of when and how |
8736 | | * heap locality/favorable blocks can significantly influence when and how |
8737 | | * heap blocks are accessed. |
8738 | | */ |
8739 | 0 | for (int b = 0; b < nblockgroups; b++) |
8740 | 0 | { |
8741 | 0 | IndexDeleteCounts *group = blockgroups + b; |
8742 | | |
8743 | | /* Better off falling back on nhtids with low npromisingtids */ |
8744 | 0 | if (group->npromisingtids <= 4) |
8745 | 0 | group->npromisingtids = 4; |
8746 | 0 | else |
8747 | 0 | group->npromisingtids = |
8748 | 0 | pg_nextpower2_32((uint32) group->npromisingtids); |
8749 | 0 | } |
8750 | | |
8751 | | /* Sort groups and rearrange caller's deltids array */ |
8752 | 0 | qsort(blockgroups, nblockgroups, sizeof(IndexDeleteCounts), |
8753 | 0 | bottomup_sort_and_shrink_cmp); |
8754 | 0 | reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete)); |
8755 | |
|
8756 | 0 | nblockgroups = Min(BOTTOMUP_MAX_NBLOCKS, nblockgroups); |
8757 | | /* Determine number of favorable blocks at the start of final deltids */ |
8758 | 0 | nblocksfavorable = bottomup_nblocksfavorable(blockgroups, nblockgroups, |
8759 | 0 | delstate->deltids); |
8760 | |
|
8761 | 0 | for (int b = 0; b < nblockgroups; b++) |
8762 | 0 | { |
8763 | 0 | IndexDeleteCounts *group = blockgroups + b; |
8764 | 0 | TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid; |
8765 | |
|
8766 | 0 | memcpy(reordereddeltids + ncopied, firstdtid, |
8767 | 0 | sizeof(TM_IndexDelete) * group->ntids); |
8768 | 0 | ncopied += group->ntids; |
8769 | 0 | } |
8770 | | |
8771 | | /* Copy final grouped and sorted TIDs back into start of caller's array */ |
8772 | 0 | memcpy(delstate->deltids, reordereddeltids, |
8773 | 0 | sizeof(TM_IndexDelete) * ncopied); |
8774 | 0 | delstate->ndeltids = ncopied; |
8775 | |
|
8776 | 0 | pfree(reordereddeltids); |
8777 | 0 | pfree(blockgroups); |
8778 | |
|
8779 | 0 | return nblocksfavorable; |
8780 | 0 | } |
8781 | | |
8782 | | /* |
8783 | | * Perform XLogInsert for a heap-visible operation. 'block' is the block |
8784 | | * being marked all-visible, and vm_buffer is the buffer containing the |
8785 | | * corresponding visibility map block. Both should have already been modified |
8786 | | * and dirtied. |
8787 | | * |
8788 | | * snapshotConflictHorizon comes from the largest xmin on the page being |
8789 | | * marked all-visible. REDO routine uses it to generate recovery conflicts. |
8790 | | * |
8791 | | * If checksums or wal_log_hints are enabled, we may also generate a full-page |
8792 | | * image of heap_buffer. Otherwise, we optimize away the FPI (by specifying |
8793 | | * REGBUF_NO_IMAGE for the heap buffer), in which case the caller should *not* |
8794 | | * update the heap page's LSN. |
8795 | | */ |
8796 | | XLogRecPtr |
8797 | | log_heap_visible(Relation rel, Buffer heap_buffer, Buffer vm_buffer, |
8798 | | TransactionId snapshotConflictHorizon, uint8 vmflags) |
8799 | 0 | { |
8800 | 0 | xl_heap_visible xlrec; |
8801 | 0 | XLogRecPtr recptr; |
8802 | 0 | uint8 flags; |
8803 | |
|
8804 | 0 | Assert(BufferIsValid(heap_buffer)); |
8805 | 0 | Assert(BufferIsValid(vm_buffer)); |
8806 | |
|
8807 | 0 | xlrec.snapshotConflictHorizon = snapshotConflictHorizon; |
8808 | 0 | xlrec.flags = vmflags; |
8809 | 0 | if (RelationIsAccessibleInLogicalDecoding(rel)) |
8810 | 0 | xlrec.flags |= VISIBILITYMAP_XLOG_CATALOG_REL; |
8811 | 0 | XLogBeginInsert(); |
8812 | 0 | XLogRegisterData(&xlrec, SizeOfHeapVisible); |
8813 | |
|
8814 | 0 | XLogRegisterBuffer(0, vm_buffer, 0); |
8815 | |
|
8816 | 0 | flags = REGBUF_STANDARD; |
8817 | 0 | if (!XLogHintBitIsNeeded()) |
8818 | 0 | flags |= REGBUF_NO_IMAGE; |
8819 | 0 | XLogRegisterBuffer(1, heap_buffer, flags); |
8820 | |
|
8821 | 0 | recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE); |
8822 | |
|
8823 | 0 | return recptr; |
8824 | 0 | } |
8825 | | |
8826 | | /* |
8827 | | * Perform XLogInsert for a heap-update operation. Caller must already |
8828 | | * have modified the buffer(s) and marked them dirty. |
8829 | | */ |
8830 | | static XLogRecPtr |
8831 | | log_heap_update(Relation reln, Buffer oldbuf, |
8832 | | Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, |
8833 | | HeapTuple old_key_tuple, |
8834 | | bool all_visible_cleared, bool new_all_visible_cleared) |
8835 | 0 | { |
8836 | 0 | xl_heap_update xlrec; |
8837 | 0 | xl_heap_header xlhdr; |
8838 | 0 | xl_heap_header xlhdr_idx; |
8839 | 0 | uint8 info; |
8840 | 0 | uint16 prefix_suffix[2]; |
8841 | 0 | uint16 prefixlen = 0, |
8842 | 0 | suffixlen = 0; |
8843 | 0 | XLogRecPtr recptr; |
8844 | 0 | Page page = BufferGetPage(newbuf); |
8845 | 0 | bool need_tuple_data = RelationIsLogicallyLogged(reln); |
8846 | 0 | bool init; |
8847 | 0 | int bufflags; |
8848 | | |
8849 | | /* Caller should not call me on a non-WAL-logged relation */ |
8850 | 0 | Assert(RelationNeedsWAL(reln)); |
8851 | |
|
8852 | 0 | XLogBeginInsert(); |
8853 | |
|
8854 | 0 | if (HeapTupleIsHeapOnly(newtup)) |
8855 | 0 | info = XLOG_HEAP_HOT_UPDATE; |
8856 | 0 | else |
8857 | 0 | info = XLOG_HEAP_UPDATE; |
8858 | | |
8859 | | /* |
8860 | | * If the old and new tuple are on the same page, we only need to log the |
8861 | | * parts of the new tuple that were changed. That saves on the amount of |
8862 | | * WAL we need to write. Currently, we just count any unchanged bytes in |
8863 | | * the beginning and end of the tuple. That's quick to check, and |
8864 | | * perfectly covers the common case that only one field is updated. |
8865 | | * |
8866 | | * We could do this even if the old and new tuple are on different pages, |
8867 | | * but only if we don't make a full-page image of the old page, which is |
8868 | | * difficult to know in advance. Also, if the old tuple is corrupt for |
8869 | | * some reason, it would allow the corruption to propagate the new page, |
8870 | | * so it seems best to avoid. Under the general assumption that most |
8871 | | * updates tend to create the new tuple version on the same page, there |
8872 | | * isn't much to be gained by doing this across pages anyway. |
8873 | | * |
8874 | | * Skip this if we're taking a full-page image of the new page, as we |
8875 | | * don't include the new tuple in the WAL record in that case. Also |
8876 | | * disable if wal_level='logical', as logical decoding needs to be able to |
8877 | | * read the new tuple in whole from the WAL record alone. |
8878 | | */ |
8879 | 0 | if (oldbuf == newbuf && !need_tuple_data && |
8880 | 0 | !XLogCheckBufferNeedsBackup(newbuf)) |
8881 | 0 | { |
8882 | 0 | char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff; |
8883 | 0 | char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff; |
8884 | 0 | int oldlen = oldtup->t_len - oldtup->t_data->t_hoff; |
8885 | 0 | int newlen = newtup->t_len - newtup->t_data->t_hoff; |
8886 | | |
8887 | | /* Check for common prefix between old and new tuple */ |
8888 | 0 | for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++) |
8889 | 0 | { |
8890 | 0 | if (newp[prefixlen] != oldp[prefixlen]) |
8891 | 0 | break; |
8892 | 0 | } |
8893 | | |
8894 | | /* |
8895 | | * Storing the length of the prefix takes 2 bytes, so we need to save |
8896 | | * at least 3 bytes or there's no point. |
8897 | | */ |
8898 | 0 | if (prefixlen < 3) |
8899 | 0 | prefixlen = 0; |
8900 | | |
8901 | | /* Same for suffix */ |
8902 | 0 | for (suffixlen = 0; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++) |
8903 | 0 | { |
8904 | 0 | if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1]) |
8905 | 0 | break; |
8906 | 0 | } |
8907 | 0 | if (suffixlen < 3) |
8908 | 0 | suffixlen = 0; |
8909 | 0 | } |
8910 | | |
8911 | | /* Prepare main WAL data chain */ |
8912 | 0 | xlrec.flags = 0; |
8913 | 0 | if (all_visible_cleared) |
8914 | 0 | xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED; |
8915 | 0 | if (new_all_visible_cleared) |
8916 | 0 | xlrec.flags |= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED; |
8917 | 0 | if (prefixlen > 0) |
8918 | 0 | xlrec.flags |= XLH_UPDATE_PREFIX_FROM_OLD; |
8919 | 0 | if (suffixlen > 0) |
8920 | 0 | xlrec.flags |= XLH_UPDATE_SUFFIX_FROM_OLD; |
8921 | 0 | if (need_tuple_data) |
8922 | 0 | { |
8923 | 0 | xlrec.flags |= XLH_UPDATE_CONTAINS_NEW_TUPLE; |
8924 | 0 | if (old_key_tuple) |
8925 | 0 | { |
8926 | 0 | if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL) |
8927 | 0 | xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_TUPLE; |
8928 | 0 | else |
8929 | 0 | xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY; |
8930 | 0 | } |
8931 | 0 | } |
8932 | | |
8933 | | /* If new tuple is the single and first tuple on page... */ |
8934 | 0 | if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber && |
8935 | 0 | PageGetMaxOffsetNumber(page) == FirstOffsetNumber) |
8936 | 0 | { |
8937 | 0 | info |= XLOG_HEAP_INIT_PAGE; |
8938 | 0 | init = true; |
8939 | 0 | } |
8940 | 0 | else |
8941 | 0 | init = false; |
8942 | | |
8943 | | /* Prepare WAL data for the old page */ |
8944 | 0 | xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self); |
8945 | 0 | xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data); |
8946 | 0 | xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask, |
8947 | 0 | oldtup->t_data->t_infomask2); |
8948 | | |
8949 | | /* Prepare WAL data for the new page */ |
8950 | 0 | xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self); |
8951 | 0 | xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data); |
8952 | |
|
8953 | 0 | bufflags = REGBUF_STANDARD; |
8954 | 0 | if (init) |
8955 | 0 | bufflags |= REGBUF_WILL_INIT; |
8956 | 0 | if (need_tuple_data) |
8957 | 0 | bufflags |= REGBUF_KEEP_DATA; |
8958 | |
|
8959 | 0 | XLogRegisterBuffer(0, newbuf, bufflags); |
8960 | 0 | if (oldbuf != newbuf) |
8961 | 0 | XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD); |
8962 | |
|
8963 | 0 | XLogRegisterData(&xlrec, SizeOfHeapUpdate); |
8964 | | |
8965 | | /* |
8966 | | * Prepare WAL data for the new tuple. |
8967 | | */ |
8968 | 0 | if (prefixlen > 0 || suffixlen > 0) |
8969 | 0 | { |
8970 | 0 | if (prefixlen > 0 && suffixlen > 0) |
8971 | 0 | { |
8972 | 0 | prefix_suffix[0] = prefixlen; |
8973 | 0 | prefix_suffix[1] = suffixlen; |
8974 | 0 | XLogRegisterBufData(0, &prefix_suffix, sizeof(uint16) * 2); |
8975 | 0 | } |
8976 | 0 | else if (prefixlen > 0) |
8977 | 0 | { |
8978 | 0 | XLogRegisterBufData(0, &prefixlen, sizeof(uint16)); |
8979 | 0 | } |
8980 | 0 | else |
8981 | 0 | { |
8982 | 0 | XLogRegisterBufData(0, &suffixlen, sizeof(uint16)); |
8983 | 0 | } |
8984 | 0 | } |
8985 | |
|
8986 | 0 | xlhdr.t_infomask2 = newtup->t_data->t_infomask2; |
8987 | 0 | xlhdr.t_infomask = newtup->t_data->t_infomask; |
8988 | 0 | xlhdr.t_hoff = newtup->t_data->t_hoff; |
8989 | 0 | Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len); |
8990 | | |
8991 | | /* |
8992 | | * PG73FORMAT: write bitmap [+ padding] [+ oid] + data |
8993 | | * |
8994 | | * The 'data' doesn't include the common prefix or suffix. |
8995 | | */ |
8996 | 0 | XLogRegisterBufData(0, &xlhdr, SizeOfHeapHeader); |
8997 | 0 | if (prefixlen == 0) |
8998 | 0 | { |
8999 | 0 | XLogRegisterBufData(0, |
9000 | 0 | (char *) newtup->t_data + SizeofHeapTupleHeader, |
9001 | 0 | newtup->t_len - SizeofHeapTupleHeader - suffixlen); |
9002 | 0 | } |
9003 | 0 | else |
9004 | 0 | { |
9005 | | /* |
9006 | | * Have to write the null bitmap and data after the common prefix as |
9007 | | * two separate rdata entries. |
9008 | | */ |
9009 | | /* bitmap [+ padding] [+ oid] */ |
9010 | 0 | if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0) |
9011 | 0 | { |
9012 | 0 | XLogRegisterBufData(0, |
9013 | 0 | (char *) newtup->t_data + SizeofHeapTupleHeader, |
9014 | 0 | newtup->t_data->t_hoff - SizeofHeapTupleHeader); |
9015 | 0 | } |
9016 | | |
9017 | | /* data after common prefix */ |
9018 | 0 | XLogRegisterBufData(0, |
9019 | 0 | (char *) newtup->t_data + newtup->t_data->t_hoff + prefixlen, |
9020 | 0 | newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen); |
9021 | 0 | } |
9022 | | |
9023 | | /* We need to log a tuple identity */ |
9024 | 0 | if (need_tuple_data && old_key_tuple) |
9025 | 0 | { |
9026 | | /* don't really need this, but its more comfy to decode */ |
9027 | 0 | xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2; |
9028 | 0 | xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask; |
9029 | 0 | xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff; |
9030 | |
|
9031 | 0 | XLogRegisterData(&xlhdr_idx, SizeOfHeapHeader); |
9032 | | |
9033 | | /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */ |
9034 | 0 | XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader, |
9035 | 0 | old_key_tuple->t_len - SizeofHeapTupleHeader); |
9036 | 0 | } |
9037 | | |
9038 | | /* filtering by origin on a row level is much more efficient */ |
9039 | 0 | XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN); |
9040 | |
|
9041 | 0 | recptr = XLogInsert(RM_HEAP_ID, info); |
9042 | |
|
9043 | 0 | return recptr; |
9044 | 0 | } |
9045 | | |
9046 | | /* |
9047 | | * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record |
9048 | | * |
9049 | | * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog |
9050 | | * tuples. |
9051 | | */ |
9052 | | static XLogRecPtr |
9053 | | log_heap_new_cid(Relation relation, HeapTuple tup) |
9054 | 0 | { |
9055 | 0 | xl_heap_new_cid xlrec; |
9056 | |
|
9057 | 0 | XLogRecPtr recptr; |
9058 | 0 | HeapTupleHeader hdr = tup->t_data; |
9059 | |
|
9060 | 0 | Assert(ItemPointerIsValid(&tup->t_self)); |
9061 | 0 | Assert(tup->t_tableOid != InvalidOid); |
9062 | |
|
9063 | 0 | xlrec.top_xid = GetTopTransactionId(); |
9064 | 0 | xlrec.target_locator = relation->rd_locator; |
9065 | 0 | xlrec.target_tid = tup->t_self; |
9066 | | |
9067 | | /* |
9068 | | * If the tuple got inserted & deleted in the same TX we definitely have a |
9069 | | * combo CID, set cmin and cmax. |
9070 | | */ |
9071 | 0 | if (hdr->t_infomask & HEAP_COMBOCID) |
9072 | 0 | { |
9073 | 0 | Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID)); |
9074 | 0 | Assert(!HeapTupleHeaderXminInvalid(hdr)); |
9075 | 0 | xlrec.cmin = HeapTupleHeaderGetCmin(hdr); |
9076 | 0 | xlrec.cmax = HeapTupleHeaderGetCmax(hdr); |
9077 | 0 | xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr); |
9078 | 0 | } |
9079 | | /* No combo CID, so only cmin or cmax can be set by this TX */ |
9080 | 0 | else |
9081 | 0 | { |
9082 | | /* |
9083 | | * Tuple inserted. |
9084 | | * |
9085 | | * We need to check for LOCK ONLY because multixacts might be |
9086 | | * transferred to the new tuple in case of FOR KEY SHARE updates in |
9087 | | * which case there will be an xmax, although the tuple just got |
9088 | | * inserted. |
9089 | | */ |
9090 | 0 | if (hdr->t_infomask & HEAP_XMAX_INVALID || |
9091 | 0 | HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask)) |
9092 | 0 | { |
9093 | 0 | xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr); |
9094 | 0 | xlrec.cmax = InvalidCommandId; |
9095 | 0 | } |
9096 | | /* Tuple from a different tx updated or deleted. */ |
9097 | 0 | else |
9098 | 0 | { |
9099 | 0 | xlrec.cmin = InvalidCommandId; |
9100 | 0 | xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr); |
9101 | 0 | } |
9102 | 0 | xlrec.combocid = InvalidCommandId; |
9103 | 0 | } |
9104 | | |
9105 | | /* |
9106 | | * Note that we don't need to register the buffer here, because this |
9107 | | * operation does not modify the page. The insert/update/delete that |
9108 | | * called us certainly did, but that's WAL-logged separately. |
9109 | | */ |
9110 | 0 | XLogBeginInsert(); |
9111 | 0 | XLogRegisterData(&xlrec, SizeOfHeapNewCid); |
9112 | | |
9113 | | /* will be looked at irrespective of origin */ |
9114 | |
|
9115 | 0 | recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID); |
9116 | |
|
9117 | 0 | return recptr; |
9118 | 0 | } |
9119 | | |
9120 | | /* |
9121 | | * Build a heap tuple representing the configured REPLICA IDENTITY to represent |
9122 | | * the old tuple in an UPDATE or DELETE. |
9123 | | * |
9124 | | * Returns NULL if there's no need to log an identity or if there's no suitable |
9125 | | * key defined. |
9126 | | * |
9127 | | * Pass key_required true if any replica identity columns changed value, or if |
9128 | | * any of them have any external data. Delete must always pass true. |
9129 | | * |
9130 | | * *copy is set to true if the returned tuple is a modified copy rather than |
9131 | | * the same tuple that was passed in. |
9132 | | */ |
9133 | | static HeapTuple |
9134 | | ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, |
9135 | | bool *copy) |
9136 | 0 | { |
9137 | 0 | TupleDesc desc = RelationGetDescr(relation); |
9138 | 0 | char replident = relation->rd_rel->relreplident; |
9139 | 0 | Bitmapset *idattrs; |
9140 | 0 | HeapTuple key_tuple; |
9141 | 0 | bool nulls[MaxHeapAttributeNumber]; |
9142 | 0 | Datum values[MaxHeapAttributeNumber]; |
9143 | |
|
9144 | 0 | *copy = false; |
9145 | |
|
9146 | 0 | if (!RelationIsLogicallyLogged(relation)) |
9147 | 0 | return NULL; |
9148 | | |
9149 | 0 | if (replident == REPLICA_IDENTITY_NOTHING) |
9150 | 0 | return NULL; |
9151 | | |
9152 | 0 | if (replident == REPLICA_IDENTITY_FULL) |
9153 | 0 | { |
9154 | | /* |
9155 | | * When logging the entire old tuple, it very well could contain |
9156 | | * toasted columns. If so, force them to be inlined. |
9157 | | */ |
9158 | 0 | if (HeapTupleHasExternal(tp)) |
9159 | 0 | { |
9160 | 0 | *copy = true; |
9161 | 0 | tp = toast_flatten_tuple(tp, desc); |
9162 | 0 | } |
9163 | 0 | return tp; |
9164 | 0 | } |
9165 | | |
9166 | | /* if the key isn't required and we're only logging the key, we're done */ |
9167 | 0 | if (!key_required) |
9168 | 0 | return NULL; |
9169 | | |
9170 | | /* find out the replica identity columns */ |
9171 | 0 | idattrs = RelationGetIndexAttrBitmap(relation, |
9172 | 0 | INDEX_ATTR_BITMAP_IDENTITY_KEY); |
9173 | | |
9174 | | /* |
9175 | | * If there's no defined replica identity columns, treat as !key_required. |
9176 | | * (This case should not be reachable from heap_update, since that should |
9177 | | * calculate key_required accurately. But heap_delete just passes |
9178 | | * constant true for key_required, so we can hit this case in deletes.) |
9179 | | */ |
9180 | 0 | if (bms_is_empty(idattrs)) |
9181 | 0 | return NULL; |
9182 | | |
9183 | | /* |
9184 | | * Construct a new tuple containing only the replica identity columns, |
9185 | | * with nulls elsewhere. While we're at it, assert that the replica |
9186 | | * identity columns aren't null. |
9187 | | */ |
9188 | 0 | heap_deform_tuple(tp, desc, values, nulls); |
9189 | |
|
9190 | 0 | for (int i = 0; i < desc->natts; i++) |
9191 | 0 | { |
9192 | 0 | if (bms_is_member(i + 1 - FirstLowInvalidHeapAttributeNumber, |
9193 | 0 | idattrs)) |
9194 | 0 | Assert(!nulls[i]); |
9195 | 0 | else |
9196 | 0 | nulls[i] = true; |
9197 | 0 | } |
9198 | |
|
9199 | 0 | key_tuple = heap_form_tuple(desc, values, nulls); |
9200 | 0 | *copy = true; |
9201 | |
|
9202 | 0 | bms_free(idattrs); |
9203 | | |
9204 | | /* |
9205 | | * If the tuple, which by here only contains indexed columns, still has |
9206 | | * toasted columns, force them to be inlined. This is somewhat unlikely |
9207 | | * since there's limits on the size of indexed columns, so we don't |
9208 | | * duplicate toast_flatten_tuple()s functionality in the above loop over |
9209 | | * the indexed columns, even if it would be more efficient. |
9210 | | */ |
9211 | 0 | if (HeapTupleHasExternal(key_tuple)) |
9212 | 0 | { |
9213 | 0 | HeapTuple oldtup = key_tuple; |
9214 | |
|
9215 | 0 | key_tuple = toast_flatten_tuple(oldtup, desc); |
9216 | 0 | heap_freetuple(oldtup); |
9217 | 0 | } |
9218 | |
|
9219 | 0 | return key_tuple; |
9220 | 0 | } |
9221 | | |
9222 | | /* |
9223 | | * HeapCheckForSerializableConflictOut |
9224 | | * We are reading a tuple. If it's not visible, there may be a |
9225 | | * rw-conflict out with the inserter. Otherwise, if it is visible to us |
9226 | | * but has been deleted, there may be a rw-conflict out with the deleter. |
9227 | | * |
9228 | | * We will determine the top level xid of the writing transaction with which |
9229 | | * we may be in conflict, and ask CheckForSerializableConflictOut() to check |
9230 | | * for overlap with our own transaction. |
9231 | | * |
9232 | | * This function should be called just about anywhere in heapam.c where a |
9233 | | * tuple has been read. The caller must hold at least a shared lock on the |
9234 | | * buffer, because this function might set hint bits on the tuple. There is |
9235 | | * currently no known reason to call this function from an index AM. |
9236 | | */ |
9237 | | void |
9238 | | HeapCheckForSerializableConflictOut(bool visible, Relation relation, |
9239 | | HeapTuple tuple, Buffer buffer, |
9240 | | Snapshot snapshot) |
9241 | 0 | { |
9242 | 0 | TransactionId xid; |
9243 | 0 | HTSV_Result htsvResult; |
9244 | |
|
9245 | 0 | if (!CheckForSerializableConflictOutNeeded(relation, snapshot)) |
9246 | 0 | return; |
9247 | | |
9248 | | /* |
9249 | | * Check to see whether the tuple has been written to by a concurrent |
9250 | | * transaction, either to create it not visible to us, or to delete it |
9251 | | * while it is visible to us. The "visible" bool indicates whether the |
9252 | | * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else |
9253 | | * is going on with it. |
9254 | | * |
9255 | | * In the event of a concurrently inserted tuple that also happens to have |
9256 | | * been concurrently updated (by a separate transaction), the xmin of the |
9257 | | * tuple will be used -- not the updater's xid. |
9258 | | */ |
9259 | 0 | htsvResult = HeapTupleSatisfiesVacuum(tuple, TransactionXmin, buffer); |
9260 | 0 | switch (htsvResult) |
9261 | 0 | { |
9262 | 0 | case HEAPTUPLE_LIVE: |
9263 | 0 | if (visible) |
9264 | 0 | return; |
9265 | 0 | xid = HeapTupleHeaderGetXmin(tuple->t_data); |
9266 | 0 | break; |
9267 | 0 | case HEAPTUPLE_RECENTLY_DEAD: |
9268 | 0 | case HEAPTUPLE_DELETE_IN_PROGRESS: |
9269 | 0 | if (visible) |
9270 | 0 | xid = HeapTupleHeaderGetUpdateXid(tuple->t_data); |
9271 | 0 | else |
9272 | 0 | xid = HeapTupleHeaderGetXmin(tuple->t_data); |
9273 | |
|
9274 | 0 | if (TransactionIdPrecedes(xid, TransactionXmin)) |
9275 | 0 | { |
9276 | | /* This is like the HEAPTUPLE_DEAD case */ |
9277 | 0 | Assert(!visible); |
9278 | 0 | return; |
9279 | 0 | } |
9280 | 0 | break; |
9281 | 0 | case HEAPTUPLE_INSERT_IN_PROGRESS: |
9282 | 0 | xid = HeapTupleHeaderGetXmin(tuple->t_data); |
9283 | 0 | break; |
9284 | 0 | case HEAPTUPLE_DEAD: |
9285 | 0 | Assert(!visible); |
9286 | 0 | return; |
9287 | 0 | default: |
9288 | | |
9289 | | /* |
9290 | | * The only way to get to this default clause is if a new value is |
9291 | | * added to the enum type without adding it to this switch |
9292 | | * statement. That's a bug, so elog. |
9293 | | */ |
9294 | 0 | elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult); |
9295 | | |
9296 | | /* |
9297 | | * In spite of having all enum values covered and calling elog on |
9298 | | * this default, some compilers think this is a code path which |
9299 | | * allows xid to be used below without initialization. Silence |
9300 | | * that warning. |
9301 | | */ |
9302 | 0 | xid = InvalidTransactionId; |
9303 | 0 | } |
9304 | | |
9305 | 0 | Assert(TransactionIdIsValid(xid)); |
9306 | 0 | Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); |
9307 | | |
9308 | | /* |
9309 | | * Find top level xid. Bail out if xid is too early to be a conflict, or |
9310 | | * if it's our own xid. |
9311 | | */ |
9312 | 0 | if (TransactionIdEquals(xid, GetTopTransactionIdIfAny())) |
9313 | 0 | return; |
9314 | 0 | xid = SubTransGetTopmostTransaction(xid); |
9315 | 0 | if (TransactionIdPrecedes(xid, TransactionXmin)) |
9316 | 0 | return; |
9317 | | |
9318 | 0 | CheckForSerializableConflictOut(relation, xid, snapshot); |
9319 | 0 | } |