/src/postgres/src/backend/access/nbtree/nbtpage.c
Line | Count | Source |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * nbtpage.c |
4 | | * BTree-specific page management code for the Postgres btree access |
5 | | * method. |
6 | | * |
7 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
8 | | * Portions Copyright (c) 1994, Regents of the University of California |
9 | | * |
10 | | * |
11 | | * IDENTIFICATION |
12 | | * src/backend/access/nbtree/nbtpage.c |
13 | | * |
14 | | * NOTES |
15 | | * Postgres btree pages look like ordinary relation pages. The opaque |
16 | | * data at high addresses includes pointers to left and right siblings |
17 | | * and flag data describing page state. The first page in a btree, page |
18 | | * zero, is special -- it stores meta-information describing the tree. |
19 | | * Pages one and higher store the actual tree data. |
20 | | * |
21 | | *------------------------------------------------------------------------- |
22 | | */ |
23 | | #include "postgres.h" |
24 | | |
25 | | #include "access/nbtree.h" |
26 | | #include "access/nbtxlog.h" |
27 | | #include "access/tableam.h" |
28 | | #include "access/transam.h" |
29 | | #include "access/xlog.h" |
30 | | #include "access/xloginsert.h" |
31 | | #include "common/int.h" |
32 | | #include "miscadmin.h" |
33 | | #include "storage/indexfsm.h" |
34 | | #include "storage/predicate.h" |
35 | | #include "storage/procarray.h" |
36 | | #include "utils/memdebug.h" |
37 | | #include "utils/memutils.h" |
38 | | #include "utils/snapmgr.h" |
39 | | |
40 | | static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf); |
41 | | static void _bt_delitems_delete(Relation rel, Buffer buf, |
42 | | TransactionId snapshotConflictHorizon, |
43 | | bool isCatalogRel, |
44 | | OffsetNumber *deletable, int ndeletable, |
45 | | BTVacuumPosting *updatable, int nupdatable); |
46 | | static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, |
47 | | OffsetNumber *updatedoffsets, |
48 | | Size *updatedbuflen, bool needswal); |
49 | | static bool _bt_mark_page_halfdead(Relation rel, Relation heaprel, |
50 | | Buffer leafbuf, BTStack stack); |
51 | | static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, |
52 | | BlockNumber scanblkno, |
53 | | bool *rightsib_empty, |
54 | | BTVacState *vstate); |
55 | | static bool _bt_lock_subtree_parent(Relation rel, Relation heaprel, |
56 | | BlockNumber child, BTStack stack, |
57 | | Buffer *subtreeparent, OffsetNumber *poffset, |
58 | | BlockNumber *topparent, |
59 | | BlockNumber *topparentrightsib); |
60 | | static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target, |
61 | | FullTransactionId safexid); |
62 | | |
63 | | /* |
64 | | * _bt_initmetapage() -- Fill a page buffer with a correct metapage image |
65 | | */ |
66 | | void |
67 | | _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, |
68 | | bool allequalimage) |
69 | 0 | { |
70 | 0 | BTMetaPageData *metad; |
71 | 0 | BTPageOpaque metaopaque; |
72 | |
|
73 | 0 | _bt_pageinit(page, BLCKSZ); |
74 | |
|
75 | 0 | metad = BTPageGetMeta(page); |
76 | 0 | metad->btm_magic = BTREE_MAGIC; |
77 | 0 | metad->btm_version = BTREE_VERSION; |
78 | 0 | metad->btm_root = rootbknum; |
79 | 0 | metad->btm_level = level; |
80 | 0 | metad->btm_fastroot = rootbknum; |
81 | 0 | metad->btm_fastlevel = level; |
82 | 0 | metad->btm_last_cleanup_num_delpages = 0; |
83 | 0 | metad->btm_last_cleanup_num_heap_tuples = -1.0; |
84 | 0 | metad->btm_allequalimage = allequalimage; |
85 | |
|
86 | 0 | metaopaque = BTPageGetOpaque(page); |
87 | 0 | metaopaque->btpo_flags = BTP_META; |
88 | | |
89 | | /* |
90 | | * Set pd_lower just past the end of the metadata. This is essential, |
91 | | * because without doing so, metadata will be lost if xlog.c compresses |
92 | | * the page. |
93 | | */ |
94 | 0 | ((PageHeader) page)->pd_lower = |
95 | 0 | ((char *) metad + sizeof(BTMetaPageData)) - (char *) page; |
96 | 0 | } |
97 | | |
98 | | /* |
99 | | * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version |
100 | | * 3, the last version that can be updated without broadly affecting |
101 | | * on-disk compatibility. (A REINDEX is required to upgrade to v4.) |
102 | | * |
103 | | * This routine does purely in-memory image upgrade. Caller is |
104 | | * responsible for locking, WAL-logging etc. |
105 | | */ |
106 | | void |
107 | | _bt_upgrademetapage(Page page) |
108 | 0 | { |
109 | 0 | BTMetaPageData *metad; |
110 | 0 | BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY; |
111 | |
|
112 | 0 | metad = BTPageGetMeta(page); |
113 | 0 | metaopaque = BTPageGetOpaque(page); |
114 | | |
115 | | /* It must be really a meta page of upgradable version */ |
116 | 0 | Assert(metaopaque->btpo_flags & BTP_META); |
117 | 0 | Assert(metad->btm_version < BTREE_NOVAC_VERSION); |
118 | 0 | Assert(metad->btm_version >= BTREE_MIN_VERSION); |
119 | | |
120 | | /* Set version number and fill extra fields added into version 3 */ |
121 | 0 | metad->btm_version = BTREE_NOVAC_VERSION; |
122 | 0 | metad->btm_last_cleanup_num_delpages = 0; |
123 | 0 | metad->btm_last_cleanup_num_heap_tuples = -1.0; |
124 | | /* Only a REINDEX can set this field */ |
125 | 0 | Assert(!metad->btm_allequalimage); |
126 | 0 | metad->btm_allequalimage = false; |
127 | | |
128 | | /* Adjust pd_lower (see _bt_initmetapage() for details) */ |
129 | 0 | ((PageHeader) page)->pd_lower = |
130 | 0 | ((char *) metad + sizeof(BTMetaPageData)) - (char *) page; |
131 | 0 | } |
132 | | |
133 | | /* |
134 | | * Get metadata from share-locked buffer containing metapage, while performing |
135 | | * standard sanity checks. |
136 | | * |
137 | | * Callers that cache data returned here in local cache should note that an |
138 | | * on-the-fly upgrade using _bt_upgrademetapage() can change the version field |
139 | | * and BTREE_NOVAC_VERSION specific fields without invalidating local cache. |
140 | | */ |
141 | | static BTMetaPageData * |
142 | | _bt_getmeta(Relation rel, Buffer metabuf) |
143 | 0 | { |
144 | 0 | Page metapg; |
145 | 0 | BTPageOpaque metaopaque; |
146 | 0 | BTMetaPageData *metad; |
147 | |
|
148 | 0 | metapg = BufferGetPage(metabuf); |
149 | 0 | metaopaque = BTPageGetOpaque(metapg); |
150 | 0 | metad = BTPageGetMeta(metapg); |
151 | | |
152 | | /* sanity-check the metapage */ |
153 | 0 | if (!P_ISMETA(metaopaque) || |
154 | 0 | metad->btm_magic != BTREE_MAGIC) |
155 | 0 | ereport(ERROR, |
156 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
157 | 0 | errmsg("index \"%s\" is not a btree", |
158 | 0 | RelationGetRelationName(rel)))); |
159 | | |
160 | 0 | if (metad->btm_version < BTREE_MIN_VERSION || |
161 | 0 | metad->btm_version > BTREE_VERSION) |
162 | 0 | ereport(ERROR, |
163 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
164 | 0 | errmsg("version mismatch in index \"%s\": file version %d, " |
165 | 0 | "current version %d, minimal supported version %d", |
166 | 0 | RelationGetRelationName(rel), |
167 | 0 | metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); |
168 | | |
169 | 0 | return metad; |
170 | 0 | } |
171 | | |
172 | | /* |
173 | | * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup |
174 | | * |
175 | | * Called by btvacuumcleanup when btbulkdelete was never called because no |
176 | | * index tuples needed to be deleted. |
177 | | */ |
178 | | bool |
179 | | _bt_vacuum_needs_cleanup(Relation rel) |
180 | 0 | { |
181 | 0 | Buffer metabuf; |
182 | 0 | Page metapg; |
183 | 0 | BTMetaPageData *metad; |
184 | 0 | uint32 btm_version; |
185 | 0 | BlockNumber prev_num_delpages; |
186 | | |
187 | | /* |
188 | | * Copy details from metapage to local variables quickly. |
189 | | * |
190 | | * Note that we deliberately avoid using cached version of metapage here. |
191 | | */ |
192 | 0 | metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); |
193 | 0 | metapg = BufferGetPage(metabuf); |
194 | 0 | metad = BTPageGetMeta(metapg); |
195 | 0 | btm_version = metad->btm_version; |
196 | |
|
197 | 0 | if (btm_version < BTREE_NOVAC_VERSION) |
198 | 0 | { |
199 | | /* |
200 | | * Metapage needs to be dynamically upgraded to store fields that are |
201 | | * only present when btm_version >= BTREE_NOVAC_VERSION |
202 | | */ |
203 | 0 | _bt_relbuf(rel, metabuf); |
204 | 0 | return true; |
205 | 0 | } |
206 | | |
207 | 0 | prev_num_delpages = metad->btm_last_cleanup_num_delpages; |
208 | 0 | _bt_relbuf(rel, metabuf); |
209 | | |
210 | | /* |
211 | | * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the |
212 | | * total size of the index. We can reasonably expect (though are not |
213 | | * guaranteed) to be able to recycle this many pages if we decide to do a |
214 | | * btvacuumscan call during the ongoing btvacuumcleanup. For further |
215 | | * details see the nbtree/README section on placing deleted pages in the |
216 | | * FSM. |
217 | | */ |
218 | 0 | if (prev_num_delpages > 0 && |
219 | 0 | prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20) |
220 | 0 | return true; |
221 | | |
222 | 0 | return false; |
223 | 0 | } |
224 | | |
225 | | /* |
226 | | * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup. |
227 | | * |
228 | | * Called at the end of btvacuumcleanup, when num_delpages value has been |
229 | | * finalized. |
230 | | */ |
231 | | void |
232 | | _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages) |
233 | 0 | { |
234 | 0 | Buffer metabuf; |
235 | 0 | Page metapg; |
236 | 0 | BTMetaPageData *metad; |
237 | | |
238 | | /* |
239 | | * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage |
240 | | * field started out as a TransactionId field called btm_oldest_btpo_xact. |
241 | | * Both "versions" are just uint32 fields. It was convenient to repurpose |
242 | | * the field when we began to use 64-bit XIDs in deleted pages. |
243 | | * |
244 | | * It's possible that a pg_upgrade'd database will contain an XID value in |
245 | | * what is now recognized as the metapage's btm_last_cleanup_num_delpages |
246 | | * field. _bt_vacuum_needs_cleanup() may even believe that this value |
247 | | * indicates that there are lots of pages that it needs to recycle, when |
248 | | * in reality there are only one or two. The worst that can happen is |
249 | | * that there will be a call to btvacuumscan a little earlier, which will |
250 | | * set btm_last_cleanup_num_delpages to a sane value when we're called. |
251 | | * |
252 | | * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is |
253 | | * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just |
254 | | * to be consistent. |
255 | | */ |
256 | 0 | metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); |
257 | 0 | metapg = BufferGetPage(metabuf); |
258 | 0 | metad = BTPageGetMeta(metapg); |
259 | | |
260 | | /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */ |
261 | 0 | if (metad->btm_version >= BTREE_NOVAC_VERSION && |
262 | 0 | metad->btm_last_cleanup_num_delpages == num_delpages) |
263 | 0 | { |
264 | | /* Usually means index continues to have num_delpages of 0 */ |
265 | 0 | _bt_relbuf(rel, metabuf); |
266 | 0 | return; |
267 | 0 | } |
268 | | |
269 | | /* trade in our read lock for a write lock */ |
270 | 0 | _bt_unlockbuf(rel, metabuf); |
271 | 0 | _bt_lockbuf(rel, metabuf, BT_WRITE); |
272 | |
|
273 | 0 | START_CRIT_SECTION(); |
274 | | |
275 | | /* upgrade meta-page if needed */ |
276 | 0 | if (metad->btm_version < BTREE_NOVAC_VERSION) |
277 | 0 | _bt_upgrademetapage(metapg); |
278 | | |
279 | | /* update cleanup-related information */ |
280 | 0 | metad->btm_last_cleanup_num_delpages = num_delpages; |
281 | 0 | metad->btm_last_cleanup_num_heap_tuples = -1.0; |
282 | 0 | MarkBufferDirty(metabuf); |
283 | | |
284 | | /* write wal record if needed */ |
285 | 0 | if (RelationNeedsWAL(rel)) |
286 | 0 | { |
287 | 0 | xl_btree_metadata md; |
288 | 0 | XLogRecPtr recptr; |
289 | |
|
290 | 0 | XLogBeginInsert(); |
291 | 0 | XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); |
292 | |
|
293 | 0 | Assert(metad->btm_version >= BTREE_NOVAC_VERSION); |
294 | 0 | md.version = metad->btm_version; |
295 | 0 | md.root = metad->btm_root; |
296 | 0 | md.level = metad->btm_level; |
297 | 0 | md.fastroot = metad->btm_fastroot; |
298 | 0 | md.fastlevel = metad->btm_fastlevel; |
299 | 0 | md.last_cleanup_num_delpages = num_delpages; |
300 | 0 | md.allequalimage = metad->btm_allequalimage; |
301 | |
|
302 | 0 | XLogRegisterBufData(0, &md, sizeof(xl_btree_metadata)); |
303 | |
|
304 | 0 | recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP); |
305 | |
|
306 | 0 | PageSetLSN(metapg, recptr); |
307 | 0 | } |
308 | |
|
309 | 0 | END_CRIT_SECTION(); |
310 | |
|
311 | 0 | _bt_relbuf(rel, metabuf); |
312 | 0 | } |
313 | | |
314 | | /* |
315 | | * _bt_getroot() -- Get the root page of the btree. |
316 | | * |
317 | | * Since the root page can move around the btree file, we have to read |
318 | | * its location from the metadata page, and then read the root page |
319 | | * itself. If no root page exists yet, we have to create one. |
320 | | * |
321 | | * The access type parameter (BT_READ or BT_WRITE) controls whether |
322 | | * a new root page will be created or not. If access = BT_READ, |
323 | | * and no root page exists, we just return InvalidBuffer. For |
324 | | * BT_WRITE, we try to create the root page if it doesn't exist. |
325 | | * NOTE that the returned root page will have only a read lock set |
326 | | * on it even if access = BT_WRITE! |
327 | | * |
328 | | * If access = BT_WRITE, heaprel must be set; otherwise caller can just |
329 | | * pass NULL. See _bt_allocbuf for an explanation. |
330 | | * |
331 | | * The returned page is not necessarily the true root --- it could be |
332 | | * a "fast root" (a page that is alone in its level due to deletions). |
333 | | * Also, if the root page is split while we are "in flight" to it, |
334 | | * what we will return is the old root, which is now just the leftmost |
335 | | * page on a probably-not-very-wide level. For most purposes this is |
336 | | * as good as or better than the true root, so we do not bother to |
337 | | * insist on finding the true root. We do, however, guarantee to |
338 | | * return a live (not deleted or half-dead) page. |
339 | | * |
340 | | * On successful return, the root page is pinned and read-locked. |
341 | | * The metadata page is not locked or pinned on exit. |
342 | | */ |
343 | | Buffer |
344 | | _bt_getroot(Relation rel, Relation heaprel, int access) |
345 | 0 | { |
346 | 0 | Buffer metabuf; |
347 | 0 | Buffer rootbuf; |
348 | 0 | Page rootpage; |
349 | 0 | BTPageOpaque rootopaque; |
350 | 0 | BlockNumber rootblkno; |
351 | 0 | uint32 rootlevel; |
352 | 0 | BTMetaPageData *metad; |
353 | |
|
354 | 0 | Assert(access == BT_READ || heaprel != NULL); |
355 | | |
356 | | /* |
357 | | * Try to use previously-cached metapage data to find the root. This |
358 | | * normally saves one buffer access per index search, which is a very |
359 | | * helpful savings in bufmgr traffic and hence contention. |
360 | | */ |
361 | 0 | if (rel->rd_amcache != NULL) |
362 | 0 | { |
363 | 0 | metad = (BTMetaPageData *) rel->rd_amcache; |
364 | | /* We shouldn't have cached it if any of these fail */ |
365 | 0 | Assert(metad->btm_magic == BTREE_MAGIC); |
366 | 0 | Assert(metad->btm_version >= BTREE_MIN_VERSION); |
367 | 0 | Assert(metad->btm_version <= BTREE_VERSION); |
368 | 0 | Assert(!metad->btm_allequalimage || |
369 | 0 | metad->btm_version > BTREE_NOVAC_VERSION); |
370 | 0 | Assert(metad->btm_root != P_NONE); |
371 | |
|
372 | 0 | rootblkno = metad->btm_fastroot; |
373 | 0 | Assert(rootblkno != P_NONE); |
374 | 0 | rootlevel = metad->btm_fastlevel; |
375 | |
|
376 | 0 | rootbuf = _bt_getbuf(rel, rootblkno, BT_READ); |
377 | 0 | rootpage = BufferGetPage(rootbuf); |
378 | 0 | rootopaque = BTPageGetOpaque(rootpage); |
379 | | |
380 | | /* |
381 | | * Since the cache might be stale, we check the page more carefully |
382 | | * here than normal. We *must* check that it's not deleted. If it's |
383 | | * not alone on its level, then we reject too --- this may be overly |
384 | | * paranoid but better safe than sorry. Note we don't check P_ISROOT, |
385 | | * because that's not set in a "fast root". |
386 | | */ |
387 | 0 | if (!P_IGNORE(rootopaque) && |
388 | 0 | rootopaque->btpo_level == rootlevel && |
389 | 0 | P_LEFTMOST(rootopaque) && |
390 | 0 | P_RIGHTMOST(rootopaque)) |
391 | 0 | { |
392 | | /* OK, accept cached page as the root */ |
393 | 0 | return rootbuf; |
394 | 0 | } |
395 | 0 | _bt_relbuf(rel, rootbuf); |
396 | | /* Cache is stale, throw it away */ |
397 | 0 | if (rel->rd_amcache) |
398 | 0 | pfree(rel->rd_amcache); |
399 | 0 | rel->rd_amcache = NULL; |
400 | 0 | } |
401 | | |
402 | 0 | metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); |
403 | 0 | metad = _bt_getmeta(rel, metabuf); |
404 | | |
405 | | /* if no root page initialized yet, do it */ |
406 | 0 | if (metad->btm_root == P_NONE) |
407 | 0 | { |
408 | 0 | Page metapg; |
409 | | |
410 | | /* If access = BT_READ, caller doesn't want us to create root yet */ |
411 | 0 | if (access == BT_READ) |
412 | 0 | { |
413 | 0 | _bt_relbuf(rel, metabuf); |
414 | 0 | return InvalidBuffer; |
415 | 0 | } |
416 | | |
417 | | /* trade in our read lock for a write lock */ |
418 | 0 | _bt_unlockbuf(rel, metabuf); |
419 | 0 | _bt_lockbuf(rel, metabuf, BT_WRITE); |
420 | | |
421 | | /* |
422 | | * Race condition: if someone else initialized the metadata between |
423 | | * the time we released the read lock and acquired the write lock, we |
424 | | * must avoid doing it again. |
425 | | */ |
426 | 0 | if (metad->btm_root != P_NONE) |
427 | 0 | { |
428 | | /* |
429 | | * Metadata initialized by someone else. In order to guarantee no |
430 | | * deadlocks, we have to release the metadata page and start all |
431 | | * over again. (Is that really true? But it's hardly worth trying |
432 | | * to optimize this case.) |
433 | | */ |
434 | 0 | _bt_relbuf(rel, metabuf); |
435 | 0 | return _bt_getroot(rel, heaprel, access); |
436 | 0 | } |
437 | | |
438 | | /* |
439 | | * Get, initialize, write, and leave a lock of the appropriate type on |
440 | | * the new root page. Since this is the first page in the tree, it's |
441 | | * a leaf as well as the root. |
442 | | */ |
443 | 0 | rootbuf = _bt_allocbuf(rel, heaprel); |
444 | 0 | rootblkno = BufferGetBlockNumber(rootbuf); |
445 | 0 | rootpage = BufferGetPage(rootbuf); |
446 | 0 | rootopaque = BTPageGetOpaque(rootpage); |
447 | 0 | rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; |
448 | 0 | rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT); |
449 | 0 | rootopaque->btpo_level = 0; |
450 | 0 | rootopaque->btpo_cycleid = 0; |
451 | | /* Get raw page pointer for metapage */ |
452 | 0 | metapg = BufferGetPage(metabuf); |
453 | | |
454 | | /* NO ELOG(ERROR) till meta is updated */ |
455 | 0 | START_CRIT_SECTION(); |
456 | | |
457 | | /* upgrade metapage if needed */ |
458 | 0 | if (metad->btm_version < BTREE_NOVAC_VERSION) |
459 | 0 | _bt_upgrademetapage(metapg); |
460 | |
|
461 | 0 | metad->btm_root = rootblkno; |
462 | 0 | metad->btm_level = 0; |
463 | 0 | metad->btm_fastroot = rootblkno; |
464 | 0 | metad->btm_fastlevel = 0; |
465 | 0 | metad->btm_last_cleanup_num_delpages = 0; |
466 | 0 | metad->btm_last_cleanup_num_heap_tuples = -1.0; |
467 | |
|
468 | 0 | MarkBufferDirty(rootbuf); |
469 | 0 | MarkBufferDirty(metabuf); |
470 | | |
471 | | /* XLOG stuff */ |
472 | 0 | if (RelationNeedsWAL(rel)) |
473 | 0 | { |
474 | 0 | xl_btree_newroot xlrec; |
475 | 0 | XLogRecPtr recptr; |
476 | 0 | xl_btree_metadata md; |
477 | |
|
478 | 0 | XLogBeginInsert(); |
479 | 0 | XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT); |
480 | 0 | XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); |
481 | |
|
482 | 0 | Assert(metad->btm_version >= BTREE_NOVAC_VERSION); |
483 | 0 | md.version = metad->btm_version; |
484 | 0 | md.root = rootblkno; |
485 | 0 | md.level = 0; |
486 | 0 | md.fastroot = rootblkno; |
487 | 0 | md.fastlevel = 0; |
488 | 0 | md.last_cleanup_num_delpages = 0; |
489 | 0 | md.allequalimage = metad->btm_allequalimage; |
490 | |
|
491 | 0 | XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata)); |
492 | |
|
493 | 0 | xlrec.rootblk = rootblkno; |
494 | 0 | xlrec.level = 0; |
495 | |
|
496 | 0 | XLogRegisterData(&xlrec, SizeOfBtreeNewroot); |
497 | |
|
498 | 0 | recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT); |
499 | |
|
500 | 0 | PageSetLSN(rootpage, recptr); |
501 | 0 | PageSetLSN(metapg, recptr); |
502 | 0 | } |
503 | |
|
504 | 0 | END_CRIT_SECTION(); |
505 | | |
506 | | /* |
507 | | * swap root write lock for read lock. There is no danger of anyone |
508 | | * else accessing the new root page while it's unlocked, since no one |
509 | | * else knows where it is yet. |
510 | | */ |
511 | 0 | _bt_unlockbuf(rel, rootbuf); |
512 | 0 | _bt_lockbuf(rel, rootbuf, BT_READ); |
513 | | |
514 | | /* okay, metadata is correct, release lock on it without caching */ |
515 | 0 | _bt_relbuf(rel, metabuf); |
516 | 0 | } |
517 | 0 | else |
518 | 0 | { |
519 | 0 | rootblkno = metad->btm_fastroot; |
520 | 0 | Assert(rootblkno != P_NONE); |
521 | 0 | rootlevel = metad->btm_fastlevel; |
522 | | |
523 | | /* |
524 | | * Cache the metapage data for next time |
525 | | */ |
526 | 0 | rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, |
527 | 0 | sizeof(BTMetaPageData)); |
528 | 0 | memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); |
529 | | |
530 | | /* |
531 | | * We are done with the metapage; arrange to release it via first |
532 | | * _bt_relandgetbuf call |
533 | | */ |
534 | 0 | rootbuf = metabuf; |
535 | |
|
536 | 0 | for (;;) |
537 | 0 | { |
538 | 0 | rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ); |
539 | 0 | rootpage = BufferGetPage(rootbuf); |
540 | 0 | rootopaque = BTPageGetOpaque(rootpage); |
541 | |
|
542 | 0 | if (!P_IGNORE(rootopaque)) |
543 | 0 | break; |
544 | | |
545 | | /* it's dead, Jim. step right one page */ |
546 | 0 | if (P_RIGHTMOST(rootopaque)) |
547 | 0 | elog(ERROR, "no live root page found in index \"%s\"", |
548 | 0 | RelationGetRelationName(rel)); |
549 | 0 | rootblkno = rootopaque->btpo_next; |
550 | 0 | } |
551 | | |
552 | 0 | if (rootopaque->btpo_level != rootlevel) |
553 | 0 | elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u", |
554 | 0 | rootblkno, RelationGetRelationName(rel), |
555 | 0 | rootopaque->btpo_level, rootlevel); |
556 | 0 | } |
557 | | |
558 | | /* |
559 | | * By here, we have a pin and read lock on the root page, and no lock set |
560 | | * on the metadata page. Return the root page's buffer. |
561 | | */ |
562 | 0 | return rootbuf; |
563 | 0 | } |
564 | | |
565 | | /* |
566 | | * _bt_gettrueroot() -- Get the true root page of the btree. |
567 | | * |
568 | | * This is the same as the BT_READ case of _bt_getroot(), except |
569 | | * we follow the true-root link not the fast-root link. |
570 | | * |
571 | | * By the time we acquire lock on the root page, it might have been split and |
572 | | * not be the true root anymore. This is okay for the present uses of this |
573 | | * routine; we only really need to be able to move up at least one tree level |
574 | | * from whatever non-root page we were at. If we ever do need to lock the |
575 | | * one true root page, we could loop here, re-reading the metapage on each |
576 | | * failure. (Note that it wouldn't do to hold the lock on the metapage while |
577 | | * moving to the root --- that'd deadlock against any concurrent root split.) |
578 | | */ |
579 | | Buffer |
580 | | _bt_gettrueroot(Relation rel) |
581 | 0 | { |
582 | 0 | Buffer metabuf; |
583 | 0 | Page metapg; |
584 | 0 | BTPageOpaque metaopaque; |
585 | 0 | Buffer rootbuf; |
586 | 0 | Page rootpage; |
587 | 0 | BTPageOpaque rootopaque; |
588 | 0 | BlockNumber rootblkno; |
589 | 0 | uint32 rootlevel; |
590 | 0 | BTMetaPageData *metad; |
591 | | |
592 | | /* |
593 | | * We don't try to use cached metapage data here, since (a) this path is |
594 | | * not performance-critical, and (b) if we are here it suggests our cache |
595 | | * is out-of-date anyway. In light of point (b), it's probably safest to |
596 | | * actively flush any cached metapage info. |
597 | | */ |
598 | 0 | if (rel->rd_amcache) |
599 | 0 | pfree(rel->rd_amcache); |
600 | 0 | rel->rd_amcache = NULL; |
601 | |
|
602 | 0 | metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); |
603 | 0 | metapg = BufferGetPage(metabuf); |
604 | 0 | metaopaque = BTPageGetOpaque(metapg); |
605 | 0 | metad = BTPageGetMeta(metapg); |
606 | |
|
607 | 0 | if (!P_ISMETA(metaopaque) || |
608 | 0 | metad->btm_magic != BTREE_MAGIC) |
609 | 0 | ereport(ERROR, |
610 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
611 | 0 | errmsg("index \"%s\" is not a btree", |
612 | 0 | RelationGetRelationName(rel)))); |
613 | | |
614 | 0 | if (metad->btm_version < BTREE_MIN_VERSION || |
615 | 0 | metad->btm_version > BTREE_VERSION) |
616 | 0 | ereport(ERROR, |
617 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
618 | 0 | errmsg("version mismatch in index \"%s\": file version %d, " |
619 | 0 | "current version %d, minimal supported version %d", |
620 | 0 | RelationGetRelationName(rel), |
621 | 0 | metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION))); |
622 | | |
623 | | /* if no root page initialized yet, fail */ |
624 | 0 | if (metad->btm_root == P_NONE) |
625 | 0 | { |
626 | 0 | _bt_relbuf(rel, metabuf); |
627 | 0 | return InvalidBuffer; |
628 | 0 | } |
629 | | |
630 | 0 | rootblkno = metad->btm_root; |
631 | 0 | rootlevel = metad->btm_level; |
632 | | |
633 | | /* |
634 | | * We are done with the metapage; arrange to release it via first |
635 | | * _bt_relandgetbuf call |
636 | | */ |
637 | 0 | rootbuf = metabuf; |
638 | |
|
639 | 0 | for (;;) |
640 | 0 | { |
641 | 0 | rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ); |
642 | 0 | rootpage = BufferGetPage(rootbuf); |
643 | 0 | rootopaque = BTPageGetOpaque(rootpage); |
644 | |
|
645 | 0 | if (!P_IGNORE(rootopaque)) |
646 | 0 | break; |
647 | | |
648 | | /* it's dead, Jim. step right one page */ |
649 | 0 | if (P_RIGHTMOST(rootopaque)) |
650 | 0 | elog(ERROR, "no live root page found in index \"%s\"", |
651 | 0 | RelationGetRelationName(rel)); |
652 | 0 | rootblkno = rootopaque->btpo_next; |
653 | 0 | } |
654 | | |
655 | 0 | if (rootopaque->btpo_level != rootlevel) |
656 | 0 | elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u", |
657 | 0 | rootblkno, RelationGetRelationName(rel), |
658 | 0 | rootopaque->btpo_level, rootlevel); |
659 | | |
660 | 0 | return rootbuf; |
661 | 0 | } |
662 | | |
663 | | /* |
664 | | * _bt_getrootheight() -- Get the height of the btree search tree. |
665 | | * |
666 | | * We return the level (counting from zero) of the current fast root. |
667 | | * This represents the number of tree levels we'd have to descend through |
668 | | * to start any btree index search. |
669 | | * |
670 | | * This is used by the planner for cost-estimation purposes. Since it's |
671 | | * only an estimate, slightly-stale data is fine, hence we don't worry |
672 | | * about updating previously cached data. |
673 | | */ |
674 | | int |
675 | | _bt_getrootheight(Relation rel) |
676 | 0 | { |
677 | 0 | BTMetaPageData *metad; |
678 | |
|
679 | 0 | if (rel->rd_amcache == NULL) |
680 | 0 | { |
681 | 0 | Buffer metabuf; |
682 | |
|
683 | 0 | metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); |
684 | 0 | metad = _bt_getmeta(rel, metabuf); |
685 | | |
686 | | /* |
687 | | * If there's no root page yet, _bt_getroot() doesn't expect a cache |
688 | | * to be made, so just stop here and report the index height is zero. |
689 | | * (XXX perhaps _bt_getroot() should be changed to allow this case.) |
690 | | */ |
691 | 0 | if (metad->btm_root == P_NONE) |
692 | 0 | { |
693 | 0 | _bt_relbuf(rel, metabuf); |
694 | 0 | return 0; |
695 | 0 | } |
696 | | |
697 | | /* |
698 | | * Cache the metapage data for next time |
699 | | */ |
700 | 0 | rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, |
701 | 0 | sizeof(BTMetaPageData)); |
702 | 0 | memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); |
703 | 0 | _bt_relbuf(rel, metabuf); |
704 | 0 | } |
705 | | |
706 | | /* Get cached page */ |
707 | 0 | metad = (BTMetaPageData *) rel->rd_amcache; |
708 | | /* We shouldn't have cached it if any of these fail */ |
709 | 0 | Assert(metad->btm_magic == BTREE_MAGIC); |
710 | 0 | Assert(metad->btm_version >= BTREE_MIN_VERSION); |
711 | 0 | Assert(metad->btm_version <= BTREE_VERSION); |
712 | 0 | Assert(!metad->btm_allequalimage || |
713 | 0 | metad->btm_version > BTREE_NOVAC_VERSION); |
714 | 0 | Assert(metad->btm_fastroot != P_NONE); |
715 | |
|
716 | 0 | return metad->btm_fastlevel; |
717 | 0 | } |
718 | | |
719 | | /* |
720 | | * _bt_metaversion() -- Get version/status info from metapage. |
721 | | * |
722 | | * Sets caller's *heapkeyspace and *allequalimage arguments using data |
723 | | * from the B-Tree metapage (could be locally-cached version). This |
724 | | * information needs to be stashed in insertion scankey, so we provide a |
725 | | * single function that fetches both at once. |
726 | | * |
727 | | * This is used to determine the rules that must be used to descend a |
728 | | * btree. Version 4 indexes treat heap TID as a tiebreaker attribute. |
729 | | * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable |
730 | | * performance when inserting a new BTScanInsert-wise duplicate tuple |
731 | | * among many leaf pages already full of such duplicates. |
732 | | * |
733 | | * Also sets allequalimage field, which indicates whether or not it is |
734 | | * safe to apply deduplication. We rely on the assumption that |
735 | | * btm_allequalimage will be zero'ed on heapkeyspace indexes that were |
736 | | * pg_upgrade'd from Postgres 12. |
737 | | */ |
738 | | void |
739 | | _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage) |
740 | 0 | { |
741 | 0 | BTMetaPageData *metad; |
742 | |
|
743 | 0 | if (rel->rd_amcache == NULL) |
744 | 0 | { |
745 | 0 | Buffer metabuf; |
746 | |
|
747 | 0 | metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ); |
748 | 0 | metad = _bt_getmeta(rel, metabuf); |
749 | | |
750 | | /* |
751 | | * If there's no root page yet, _bt_getroot() doesn't expect a cache |
752 | | * to be made, so just stop here. (XXX perhaps _bt_getroot() should |
753 | | * be changed to allow this case.) |
754 | | */ |
755 | 0 | if (metad->btm_root == P_NONE) |
756 | 0 | { |
757 | 0 | *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION; |
758 | 0 | *allequalimage = metad->btm_allequalimage; |
759 | |
|
760 | 0 | _bt_relbuf(rel, metabuf); |
761 | 0 | return; |
762 | 0 | } |
763 | | |
764 | | /* |
765 | | * Cache the metapage data for next time |
766 | | * |
767 | | * An on-the-fly version upgrade performed by _bt_upgrademetapage() |
768 | | * can change the nbtree version for an index without invalidating any |
769 | | * local cache. This is okay because it can only happen when moving |
770 | | * from version 2 to version 3, both of which are !heapkeyspace |
771 | | * versions. |
772 | | */ |
773 | 0 | rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt, |
774 | 0 | sizeof(BTMetaPageData)); |
775 | 0 | memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData)); |
776 | 0 | _bt_relbuf(rel, metabuf); |
777 | 0 | } |
778 | | |
779 | | /* Get cached page */ |
780 | 0 | metad = (BTMetaPageData *) rel->rd_amcache; |
781 | | /* We shouldn't have cached it if any of these fail */ |
782 | 0 | Assert(metad->btm_magic == BTREE_MAGIC); |
783 | 0 | Assert(metad->btm_version >= BTREE_MIN_VERSION); |
784 | 0 | Assert(metad->btm_version <= BTREE_VERSION); |
785 | 0 | Assert(!metad->btm_allequalimage || |
786 | 0 | metad->btm_version > BTREE_NOVAC_VERSION); |
787 | 0 | Assert(metad->btm_fastroot != P_NONE); |
788 | |
|
789 | 0 | *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION; |
790 | 0 | *allequalimage = metad->btm_allequalimage; |
791 | 0 | } |
792 | | |
793 | | /* |
794 | | * _bt_checkpage() -- Verify that a freshly-read page looks sane. |
795 | | */ |
796 | | void |
797 | | _bt_checkpage(Relation rel, Buffer buf) |
798 | 0 | { |
799 | 0 | Page page = BufferGetPage(buf); |
800 | | |
801 | | /* |
802 | | * ReadBuffer verifies that every newly-read page passes |
803 | | * PageHeaderIsValid, which means it either contains a reasonably sane |
804 | | * page header or is all-zero. We have to defend against the all-zero |
805 | | * case, however. |
806 | | */ |
807 | 0 | if (PageIsNew(page)) |
808 | 0 | ereport(ERROR, |
809 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
810 | 0 | errmsg("index \"%s\" contains unexpected zero page at block %u", |
811 | 0 | RelationGetRelationName(rel), |
812 | 0 | BufferGetBlockNumber(buf)), |
813 | 0 | errhint("Please REINDEX it."))); |
814 | | |
815 | | /* |
816 | | * Additionally check that the special area looks sane. |
817 | | */ |
818 | 0 | if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData))) |
819 | 0 | ereport(ERROR, |
820 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
821 | 0 | errmsg("index \"%s\" contains corrupted page at block %u", |
822 | 0 | RelationGetRelationName(rel), |
823 | 0 | BufferGetBlockNumber(buf)), |
824 | 0 | errhint("Please REINDEX it."))); |
825 | 0 | } |
826 | | |
827 | | /* |
828 | | * _bt_getbuf() -- Get an existing block in a buffer, for read or write. |
829 | | * |
830 | | * The general rule in nbtree is that it's never okay to access a |
831 | | * page without holding both a buffer pin and a buffer lock on |
832 | | * the page's buffer. |
833 | | * |
834 | | * When this routine returns, the appropriate lock is set on the |
835 | | * requested buffer and its reference count has been incremented |
836 | | * (ie, the buffer is "locked and pinned"). Also, we apply |
837 | | * _bt_checkpage to sanity-check the page, and perform Valgrind |
838 | | * client requests that help Valgrind detect unsafe page accesses. |
839 | | * |
840 | | * Note: raw LockBuffer() calls are disallowed in nbtree; all |
841 | | * buffer lock requests need to go through wrapper functions such |
842 | | * as _bt_lockbuf(). |
843 | | */ |
844 | | Buffer |
845 | | _bt_getbuf(Relation rel, BlockNumber blkno, int access) |
846 | 0 | { |
847 | 0 | Buffer buf; |
848 | |
|
849 | 0 | Assert(BlockNumberIsValid(blkno)); |
850 | | |
851 | | /* Read an existing block of the relation */ |
852 | 0 | buf = ReadBuffer(rel, blkno); |
853 | 0 | _bt_lockbuf(rel, buf, access); |
854 | 0 | _bt_checkpage(rel, buf); |
855 | |
|
856 | 0 | return buf; |
857 | 0 | } |
858 | | |
859 | | /* |
860 | | * _bt_allocbuf() -- Allocate a new block/page. |
861 | | * |
862 | | * Returns a write-locked buffer containing an unallocated nbtree page. |
863 | | * |
864 | | * Callers are required to pass a valid heaprel. We need heaprel so that we |
865 | | * can handle generating a snapshotConflictHorizon that makes reusing a page |
866 | | * from the FSM safe for queries that may be running on standbys. |
867 | | */ |
868 | | Buffer |
869 | | _bt_allocbuf(Relation rel, Relation heaprel) |
870 | 0 | { |
871 | 0 | Buffer buf; |
872 | 0 | BlockNumber blkno; |
873 | 0 | Page page; |
874 | |
|
875 | 0 | Assert(heaprel != NULL); |
876 | | |
877 | | /* |
878 | | * First see if the FSM knows of any free pages. |
879 | | * |
880 | | * We can't trust the FSM's report unreservedly; we have to check that the |
881 | | * page is still free. (For example, an already-free page could have been |
882 | | * re-used between the time the last VACUUM scanned it and the time the |
883 | | * VACUUM made its FSM updates.) |
884 | | * |
885 | | * In fact, it's worse than that: we can't even assume that it's safe to |
886 | | * take a lock on the reported page. If somebody else has a lock on it, |
887 | | * or even worse our own caller does, we could deadlock. (The own-caller |
888 | | * scenario is actually not improbable. Consider an index on a serial or |
889 | | * timestamp column. Nearly all splits will be at the rightmost page, so |
890 | | * it's entirely likely that _bt_split will call us while holding a lock |
891 | | * on the page most recently acquired from FSM. A VACUUM running |
892 | | * concurrently with the previous split could well have placed that page |
893 | | * back in FSM.) |
894 | | * |
895 | | * To get around that, we ask for only a conditional lock on the reported |
896 | | * page. If we fail, then someone else is using the page, and we may |
897 | | * reasonably assume it's not free. (If we happen to be wrong, the worst |
898 | | * consequence is the page will be lost to use till the next VACUUM, which |
899 | | * is no big problem.) |
900 | | */ |
901 | 0 | for (;;) |
902 | 0 | { |
903 | 0 | blkno = GetFreeIndexPage(rel); |
904 | 0 | if (blkno == InvalidBlockNumber) |
905 | 0 | break; |
906 | 0 | buf = ReadBuffer(rel, blkno); |
907 | 0 | if (_bt_conditionallockbuf(rel, buf)) |
908 | 0 | { |
909 | 0 | page = BufferGetPage(buf); |
910 | | |
911 | | /* |
912 | | * It's possible to find an all-zeroes page in an index. For |
913 | | * example, a backend might successfully extend the relation one |
914 | | * page and then crash before it is able to make a WAL entry for |
915 | | * adding the page. If we find a zeroed page then reclaim it |
916 | | * immediately. |
917 | | */ |
918 | 0 | if (PageIsNew(page)) |
919 | 0 | { |
920 | | /* Okay to use page. Initialize and return it. */ |
921 | 0 | _bt_pageinit(page, BufferGetPageSize(buf)); |
922 | 0 | return buf; |
923 | 0 | } |
924 | | |
925 | 0 | if (BTPageIsRecyclable(page, heaprel)) |
926 | 0 | { |
927 | | /* |
928 | | * If we are generating WAL for Hot Standby then create a WAL |
929 | | * record that will allow us to conflict with queries running |
930 | | * on standby, in case they have snapshots older than safexid |
931 | | * value |
932 | | */ |
933 | 0 | if (RelationNeedsWAL(rel) && XLogStandbyInfoActive()) |
934 | 0 | { |
935 | 0 | xl_btree_reuse_page xlrec_reuse; |
936 | | |
937 | | /* |
938 | | * Note that we don't register the buffer with the record, |
939 | | * because this operation doesn't modify the page (that |
940 | | * already happened, back when VACUUM deleted the page). |
941 | | * This record only exists to provide a conflict point for |
942 | | * Hot Standby. See record REDO routine comments. |
943 | | */ |
944 | 0 | xlrec_reuse.locator = rel->rd_locator; |
945 | 0 | xlrec_reuse.block = blkno; |
946 | 0 | xlrec_reuse.snapshotConflictHorizon = BTPageGetDeleteXid(page); |
947 | 0 | xlrec_reuse.isCatalogRel = |
948 | 0 | RelationIsAccessibleInLogicalDecoding(heaprel); |
949 | |
|
950 | 0 | XLogBeginInsert(); |
951 | 0 | XLogRegisterData(&xlrec_reuse, SizeOfBtreeReusePage); |
952 | |
|
953 | 0 | XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE); |
954 | 0 | } |
955 | | |
956 | | /* Okay to use page. Re-initialize and return it. */ |
957 | 0 | _bt_pageinit(page, BufferGetPageSize(buf)); |
958 | 0 | return buf; |
959 | 0 | } |
960 | 0 | elog(DEBUG2, "FSM returned nonrecyclable page"); |
961 | 0 | _bt_relbuf(rel, buf); |
962 | 0 | } |
963 | 0 | else |
964 | 0 | { |
965 | 0 | elog(DEBUG2, "FSM returned nonlockable page"); |
966 | | /* couldn't get lock, so just drop pin */ |
967 | 0 | ReleaseBuffer(buf); |
968 | 0 | } |
969 | 0 | } |
970 | | |
971 | | /* |
972 | | * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we |
973 | | * risk a race condition against btvacuumscan --- see comments therein. |
974 | | * This forces us to repeat the valgrind request that _bt_lockbuf() |
975 | | * otherwise would make, as we can't use _bt_lockbuf() without introducing |
976 | | * a race. |
977 | | */ |
978 | 0 | buf = ExtendBufferedRel(BMR_REL(rel), MAIN_FORKNUM, NULL, EB_LOCK_FIRST); |
979 | 0 | if (!RelationUsesLocalBuffers(rel)) |
980 | 0 | VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); |
981 | | |
982 | | /* Initialize the new page before returning it */ |
983 | 0 | page = BufferGetPage(buf); |
984 | 0 | Assert(PageIsNew(page)); |
985 | 0 | _bt_pageinit(page, BufferGetPageSize(buf)); |
986 | |
|
987 | 0 | return buf; |
988 | 0 | } |
989 | | |
990 | | /* |
991 | | * _bt_relandgetbuf() -- release a locked buffer and get another one. |
992 | | * |
993 | | * This is equivalent to _bt_relbuf followed by _bt_getbuf. Also, if obuf is |
994 | | * InvalidBuffer then it reduces to just _bt_getbuf; allowing this case |
995 | | * simplifies some callers. |
996 | | * |
997 | | * The original motivation for using this was to avoid two entries to the |
998 | | * bufmgr when one would do. However, now it's mainly just a notational |
999 | | * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf |
1000 | | * is when the target page is the same one already in the buffer. |
1001 | | */ |
1002 | | Buffer |
1003 | | _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access) |
1004 | 0 | { |
1005 | 0 | Buffer buf; |
1006 | |
|
1007 | 0 | Assert(BlockNumberIsValid(blkno)); |
1008 | 0 | if (BufferIsValid(obuf)) |
1009 | 0 | _bt_unlockbuf(rel, obuf); |
1010 | 0 | buf = ReleaseAndReadBuffer(obuf, rel, blkno); |
1011 | 0 | _bt_lockbuf(rel, buf, access); |
1012 | |
|
1013 | 0 | _bt_checkpage(rel, buf); |
1014 | 0 | return buf; |
1015 | 0 | } |
1016 | | |
1017 | | /* |
1018 | | * _bt_relbuf() -- release a locked buffer. |
1019 | | * |
1020 | | * Lock and pin (refcount) are both dropped. |
1021 | | */ |
1022 | | void |
1023 | | _bt_relbuf(Relation rel, Buffer buf) |
1024 | 0 | { |
1025 | 0 | _bt_unlockbuf(rel, buf); |
1026 | 0 | ReleaseBuffer(buf); |
1027 | 0 | } |
1028 | | |
1029 | | /* |
1030 | | * _bt_lockbuf() -- lock a pinned buffer. |
1031 | | * |
1032 | | * Lock is acquired without acquiring another pin. This is like a raw |
1033 | | * LockBuffer() call, but performs extra steps needed by Valgrind. |
1034 | | * |
1035 | | * Note: Caller may need to call _bt_checkpage() with buf when pin on buf |
1036 | | * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf(). |
1037 | | */ |
1038 | | void |
1039 | | _bt_lockbuf(Relation rel, Buffer buf, int access) |
1040 | 0 | { |
1041 | | /* LockBuffer() asserts that pin is held by this backend */ |
1042 | 0 | LockBuffer(buf, access); |
1043 | | |
1044 | | /* |
1045 | | * It doesn't matter that _bt_unlockbuf() won't get called in the event of |
1046 | | * an nbtree error (e.g. a unique violation error). That won't cause |
1047 | | * Valgrind false positives. |
1048 | | * |
1049 | | * The nbtree client requests are superimposed on top of the bufmgr.c |
1050 | | * buffer pin client requests. In the event of an nbtree error the buffer |
1051 | | * will certainly get marked as defined when the backend once again |
1052 | | * acquires its first pin on the buffer. (Of course, if the backend never |
1053 | | * touches the buffer again then it doesn't matter that it remains |
1054 | | * non-accessible to Valgrind.) |
1055 | | * |
1056 | | * Note: When an IndexTuple C pointer gets computed using an ItemId read |
1057 | | * from a page while a lock was held, the C pointer becomes unsafe to |
1058 | | * dereference forever as soon as the lock is released. Valgrind can only |
1059 | | * detect cases where the pointer gets dereferenced with no _current_ |
1060 | | * lock/pin held, though. |
1061 | | */ |
1062 | 0 | if (!RelationUsesLocalBuffers(rel)) |
1063 | 0 | VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); |
1064 | 0 | } |
1065 | | |
1066 | | /* |
1067 | | * _bt_unlockbuf() -- unlock a pinned buffer. |
1068 | | */ |
1069 | | void |
1070 | | _bt_unlockbuf(Relation rel, Buffer buf) |
1071 | 0 | { |
1072 | | /* |
1073 | | * Buffer is pinned and locked, which means that it is expected to be |
1074 | | * defined and addressable. Check that proactively. |
1075 | | */ |
1076 | 0 | VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ); |
1077 | | |
1078 | | /* LockBuffer() asserts that pin is held by this backend */ |
1079 | 0 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
1080 | |
|
1081 | 0 | if (!RelationUsesLocalBuffers(rel)) |
1082 | 0 | VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ); |
1083 | 0 | } |
1084 | | |
1085 | | /* |
1086 | | * _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned |
1087 | | * buffer. |
1088 | | * |
1089 | | * Note: Caller may need to call _bt_checkpage() with buf when pin on buf |
1090 | | * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf(). |
1091 | | */ |
1092 | | bool |
1093 | | _bt_conditionallockbuf(Relation rel, Buffer buf) |
1094 | 0 | { |
1095 | | /* ConditionalLockBuffer() asserts that pin is held by this backend */ |
1096 | 0 | if (!ConditionalLockBuffer(buf)) |
1097 | 0 | return false; |
1098 | | |
1099 | 0 | if (!RelationUsesLocalBuffers(rel)) |
1100 | 0 | VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); |
1101 | |
|
1102 | 0 | return true; |
1103 | 0 | } |
1104 | | |
1105 | | /* |
1106 | | * _bt_upgradelockbufcleanup() -- upgrade lock to a full cleanup lock. |
1107 | | */ |
1108 | | void |
1109 | | _bt_upgradelockbufcleanup(Relation rel, Buffer buf) |
1110 | 0 | { |
1111 | | /* |
1112 | | * Buffer is pinned and locked, which means that it is expected to be |
1113 | | * defined and addressable. Check that proactively. |
1114 | | */ |
1115 | 0 | VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ); |
1116 | | |
1117 | | /* LockBuffer() asserts that pin is held by this backend */ |
1118 | 0 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
1119 | 0 | LockBufferForCleanup(buf); |
1120 | 0 | } |
1121 | | |
1122 | | /* |
1123 | | * _bt_pageinit() -- Initialize a new page. |
1124 | | * |
1125 | | * On return, the page header is initialized; data space is empty; |
1126 | | * special space is zeroed out. |
1127 | | */ |
1128 | | void |
1129 | | _bt_pageinit(Page page, Size size) |
1130 | 0 | { |
1131 | 0 | PageInit(page, size, sizeof(BTPageOpaqueData)); |
1132 | 0 | } |
1133 | | |
1134 | | /* |
1135 | | * Delete item(s) from a btree leaf page during VACUUM. |
1136 | | * |
1137 | | * This routine assumes that the caller already has a full cleanup lock on |
1138 | | * the buffer. Also, the given deletable and updatable arrays *must* be |
1139 | | * sorted in ascending order. |
1140 | | * |
1141 | | * Routine deals with deleting TIDs when some (but not all) of the heap TIDs |
1142 | | * in an existing posting list item are to be removed. This works by |
1143 | | * updating/overwriting an existing item with caller's new version of the item |
1144 | | * (a version that lacks the TIDs that are to be deleted). |
1145 | | * |
1146 | | * We record VACUUMs and b-tree deletes differently in WAL. Deletes must |
1147 | | * generate their own snapshotConflictHorizon directly from the tableam, |
1148 | | * whereas VACUUMs rely on the initial VACUUM table scan performing |
1149 | | * WAL-logging that takes care of the issue for the table's indexes |
1150 | | * indirectly. Also, we remove the VACUUM cycle ID from pages, which b-tree |
1151 | | * deletes don't do. |
1152 | | */ |
1153 | | void |
1154 | | _bt_delitems_vacuum(Relation rel, Buffer buf, |
1155 | | OffsetNumber *deletable, int ndeletable, |
1156 | | BTVacuumPosting *updatable, int nupdatable) |
1157 | 0 | { |
1158 | 0 | Page page = BufferGetPage(buf); |
1159 | 0 | BTPageOpaque opaque; |
1160 | 0 | bool needswal = RelationNeedsWAL(rel); |
1161 | 0 | char *updatedbuf = NULL; |
1162 | 0 | Size updatedbuflen = 0; |
1163 | 0 | OffsetNumber updatedoffsets[MaxIndexTuplesPerPage]; |
1164 | | |
1165 | | /* Shouldn't be called unless there's something to do */ |
1166 | 0 | Assert(ndeletable > 0 || nupdatable > 0); |
1167 | | |
1168 | | /* Generate new version of posting lists without deleted TIDs */ |
1169 | 0 | if (nupdatable > 0) |
1170 | 0 | updatedbuf = _bt_delitems_update(updatable, nupdatable, |
1171 | 0 | updatedoffsets, &updatedbuflen, |
1172 | 0 | needswal); |
1173 | | |
1174 | | /* No ereport(ERROR) until changes are logged */ |
1175 | 0 | START_CRIT_SECTION(); |
1176 | | |
1177 | | /* |
1178 | | * Handle posting tuple updates. |
1179 | | * |
1180 | | * Deliberately do this before handling simple deletes. If we did it the |
1181 | | * other way around (i.e. WAL record order -- simple deletes before |
1182 | | * updates) then we'd have to make compensating changes to the 'updatable' |
1183 | | * array of offset numbers. |
1184 | | * |
1185 | | * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it |
1186 | | * happens to already be set. It's important that we not interfere with |
1187 | | * any future simple index tuple deletion operations. |
1188 | | */ |
1189 | 0 | for (int i = 0; i < nupdatable; i++) |
1190 | 0 | { |
1191 | 0 | OffsetNumber updatedoffset = updatedoffsets[i]; |
1192 | 0 | IndexTuple itup; |
1193 | 0 | Size itemsz; |
1194 | |
|
1195 | 0 | itup = updatable[i]->itup; |
1196 | 0 | itemsz = MAXALIGN(IndexTupleSize(itup)); |
1197 | 0 | if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup, |
1198 | 0 | itemsz)) |
1199 | 0 | elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"", |
1200 | 0 | BufferGetBlockNumber(buf), RelationGetRelationName(rel)); |
1201 | 0 | } |
1202 | | |
1203 | | /* Now handle simple deletes of entire tuples */ |
1204 | 0 | if (ndeletable > 0) |
1205 | 0 | PageIndexMultiDelete(page, deletable, ndeletable); |
1206 | | |
1207 | | /* |
1208 | | * We can clear the vacuum cycle ID since this page has certainly been |
1209 | | * processed by the current vacuum scan. |
1210 | | */ |
1211 | 0 | opaque = BTPageGetOpaque(page); |
1212 | 0 | opaque->btpo_cycleid = 0; |
1213 | | |
1214 | | /* |
1215 | | * Clear the BTP_HAS_GARBAGE page flag. |
1216 | | * |
1217 | | * This flag indicates the presence of LP_DEAD items on the page (though |
1218 | | * not reliably). Note that we only rely on it with pg_upgrade'd |
1219 | | * !heapkeyspace indexes. That's why clearing it here won't usually |
1220 | | * interfere with simple index tuple deletion. |
1221 | | */ |
1222 | 0 | opaque->btpo_flags &= ~BTP_HAS_GARBAGE; |
1223 | |
|
1224 | 0 | MarkBufferDirty(buf); |
1225 | | |
1226 | | /* XLOG stuff */ |
1227 | 0 | if (needswal) |
1228 | 0 | { |
1229 | 0 | XLogRecPtr recptr; |
1230 | 0 | xl_btree_vacuum xlrec_vacuum; |
1231 | |
|
1232 | 0 | xlrec_vacuum.ndeleted = ndeletable; |
1233 | 0 | xlrec_vacuum.nupdated = nupdatable; |
1234 | |
|
1235 | 0 | XLogBeginInsert(); |
1236 | 0 | XLogRegisterBuffer(0, buf, REGBUF_STANDARD); |
1237 | 0 | XLogRegisterData(&xlrec_vacuum, SizeOfBtreeVacuum); |
1238 | |
|
1239 | 0 | if (ndeletable > 0) |
1240 | 0 | XLogRegisterBufData(0, deletable, |
1241 | 0 | ndeletable * sizeof(OffsetNumber)); |
1242 | |
|
1243 | 0 | if (nupdatable > 0) |
1244 | 0 | { |
1245 | 0 | XLogRegisterBufData(0, updatedoffsets, |
1246 | 0 | nupdatable * sizeof(OffsetNumber)); |
1247 | 0 | XLogRegisterBufData(0, updatedbuf, updatedbuflen); |
1248 | 0 | } |
1249 | |
|
1250 | 0 | recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM); |
1251 | |
|
1252 | 0 | PageSetLSN(page, recptr); |
1253 | 0 | } |
1254 | |
|
1255 | 0 | END_CRIT_SECTION(); |
1256 | | |
1257 | | /* can't leak memory here */ |
1258 | 0 | if (updatedbuf != NULL) |
1259 | 0 | pfree(updatedbuf); |
1260 | | /* free tuples allocated within _bt_delitems_update() */ |
1261 | 0 | for (int i = 0; i < nupdatable; i++) |
1262 | 0 | pfree(updatable[i]->itup); |
1263 | 0 | } |
1264 | | |
1265 | | /* |
1266 | | * Delete item(s) from a btree leaf page during single-page cleanup. |
1267 | | * |
1268 | | * This routine assumes that the caller has pinned and write locked the |
1269 | | * buffer. Also, the given deletable and updatable arrays *must* be sorted in |
1270 | | * ascending order. |
1271 | | * |
1272 | | * Routine deals with deleting TIDs when some (but not all) of the heap TIDs |
1273 | | * in an existing posting list item are to be removed. This works by |
1274 | | * updating/overwriting an existing item with caller's new version of the item |
1275 | | * (a version that lacks the TIDs that are to be deleted). |
1276 | | * |
1277 | | * This is nearly the same as _bt_delitems_vacuum as far as what it does to |
1278 | | * the page, but it needs its own snapshotConflictHorizon and isCatalogRel |
1279 | | * (from the tableam). This is used by the REDO routine to generate recovery |
1280 | | * conflicts. The other difference is that only _bt_delitems_vacuum will |
1281 | | * clear page's VACUUM cycle ID. |
1282 | | */ |
1283 | | static void |
1284 | | _bt_delitems_delete(Relation rel, Buffer buf, |
1285 | | TransactionId snapshotConflictHorizon, bool isCatalogRel, |
1286 | | OffsetNumber *deletable, int ndeletable, |
1287 | | BTVacuumPosting *updatable, int nupdatable) |
1288 | 0 | { |
1289 | 0 | Page page = BufferGetPage(buf); |
1290 | 0 | BTPageOpaque opaque; |
1291 | 0 | bool needswal = RelationNeedsWAL(rel); |
1292 | 0 | char *updatedbuf = NULL; |
1293 | 0 | Size updatedbuflen = 0; |
1294 | 0 | OffsetNumber updatedoffsets[MaxIndexTuplesPerPage]; |
1295 | | |
1296 | | /* Shouldn't be called unless there's something to do */ |
1297 | 0 | Assert(ndeletable > 0 || nupdatable > 0); |
1298 | | |
1299 | | /* Generate new versions of posting lists without deleted TIDs */ |
1300 | 0 | if (nupdatable > 0) |
1301 | 0 | updatedbuf = _bt_delitems_update(updatable, nupdatable, |
1302 | 0 | updatedoffsets, &updatedbuflen, |
1303 | 0 | needswal); |
1304 | | |
1305 | | /* No ereport(ERROR) until changes are logged */ |
1306 | 0 | START_CRIT_SECTION(); |
1307 | | |
1308 | | /* Handle updates and deletes just like _bt_delitems_vacuum */ |
1309 | 0 | for (int i = 0; i < nupdatable; i++) |
1310 | 0 | { |
1311 | 0 | OffsetNumber updatedoffset = updatedoffsets[i]; |
1312 | 0 | IndexTuple itup; |
1313 | 0 | Size itemsz; |
1314 | |
|
1315 | 0 | itup = updatable[i]->itup; |
1316 | 0 | itemsz = MAXALIGN(IndexTupleSize(itup)); |
1317 | 0 | if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup, |
1318 | 0 | itemsz)) |
1319 | 0 | elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"", |
1320 | 0 | BufferGetBlockNumber(buf), RelationGetRelationName(rel)); |
1321 | 0 | } |
1322 | | |
1323 | 0 | if (ndeletable > 0) |
1324 | 0 | PageIndexMultiDelete(page, deletable, ndeletable); |
1325 | | |
1326 | | /* |
1327 | | * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at |
1328 | | * this point. The VACUUM command alone controls vacuum cycle IDs. |
1329 | | */ |
1330 | 0 | opaque = BTPageGetOpaque(page); |
1331 | | |
1332 | | /* |
1333 | | * Clear the BTP_HAS_GARBAGE page flag. |
1334 | | * |
1335 | | * This flag indicates the presence of LP_DEAD items on the page (though |
1336 | | * not reliably). Note that we only rely on it with pg_upgrade'd |
1337 | | * !heapkeyspace indexes. |
1338 | | */ |
1339 | 0 | opaque->btpo_flags &= ~BTP_HAS_GARBAGE; |
1340 | |
|
1341 | 0 | MarkBufferDirty(buf); |
1342 | | |
1343 | | /* XLOG stuff */ |
1344 | 0 | if (needswal) |
1345 | 0 | { |
1346 | 0 | XLogRecPtr recptr; |
1347 | 0 | xl_btree_delete xlrec_delete; |
1348 | |
|
1349 | 0 | xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon; |
1350 | 0 | xlrec_delete.ndeleted = ndeletable; |
1351 | 0 | xlrec_delete.nupdated = nupdatable; |
1352 | 0 | xlrec_delete.isCatalogRel = isCatalogRel; |
1353 | |
|
1354 | 0 | XLogBeginInsert(); |
1355 | 0 | XLogRegisterBuffer(0, buf, REGBUF_STANDARD); |
1356 | 0 | XLogRegisterData(&xlrec_delete, SizeOfBtreeDelete); |
1357 | |
|
1358 | 0 | if (ndeletable > 0) |
1359 | 0 | XLogRegisterBufData(0, deletable, |
1360 | 0 | ndeletable * sizeof(OffsetNumber)); |
1361 | |
|
1362 | 0 | if (nupdatable > 0) |
1363 | 0 | { |
1364 | 0 | XLogRegisterBufData(0, updatedoffsets, |
1365 | 0 | nupdatable * sizeof(OffsetNumber)); |
1366 | 0 | XLogRegisterBufData(0, updatedbuf, updatedbuflen); |
1367 | 0 | } |
1368 | |
|
1369 | 0 | recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE); |
1370 | |
|
1371 | 0 | PageSetLSN(page, recptr); |
1372 | 0 | } |
1373 | |
|
1374 | 0 | END_CRIT_SECTION(); |
1375 | | |
1376 | | /* can't leak memory here */ |
1377 | 0 | if (updatedbuf != NULL) |
1378 | 0 | pfree(updatedbuf); |
1379 | | /* free tuples allocated within _bt_delitems_update() */ |
1380 | 0 | for (int i = 0; i < nupdatable; i++) |
1381 | 0 | pfree(updatable[i]->itup); |
1382 | 0 | } |
1383 | | |
1384 | | /* |
1385 | | * Set up state needed to delete TIDs from posting list tuples via "updating" |
1386 | | * the tuple. Performs steps common to both _bt_delitems_vacuum and |
1387 | | * _bt_delitems_delete. These steps must take place before each function's |
1388 | | * critical section begins. |
1389 | | * |
1390 | | * updatable and nupdatable are inputs, though note that we will use |
1391 | | * _bt_update_posting() to replace the original itup with a pointer to a final |
1392 | | * version in palloc()'d memory. Caller should free the tuples when its done. |
1393 | | * |
1394 | | * The first nupdatable entries from updatedoffsets are set to the page offset |
1395 | | * number for posting list tuples that caller updates. This is mostly useful |
1396 | | * because caller may need to WAL-log the page offsets (though we always do |
1397 | | * this for caller out of convenience). |
1398 | | * |
1399 | | * Returns buffer consisting of an array of xl_btree_update structs that |
1400 | | * describe the steps we perform here for caller (though only when needswal is |
1401 | | * true). Also sets *updatedbuflen to the final size of the buffer. This |
1402 | | * buffer is used by caller when WAL logging is required. |
1403 | | */ |
1404 | | static char * |
1405 | | _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, |
1406 | | OffsetNumber *updatedoffsets, Size *updatedbuflen, |
1407 | | bool needswal) |
1408 | 0 | { |
1409 | 0 | char *updatedbuf = NULL; |
1410 | 0 | Size buflen = 0; |
1411 | | |
1412 | | /* Shouldn't be called unless there's something to do */ |
1413 | 0 | Assert(nupdatable > 0); |
1414 | |
|
1415 | 0 | for (int i = 0; i < nupdatable; i++) |
1416 | 0 | { |
1417 | 0 | BTVacuumPosting vacposting = updatable[i]; |
1418 | 0 | Size itemsz; |
1419 | | |
1420 | | /* Replace work area IndexTuple with updated version */ |
1421 | 0 | _bt_update_posting(vacposting); |
1422 | | |
1423 | | /* Keep track of size of xl_btree_update for updatedbuf in passing */ |
1424 | 0 | itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16); |
1425 | 0 | buflen += itemsz; |
1426 | | |
1427 | | /* Build updatedoffsets buffer in passing */ |
1428 | 0 | updatedoffsets[i] = vacposting->updatedoffset; |
1429 | 0 | } |
1430 | | |
1431 | | /* XLOG stuff */ |
1432 | 0 | if (needswal) |
1433 | 0 | { |
1434 | 0 | Size offset = 0; |
1435 | | |
1436 | | /* Allocate, set final size for caller */ |
1437 | 0 | updatedbuf = palloc(buflen); |
1438 | 0 | *updatedbuflen = buflen; |
1439 | 0 | for (int i = 0; i < nupdatable; i++) |
1440 | 0 | { |
1441 | 0 | BTVacuumPosting vacposting = updatable[i]; |
1442 | 0 | Size itemsz; |
1443 | 0 | xl_btree_update update; |
1444 | |
|
1445 | 0 | update.ndeletedtids = vacposting->ndeletedtids; |
1446 | 0 | memcpy(updatedbuf + offset, &update.ndeletedtids, |
1447 | 0 | SizeOfBtreeUpdate); |
1448 | 0 | offset += SizeOfBtreeUpdate; |
1449 | |
|
1450 | 0 | itemsz = update.ndeletedtids * sizeof(uint16); |
1451 | 0 | memcpy(updatedbuf + offset, vacposting->deletetids, itemsz); |
1452 | 0 | offset += itemsz; |
1453 | 0 | } |
1454 | 0 | } |
1455 | |
|
1456 | 0 | return updatedbuf; |
1457 | 0 | } |
1458 | | |
1459 | | /* |
1460 | | * Comparator used by _bt_delitems_delete_check() to restore deltids array |
1461 | | * back to its original leaf-page-wise sort order |
1462 | | */ |
1463 | | static int |
1464 | | _bt_delitems_cmp(const void *a, const void *b) |
1465 | 0 | { |
1466 | 0 | TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a; |
1467 | 0 | TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b; |
1468 | |
|
1469 | 0 | Assert(indexdelete1->id != indexdelete2->id); |
1470 | |
|
1471 | 0 | return pg_cmp_s16(indexdelete1->id, indexdelete2->id); |
1472 | 0 | } |
1473 | | |
1474 | | /* |
1475 | | * Try to delete item(s) from a btree leaf page during single-page cleanup. |
1476 | | * |
1477 | | * nbtree interface to table_index_delete_tuples(). Deletes a subset of index |
1478 | | * tuples from caller's deltids array: those whose TIDs are found safe to |
1479 | | * delete by the tableam (or already marked LP_DEAD in index, and so already |
1480 | | * known to be deletable by our simple index deletion caller). We physically |
1481 | | * delete index tuples from buf leaf page last of all (for index tuples where |
1482 | | * that is known to be safe following our table_index_delete_tuples() call). |
1483 | | * |
1484 | | * Simple index deletion caller only includes TIDs from index tuples marked |
1485 | | * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be |
1486 | | * included without increasing the total number of distinct table blocks for |
1487 | | * the deletion operation as a whole. This approach often allows us to delete |
1488 | | * some extra index tuples that were practically free for tableam to check in |
1489 | | * passing (when they actually turn out to be safe to delete). It probably |
1490 | | * only makes sense for the tableam to go ahead with these extra checks when |
1491 | | * it is block-oriented (otherwise the checks probably won't be practically |
1492 | | * free, which we rely on). The tableam interface requires the tableam side |
1493 | | * to handle the problem, though, so this is okay (we as an index AM are free |
1494 | | * to make the simplifying assumption that all tableams must be block-based). |
1495 | | * |
1496 | | * Bottom-up index deletion caller provides all the TIDs from the leaf page, |
1497 | | * without expecting that tableam will check most of them. The tableam has |
1498 | | * considerable discretion around which entries/blocks it checks. Our role in |
1499 | | * costing the bottom-up deletion operation is strictly advisory. |
1500 | | * |
1501 | | * Note: Caller must have added deltids entries (i.e. entries that go in |
1502 | | * delstate's main array) in leaf-page-wise order: page offset number order, |
1503 | | * TID order among entries taken from the same posting list tuple (tiebreak on |
1504 | | * TID). This order is convenient to work with here. |
1505 | | * |
1506 | | * Note: We also rely on the id field of each deltids element "capturing" this |
1507 | | * original leaf-page-wise order. That is, we expect to be able to get back |
1508 | | * to the original leaf-page-wise order just by sorting deltids on the id |
1509 | | * field (tableam will sort deltids for its own reasons, so we'll need to put |
1510 | | * it back in leaf-page-wise order afterwards). |
1511 | | */ |
1512 | | void |
1513 | | _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, |
1514 | | TM_IndexDeleteOp *delstate) |
1515 | 0 | { |
1516 | 0 | Page page = BufferGetPage(buf); |
1517 | 0 | TransactionId snapshotConflictHorizon; |
1518 | 0 | bool isCatalogRel; |
1519 | 0 | OffsetNumber postingidxoffnum = InvalidOffsetNumber; |
1520 | 0 | int ndeletable = 0, |
1521 | 0 | nupdatable = 0; |
1522 | 0 | OffsetNumber deletable[MaxIndexTuplesPerPage]; |
1523 | 0 | BTVacuumPosting updatable[MaxIndexTuplesPerPage]; |
1524 | | |
1525 | | /* Use tableam interface to determine which tuples to delete first */ |
1526 | 0 | snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate); |
1527 | 0 | isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel); |
1528 | | |
1529 | | /* Should not WAL-log snapshotConflictHorizon unless it's required */ |
1530 | 0 | if (!XLogStandbyInfoActive()) |
1531 | 0 | snapshotConflictHorizon = InvalidTransactionId; |
1532 | | |
1533 | | /* |
1534 | | * Construct a leaf-page-wise description of what _bt_delitems_delete() |
1535 | | * needs to do to physically delete index tuples from the page. |
1536 | | * |
1537 | | * Must sort deltids array to restore leaf-page-wise order (original order |
1538 | | * before call to tableam). This is the order that the loop expects. |
1539 | | * |
1540 | | * Note that deltids array might be a lot smaller now. It might even have |
1541 | | * no entries at all (with bottom-up deletion caller), in which case there |
1542 | | * is nothing left to do. |
1543 | | */ |
1544 | 0 | qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete), |
1545 | 0 | _bt_delitems_cmp); |
1546 | 0 | if (delstate->ndeltids == 0) |
1547 | 0 | { |
1548 | 0 | Assert(delstate->bottomup); |
1549 | 0 | return; |
1550 | 0 | } |
1551 | | |
1552 | | /* We definitely have to delete at least one index tuple (or one TID) */ |
1553 | 0 | for (int i = 0; i < delstate->ndeltids; i++) |
1554 | 0 | { |
1555 | 0 | TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id; |
1556 | 0 | OffsetNumber idxoffnum = dstatus->idxoffnum; |
1557 | 0 | ItemId itemid = PageGetItemId(page, idxoffnum); |
1558 | 0 | IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); |
1559 | 0 | int nestedi, |
1560 | 0 | nitem; |
1561 | 0 | BTVacuumPosting vacposting; |
1562 | |
|
1563 | 0 | Assert(OffsetNumberIsValid(idxoffnum)); |
1564 | |
|
1565 | 0 | if (idxoffnum == postingidxoffnum) |
1566 | 0 | { |
1567 | | /* |
1568 | | * This deltid entry is a TID from a posting list tuple that has |
1569 | | * already been completely processed |
1570 | | */ |
1571 | 0 | Assert(BTreeTupleIsPosting(itup)); |
1572 | 0 | Assert(ItemPointerCompare(BTreeTupleGetHeapTID(itup), |
1573 | 0 | &delstate->deltids[i].tid) < 0); |
1574 | 0 | Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(itup), |
1575 | 0 | &delstate->deltids[i].tid) >= 0); |
1576 | 0 | continue; |
1577 | 0 | } |
1578 | | |
1579 | 0 | if (!BTreeTupleIsPosting(itup)) |
1580 | 0 | { |
1581 | | /* Plain non-pivot tuple */ |
1582 | 0 | Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid)); |
1583 | 0 | if (dstatus->knowndeletable) |
1584 | 0 | deletable[ndeletable++] = idxoffnum; |
1585 | 0 | continue; |
1586 | 0 | } |
1587 | | |
1588 | | /* |
1589 | | * itup is a posting list tuple whose lowest deltids entry (which may |
1590 | | * or may not be for the first TID from itup) is considered here now. |
1591 | | * We should process all of the deltids entries for the posting list |
1592 | | * together now, though (not just the lowest). Remember to skip over |
1593 | | * later itup-related entries during later iterations of outermost |
1594 | | * loop. |
1595 | | */ |
1596 | 0 | postingidxoffnum = idxoffnum; /* Remember work in outermost loop */ |
1597 | 0 | nestedi = i; /* Initialize for first itup deltids entry */ |
1598 | 0 | vacposting = NULL; /* Describes final action for itup */ |
1599 | 0 | nitem = BTreeTupleGetNPosting(itup); |
1600 | 0 | for (int p = 0; p < nitem; p++) |
1601 | 0 | { |
1602 | 0 | ItemPointer ptid = BTreeTupleGetPostingN(itup, p); |
1603 | 0 | int ptidcmp = -1; |
1604 | | |
1605 | | /* |
1606 | | * This nested loop reuses work across ptid TIDs taken from itup. |
1607 | | * We take advantage of the fact that both itup's TIDs and deltids |
1608 | | * entries (within a single itup/posting list grouping) must both |
1609 | | * be in ascending TID order. |
1610 | | */ |
1611 | 0 | for (; nestedi < delstate->ndeltids; nestedi++) |
1612 | 0 | { |
1613 | 0 | TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi]; |
1614 | 0 | TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id); |
1615 | | |
1616 | | /* Stop once we get past all itup related deltids entries */ |
1617 | 0 | Assert(tdstatus->idxoffnum >= idxoffnum); |
1618 | 0 | if (tdstatus->idxoffnum != idxoffnum) |
1619 | 0 | break; |
1620 | | |
1621 | | /* Skip past non-deletable itup related entries up front */ |
1622 | 0 | if (!tdstatus->knowndeletable) |
1623 | 0 | continue; |
1624 | | |
1625 | | /* Entry is first partial ptid match (or an exact match)? */ |
1626 | 0 | ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid); |
1627 | 0 | if (ptidcmp >= 0) |
1628 | 0 | { |
1629 | | /* Greater than or equal (partial or exact) match... */ |
1630 | 0 | break; |
1631 | 0 | } |
1632 | 0 | } |
1633 | | |
1634 | | /* ...exact ptid match to a deletable deltids entry? */ |
1635 | 0 | if (ptidcmp != 0) |
1636 | 0 | continue; |
1637 | | |
1638 | | /* Exact match for deletable deltids entry -- ptid gets deleted */ |
1639 | 0 | if (vacposting == NULL) |
1640 | 0 | { |
1641 | 0 | vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) + |
1642 | 0 | nitem * sizeof(uint16)); |
1643 | 0 | vacposting->itup = itup; |
1644 | 0 | vacposting->updatedoffset = idxoffnum; |
1645 | 0 | vacposting->ndeletedtids = 0; |
1646 | 0 | } |
1647 | 0 | vacposting->deletetids[vacposting->ndeletedtids++] = p; |
1648 | 0 | } |
1649 | | |
1650 | | /* Final decision on itup, a posting list tuple */ |
1651 | |
|
1652 | 0 | if (vacposting == NULL) |
1653 | 0 | { |
1654 | | /* No TIDs to delete from itup -- do nothing */ |
1655 | 0 | } |
1656 | 0 | else if (vacposting->ndeletedtids == nitem) |
1657 | 0 | { |
1658 | | /* Straight delete of itup (to delete all TIDs) */ |
1659 | 0 | deletable[ndeletable++] = idxoffnum; |
1660 | | /* Turns out we won't need granular information */ |
1661 | 0 | pfree(vacposting); |
1662 | 0 | } |
1663 | 0 | else |
1664 | 0 | { |
1665 | | /* Delete some (but not all) TIDs from itup */ |
1666 | 0 | Assert(vacposting->ndeletedtids > 0 && |
1667 | 0 | vacposting->ndeletedtids < nitem); |
1668 | 0 | updatable[nupdatable++] = vacposting; |
1669 | 0 | } |
1670 | 0 | } |
1671 | | |
1672 | | /* Physically delete tuples (or TIDs) using deletable (or updatable) */ |
1673 | 0 | _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel, |
1674 | 0 | deletable, ndeletable, updatable, nupdatable); |
1675 | | |
1676 | | /* be tidy */ |
1677 | 0 | for (int i = 0; i < nupdatable; i++) |
1678 | 0 | pfree(updatable[i]); |
1679 | 0 | } |
1680 | | |
1681 | | /* |
1682 | | * Check that leftsib page (the btpo_prev of target page) is not marked with |
1683 | | * INCOMPLETE_SPLIT flag. Used during page deletion. |
1684 | | * |
1685 | | * Returning true indicates that page flag is set in leftsib (which is |
1686 | | * definitely still the left sibling of target). When that happens, the |
1687 | | * target doesn't have a downlink in parent, and the page deletion algorithm |
1688 | | * isn't prepared to handle that. Deletion of the target page (or the whole |
1689 | | * subtree that contains the target page) cannot take place. |
1690 | | * |
1691 | | * Caller should not have a lock on the target page itself, since pages on the |
1692 | | * same level must always be locked left to right to avoid deadlocks. |
1693 | | */ |
1694 | | static bool |
1695 | | _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target) |
1696 | 0 | { |
1697 | 0 | Buffer buf; |
1698 | 0 | Page page; |
1699 | 0 | BTPageOpaque opaque; |
1700 | 0 | bool result; |
1701 | | |
1702 | | /* Easy case: No left sibling */ |
1703 | 0 | if (leftsib == P_NONE) |
1704 | 0 | return false; |
1705 | | |
1706 | 0 | buf = _bt_getbuf(rel, leftsib, BT_READ); |
1707 | 0 | page = BufferGetPage(buf); |
1708 | 0 | opaque = BTPageGetOpaque(page); |
1709 | | |
1710 | | /* |
1711 | | * If the left sibling was concurrently split, so that its next-pointer |
1712 | | * doesn't point to the current page anymore, the split that created |
1713 | | * target must be completed. Caller can reasonably expect that there will |
1714 | | * be a downlink to the target page that it can relocate using its stack. |
1715 | | * (We don't allow splitting an incompletely split page again until the |
1716 | | * previous split has been completed.) |
1717 | | */ |
1718 | 0 | result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque)); |
1719 | 0 | _bt_relbuf(rel, buf); |
1720 | |
|
1721 | 0 | return result; |
1722 | 0 | } |
1723 | | |
1724 | | /* |
1725 | | * Check that leafrightsib page (the btpo_next of target leaf page) is not |
1726 | | * marked with ISHALFDEAD flag. Used during page deletion. |
1727 | | * |
1728 | | * Returning true indicates that page flag is set in leafrightsib, so page |
1729 | | * deletion cannot go ahead. Our caller is not prepared to deal with the case |
1730 | | * where the parent page does not have a pivot tuples whose downlink points to |
1731 | | * leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't |
1732 | | * seem worth going to the trouble of teaching our caller to deal with it. |
1733 | | * The situation will be resolved after VACUUM finishes the deletion of the |
1734 | | * half-dead page (when a future VACUUM operation reaches the target page |
1735 | | * again). |
1736 | | * |
1737 | | * _bt_leftsib_splitflag() is called for both leaf pages and internal pages. |
1738 | | * _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is |
1739 | | * okay because of the restriction on deleting pages that are the rightmost |
1740 | | * page of their parent (i.e. that such deletions can only take place when the |
1741 | | * entire subtree must be deleted). The leaf level check made here will apply |
1742 | | * to a right "cousin" leaf page rather than a simple right sibling leaf page |
1743 | | * in cases where caller actually goes on to attempt deleting pages that are |
1744 | | * above the leaf page. The right cousin leaf page is representative of the |
1745 | | * left edge of the subtree to the right of the to-be-deleted subtree as a |
1746 | | * whole, which is exactly the condition that our caller cares about. |
1747 | | * (Besides, internal pages are never marked half-dead, so it isn't even |
1748 | | * possible to _directly_ assess if an internal page is part of some other |
1749 | | * to-be-deleted subtree.) |
1750 | | */ |
1751 | | static bool |
1752 | | _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib) |
1753 | 0 | { |
1754 | 0 | Buffer buf; |
1755 | 0 | Page page; |
1756 | 0 | BTPageOpaque opaque; |
1757 | 0 | bool result; |
1758 | |
|
1759 | 0 | Assert(leafrightsib != P_NONE); |
1760 | |
|
1761 | 0 | buf = _bt_getbuf(rel, leafrightsib, BT_READ); |
1762 | 0 | page = BufferGetPage(buf); |
1763 | 0 | opaque = BTPageGetOpaque(page); |
1764 | |
|
1765 | 0 | Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque)); |
1766 | 0 | result = P_ISHALFDEAD(opaque); |
1767 | 0 | _bt_relbuf(rel, buf); |
1768 | |
|
1769 | 0 | return result; |
1770 | 0 | } |
1771 | | |
1772 | | /* |
1773 | | * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so. |
1774 | | * |
1775 | | * This action unlinks the leaf page from the b-tree structure, removing all |
1776 | | * pointers leading to it --- but not touching its own left and right links. |
1777 | | * The page cannot be physically reclaimed right away, since other processes |
1778 | | * may currently be trying to follow links leading to the page; they have to |
1779 | | * be allowed to use its right-link to recover. See nbtree/README. |
1780 | | * |
1781 | | * On entry, the target buffer must be pinned and locked (either read or write |
1782 | | * lock is OK). The page must be an empty leaf page, which may be half-dead |
1783 | | * already (a half-dead page should only be passed to us when an earlier |
1784 | | * VACUUM operation was interrupted, though). Note in particular that caller |
1785 | | * should never pass a buffer containing an existing deleted page here. The |
1786 | | * lock and pin on caller's buffer will be dropped before we return. |
1787 | | * |
1788 | | * Maintains bulk delete stats for caller, which are taken from vstate. We |
1789 | | * need to cooperate closely with caller here so that whole VACUUM operation |
1790 | | * reliably avoids any double counting of subsidiary-to-leafbuf pages that we |
1791 | | * delete in passing. If such pages happen to be from a block number that is |
1792 | | * ahead of the current scanblkno position, then caller is expected to count |
1793 | | * them directly later on. It's simpler for us to understand caller's |
1794 | | * requirements than it would be for caller to understand when or how a |
1795 | | * deleted page became deleted after the fact. |
1796 | | * |
1797 | | * NOTE: this leaks memory. Rather than trying to clean up everything |
1798 | | * carefully, it's better to run it in a temp context that can be reset |
1799 | | * frequently. |
1800 | | */ |
1801 | | void |
1802 | | _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) |
1803 | 0 | { |
1804 | 0 | BlockNumber rightsib; |
1805 | 0 | bool rightsib_empty; |
1806 | 0 | Page page; |
1807 | 0 | BTPageOpaque opaque; |
1808 | | |
1809 | | /* |
1810 | | * Save original leafbuf block number from caller. Only deleted blocks |
1811 | | * that are <= scanblkno are added to bulk delete stat's pages_deleted |
1812 | | * count. |
1813 | | */ |
1814 | 0 | BlockNumber scanblkno = BufferGetBlockNumber(leafbuf); |
1815 | | |
1816 | | /* |
1817 | | * "stack" is a search stack leading (approximately) to the target page. |
1818 | | * It is initially NULL, but when iterating, we keep it to avoid |
1819 | | * duplicated search effort. |
1820 | | * |
1821 | | * Also, when "stack" is not NULL, we have already checked that the |
1822 | | * current page is not the right half of an incomplete split, i.e. the |
1823 | | * left sibling does not have its INCOMPLETE_SPLIT flag set, including |
1824 | | * when the current target page is to the right of caller's initial page |
1825 | | * (the scanblkno page). |
1826 | | */ |
1827 | 0 | BTStack stack = NULL; |
1828 | |
|
1829 | 0 | for (;;) |
1830 | 0 | { |
1831 | 0 | page = BufferGetPage(leafbuf); |
1832 | 0 | opaque = BTPageGetOpaque(page); |
1833 | | |
1834 | | /* |
1835 | | * Internal pages are never deleted directly, only as part of deleting |
1836 | | * the whole subtree all the way down to leaf level. |
1837 | | * |
1838 | | * Also check for deleted pages here. Caller never passes us a fully |
1839 | | * deleted page. Only VACUUM can delete pages, so there can't have |
1840 | | * been a concurrent deletion. Assume that we reached any deleted |
1841 | | * page encountered here by following a sibling link, and that the |
1842 | | * index is corrupt. |
1843 | | */ |
1844 | 0 | Assert(!P_ISDELETED(opaque)); |
1845 | 0 | if (!P_ISLEAF(opaque) || P_ISDELETED(opaque)) |
1846 | 0 | { |
1847 | | /* |
1848 | | * Pre-9.4 page deletion only marked internal pages as half-dead, |
1849 | | * but now we only use that flag on leaf pages. The old algorithm |
1850 | | * was never supposed to leave half-dead pages in the tree, it was |
1851 | | * just a transient state, but it was nevertheless possible in |
1852 | | * error scenarios. We don't know how to deal with them here. They |
1853 | | * are harmless as far as searches are considered, but inserts |
1854 | | * into the deleted keyspace could add out-of-order downlinks in |
1855 | | * the upper levels. Log a notice, hopefully the admin will notice |
1856 | | * and reindex. |
1857 | | */ |
1858 | 0 | if (P_ISHALFDEAD(opaque)) |
1859 | 0 | ereport(LOG, |
1860 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
1861 | 0 | errmsg("index \"%s\" contains a half-dead internal page", |
1862 | 0 | RelationGetRelationName(rel)), |
1863 | 0 | errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it."))); |
1864 | | |
1865 | 0 | if (P_ISDELETED(opaque)) |
1866 | 0 | ereport(LOG, |
1867 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
1868 | 0 | errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"", |
1869 | 0 | BufferGetBlockNumber(leafbuf), |
1870 | 0 | scanblkno, |
1871 | 0 | RelationGetRelationName(rel)))); |
1872 | | |
1873 | 0 | _bt_relbuf(rel, leafbuf); |
1874 | 0 | return; |
1875 | 0 | } |
1876 | | |
1877 | | /* |
1878 | | * We can never delete rightmost pages nor root pages. While at it, |
1879 | | * check that page is empty, since it's possible that the leafbuf page |
1880 | | * was empty a moment ago, but has since had some inserts. |
1881 | | * |
1882 | | * To keep the algorithm simple, we also never delete an incompletely |
1883 | | * split page (they should be rare enough that this doesn't make any |
1884 | | * meaningful difference to disk usage): |
1885 | | * |
1886 | | * The INCOMPLETE_SPLIT flag on the page tells us if the page is the |
1887 | | * left half of an incomplete split, but ensuring that it's not the |
1888 | | * right half is more complicated. For that, we have to check that |
1889 | | * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using |
1890 | | * _bt_leftsib_splitflag(). On the first iteration, we temporarily |
1891 | | * release the lock on scanblkno/leafbuf, check the left sibling, and |
1892 | | * construct a search stack to scanblkno. On subsequent iterations, |
1893 | | * we know we stepped right from a page that passed these tests, so |
1894 | | * it's OK. |
1895 | | */ |
1896 | 0 | if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || |
1897 | 0 | P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) || |
1898 | 0 | P_INCOMPLETE_SPLIT(opaque)) |
1899 | 0 | { |
1900 | | /* Should never fail to delete a half-dead page */ |
1901 | 0 | Assert(!P_ISHALFDEAD(opaque)); |
1902 | |
|
1903 | 0 | _bt_relbuf(rel, leafbuf); |
1904 | 0 | return; |
1905 | 0 | } |
1906 | | |
1907 | | /* |
1908 | | * First, remove downlink pointing to the page (or a parent of the |
1909 | | * page, if we are going to delete a taller subtree), and mark the |
1910 | | * leafbuf page half-dead |
1911 | | */ |
1912 | 0 | if (!P_ISHALFDEAD(opaque)) |
1913 | 0 | { |
1914 | | /* |
1915 | | * We need an approximate pointer to the page's parent page. We |
1916 | | * use a variant of the standard search mechanism to search for |
1917 | | * the page's high key; this will give us a link to either the |
1918 | | * current parent or someplace to its left (if there are multiple |
1919 | | * equal high keys, which is possible with !heapkeyspace indexes). |
1920 | | * |
1921 | | * Also check if this is the right-half of an incomplete split |
1922 | | * (see comment above). |
1923 | | */ |
1924 | 0 | if (!stack) |
1925 | 0 | { |
1926 | 0 | BTScanInsert itup_key; |
1927 | 0 | ItemId itemid; |
1928 | 0 | IndexTuple targetkey; |
1929 | 0 | BlockNumber leftsib, |
1930 | 0 | leafblkno; |
1931 | 0 | Buffer sleafbuf; |
1932 | |
|
1933 | 0 | itemid = PageGetItemId(page, P_HIKEY); |
1934 | 0 | targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid)); |
1935 | |
|
1936 | 0 | leftsib = opaque->btpo_prev; |
1937 | 0 | leafblkno = BufferGetBlockNumber(leafbuf); |
1938 | | |
1939 | | /* |
1940 | | * To avoid deadlocks, we'd better drop the leaf page lock |
1941 | | * before going further. |
1942 | | */ |
1943 | 0 | _bt_unlockbuf(rel, leafbuf); |
1944 | | |
1945 | | /* |
1946 | | * Check that the left sibling of leafbuf (if any) is not |
1947 | | * marked with INCOMPLETE_SPLIT flag before proceeding |
1948 | | */ |
1949 | 0 | Assert(leafblkno == scanblkno); |
1950 | 0 | if (_bt_leftsib_splitflag(rel, leftsib, leafblkno)) |
1951 | 0 | { |
1952 | 0 | ReleaseBuffer(leafbuf); |
1953 | 0 | return; |
1954 | 0 | } |
1955 | | |
1956 | | /* |
1957 | | * We need an insertion scan key, so build one. |
1958 | | * |
1959 | | * _bt_search searches for the leaf page that contains any |
1960 | | * matching non-pivot tuples, but we need it to "search" for |
1961 | | * the high key pivot from the page that we're set to delete. |
1962 | | * Compensate for the mismatch by having _bt_search locate the |
1963 | | * last position < equal-to-untruncated-prefix non-pivots. |
1964 | | */ |
1965 | 0 | itup_key = _bt_mkscankey(rel, targetkey); |
1966 | | |
1967 | | /* Set up a BTLessStrategyNumber-like insertion scan key */ |
1968 | 0 | itup_key->nextkey = false; |
1969 | 0 | itup_key->backward = true; |
1970 | 0 | stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ); |
1971 | | /* won't need a second lock or pin on leafbuf */ |
1972 | 0 | _bt_relbuf(rel, sleafbuf); |
1973 | | |
1974 | | /* |
1975 | | * Re-lock the leaf page, and start over to use our stack |
1976 | | * within _bt_mark_page_halfdead. We must do it that way |
1977 | | * because it's possible that leafbuf can no longer be |
1978 | | * deleted. We need to recheck. |
1979 | | * |
1980 | | * Note: We can't simply hold on to the sleafbuf lock instead, |
1981 | | * because it's barely possible that sleafbuf is not the same |
1982 | | * page as leafbuf. This happens when leafbuf split after our |
1983 | | * original lock was dropped, but before _bt_search finished |
1984 | | * its descent. We rely on the assumption that we'll find |
1985 | | * leafbuf isn't safe to delete anymore in this scenario. |
1986 | | * (Page deletion can cope with the stack being to the left of |
1987 | | * leafbuf, but not to the right of leafbuf.) |
1988 | | */ |
1989 | 0 | _bt_lockbuf(rel, leafbuf, BT_WRITE); |
1990 | 0 | continue; |
1991 | 0 | } |
1992 | | |
1993 | | /* |
1994 | | * See if it's safe to delete the leaf page, and determine how |
1995 | | * many parent/internal pages above the leaf level will be |
1996 | | * deleted. If it's safe then _bt_mark_page_halfdead will also |
1997 | | * perform the first phase of deletion, which includes marking the |
1998 | | * leafbuf page half-dead. |
1999 | | */ |
2000 | 0 | Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque)); |
2001 | 0 | if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf, |
2002 | 0 | stack)) |
2003 | 0 | { |
2004 | 0 | _bt_relbuf(rel, leafbuf); |
2005 | 0 | return; |
2006 | 0 | } |
2007 | 0 | } |
2008 | | |
2009 | | /* |
2010 | | * Then unlink it from its siblings. Each call to |
2011 | | * _bt_unlink_halfdead_page unlinks the topmost page from the subtree, |
2012 | | * making it shallower. Iterate until the leafbuf page is deleted. |
2013 | | */ |
2014 | 0 | rightsib_empty = false; |
2015 | 0 | Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque)); |
2016 | 0 | while (P_ISHALFDEAD(opaque)) |
2017 | 0 | { |
2018 | | /* Check for interrupts in _bt_unlink_halfdead_page */ |
2019 | 0 | if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno, |
2020 | 0 | &rightsib_empty, vstate)) |
2021 | 0 | { |
2022 | | /* |
2023 | | * _bt_unlink_halfdead_page should never fail, since we |
2024 | | * established that deletion is generally safe in |
2025 | | * _bt_mark_page_halfdead -- index must be corrupt. |
2026 | | * |
2027 | | * Note that _bt_unlink_halfdead_page already released the |
2028 | | * lock and pin on leafbuf for us. |
2029 | | */ |
2030 | 0 | Assert(false); |
2031 | 0 | return; |
2032 | 0 | } |
2033 | 0 | } |
2034 | | |
2035 | 0 | Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque)); |
2036 | |
|
2037 | 0 | rightsib = opaque->btpo_next; |
2038 | |
|
2039 | 0 | _bt_relbuf(rel, leafbuf); |
2040 | | |
2041 | | /* |
2042 | | * Check here, as calling loops will have locks held, preventing |
2043 | | * interrupts from being processed. |
2044 | | */ |
2045 | 0 | CHECK_FOR_INTERRUPTS(); |
2046 | | |
2047 | | /* |
2048 | | * The page has now been deleted. If its right sibling is completely |
2049 | | * empty, it's possible that the reason we haven't deleted it earlier |
2050 | | * is that it was the rightmost child of the parent. Now that we |
2051 | | * removed the downlink for this page, the right sibling might now be |
2052 | | * the only child of the parent, and could be removed. It would be |
2053 | | * picked up by the next vacuum anyway, but might as well try to |
2054 | | * remove it now, so loop back to process the right sibling. |
2055 | | * |
2056 | | * Note: This relies on the assumption that _bt_getstackbuf() will be |
2057 | | * able to reuse our original descent stack with a different child |
2058 | | * block (provided that the child block is to the right of the |
2059 | | * original leaf page reached by _bt_search()). It will even update |
2060 | | * the descent stack each time we loop around, avoiding repeated work. |
2061 | | */ |
2062 | 0 | if (!rightsib_empty) |
2063 | 0 | break; |
2064 | | |
2065 | 0 | leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE); |
2066 | 0 | } |
2067 | 0 | } |
2068 | | |
2069 | | /* |
2070 | | * First stage of page deletion. |
2071 | | * |
2072 | | * Establish the height of the to-be-deleted subtree with leafbuf at its |
2073 | | * lowest level, remove the downlink to the subtree, and mark leafbuf |
2074 | | * half-dead. The final to-be-deleted subtree is usually just leafbuf itself, |
2075 | | * but may include additional internal pages (at most one per level of the |
2076 | | * tree below the root). |
2077 | | * |
2078 | | * Caller must pass a valid heaprel, since it's just about possible that our |
2079 | | * call to _bt_lock_subtree_parent will need to allocate a new index page to |
2080 | | * complete a page split. Every call to _bt_allocbuf needs to pass a heaprel. |
2081 | | * |
2082 | | * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is |
2083 | | * the rightmost child of its parent (and parent has more than one downlink). |
2084 | | * Returns 'true' when the first stage of page deletion completed |
2085 | | * successfully. |
2086 | | */ |
2087 | | static bool |
2088 | | _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf, |
2089 | | BTStack stack) |
2090 | 0 | { |
2091 | 0 | BlockNumber leafblkno; |
2092 | 0 | BlockNumber leafrightsib; |
2093 | 0 | BlockNumber topparent; |
2094 | 0 | BlockNumber topparentrightsib; |
2095 | 0 | ItemId itemid; |
2096 | 0 | Page page; |
2097 | 0 | BTPageOpaque opaque; |
2098 | 0 | Buffer subtreeparent; |
2099 | 0 | OffsetNumber poffset; |
2100 | 0 | OffsetNumber nextoffset; |
2101 | 0 | IndexTuple itup; |
2102 | 0 | IndexTupleData trunctuple; |
2103 | |
|
2104 | 0 | page = BufferGetPage(leafbuf); |
2105 | 0 | opaque = BTPageGetOpaque(page); |
2106 | |
|
2107 | 0 | Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) && |
2108 | 0 | P_ISLEAF(opaque) && !P_IGNORE(opaque) && |
2109 | 0 | P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)); |
2110 | 0 | Assert(heaprel != NULL); |
2111 | | |
2112 | | /* |
2113 | | * Save info about the leaf page. |
2114 | | */ |
2115 | 0 | leafblkno = BufferGetBlockNumber(leafbuf); |
2116 | 0 | leafrightsib = opaque->btpo_next; |
2117 | | |
2118 | | /* |
2119 | | * Before attempting to lock the parent page, check that the right sibling |
2120 | | * is not in half-dead state. A half-dead right sibling would have no |
2121 | | * downlink in the parent, which would be highly confusing later when we |
2122 | | * delete the downlink. It would fail the "right sibling of target page |
2123 | | * is also the next child in parent page" cross-check below. |
2124 | | */ |
2125 | 0 | if (_bt_rightsib_halfdeadflag(rel, leafrightsib)) |
2126 | 0 | { |
2127 | 0 | elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead", |
2128 | 0 | leafblkno, leafrightsib); |
2129 | 0 | return false; |
2130 | 0 | } |
2131 | | |
2132 | | /* |
2133 | | * We cannot delete a page that is the rightmost child of its immediate |
2134 | | * parent, unless it is the only child --- in which case the parent has to |
2135 | | * be deleted too, and the same condition applies recursively to it. We |
2136 | | * have to check this condition all the way up before trying to delete, |
2137 | | * and lock the parent of the root of the to-be-deleted subtree (the |
2138 | | * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent |
2139 | | * for us. We remove the downlink to the "top parent" page (subtree root |
2140 | | * page) from the subtree parent page below. |
2141 | | * |
2142 | | * Initialize topparent to be leafbuf page now. The final to-be-deleted |
2143 | | * subtree is often a degenerate one page subtree consisting only of the |
2144 | | * leafbuf page. When that happens, the leafbuf page is the final subtree |
2145 | | * root page/top parent page. |
2146 | | */ |
2147 | 0 | topparent = leafblkno; |
2148 | 0 | topparentrightsib = leafrightsib; |
2149 | 0 | if (!_bt_lock_subtree_parent(rel, heaprel, leafblkno, stack, |
2150 | 0 | &subtreeparent, &poffset, |
2151 | 0 | &topparent, &topparentrightsib)) |
2152 | 0 | return false; |
2153 | | |
2154 | 0 | page = BufferGetPage(subtreeparent); |
2155 | 0 | opaque = BTPageGetOpaque(page); |
2156 | |
|
2157 | | #ifdef USE_ASSERT_CHECKING |
2158 | | |
2159 | | /* |
2160 | | * This is just an assertion because _bt_lock_subtree_parent should have |
2161 | | * guaranteed tuple has the expected contents |
2162 | | */ |
2163 | | itemid = PageGetItemId(page, poffset); |
2164 | | itup = (IndexTuple) PageGetItem(page, itemid); |
2165 | | Assert(BTreeTupleGetDownLink(itup) == topparent); |
2166 | | #endif |
2167 | |
|
2168 | 0 | nextoffset = OffsetNumberNext(poffset); |
2169 | 0 | itemid = PageGetItemId(page, nextoffset); |
2170 | 0 | itup = (IndexTuple) PageGetItem(page, itemid); |
2171 | | |
2172 | | /* |
2173 | | * Check that the parent-page index items we're about to delete/overwrite |
2174 | | * in subtree parent page contain what we expect. This can fail if the |
2175 | | * index has become corrupt for some reason. When that happens we back |
2176 | | * out of deletion of the leafbuf subtree. (This is just like the case |
2177 | | * where _bt_lock_subtree_parent() cannot "re-find" leafbuf's downlink.) |
2178 | | */ |
2179 | 0 | if (BTreeTupleGetDownLink(itup) != topparentrightsib) |
2180 | 0 | { |
2181 | 0 | ereport(LOG, |
2182 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
2183 | 0 | errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"", |
2184 | 0 | topparentrightsib, topparent, |
2185 | 0 | BTreeTupleGetDownLink(itup), |
2186 | 0 | BufferGetBlockNumber(subtreeparent), |
2187 | 0 | RelationGetRelationName(rel)))); |
2188 | | |
2189 | 0 | _bt_relbuf(rel, subtreeparent); |
2190 | 0 | Assert(false); |
2191 | 0 | return false; |
2192 | 0 | } |
2193 | | |
2194 | | /* |
2195 | | * Any insert which would have gone on the leaf block will now go to its |
2196 | | * right sibling. In other words, the key space moves right. |
2197 | | */ |
2198 | 0 | PredicateLockPageCombine(rel, leafblkno, leafrightsib); |
2199 | | |
2200 | | /* No ereport(ERROR) until changes are logged */ |
2201 | 0 | START_CRIT_SECTION(); |
2202 | | |
2203 | | /* |
2204 | | * Update parent of subtree. We want to delete the downlink to the top |
2205 | | * parent page/root of the subtree, and the *following* key. Easiest way |
2206 | | * is to copy the right sibling's downlink over the downlink that points |
2207 | | * to top parent page, and then delete the right sibling's original pivot |
2208 | | * tuple. |
2209 | | * |
2210 | | * Lanin and Shasha make the key space move left when deleting a page, |
2211 | | * whereas the key space moves right here. That's why we cannot simply |
2212 | | * delete the pivot tuple with the downlink to the top parent page. See |
2213 | | * nbtree/README. |
2214 | | */ |
2215 | 0 | page = BufferGetPage(subtreeparent); |
2216 | 0 | opaque = BTPageGetOpaque(page); |
2217 | |
|
2218 | 0 | itemid = PageGetItemId(page, poffset); |
2219 | 0 | itup = (IndexTuple) PageGetItem(page, itemid); |
2220 | 0 | BTreeTupleSetDownLink(itup, topparentrightsib); |
2221 | |
|
2222 | 0 | nextoffset = OffsetNumberNext(poffset); |
2223 | 0 | PageIndexTupleDelete(page, nextoffset); |
2224 | | |
2225 | | /* |
2226 | | * Mark the leaf page as half-dead, and stamp it with a link to the top |
2227 | | * parent page. When the leaf page is also the top parent page, the link |
2228 | | * is set to InvalidBlockNumber. |
2229 | | */ |
2230 | 0 | page = BufferGetPage(leafbuf); |
2231 | 0 | opaque = BTPageGetOpaque(page); |
2232 | 0 | opaque->btpo_flags |= BTP_HALF_DEAD; |
2233 | |
|
2234 | 0 | Assert(PageGetMaxOffsetNumber(page) == P_HIKEY); |
2235 | 0 | MemSet(&trunctuple, 0, sizeof(IndexTupleData)); |
2236 | 0 | trunctuple.t_info = sizeof(IndexTupleData); |
2237 | 0 | if (topparent != leafblkno) |
2238 | 0 | BTreeTupleSetTopParent(&trunctuple, topparent); |
2239 | 0 | else |
2240 | 0 | BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber); |
2241 | |
|
2242 | 0 | if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple, |
2243 | 0 | IndexTupleSize(&trunctuple))) |
2244 | 0 | elog(ERROR, "could not overwrite high key in half-dead page"); |
2245 | | |
2246 | | /* Must mark buffers dirty before XLogInsert */ |
2247 | 0 | MarkBufferDirty(subtreeparent); |
2248 | 0 | MarkBufferDirty(leafbuf); |
2249 | | |
2250 | | /* XLOG stuff */ |
2251 | 0 | if (RelationNeedsWAL(rel)) |
2252 | 0 | { |
2253 | 0 | xl_btree_mark_page_halfdead xlrec; |
2254 | 0 | XLogRecPtr recptr; |
2255 | |
|
2256 | 0 | xlrec.poffset = poffset; |
2257 | 0 | xlrec.leafblk = leafblkno; |
2258 | 0 | if (topparent != leafblkno) |
2259 | 0 | xlrec.topparent = topparent; |
2260 | 0 | else |
2261 | 0 | xlrec.topparent = InvalidBlockNumber; |
2262 | |
|
2263 | 0 | XLogBeginInsert(); |
2264 | 0 | XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT); |
2265 | 0 | XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD); |
2266 | |
|
2267 | 0 | page = BufferGetPage(leafbuf); |
2268 | 0 | opaque = BTPageGetOpaque(page); |
2269 | 0 | xlrec.leftblk = opaque->btpo_prev; |
2270 | 0 | xlrec.rightblk = opaque->btpo_next; |
2271 | |
|
2272 | 0 | XLogRegisterData(&xlrec, SizeOfBtreeMarkPageHalfDead); |
2273 | |
|
2274 | 0 | recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD); |
2275 | |
|
2276 | 0 | page = BufferGetPage(subtreeparent); |
2277 | 0 | PageSetLSN(page, recptr); |
2278 | 0 | page = BufferGetPage(leafbuf); |
2279 | 0 | PageSetLSN(page, recptr); |
2280 | 0 | } |
2281 | |
|
2282 | 0 | END_CRIT_SECTION(); |
2283 | |
|
2284 | 0 | _bt_relbuf(rel, subtreeparent); |
2285 | 0 | return true; |
2286 | 0 | } |
2287 | | |
2288 | | /* |
2289 | | * Second stage of page deletion. |
2290 | | * |
2291 | | * Unlinks a single page (in the subtree undergoing deletion) from its |
2292 | | * siblings. Also marks the page deleted. |
2293 | | * |
2294 | | * To get rid of the whole subtree, including the leaf page itself, call here |
2295 | | * until the leaf page is deleted. The original "top parent" established in |
2296 | | * the first stage of deletion is deleted in the first call here, while the |
2297 | | * leaf page is deleted in the last call here. Note that the leaf page itself |
2298 | | * is often the initial top parent page. |
2299 | | * |
2300 | | * Returns 'false' if the page could not be unlinked (shouldn't happen). If |
2301 | | * the right sibling of the current target page is empty, *rightsib_empty is |
2302 | | * set to true, allowing caller to delete the target's right sibling page in |
2303 | | * passing. Note that *rightsib_empty is only actually used by caller when |
2304 | | * target page is leafbuf, following last call here for leafbuf/the subtree |
2305 | | * containing leafbuf. (We always set *rightsib_empty for caller, just to be |
2306 | | * consistent.) |
2307 | | * |
2308 | | * Must hold pin and lock on leafbuf at entry (read or write doesn't matter). |
2309 | | * On success exit, we'll be holding pin and write lock. On failure exit, |
2310 | | * we'll release both pin and lock before returning (we define it that way |
2311 | | * to avoid having to reacquire a lock we already released). |
2312 | | */ |
2313 | | static bool |
2314 | | _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, |
2315 | | bool *rightsib_empty, BTVacState *vstate) |
2316 | 0 | { |
2317 | 0 | BlockNumber leafblkno = BufferGetBlockNumber(leafbuf); |
2318 | 0 | IndexBulkDeleteResult *stats = vstate->stats; |
2319 | 0 | BlockNumber leafleftsib; |
2320 | 0 | BlockNumber leafrightsib; |
2321 | 0 | BlockNumber target; |
2322 | 0 | BlockNumber leftsib; |
2323 | 0 | BlockNumber rightsib; |
2324 | 0 | Buffer lbuf = InvalidBuffer; |
2325 | 0 | Buffer buf; |
2326 | 0 | Buffer rbuf; |
2327 | 0 | Buffer metabuf = InvalidBuffer; |
2328 | 0 | Page metapg = NULL; |
2329 | 0 | BTMetaPageData *metad = NULL; |
2330 | 0 | ItemId itemid; |
2331 | 0 | Page page; |
2332 | 0 | BTPageOpaque opaque; |
2333 | 0 | FullTransactionId safexid; |
2334 | 0 | bool rightsib_is_rightmost; |
2335 | 0 | uint32 targetlevel; |
2336 | 0 | IndexTuple leafhikey; |
2337 | 0 | BlockNumber leaftopparent; |
2338 | |
|
2339 | 0 | page = BufferGetPage(leafbuf); |
2340 | 0 | opaque = BTPageGetOpaque(page); |
2341 | |
|
2342 | 0 | Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque)); |
2343 | | |
2344 | | /* |
2345 | | * Remember some information about the leaf page. |
2346 | | */ |
2347 | 0 | itemid = PageGetItemId(page, P_HIKEY); |
2348 | 0 | leafhikey = (IndexTuple) PageGetItem(page, itemid); |
2349 | 0 | target = BTreeTupleGetTopParent(leafhikey); |
2350 | 0 | leafleftsib = opaque->btpo_prev; |
2351 | 0 | leafrightsib = opaque->btpo_next; |
2352 | |
|
2353 | 0 | _bt_unlockbuf(rel, leafbuf); |
2354 | | |
2355 | | /* |
2356 | | * Check here, as calling loops will have locks held, preventing |
2357 | | * interrupts from being processed. |
2358 | | */ |
2359 | 0 | CHECK_FOR_INTERRUPTS(); |
2360 | | |
2361 | | /* Unlink the current top parent of the subtree */ |
2362 | 0 | if (!BlockNumberIsValid(target)) |
2363 | 0 | { |
2364 | | /* Target is leaf page (or leaf page is top parent, if you prefer) */ |
2365 | 0 | target = leafblkno; |
2366 | |
|
2367 | 0 | buf = leafbuf; |
2368 | 0 | leftsib = leafleftsib; |
2369 | 0 | targetlevel = 0; |
2370 | 0 | } |
2371 | 0 | else |
2372 | 0 | { |
2373 | | /* Target is the internal page taken from leaf's top parent link */ |
2374 | 0 | Assert(target != leafblkno); |
2375 | | |
2376 | | /* Fetch the block number of the target's left sibling */ |
2377 | 0 | buf = _bt_getbuf(rel, target, BT_READ); |
2378 | 0 | page = BufferGetPage(buf); |
2379 | 0 | opaque = BTPageGetOpaque(page); |
2380 | 0 | leftsib = opaque->btpo_prev; |
2381 | 0 | targetlevel = opaque->btpo_level; |
2382 | 0 | Assert(targetlevel > 0); |
2383 | | |
2384 | | /* |
2385 | | * To avoid deadlocks, we'd better drop the target page lock before |
2386 | | * going further. |
2387 | | */ |
2388 | 0 | _bt_unlockbuf(rel, buf); |
2389 | 0 | } |
2390 | | |
2391 | | /* |
2392 | | * We have to lock the pages we need to modify in the standard order: |
2393 | | * moving right, then up. Else we will deadlock against other writers. |
2394 | | * |
2395 | | * So, first lock the leaf page, if it's not the target. Then find and |
2396 | | * write-lock the current left sibling of the target page. The sibling |
2397 | | * that was current a moment ago could have split, so we may have to move |
2398 | | * right. |
2399 | | */ |
2400 | 0 | if (target != leafblkno) |
2401 | 0 | _bt_lockbuf(rel, leafbuf, BT_WRITE); |
2402 | 0 | if (leftsib != P_NONE) |
2403 | 0 | { |
2404 | 0 | lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); |
2405 | 0 | page = BufferGetPage(lbuf); |
2406 | 0 | opaque = BTPageGetOpaque(page); |
2407 | 0 | while (P_ISDELETED(opaque) || opaque->btpo_next != target) |
2408 | 0 | { |
2409 | 0 | bool leftsibvalid = true; |
2410 | | |
2411 | | /* |
2412 | | * Before we follow the link from the page that was the left |
2413 | | * sibling mere moments ago, validate its right link. This |
2414 | | * reduces the opportunities for loop to fail to ever make any |
2415 | | * progress in the presence of index corruption. |
2416 | | * |
2417 | | * Note: we rely on the assumption that there can only be one |
2418 | | * vacuum process running at a time (against the same index). |
2419 | | */ |
2420 | 0 | if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) || |
2421 | 0 | leftsib == opaque->btpo_next) |
2422 | 0 | leftsibvalid = false; |
2423 | |
|
2424 | 0 | leftsib = opaque->btpo_next; |
2425 | 0 | _bt_relbuf(rel, lbuf); |
2426 | |
|
2427 | 0 | if (!leftsibvalid) |
2428 | 0 | { |
2429 | | /* |
2430 | | * This is known to fail in the field; sibling link corruption |
2431 | | * is relatively common. Press on with vacuuming rather than |
2432 | | * just throwing an ERROR. |
2433 | | */ |
2434 | 0 | ereport(LOG, |
2435 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
2436 | 0 | errmsg_internal("valid left sibling for deletion target could not be located: " |
2437 | 0 | "left sibling %u of target %u with leafblkno %u and scanblkno %u on level %u of index \"%s\"", |
2438 | 0 | leftsib, target, leafblkno, scanblkno, |
2439 | 0 | targetlevel, RelationGetRelationName(rel)))); |
2440 | | |
2441 | | /* Must release all pins and locks on failure exit */ |
2442 | 0 | ReleaseBuffer(buf); |
2443 | 0 | if (target != leafblkno) |
2444 | 0 | _bt_relbuf(rel, leafbuf); |
2445 | |
|
2446 | 0 | return false; |
2447 | 0 | } |
2448 | | |
2449 | 0 | CHECK_FOR_INTERRUPTS(); |
2450 | | |
2451 | | /* step right one page */ |
2452 | 0 | lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); |
2453 | 0 | page = BufferGetPage(lbuf); |
2454 | 0 | opaque = BTPageGetOpaque(page); |
2455 | 0 | } |
2456 | 0 | } |
2457 | 0 | else |
2458 | 0 | lbuf = InvalidBuffer; |
2459 | | |
2460 | | /* Next write-lock the target page itself */ |
2461 | 0 | _bt_lockbuf(rel, buf, BT_WRITE); |
2462 | 0 | page = BufferGetPage(buf); |
2463 | 0 | opaque = BTPageGetOpaque(page); |
2464 | | |
2465 | | /* |
2466 | | * Check page is still empty etc, else abandon deletion. This is just for |
2467 | | * paranoia's sake; a half-dead page cannot resurrect because there can be |
2468 | | * only one vacuum process running at a time. |
2469 | | */ |
2470 | 0 | if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque)) |
2471 | 0 | elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"", |
2472 | 0 | target, RelationGetRelationName(rel)); |
2473 | | |
2474 | 0 | if (opaque->btpo_prev != leftsib) |
2475 | 0 | ereport(ERROR, |
2476 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
2477 | 0 | errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"", |
2478 | 0 | leftsib, opaque->btpo_prev, target, |
2479 | 0 | RelationGetRelationName(rel)))); |
2480 | | |
2481 | 0 | if (target == leafblkno) |
2482 | 0 | { |
2483 | 0 | if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) || |
2484 | 0 | !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque)) |
2485 | 0 | elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"", |
2486 | 0 | target, RelationGetRelationName(rel)); |
2487 | | |
2488 | | /* Leaf page is also target page: don't set leaftopparent */ |
2489 | 0 | leaftopparent = InvalidBlockNumber; |
2490 | 0 | } |
2491 | 0 | else |
2492 | 0 | { |
2493 | 0 | IndexTuple finaldataitem; |
2494 | |
|
2495 | 0 | if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) || |
2496 | 0 | P_ISLEAF(opaque)) |
2497 | 0 | elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"", |
2498 | 0 | targetlevel, target, RelationGetRelationName(rel)); |
2499 | | |
2500 | | /* Target is internal: set leaftopparent for next call here... */ |
2501 | 0 | itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque)); |
2502 | 0 | finaldataitem = (IndexTuple) PageGetItem(page, itemid); |
2503 | 0 | leaftopparent = BTreeTupleGetDownLink(finaldataitem); |
2504 | | /* ...except when it would be a redundant pointer-to-self */ |
2505 | 0 | if (leaftopparent == leafblkno) |
2506 | 0 | leaftopparent = InvalidBlockNumber; |
2507 | 0 | } |
2508 | | |
2509 | | /* No leaftopparent for level 0 (leaf page) or level 1 target */ |
2510 | 0 | Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1); |
2511 | | |
2512 | | /* |
2513 | | * And next write-lock the (current) right sibling. |
2514 | | */ |
2515 | 0 | rightsib = opaque->btpo_next; |
2516 | 0 | rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); |
2517 | 0 | page = BufferGetPage(rbuf); |
2518 | 0 | opaque = BTPageGetOpaque(page); |
2519 | | |
2520 | | /* |
2521 | | * Validate target's right sibling page. Its left link must point back to |
2522 | | * the target page. |
2523 | | */ |
2524 | 0 | if (opaque->btpo_prev != target) |
2525 | 0 | { |
2526 | | /* |
2527 | | * This is known to fail in the field; sibling link corruption is |
2528 | | * relatively common. Press on with vacuuming rather than just |
2529 | | * throwing an ERROR (same approach used for left-sibling's-right-link |
2530 | | * validation check a moment ago). |
2531 | | */ |
2532 | 0 | ereport(LOG, |
2533 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
2534 | 0 | errmsg_internal("right sibling's left-link doesn't match: " |
2535 | 0 | "right sibling %u of target %u with leafblkno %u " |
2536 | 0 | "and scanblkno %u spuriously links to non-target %u " |
2537 | 0 | "on level %u of index \"%s\"", |
2538 | 0 | rightsib, target, leafblkno, |
2539 | 0 | scanblkno, opaque->btpo_prev, |
2540 | 0 | targetlevel, RelationGetRelationName(rel)))); |
2541 | | |
2542 | | /* Must release all pins and locks on failure exit */ |
2543 | 0 | if (BufferIsValid(lbuf)) |
2544 | 0 | _bt_relbuf(rel, lbuf); |
2545 | 0 | _bt_relbuf(rel, rbuf); |
2546 | 0 | _bt_relbuf(rel, buf); |
2547 | 0 | if (target != leafblkno) |
2548 | 0 | _bt_relbuf(rel, leafbuf); |
2549 | |
|
2550 | 0 | return false; |
2551 | 0 | } |
2552 | | |
2553 | 0 | rightsib_is_rightmost = P_RIGHTMOST(opaque); |
2554 | 0 | *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)); |
2555 | | |
2556 | | /* |
2557 | | * If we are deleting the next-to-last page on the target's level, then |
2558 | | * the rightsib is a candidate to become the new fast root. (In theory, it |
2559 | | * might be possible to push the fast root even further down, but the odds |
2560 | | * of doing so are slim, and the locking considerations daunting.) |
2561 | | * |
2562 | | * We can safely acquire a lock on the metapage here --- see comments for |
2563 | | * _bt_newlevel(). |
2564 | | */ |
2565 | 0 | if (leftsib == P_NONE && rightsib_is_rightmost) |
2566 | 0 | { |
2567 | 0 | page = BufferGetPage(rbuf); |
2568 | 0 | opaque = BTPageGetOpaque(page); |
2569 | 0 | if (P_RIGHTMOST(opaque)) |
2570 | 0 | { |
2571 | | /* rightsib will be the only one left on the level */ |
2572 | 0 | metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); |
2573 | 0 | metapg = BufferGetPage(metabuf); |
2574 | 0 | metad = BTPageGetMeta(metapg); |
2575 | | |
2576 | | /* |
2577 | | * The expected case here is btm_fastlevel == targetlevel+1; if |
2578 | | * the fastlevel is <= targetlevel, something is wrong, and we |
2579 | | * choose to overwrite it to fix it. |
2580 | | */ |
2581 | 0 | if (metad->btm_fastlevel > targetlevel + 1) |
2582 | 0 | { |
2583 | | /* no update wanted */ |
2584 | 0 | _bt_relbuf(rel, metabuf); |
2585 | 0 | metabuf = InvalidBuffer; |
2586 | 0 | } |
2587 | 0 | } |
2588 | 0 | } |
2589 | | |
2590 | | /* |
2591 | | * Here we begin doing the deletion. |
2592 | | */ |
2593 | | |
2594 | | /* No ereport(ERROR) until changes are logged */ |
2595 | 0 | START_CRIT_SECTION(); |
2596 | | |
2597 | | /* |
2598 | | * Update siblings' side-links. Note the target page's side-links will |
2599 | | * continue to point to the siblings. Asserts here are just rechecking |
2600 | | * things we already verified above. |
2601 | | */ |
2602 | 0 | if (BufferIsValid(lbuf)) |
2603 | 0 | { |
2604 | 0 | page = BufferGetPage(lbuf); |
2605 | 0 | opaque = BTPageGetOpaque(page); |
2606 | 0 | Assert(opaque->btpo_next == target); |
2607 | 0 | opaque->btpo_next = rightsib; |
2608 | 0 | } |
2609 | 0 | page = BufferGetPage(rbuf); |
2610 | 0 | opaque = BTPageGetOpaque(page); |
2611 | 0 | Assert(opaque->btpo_prev == target); |
2612 | 0 | opaque->btpo_prev = leftsib; |
2613 | | |
2614 | | /* |
2615 | | * If we deleted a parent of the targeted leaf page, instead of the leaf |
2616 | | * itself, update the leaf to point to the next remaining child in the |
2617 | | * subtree. |
2618 | | * |
2619 | | * Note: We rely on the fact that a buffer pin on the leaf page has been |
2620 | | * held since leafhikey was initialized. This is safe, though only |
2621 | | * because the page was already half-dead at that point. The leaf page |
2622 | | * cannot have been modified by any other backend during the period when |
2623 | | * no lock was held. |
2624 | | */ |
2625 | 0 | if (target != leafblkno) |
2626 | 0 | BTreeTupleSetTopParent(leafhikey, leaftopparent); |
2627 | | |
2628 | | /* |
2629 | | * Mark the page itself deleted. It can be recycled when all current |
2630 | | * transactions are gone. Storing GetTopTransactionId() would work, but |
2631 | | * we're in VACUUM and would not otherwise have an XID. Having already |
2632 | | * updated links to the target, ReadNextFullTransactionId() suffices as an |
2633 | | * upper bound. Any scan having retained a now-stale link is advertising |
2634 | | * in its PGPROC an xmin less than or equal to the value we read here. It |
2635 | | * will continue to do so, holding back the xmin horizon, for the duration |
2636 | | * of that scan. |
2637 | | */ |
2638 | 0 | page = BufferGetPage(buf); |
2639 | 0 | opaque = BTPageGetOpaque(page); |
2640 | 0 | Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque)); |
2641 | | |
2642 | | /* |
2643 | | * Store upper bound XID that's used to determine when deleted page is no |
2644 | | * longer needed as a tombstone |
2645 | | */ |
2646 | 0 | safexid = ReadNextFullTransactionId(); |
2647 | 0 | BTPageSetDeleted(page, safexid); |
2648 | 0 | opaque->btpo_cycleid = 0; |
2649 | | |
2650 | | /* And update the metapage, if needed */ |
2651 | 0 | if (BufferIsValid(metabuf)) |
2652 | 0 | { |
2653 | | /* upgrade metapage if needed */ |
2654 | 0 | if (metad->btm_version < BTREE_NOVAC_VERSION) |
2655 | 0 | _bt_upgrademetapage(metapg); |
2656 | 0 | metad->btm_fastroot = rightsib; |
2657 | 0 | metad->btm_fastlevel = targetlevel; |
2658 | 0 | MarkBufferDirty(metabuf); |
2659 | 0 | } |
2660 | | |
2661 | | /* Must mark buffers dirty before XLogInsert */ |
2662 | 0 | MarkBufferDirty(rbuf); |
2663 | 0 | MarkBufferDirty(buf); |
2664 | 0 | if (BufferIsValid(lbuf)) |
2665 | 0 | MarkBufferDirty(lbuf); |
2666 | 0 | if (target != leafblkno) |
2667 | 0 | MarkBufferDirty(leafbuf); |
2668 | | |
2669 | | /* XLOG stuff */ |
2670 | 0 | if (RelationNeedsWAL(rel)) |
2671 | 0 | { |
2672 | 0 | xl_btree_unlink_page xlrec; |
2673 | 0 | xl_btree_metadata xlmeta; |
2674 | 0 | uint8 xlinfo; |
2675 | 0 | XLogRecPtr recptr; |
2676 | |
|
2677 | 0 | XLogBeginInsert(); |
2678 | |
|
2679 | 0 | XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT); |
2680 | 0 | if (BufferIsValid(lbuf)) |
2681 | 0 | XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD); |
2682 | 0 | XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); |
2683 | 0 | if (target != leafblkno) |
2684 | 0 | XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT); |
2685 | | |
2686 | | /* information stored on the target/to-be-unlinked block */ |
2687 | 0 | xlrec.leftsib = leftsib; |
2688 | 0 | xlrec.rightsib = rightsib; |
2689 | 0 | xlrec.level = targetlevel; |
2690 | 0 | xlrec.safexid = safexid; |
2691 | | |
2692 | | /* information needed to recreate the leaf block (if not the target) */ |
2693 | 0 | xlrec.leafleftsib = leafleftsib; |
2694 | 0 | xlrec.leafrightsib = leafrightsib; |
2695 | 0 | xlrec.leaftopparent = leaftopparent; |
2696 | |
|
2697 | 0 | XLogRegisterData(&xlrec, SizeOfBtreeUnlinkPage); |
2698 | |
|
2699 | 0 | if (BufferIsValid(metabuf)) |
2700 | 0 | { |
2701 | 0 | XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); |
2702 | |
|
2703 | 0 | Assert(metad->btm_version >= BTREE_NOVAC_VERSION); |
2704 | 0 | xlmeta.version = metad->btm_version; |
2705 | 0 | xlmeta.root = metad->btm_root; |
2706 | 0 | xlmeta.level = metad->btm_level; |
2707 | 0 | xlmeta.fastroot = metad->btm_fastroot; |
2708 | 0 | xlmeta.fastlevel = metad->btm_fastlevel; |
2709 | 0 | xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; |
2710 | 0 | xlmeta.allequalimage = metad->btm_allequalimage; |
2711 | |
|
2712 | 0 | XLogRegisterBufData(4, &xlmeta, sizeof(xl_btree_metadata)); |
2713 | 0 | xlinfo = XLOG_BTREE_UNLINK_PAGE_META; |
2714 | 0 | } |
2715 | 0 | else |
2716 | 0 | xlinfo = XLOG_BTREE_UNLINK_PAGE; |
2717 | |
|
2718 | 0 | recptr = XLogInsert(RM_BTREE_ID, xlinfo); |
2719 | |
|
2720 | 0 | if (BufferIsValid(metabuf)) |
2721 | 0 | { |
2722 | 0 | PageSetLSN(metapg, recptr); |
2723 | 0 | } |
2724 | 0 | page = BufferGetPage(rbuf); |
2725 | 0 | PageSetLSN(page, recptr); |
2726 | 0 | page = BufferGetPage(buf); |
2727 | 0 | PageSetLSN(page, recptr); |
2728 | 0 | if (BufferIsValid(lbuf)) |
2729 | 0 | { |
2730 | 0 | page = BufferGetPage(lbuf); |
2731 | 0 | PageSetLSN(page, recptr); |
2732 | 0 | } |
2733 | 0 | if (target != leafblkno) |
2734 | 0 | { |
2735 | 0 | page = BufferGetPage(leafbuf); |
2736 | 0 | PageSetLSN(page, recptr); |
2737 | 0 | } |
2738 | 0 | } |
2739 | |
|
2740 | 0 | END_CRIT_SECTION(); |
2741 | | |
2742 | | /* release metapage */ |
2743 | 0 | if (BufferIsValid(metabuf)) |
2744 | 0 | _bt_relbuf(rel, metabuf); |
2745 | | |
2746 | | /* release siblings */ |
2747 | 0 | if (BufferIsValid(lbuf)) |
2748 | 0 | _bt_relbuf(rel, lbuf); |
2749 | 0 | _bt_relbuf(rel, rbuf); |
2750 | | |
2751 | | /* If the target is not leafbuf, we're done with it now -- release it */ |
2752 | 0 | if (target != leafblkno) |
2753 | 0 | _bt_relbuf(rel, buf); |
2754 | | |
2755 | | /* |
2756 | | * Maintain pages_newly_deleted, which is simply the number of pages |
2757 | | * deleted by the ongoing VACUUM operation. |
2758 | | * |
2759 | | * Maintain pages_deleted in a way that takes into account how |
2760 | | * btvacuumpage() will count deleted pages that have yet to become |
2761 | | * scanblkno -- only count page when it's not going to get that treatment |
2762 | | * later on. |
2763 | | */ |
2764 | 0 | stats->pages_newly_deleted++; |
2765 | 0 | if (target <= scanblkno) |
2766 | 0 | stats->pages_deleted++; |
2767 | | |
2768 | | /* |
2769 | | * Remember information about the target page (now a newly deleted page) |
2770 | | * in dedicated vstate space for later. The page will be considered as a |
2771 | | * candidate to place in the FSM at the end of the current btvacuumscan() |
2772 | | * call. |
2773 | | */ |
2774 | 0 | _bt_pendingfsm_add(vstate, target, safexid); |
2775 | | |
2776 | | /* Success - hold on to lock on leafbuf (might also have been target) */ |
2777 | 0 | return true; |
2778 | 0 | } |
2779 | | |
2780 | | /* |
2781 | | * Establish how tall the to-be-deleted subtree will be during the first stage |
2782 | | * of page deletion. |
2783 | | * |
2784 | | * Caller's child argument is the block number of the page caller wants to |
2785 | | * delete (this is leafbuf's block number, except when we're called |
2786 | | * recursively). stack is a search stack leading to it. Note that we will |
2787 | | * update the stack entry(s) to reflect current downlink positions --- this is |
2788 | | * similar to the corresponding point in page split handling. |
2789 | | * |
2790 | | * If "first stage" caller cannot go ahead with deleting _any_ pages, returns |
2791 | | * false. Returns true on success, in which case caller can use certain |
2792 | | * details established here to perform the first stage of deletion. This |
2793 | | * function is the last point at which page deletion may be deemed unsafe |
2794 | | * (barring index corruption, or unexpected concurrent page deletions). |
2795 | | * |
2796 | | * We write lock the parent of the root of the to-be-deleted subtree for |
2797 | | * caller on success (i.e. we leave our lock on the *subtreeparent buffer for |
2798 | | * caller). Caller will have to remove a downlink from *subtreeparent. We |
2799 | | * also set a *subtreeparent offset number in *poffset, to indicate the |
2800 | | * location of the pivot tuple that contains the relevant downlink. |
2801 | | * |
2802 | | * The root of the to-be-deleted subtree is called the "top parent". Note |
2803 | | * that the leafbuf page is often the final "top parent" page (you can think |
2804 | | * of the leafbuf page as a degenerate single page subtree when that happens). |
2805 | | * Caller should initialize *topparent to the target leafbuf page block number |
2806 | | * (while *topparentrightsib should be set to leafbuf's right sibling block |
2807 | | * number). We will update *topparent (and *topparentrightsib) for caller |
2808 | | * here, though only when it turns out that caller will delete at least one |
2809 | | * internal page (i.e. only when caller needs to store a valid link to the top |
2810 | | * parent block in the leafbuf page using BTreeTupleSetTopParent()). |
2811 | | */ |
2812 | | static bool |
2813 | | _bt_lock_subtree_parent(Relation rel, Relation heaprel, BlockNumber child, |
2814 | | BTStack stack, Buffer *subtreeparent, |
2815 | | OffsetNumber *poffset, BlockNumber *topparent, |
2816 | | BlockNumber *topparentrightsib) |
2817 | 0 | { |
2818 | 0 | BlockNumber parent, |
2819 | 0 | leftsibparent; |
2820 | 0 | OffsetNumber parentoffset, |
2821 | 0 | maxoff; |
2822 | 0 | Buffer pbuf; |
2823 | 0 | Page page; |
2824 | 0 | BTPageOpaque opaque; |
2825 | | |
2826 | | /* |
2827 | | * Locate the pivot tuple whose downlink points to "child". Write lock |
2828 | | * the parent page itself. |
2829 | | */ |
2830 | 0 | pbuf = _bt_getstackbuf(rel, heaprel, stack, child); |
2831 | 0 | if (pbuf == InvalidBuffer) |
2832 | 0 | { |
2833 | | /* |
2834 | | * Failed to "re-find" a pivot tuple whose downlink matched our child |
2835 | | * block number on the parent level -- the index must be corrupt. |
2836 | | * Don't even try to delete the leafbuf subtree. Just report the |
2837 | | * issue and press on with vacuuming the index. |
2838 | | * |
2839 | | * Note: _bt_getstackbuf() recovers from concurrent page splits that |
2840 | | * take place on the parent level. Its approach is a near-exhaustive |
2841 | | * linear search. This also gives it a surprisingly good chance of |
2842 | | * recovering in the event of a buggy or inconsistent opclass. But we |
2843 | | * don't rely on that here. |
2844 | | */ |
2845 | 0 | ereport(LOG, |
2846 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
2847 | 0 | errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u", |
2848 | 0 | RelationGetRelationName(rel), child))); |
2849 | 0 | Assert(false); |
2850 | 0 | return false; |
2851 | 0 | } |
2852 | | |
2853 | 0 | parent = stack->bts_blkno; |
2854 | 0 | parentoffset = stack->bts_offset; |
2855 | |
|
2856 | 0 | page = BufferGetPage(pbuf); |
2857 | 0 | opaque = BTPageGetOpaque(page); |
2858 | 0 | maxoff = PageGetMaxOffsetNumber(page); |
2859 | 0 | leftsibparent = opaque->btpo_prev; |
2860 | | |
2861 | | /* |
2862 | | * _bt_getstackbuf() completes page splits on returned parent buffer when |
2863 | | * required. |
2864 | | * |
2865 | | * In general it's a bad idea for VACUUM to use up more disk space, which |
2866 | | * is why page deletion does not finish incomplete page splits most of the |
2867 | | * time. We allow this limited exception because the risk is much lower, |
2868 | | * and the potential downside of not proceeding is much higher: A single |
2869 | | * internal page with the INCOMPLETE_SPLIT flag set might otherwise |
2870 | | * prevent us from deleting hundreds of empty leaf pages from one level |
2871 | | * down. |
2872 | | */ |
2873 | 0 | Assert(!P_INCOMPLETE_SPLIT(opaque)); |
2874 | |
|
2875 | 0 | if (parentoffset < maxoff) |
2876 | 0 | { |
2877 | | /* |
2878 | | * Child is not the rightmost child in parent, so it's safe to delete |
2879 | | * the subtree whose root/topparent is child page |
2880 | | */ |
2881 | 0 | *subtreeparent = pbuf; |
2882 | 0 | *poffset = parentoffset; |
2883 | 0 | return true; |
2884 | 0 | } |
2885 | | |
2886 | | /* |
2887 | | * Child is the rightmost child of parent. |
2888 | | * |
2889 | | * Since it's the rightmost child of parent, deleting the child (or |
2890 | | * deleting the subtree whose root/topparent is the child page) is only |
2891 | | * safe when it's also possible to delete the parent. |
2892 | | */ |
2893 | 0 | Assert(parentoffset == maxoff); |
2894 | 0 | if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque)) |
2895 | 0 | { |
2896 | | /* |
2897 | | * Child isn't parent's only child, or parent is rightmost on its |
2898 | | * entire level. Definitely cannot delete any pages. |
2899 | | */ |
2900 | 0 | _bt_relbuf(rel, pbuf); |
2901 | 0 | return false; |
2902 | 0 | } |
2903 | | |
2904 | | /* |
2905 | | * Now make sure that the parent deletion is itself safe by examining the |
2906 | | * child's grandparent page. Recurse, passing the parent page as the |
2907 | | * child page (child's grandparent is the parent on the next level up). If |
2908 | | * parent deletion is unsafe, then child deletion must also be unsafe (in |
2909 | | * which case caller cannot delete any pages at all). |
2910 | | */ |
2911 | 0 | *topparent = parent; |
2912 | 0 | *topparentrightsib = opaque->btpo_next; |
2913 | | |
2914 | | /* |
2915 | | * Release lock on parent before recursing. |
2916 | | * |
2917 | | * It's OK to release page locks on parent before recursive call locks |
2918 | | * grandparent. An internal page can only acquire an entry if the child |
2919 | | * is split, but that cannot happen as long as we still hold a lock on the |
2920 | | * leafbuf page. |
2921 | | */ |
2922 | 0 | _bt_relbuf(rel, pbuf); |
2923 | | |
2924 | | /* |
2925 | | * Before recursing, check that the left sibling of parent (if any) is not |
2926 | | * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the |
2927 | | * parent lock). |
2928 | | * |
2929 | | * Note: We deliberately avoid completing incomplete splits here. |
2930 | | */ |
2931 | 0 | if (_bt_leftsib_splitflag(rel, leftsibparent, parent)) |
2932 | 0 | return false; |
2933 | | |
2934 | | /* Recurse to examine child page's grandparent page */ |
2935 | 0 | return _bt_lock_subtree_parent(rel, heaprel, parent, stack->bts_parent, |
2936 | 0 | subtreeparent, poffset, |
2937 | 0 | topparent, topparentrightsib); |
2938 | 0 | } |
2939 | | |
2940 | | /* |
2941 | | * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize |
2942 | | * optimization. |
2943 | | * |
2944 | | * Called at the start of a btvacuumscan(). Caller's cleanuponly argument |
2945 | | * indicates if ongoing VACUUM has not (and will not) call btbulkdelete(). |
2946 | | * |
2947 | | * We expect to allocate memory inside VACUUM's top-level memory context here. |
2948 | | * The working buffer is subject to a limit based on work_mem. Our strategy |
2949 | | * when the array can no longer grow within the bounds of that limit is to |
2950 | | * stop saving additional newly deleted pages, while proceeding as usual with |
2951 | | * the pages that we can fit. |
2952 | | */ |
2953 | | void |
2954 | | _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly) |
2955 | 0 | { |
2956 | 0 | Size maxbufsize; |
2957 | | |
2958 | | /* |
2959 | | * Don't bother with optimization in cleanup-only case -- we don't expect |
2960 | | * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan() |
2961 | | * can only take place because this optimization didn't work out during |
2962 | | * the last VACUUM. |
2963 | | */ |
2964 | 0 | if (cleanuponly) |
2965 | 0 | return; |
2966 | | |
2967 | | /* |
2968 | | * Cap maximum size of array so that we always respect work_mem. Avoid |
2969 | | * int overflow here. |
2970 | | */ |
2971 | 0 | vstate->bufsize = 256; |
2972 | 0 | maxbufsize = (work_mem * (Size) 1024) / sizeof(BTPendingFSM); |
2973 | 0 | maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM)); |
2974 | | /* BTVacState.maxbufsize has type int */ |
2975 | 0 | maxbufsize = Min(maxbufsize, INT_MAX); |
2976 | | /* Stay sane with small work_mem */ |
2977 | 0 | maxbufsize = Max(maxbufsize, vstate->bufsize); |
2978 | 0 | vstate->maxbufsize = (int) maxbufsize; |
2979 | | |
2980 | | /* Allocate buffer, indicate that there are currently 0 pending pages */ |
2981 | 0 | vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize); |
2982 | 0 | vstate->npendingpages = 0; |
2983 | 0 | } |
2984 | | |
2985 | | /* |
2986 | | * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during |
2987 | | * the ongoing VACUUM operation) into the free space map -- though only when |
2988 | | * it is actually safe to do so by now. |
2989 | | * |
2990 | | * Called at the end of a btvacuumscan(), just before free space map vacuuming |
2991 | | * takes place. |
2992 | | * |
2993 | | * Frees memory allocated by _bt_pendingfsm_init(), if any. |
2994 | | */ |
2995 | | void |
2996 | | _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate) |
2997 | 0 | { |
2998 | 0 | IndexBulkDeleteResult *stats = vstate->stats; |
2999 | 0 | Relation heaprel = vstate->info->heaprel; |
3000 | |
|
3001 | 0 | Assert(stats->pages_newly_deleted >= vstate->npendingpages); |
3002 | 0 | Assert(heaprel != NULL); |
3003 | |
|
3004 | 0 | if (vstate->npendingpages == 0) |
3005 | 0 | { |
3006 | | /* Just free memory when nothing to do */ |
3007 | 0 | if (vstate->pendingpages) |
3008 | 0 | pfree(vstate->pendingpages); |
3009 | |
|
3010 | 0 | return; |
3011 | 0 | } |
3012 | | |
3013 | | #ifdef DEBUG_BTREE_PENDING_FSM |
3014 | | |
3015 | | /* |
3016 | | * Debugging aid: Sleep for 5 seconds to greatly increase the chances of |
3017 | | * placing pending pages in the FSM. Note that the optimization will |
3018 | | * never be effective without some other backend concurrently consuming an |
3019 | | * XID. |
3020 | | */ |
3021 | | pg_usleep(5000000L); |
3022 | | #endif |
3023 | | |
3024 | | /* |
3025 | | * Recompute VACUUM XID boundaries. |
3026 | | * |
3027 | | * We don't actually care about the oldest non-removable XID. Computing |
3028 | | * the oldest such XID has a useful side-effect that we rely on: it |
3029 | | * forcibly updates the XID horizon state for this backend. This step is |
3030 | | * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize |
3031 | | * that it is now safe to recycle newly deleted pages without this step. |
3032 | | */ |
3033 | 0 | GetOldestNonRemovableTransactionId(heaprel); |
3034 | |
|
3035 | 0 | for (int i = 0; i < vstate->npendingpages; i++) |
3036 | 0 | { |
3037 | 0 | BlockNumber target = vstate->pendingpages[i].target; |
3038 | 0 | FullTransactionId safexid = vstate->pendingpages[i].safexid; |
3039 | | |
3040 | | /* |
3041 | | * Do the equivalent of checking BTPageIsRecyclable(), but without |
3042 | | * accessing the page again a second time. |
3043 | | * |
3044 | | * Give up on finding the first non-recyclable page -- all later pages |
3045 | | * must be non-recyclable too, since _bt_pendingfsm_add() adds pages |
3046 | | * to the array in safexid order. |
3047 | | */ |
3048 | 0 | if (!GlobalVisCheckRemovableFullXid(heaprel, safexid)) |
3049 | 0 | break; |
3050 | | |
3051 | 0 | RecordFreeIndexPage(rel, target); |
3052 | 0 | stats->pages_free++; |
3053 | 0 | } |
3054 | |
|
3055 | 0 | pfree(vstate->pendingpages); |
3056 | 0 | } |
3057 | | |
3058 | | /* |
3059 | | * Maintain array of pages that were deleted during current btvacuumscan() |
3060 | | * call, for use in _bt_pendingfsm_finalize() |
3061 | | */ |
3062 | | static void |
3063 | | _bt_pendingfsm_add(BTVacState *vstate, |
3064 | | BlockNumber target, |
3065 | | FullTransactionId safexid) |
3066 | 0 | { |
3067 | 0 | Assert(vstate->npendingpages <= vstate->bufsize); |
3068 | 0 | Assert(vstate->bufsize <= vstate->maxbufsize); |
3069 | |
|
3070 | | #ifdef USE_ASSERT_CHECKING |
3071 | | |
3072 | | /* |
3073 | | * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the |
3074 | | * array will always be in safexid order (since that is the order that we |
3075 | | * save them in here) |
3076 | | */ |
3077 | | if (vstate->npendingpages > 0) |
3078 | | { |
3079 | | FullTransactionId lastsafexid = |
3080 | | vstate->pendingpages[vstate->npendingpages - 1].safexid; |
3081 | | |
3082 | | Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid)); |
3083 | | } |
3084 | | #endif |
3085 | | |
3086 | | /* |
3087 | | * If temp buffer reaches maxbufsize/work_mem capacity then we discard |
3088 | | * information about this page. |
3089 | | * |
3090 | | * Note that this also covers the case where we opted to not use the |
3091 | | * optimization in _bt_pendingfsm_init(). |
3092 | | */ |
3093 | 0 | if (vstate->npendingpages == vstate->maxbufsize) |
3094 | 0 | return; |
3095 | | |
3096 | | /* Consider enlarging buffer */ |
3097 | 0 | if (vstate->npendingpages == vstate->bufsize) |
3098 | 0 | { |
3099 | 0 | int newbufsize = vstate->bufsize * 2; |
3100 | | |
3101 | | /* Respect work_mem */ |
3102 | 0 | if (newbufsize > vstate->maxbufsize) |
3103 | 0 | newbufsize = vstate->maxbufsize; |
3104 | |
|
3105 | 0 | vstate->bufsize = newbufsize; |
3106 | 0 | vstate->pendingpages = |
3107 | 0 | repalloc(vstate->pendingpages, |
3108 | 0 | sizeof(BTPendingFSM) * vstate->bufsize); |
3109 | 0 | } |
3110 | | |
3111 | | /* Save metadata for newly deleted page */ |
3112 | 0 | vstate->pendingpages[vstate->npendingpages].target = target; |
3113 | 0 | vstate->pendingpages[vstate->npendingpages].safexid = safexid; |
3114 | 0 | vstate->npendingpages++; |
3115 | 0 | } |