/src/postgres/src/backend/storage/freespace/fsmpage.c
Line | Count | Source |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * fsmpage.c |
4 | | * routines to search and manipulate one FSM page. |
5 | | * |
6 | | * |
7 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
8 | | * Portions Copyright (c) 1994, Regents of the University of California |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/backend/storage/freespace/fsmpage.c |
12 | | * |
13 | | * NOTES: |
14 | | * |
15 | | * The public functions in this file form an API that hides the internal |
16 | | * structure of a FSM page. This allows freespace.c to treat each FSM page |
17 | | * as a black box with SlotsPerPage "slots". fsm_set_avail() and |
18 | | * fsm_get_avail() let you get/set the value of a slot, and |
19 | | * fsm_search_avail() lets you search for a slot with value >= X. |
20 | | * |
21 | | *------------------------------------------------------------------------- |
22 | | */ |
23 | | #include "postgres.h" |
24 | | |
25 | | #include "storage/bufmgr.h" |
26 | | #include "storage/fsm_internals.h" |
27 | | |
28 | | /* Macros to navigate the tree within a page. Root has index zero. */ |
29 | 0 | #define leftchild(x) (2 * (x) + 1) |
30 | | #define rightchild(x) (2 * (x) + 2) |
31 | 0 | #define parentof(x) (((x) - 1) / 2) |
32 | | |
33 | | /* |
34 | | * Find right neighbor of x, wrapping around within the level |
35 | | */ |
36 | | static int |
37 | | rightneighbor(int x) |
38 | 0 | { |
39 | | /* |
40 | | * Move right. This might wrap around, stepping to the leftmost node at |
41 | | * the next level. |
42 | | */ |
43 | 0 | x++; |
44 | | |
45 | | /* |
46 | | * Check if we stepped to the leftmost node at next level, and correct if |
47 | | * so. The leftmost nodes at each level are numbered x = 2^level - 1, so |
48 | | * check if (x + 1) is a power of two, using a standard |
49 | | * twos-complement-arithmetic trick. |
50 | | */ |
51 | 0 | if (((x + 1) & x) == 0) |
52 | 0 | x = parentof(x); |
53 | |
|
54 | 0 | return x; |
55 | 0 | } |
56 | | |
57 | | /* |
58 | | * Sets the value of a slot on page. Returns true if the page was modified. |
59 | | * |
60 | | * The caller must hold an exclusive lock on the page. |
61 | | */ |
62 | | bool |
63 | | fsm_set_avail(Page page, int slot, uint8 value) |
64 | 0 | { |
65 | 0 | int nodeno = NonLeafNodesPerPage + slot; |
66 | 0 | FSMPage fsmpage = (FSMPage) PageGetContents(page); |
67 | 0 | uint8 oldvalue; |
68 | |
|
69 | 0 | Assert(slot < LeafNodesPerPage); |
70 | |
|
71 | 0 | oldvalue = fsmpage->fp_nodes[nodeno]; |
72 | | |
73 | | /* If the value hasn't changed, we don't need to do anything */ |
74 | 0 | if (oldvalue == value && value <= fsmpage->fp_nodes[0]) |
75 | 0 | return false; |
76 | | |
77 | 0 | fsmpage->fp_nodes[nodeno] = value; |
78 | | |
79 | | /* |
80 | | * Propagate up, until we hit the root or a node that doesn't need to be |
81 | | * updated. |
82 | | */ |
83 | 0 | do |
84 | 0 | { |
85 | 0 | uint8 newvalue = 0; |
86 | 0 | int lchild; |
87 | 0 | int rchild; |
88 | |
|
89 | 0 | nodeno = parentof(nodeno); |
90 | 0 | lchild = leftchild(nodeno); |
91 | 0 | rchild = lchild + 1; |
92 | |
|
93 | 0 | newvalue = fsmpage->fp_nodes[lchild]; |
94 | 0 | if (rchild < NodesPerPage) |
95 | 0 | newvalue = Max(newvalue, |
96 | 0 | fsmpage->fp_nodes[rchild]); |
97 | |
|
98 | 0 | oldvalue = fsmpage->fp_nodes[nodeno]; |
99 | 0 | if (oldvalue == newvalue) |
100 | 0 | break; |
101 | | |
102 | 0 | fsmpage->fp_nodes[nodeno] = newvalue; |
103 | 0 | } while (nodeno > 0); |
104 | | |
105 | | /* |
106 | | * sanity check: if the new value is (still) higher than the value at the |
107 | | * top, the tree is corrupt. If so, rebuild. |
108 | | */ |
109 | 0 | if (value > fsmpage->fp_nodes[0]) |
110 | 0 | fsm_rebuild_page(page); |
111 | |
|
112 | 0 | return true; |
113 | 0 | } |
114 | | |
115 | | /* |
116 | | * Returns the value of given slot on page. |
117 | | * |
118 | | * Since this is just a read-only access of a single byte, the page doesn't |
119 | | * need to be locked. |
120 | | */ |
121 | | uint8 |
122 | | fsm_get_avail(Page page, int slot) |
123 | 0 | { |
124 | 0 | FSMPage fsmpage = (FSMPage) PageGetContents(page); |
125 | |
|
126 | 0 | Assert(slot < LeafNodesPerPage); |
127 | |
|
128 | 0 | return fsmpage->fp_nodes[NonLeafNodesPerPage + slot]; |
129 | 0 | } |
130 | | |
131 | | /* |
132 | | * Returns the value at the root of a page. |
133 | | * |
134 | | * Since this is just a read-only access of a single byte, the page doesn't |
135 | | * need to be locked. |
136 | | */ |
137 | | uint8 |
138 | | fsm_get_max_avail(Page page) |
139 | 0 | { |
140 | 0 | FSMPage fsmpage = (FSMPage) PageGetContents(page); |
141 | |
|
142 | 0 | return fsmpage->fp_nodes[0]; |
143 | 0 | } |
144 | | |
145 | | /* |
146 | | * Searches for a slot with category at least minvalue. |
147 | | * Returns slot number, or -1 if none found. |
148 | | * |
149 | | * The caller must hold at least a shared lock on the page, and this |
150 | | * function can unlock and lock the page again in exclusive mode if it |
151 | | * needs to be updated. exclusive_lock_held should be set to true if the |
152 | | * caller is already holding an exclusive lock, to avoid extra work. |
153 | | * |
154 | | * If advancenext is false, fp_next_slot is set to point to the returned |
155 | | * slot, and if it's true, to the slot after the returned slot. |
156 | | */ |
157 | | int |
158 | | fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext, |
159 | | bool exclusive_lock_held) |
160 | 0 | { |
161 | 0 | Page page = BufferGetPage(buf); |
162 | 0 | FSMPage fsmpage = (FSMPage) PageGetContents(page); |
163 | 0 | int nodeno; |
164 | 0 | int target; |
165 | 0 | uint16 slot; |
166 | |
|
167 | 0 | restart: |
168 | | |
169 | | /* |
170 | | * Check the root first, and exit quickly if there's no leaf with enough |
171 | | * free space |
172 | | */ |
173 | 0 | if (fsmpage->fp_nodes[0] < minvalue) |
174 | 0 | return -1; |
175 | | |
176 | | /* |
177 | | * Start search using fp_next_slot. It's just a hint, so check that it's |
178 | | * sane. (This also handles wrapping around when the prior call returned |
179 | | * the last slot on the page.) |
180 | | */ |
181 | 0 | target = fsmpage->fp_next_slot; |
182 | 0 | if (target < 0 || target >= LeafNodesPerPage) |
183 | 0 | target = 0; |
184 | 0 | target += NonLeafNodesPerPage; |
185 | | |
186 | | /*---------- |
187 | | * Start the search from the target slot. At every step, move one |
188 | | * node to the right, then climb up to the parent. Stop when we reach |
189 | | * a node with enough free space (as we must, since the root has enough |
190 | | * space). |
191 | | * |
192 | | * The idea is to gradually expand our "search triangle", that is, all |
193 | | * nodes covered by the current node, and to be sure we search to the |
194 | | * right from the start point. At the first step, only the target slot |
195 | | * is examined. When we move up from a left child to its parent, we are |
196 | | * adding the right-hand subtree of that parent to the search triangle. |
197 | | * When we move right then up from a right child, we are dropping the |
198 | | * current search triangle (which we know doesn't contain any suitable |
199 | | * page) and instead looking at the next-larger-size triangle to its |
200 | | * right. So we never look left from our original start point, and at |
201 | | * each step the size of the search triangle doubles, ensuring it takes |
202 | | * only log2(N) work to search N pages. |
203 | | * |
204 | | * The "move right" operation will wrap around if it hits the right edge |
205 | | * of the tree, so the behavior is still good if we start near the right. |
206 | | * Note also that the move-and-climb behavior ensures that we can't end |
207 | | * up on one of the missing nodes at the right of the leaf level. |
208 | | * |
209 | | * For example, consider this tree: |
210 | | * |
211 | | * 7 |
212 | | * 7 6 |
213 | | * 5 7 6 5 |
214 | | * 4 5 5 7 2 6 5 2 |
215 | | * T |
216 | | * |
217 | | * Assume that the target node is the node indicated by the letter T, |
218 | | * and we're searching for a node with value of 6 or higher. The search |
219 | | * begins at T. At the first iteration, we move to the right, then to the |
220 | | * parent, arriving at the rightmost 5. At the second iteration, we move |
221 | | * to the right, wrapping around, then climb up, arriving at the 7 on the |
222 | | * third level. 7 satisfies our search, so we descend down to the bottom, |
223 | | * following the path of sevens. This is in fact the first suitable page |
224 | | * to the right of (allowing for wraparound) our start point. |
225 | | *---------- |
226 | | */ |
227 | 0 | nodeno = target; |
228 | 0 | while (nodeno > 0) |
229 | 0 | { |
230 | 0 | if (fsmpage->fp_nodes[nodeno] >= minvalue) |
231 | 0 | break; |
232 | | |
233 | | /* |
234 | | * Move to the right, wrapping around on same level if necessary, then |
235 | | * climb up. |
236 | | */ |
237 | 0 | nodeno = parentof(rightneighbor(nodeno)); |
238 | 0 | } |
239 | | |
240 | | /* |
241 | | * We're now at a node with enough free space, somewhere in the middle of |
242 | | * the tree. Descend to the bottom, following a path with enough free |
243 | | * space, preferring to move left if there's a choice. |
244 | | */ |
245 | 0 | while (nodeno < NonLeafNodesPerPage) |
246 | 0 | { |
247 | 0 | int childnodeno = leftchild(nodeno); |
248 | |
|
249 | 0 | if (childnodeno < NodesPerPage && |
250 | 0 | fsmpage->fp_nodes[childnodeno] >= minvalue) |
251 | 0 | { |
252 | 0 | nodeno = childnodeno; |
253 | 0 | continue; |
254 | 0 | } |
255 | 0 | childnodeno++; /* point to right child */ |
256 | 0 | if (childnodeno < NodesPerPage && |
257 | 0 | fsmpage->fp_nodes[childnodeno] >= minvalue) |
258 | 0 | { |
259 | 0 | nodeno = childnodeno; |
260 | 0 | } |
261 | 0 | else |
262 | 0 | { |
263 | | /* |
264 | | * Oops. The parent node promised that either left or right child |
265 | | * has enough space, but neither actually did. This can happen in |
266 | | * case of a "torn page", IOW if we crashed earlier while writing |
267 | | * the page to disk, and only part of the page made it to disk. |
268 | | * |
269 | | * Fix the corruption and restart. |
270 | | */ |
271 | 0 | RelFileLocator rlocator; |
272 | 0 | ForkNumber forknum; |
273 | 0 | BlockNumber blknum; |
274 | |
|
275 | 0 | BufferGetTag(buf, &rlocator, &forknum, &blknum); |
276 | 0 | elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u", |
277 | 0 | blknum, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber); |
278 | | |
279 | | /* make sure we hold an exclusive lock */ |
280 | 0 | if (!exclusive_lock_held) |
281 | 0 | { |
282 | 0 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
283 | 0 | LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
284 | 0 | exclusive_lock_held = true; |
285 | 0 | } |
286 | 0 | fsm_rebuild_page(page); |
287 | 0 | MarkBufferDirtyHint(buf, false); |
288 | 0 | goto restart; |
289 | 0 | } |
290 | 0 | } |
291 | | |
292 | | /* We're now at the bottom level, at a node with enough space. */ |
293 | 0 | slot = nodeno - NonLeafNodesPerPage; |
294 | | |
295 | | /* |
296 | | * Update the next-target pointer. Note that we do this even if we're only |
297 | | * holding a shared lock, on the grounds that it's better to use a shared |
298 | | * lock and get a garbled next pointer every now and then, than take the |
299 | | * concurrency hit of an exclusive lock. |
300 | | * |
301 | | * Wrap-around is handled at the beginning of this function. |
302 | | */ |
303 | 0 | fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); |
304 | |
|
305 | 0 | return slot; |
306 | 0 | } |
307 | | |
308 | | /* |
309 | | * Sets the available space to zero for all slots numbered >= nslots. |
310 | | * Returns true if the page was modified. |
311 | | */ |
312 | | bool |
313 | | fsm_truncate_avail(Page page, int nslots) |
314 | 0 | { |
315 | 0 | FSMPage fsmpage = (FSMPage) PageGetContents(page); |
316 | 0 | uint8 *ptr; |
317 | 0 | bool changed = false; |
318 | |
|
319 | 0 | Assert(nslots >= 0 && nslots < LeafNodesPerPage); |
320 | | |
321 | | /* Clear all truncated leaf nodes */ |
322 | 0 | ptr = &fsmpage->fp_nodes[NonLeafNodesPerPage + nslots]; |
323 | 0 | for (; ptr < &fsmpage->fp_nodes[NodesPerPage]; ptr++) |
324 | 0 | { |
325 | 0 | if (*ptr != 0) |
326 | 0 | changed = true; |
327 | 0 | *ptr = 0; |
328 | 0 | } |
329 | | |
330 | | /* Fix upper nodes. */ |
331 | 0 | if (changed) |
332 | 0 | fsm_rebuild_page(page); |
333 | |
|
334 | 0 | return changed; |
335 | 0 | } |
336 | | |
337 | | /* |
338 | | * Reconstructs the upper levels of a page. Returns true if the page |
339 | | * was modified. |
340 | | */ |
341 | | bool |
342 | | fsm_rebuild_page(Page page) |
343 | 0 | { |
344 | 0 | FSMPage fsmpage = (FSMPage) PageGetContents(page); |
345 | 0 | bool changed = false; |
346 | 0 | int nodeno; |
347 | | |
348 | | /* |
349 | | * Start from the lowest non-leaf level, at last node, working our way |
350 | | * backwards, through all non-leaf nodes at all levels, up to the root. |
351 | | */ |
352 | 0 | for (nodeno = NonLeafNodesPerPage - 1; nodeno >= 0; nodeno--) |
353 | 0 | { |
354 | 0 | int lchild = leftchild(nodeno); |
355 | 0 | int rchild = lchild + 1; |
356 | 0 | uint8 newvalue = 0; |
357 | | |
358 | | /* The first few nodes we examine might have zero or one child. */ |
359 | 0 | if (lchild < NodesPerPage) |
360 | 0 | newvalue = fsmpage->fp_nodes[lchild]; |
361 | |
|
362 | 0 | if (rchild < NodesPerPage) |
363 | 0 | newvalue = Max(newvalue, |
364 | 0 | fsmpage->fp_nodes[rchild]); |
365 | |
|
366 | 0 | if (fsmpage->fp_nodes[nodeno] != newvalue) |
367 | 0 | { |
368 | 0 | fsmpage->fp_nodes[nodeno] = newvalue; |
369 | 0 | changed = true; |
370 | 0 | } |
371 | 0 | } |
372 | |
|
373 | 0 | return changed; |
374 | 0 | } |