/src/postgres/src/backend/access/transam/multixact.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * multixact.c |
4 | | * PostgreSQL multi-transaction-log manager |
5 | | * |
6 | | * The pg_multixact manager is a pg_xact-like manager that stores an array of |
7 | | * MultiXactMember for each MultiXactId. It is a fundamental part of the |
8 | | * shared-row-lock implementation. Each MultiXactMember is comprised of a |
9 | | * TransactionId and a set of flag bits. The name is a bit historical: |
10 | | * originally, a MultiXactId consisted of more than one TransactionId (except |
11 | | * in rare corner cases), hence "multi". Nowadays, however, it's perfectly |
12 | | * legitimate to have MultiXactIds that only include a single Xid. |
13 | | * |
14 | | * The meaning of the flag bits is opaque to this module, but they are mostly |
15 | | * used in heapam.c to identify lock modes that each of the member transactions |
16 | | * is holding on any given tuple. This module just contains support to store |
17 | | * and retrieve the arrays. |
18 | | * |
19 | | * We use two SLRU areas, one for storing the offsets at which the data |
20 | | * starts for each MultiXactId in the other one. This trick allows us to |
21 | | * store variable length arrays of TransactionIds. (We could alternatively |
22 | | * use one area containing counts and TransactionIds, with valid MultiXactId |
23 | | * values pointing at slots containing counts; but that way seems less robust |
24 | | * since it would get completely confused if someone inquired about a bogus |
25 | | * MultiXactId that pointed to an intermediate slot containing an XID.) |
26 | | * |
27 | | * XLOG interactions: this module generates a record whenever a new OFFSETs or |
28 | | * MEMBERs page is initialized to zeroes, as well as an |
29 | | * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined. |
30 | | * This module ignores the WAL rule "write xlog before data," because it |
31 | | * suffices that actions recording a MultiXactId in a heap xmax do follow that |
32 | | * rule. The only way for the MXID to be referenced from any data page is for |
33 | | * heap_lock_tuple() or heap_update() to have put it there, and each generates |
34 | | * an XLOG record that must follow ours. The normal LSN interlock between the |
35 | | * data page and that XLOG record will ensure that our XLOG record reaches |
36 | | * disk first. If the SLRU members/offsets data reaches disk sooner than the |
37 | | * XLOG records, we do not care; after recovery, no xmax will refer to it. On |
38 | | * the flip side, to ensure that all referenced entries _do_ reach disk, this |
39 | | * module's XLOG records completely rebuild the data entered since the last |
40 | | * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk |
41 | | * before each checkpoint is considered complete. |
42 | | * |
43 | | * Like clog.c, and unlike subtrans.c, we have to preserve state across |
44 | | * crashes and ensure that MXID and offset numbering increases monotonically |
45 | | * across a crash. We do this in the same way as it's done for transaction |
46 | | * IDs: the WAL record is guaranteed to contain evidence of every MXID we |
47 | | * could need to worry about, and we just make sure that at the end of |
48 | | * replay, the next-MXID and next-offset counters are at least as large as |
49 | | * anything we saw during replay. |
50 | | * |
51 | | * We are able to remove segments no longer necessary by carefully tracking |
52 | | * each table's used values: during vacuum, any multixact older than a certain |
53 | | * value is removed; the cutoff value is stored in pg_class. The minimum value |
54 | | * across all tables in each database is stored in pg_database, and the global |
55 | | * minimum across all databases is part of pg_control and is kept in shared |
56 | | * memory. Whenever that minimum is advanced, the SLRUs are truncated. |
57 | | * |
58 | | * When new multixactid values are to be created, care is taken that the |
59 | | * counter does not fall within the wraparound horizon considering the global |
60 | | * minimum value. |
61 | | * |
62 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
63 | | * Portions Copyright (c) 1994, Regents of the University of California |
64 | | * |
65 | | * src/backend/access/transam/multixact.c |
66 | | * |
67 | | *------------------------------------------------------------------------- |
68 | | */ |
69 | | #include "postgres.h" |
70 | | |
71 | | #include "access/multixact.h" |
72 | | #include "access/slru.h" |
73 | | #include "access/transam.h" |
74 | | #include "access/twophase.h" |
75 | | #include "access/twophase_rmgr.h" |
76 | | #include "access/xact.h" |
77 | | #include "access/xlog.h" |
78 | | #include "access/xloginsert.h" |
79 | | #include "access/xlogutils.h" |
80 | | #include "commands/dbcommands.h" |
81 | | #include "funcapi.h" |
82 | | #include "lib/ilist.h" |
83 | | #include "miscadmin.h" |
84 | | #include "pg_trace.h" |
85 | | #include "pgstat.h" |
86 | | #include "postmaster/autovacuum.h" |
87 | | #include "storage/pmsignal.h" |
88 | | #include "storage/proc.h" |
89 | | #include "storage/procarray.h" |
90 | | #include "utils/fmgrprotos.h" |
91 | | #include "utils/guc_hooks.h" |
92 | | #include "utils/injection_point.h" |
93 | | #include "utils/memutils.h" |
94 | | |
95 | | |
96 | | /* |
97 | | * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is |
98 | | * used everywhere else in Postgres. |
99 | | * |
100 | | * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF, |
101 | | * MultiXact page numbering also wraps around at |
102 | | * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at |
103 | | * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need |
104 | | * take no explicit notice of that fact in this module, except when comparing |
105 | | * segment and page numbers in TruncateMultiXact (see |
106 | | * MultiXactOffsetPagePrecedes). |
107 | | */ |
108 | | |
109 | | /* We need four bytes per offset */ |
110 | 0 | #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) |
111 | | |
112 | | static inline int64 |
113 | | MultiXactIdToOffsetPage(MultiXactId multi) |
114 | 0 | { |
115 | 0 | return multi / MULTIXACT_OFFSETS_PER_PAGE; |
116 | 0 | } |
117 | | |
118 | | static inline int |
119 | | MultiXactIdToOffsetEntry(MultiXactId multi) |
120 | 0 | { |
121 | 0 | return multi % MULTIXACT_OFFSETS_PER_PAGE; |
122 | 0 | } |
123 | | |
124 | | static inline int64 |
125 | | MultiXactIdToOffsetSegment(MultiXactId multi) |
126 | 0 | { |
127 | 0 | return MultiXactIdToOffsetPage(multi) / SLRU_PAGES_PER_SEGMENT; |
128 | 0 | } |
129 | | |
130 | | /* |
131 | | * The situation for members is a bit more complex: we store one byte of |
132 | | * additional flag bits for each TransactionId. To do this without getting |
133 | | * into alignment issues, we store four bytes of flags, and then the |
134 | | * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and |
135 | | * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups |
136 | | * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and |
137 | | * performance) trumps space efficiency here. |
138 | | * |
139 | | * Note that the "offset" macros work with byte offset, not array indexes, so |
140 | | * arithmetic must be done using "char *" pointers. |
141 | | */ |
142 | | /* We need eight bits per xact, so one xact fits in a byte */ |
143 | 0 | #define MXACT_MEMBER_BITS_PER_XACT 8 |
144 | 0 | #define MXACT_MEMBER_FLAGS_PER_BYTE 1 |
145 | 0 | #define MXACT_MEMBER_XACT_BITMASK ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) |
146 | | |
147 | | /* how many full bytes of flags are there in a group? */ |
148 | 0 | #define MULTIXACT_FLAGBYTES_PER_GROUP 4 |
149 | | #define MULTIXACT_MEMBERS_PER_MEMBERGROUP \ |
150 | 0 | (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE) |
151 | | /* size in bytes of a complete group */ |
152 | | #define MULTIXACT_MEMBERGROUP_SIZE \ |
153 | 0 | (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) |
154 | 0 | #define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) |
155 | | #define MULTIXACT_MEMBERS_PER_PAGE \ |
156 | 0 | (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) |
157 | | |
158 | | /* |
159 | | * Because the number of items per page is not a divisor of the last item |
160 | | * number (member 0xFFFFFFFF), the last segment does not use the maximum number |
161 | | * of pages, and moreover the last used page therein does not use the same |
162 | | * number of items as previous pages. (Another way to say it is that the |
163 | | * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page |
164 | | * has some empty space after that item.) |
165 | | * |
166 | | * This constant is the number of members in the last page of the last segment. |
167 | | */ |
168 | | #define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \ |
169 | 0 | ((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1)) |
170 | | |
171 | | /* page in which a member is to be found */ |
172 | | static inline int64 |
173 | | MXOffsetToMemberPage(MultiXactOffset offset) |
174 | 0 | { |
175 | 0 | return offset / MULTIXACT_MEMBERS_PER_PAGE; |
176 | 0 | } |
177 | | |
178 | | static inline int64 |
179 | | MXOffsetToMemberSegment(MultiXactOffset offset) |
180 | 0 | { |
181 | 0 | return MXOffsetToMemberPage(offset) / SLRU_PAGES_PER_SEGMENT; |
182 | 0 | } |
183 | | |
184 | | /* Location (byte offset within page) of flag word for a given member */ |
185 | | static inline int |
186 | | MXOffsetToFlagsOffset(MultiXactOffset offset) |
187 | 0 | { |
188 | 0 | MultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP; |
189 | 0 | int grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE; |
190 | 0 | int byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE; |
191 | |
|
192 | 0 | return byteoff; |
193 | 0 | } |
194 | | |
195 | | static inline int |
196 | | MXOffsetToFlagsBitShift(MultiXactOffset offset) |
197 | 0 | { |
198 | 0 | int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; |
199 | 0 | int bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT; |
200 | |
|
201 | 0 | return bshift; |
202 | 0 | } |
203 | | |
204 | | /* Location (byte offset within page) of TransactionId of given member */ |
205 | | static inline int |
206 | | MXOffsetToMemberOffset(MultiXactOffset offset) |
207 | 0 | { |
208 | 0 | int member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP; |
209 | |
|
210 | 0 | return MXOffsetToFlagsOffset(offset) + |
211 | 0 | MULTIXACT_FLAGBYTES_PER_GROUP + |
212 | 0 | member_in_group * sizeof(TransactionId); |
213 | 0 | } |
214 | | |
215 | | /* Multixact members wraparound thresholds. */ |
216 | 0 | #define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 2) |
217 | | #define MULTIXACT_MEMBER_DANGER_THRESHOLD \ |
218 | 0 | (MaxMultiXactOffset - MaxMultiXactOffset / 4) |
219 | | |
220 | | static inline MultiXactId |
221 | | PreviousMultiXactId(MultiXactId multi) |
222 | 0 | { |
223 | 0 | return multi == FirstMultiXactId ? MaxMultiXactId : multi - 1; |
224 | 0 | } |
225 | | |
226 | | /* |
227 | | * Links to shared-memory data structures for MultiXact control |
228 | | */ |
229 | | static SlruCtlData MultiXactOffsetCtlData; |
230 | | static SlruCtlData MultiXactMemberCtlData; |
231 | | |
232 | 0 | #define MultiXactOffsetCtl (&MultiXactOffsetCtlData) |
233 | 0 | #define MultiXactMemberCtl (&MultiXactMemberCtlData) |
234 | | |
235 | | /* |
236 | | * MultiXact state shared across all backends. All this state is protected |
237 | | * by MultiXactGenLock. (We also use SLRU bank's lock of MultiXactOffset and |
238 | | * MultiXactMember to guard accesses to the two sets of SLRU buffers. For |
239 | | * concurrency's sake, we avoid holding more than one of these locks at a |
240 | | * time.) |
241 | | */ |
242 | | typedef struct MultiXactStateData |
243 | | { |
244 | | /* next-to-be-assigned MultiXactId */ |
245 | | MultiXactId nextMXact; |
246 | | |
247 | | /* next-to-be-assigned offset */ |
248 | | MultiXactOffset nextOffset; |
249 | | |
250 | | /* Have we completed multixact startup? */ |
251 | | bool finishedStartup; |
252 | | |
253 | | /* |
254 | | * Oldest multixact that is still potentially referenced by a relation. |
255 | | * Anything older than this should not be consulted. These values are |
256 | | * updated by vacuum. |
257 | | */ |
258 | | MultiXactId oldestMultiXactId; |
259 | | Oid oldestMultiXactDB; |
260 | | |
261 | | /* |
262 | | * Oldest multixact offset that is potentially referenced by a multixact |
263 | | * referenced by a relation. We don't always know this value, so there's |
264 | | * a flag here to indicate whether or not we currently do. |
265 | | */ |
266 | | MultiXactOffset oldestOffset; |
267 | | bool oldestOffsetKnown; |
268 | | |
269 | | /* support for anti-wraparound measures */ |
270 | | MultiXactId multiVacLimit; |
271 | | MultiXactId multiWarnLimit; |
272 | | MultiXactId multiStopLimit; |
273 | | MultiXactId multiWrapLimit; |
274 | | |
275 | | /* support for members anti-wraparound measures */ |
276 | | MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ |
277 | | |
278 | | /* |
279 | | * This is used to sleep until a multixact offset is written when we want |
280 | | * to create the next one. |
281 | | */ |
282 | | ConditionVariable nextoff_cv; |
283 | | |
284 | | /* |
285 | | * Per-backend data starts here. We have two arrays stored in the area |
286 | | * immediately following the MultiXactStateData struct. Each is indexed by |
287 | | * ProcNumber. |
288 | | * |
289 | | * In both arrays, there's a slot for all normal backends |
290 | | * (0..MaxBackends-1) followed by a slot for max_prepared_xacts prepared |
291 | | * transactions. |
292 | | * |
293 | | * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current |
294 | | * transaction(s) could possibly be a member of, or InvalidMultiXactId |
295 | | * when the backend has no live transaction that could possibly be a |
296 | | * member of a MultiXact. Each backend sets its entry to the current |
297 | | * nextMXact counter just before first acquiring a shared lock in a given |
298 | | * transaction, and clears it at transaction end. (This works because only |
299 | | * during or after acquiring a shared lock could an XID possibly become a |
300 | | * member of a MultiXact, and that MultiXact would have to be created |
301 | | * during or after the lock acquisition.) |
302 | | * |
303 | | * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's |
304 | | * current transaction(s) think is potentially live, or InvalidMultiXactId |
305 | | * when not in a transaction or not in a transaction that's paid any |
306 | | * attention to MultiXacts yet. This is computed when first needed in a |
307 | | * given transaction, and cleared at transaction end. We can compute it |
308 | | * as the minimum of the valid OldestMemberMXactId[] entries at the time |
309 | | * we compute it (using nextMXact if none are valid). Each backend is |
310 | | * required not to attempt to access any SLRU data for MultiXactIds older |
311 | | * than its own OldestVisibleMXactId[] setting; this is necessary because |
312 | | * the relevant SLRU data can be concurrently truncated away. |
313 | | * |
314 | | * The oldest valid value among all of the OldestMemberMXactId[] and |
315 | | * OldestVisibleMXactId[] entries is considered by vacuum as the earliest |
316 | | * possible value still having any live member transaction -- OldestMxact. |
317 | | * Any value older than that is typically removed from tuple headers, or |
318 | | * "frozen" via being replaced with a new xmax. VACUUM can sometimes even |
319 | | * remove an individual MultiXact xmax whose value is >= its OldestMxact |
320 | | * cutoff, though typically only when no individual member XID is still |
321 | | * running. See FreezeMultiXactId for full details. |
322 | | * |
323 | | * Whenever VACUUM advances relminmxid, then either its OldestMxact cutoff |
324 | | * or the oldest extant Multi remaining in the table is used as the new |
325 | | * pg_class.relminmxid value (whichever is earlier). The minimum of all |
326 | | * relminmxid values in each database is stored in pg_database.datminmxid. |
327 | | * In turn, the minimum of all of those values is stored in pg_control. |
328 | | * This is used as the truncation point for pg_multixact when unneeded |
329 | | * segments get removed by vac_truncate_clog() during vacuuming. |
330 | | */ |
331 | | MultiXactId perBackendXactIds[FLEXIBLE_ARRAY_MEMBER]; |
332 | | } MultiXactStateData; |
333 | | |
334 | | /* |
335 | | * Size of OldestMemberMXactId and OldestVisibleMXactId arrays. |
336 | | */ |
337 | 0 | #define MaxOldestSlot (MaxBackends + max_prepared_xacts) |
338 | | |
339 | | /* Pointers to the state data in shared memory */ |
340 | | static MultiXactStateData *MultiXactState; |
341 | | static MultiXactId *OldestMemberMXactId; |
342 | | static MultiXactId *OldestVisibleMXactId; |
343 | | |
344 | | |
345 | | /* |
346 | | * Definitions for the backend-local MultiXactId cache. |
347 | | * |
348 | | * We use this cache to store known MultiXacts, so we don't need to go to |
349 | | * SLRU areas every time. |
350 | | * |
351 | | * The cache lasts for the duration of a single transaction, the rationale |
352 | | * for this being that most entries will contain our own TransactionId and |
353 | | * so they will be uninteresting by the time our next transaction starts. |
354 | | * (XXX not clear that this is correct --- other members of the MultiXact |
355 | | * could hang around longer than we did. However, it's not clear what a |
356 | | * better policy for flushing old cache entries would be.) FIXME actually |
357 | | * this is plain wrong now that multixact's may contain update Xids. |
358 | | * |
359 | | * We allocate the cache entries in a memory context that is deleted at |
360 | | * transaction end, so we don't need to do retail freeing of entries. |
361 | | */ |
362 | | typedef struct mXactCacheEnt |
363 | | { |
364 | | MultiXactId multi; |
365 | | int nmembers; |
366 | | dlist_node node; |
367 | | MultiXactMember members[FLEXIBLE_ARRAY_MEMBER]; |
368 | | } mXactCacheEnt; |
369 | | |
370 | 0 | #define MAX_CACHE_ENTRIES 256 |
371 | | static dclist_head MXactCache = DCLIST_STATIC_INIT(MXactCache); |
372 | | static MemoryContext MXactContext = NULL; |
373 | | |
374 | | #ifdef MULTIXACT_DEBUG |
375 | | #define debug_elog2(a,b) elog(a,b) |
376 | | #define debug_elog3(a,b,c) elog(a,b,c) |
377 | | #define debug_elog4(a,b,c,d) elog(a,b,c,d) |
378 | | #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e) |
379 | | #define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f) |
380 | | #else |
381 | | #define debug_elog2(a,b) |
382 | | #define debug_elog3(a,b,c) |
383 | | #define debug_elog4(a,b,c,d) |
384 | | #define debug_elog5(a,b,c,d,e) |
385 | | #define debug_elog6(a,b,c,d,e,f) |
386 | | #endif |
387 | | |
388 | | /* internal MultiXactId management */ |
389 | | static void MultiXactIdSetOldestVisible(void); |
390 | | static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, |
391 | | int nmembers, MultiXactMember *members); |
392 | | static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset); |
393 | | |
394 | | /* MultiXact cache management */ |
395 | | static int mxactMemberComparator(const void *arg1, const void *arg2); |
396 | | static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members); |
397 | | static int mXactCacheGetById(MultiXactId multi, MultiXactMember **members); |
398 | | static void mXactCachePut(MultiXactId multi, int nmembers, |
399 | | MultiXactMember *members); |
400 | | |
401 | | static char *mxstatus_to_string(MultiXactStatus status); |
402 | | |
403 | | /* management of SLRU infrastructure */ |
404 | | static int ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog); |
405 | | static int ZeroMultiXactMemberPage(int64 pageno, bool writeXlog); |
406 | | static bool MultiXactOffsetPagePrecedes(int64 page1, int64 page2); |
407 | | static bool MultiXactMemberPagePrecedes(int64 page1, int64 page2); |
408 | | static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, |
409 | | MultiXactOffset offset2); |
410 | | static void ExtendMultiXactOffset(MultiXactId multi); |
411 | | static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); |
412 | | static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, |
413 | | MultiXactOffset start, uint32 distance); |
414 | | static bool SetOffsetVacuumLimit(bool is_startup); |
415 | | static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); |
416 | | static void WriteMZeroPageXlogRec(int64 pageno, uint8 info); |
417 | | static void WriteMTruncateXlogRec(Oid oldestMultiDB, |
418 | | MultiXactId startTruncOff, |
419 | | MultiXactId endTruncOff, |
420 | | MultiXactOffset startTruncMemb, |
421 | | MultiXactOffset endTruncMemb); |
422 | | |
423 | | |
424 | | /* |
425 | | * MultiXactIdCreate |
426 | | * Construct a MultiXactId representing two TransactionIds. |
427 | | * |
428 | | * The two XIDs must be different, or be requesting different statuses. |
429 | | * |
430 | | * NB - we don't worry about our local MultiXactId cache here, because that |
431 | | * is handled by the lower-level routines. |
432 | | */ |
433 | | MultiXactId |
434 | | MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, |
435 | | TransactionId xid2, MultiXactStatus status2) |
436 | 0 | { |
437 | 0 | MultiXactId newMulti; |
438 | 0 | MultiXactMember members[2]; |
439 | |
|
440 | 0 | Assert(TransactionIdIsValid(xid1)); |
441 | 0 | Assert(TransactionIdIsValid(xid2)); |
442 | |
|
443 | 0 | Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2)); |
444 | | |
445 | | /* MultiXactIdSetOldestMember() must have been called already. */ |
446 | 0 | Assert(MultiXactIdIsValid(OldestMemberMXactId[MyProcNumber])); |
447 | | |
448 | | /* |
449 | | * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs |
450 | | * are still running. In typical usage, xid2 will be our own XID and the |
451 | | * caller just did a check on xid1, so it'd be wasted effort. |
452 | | */ |
453 | |
|
454 | 0 | members[0].xid = xid1; |
455 | 0 | members[0].status = status1; |
456 | 0 | members[1].xid = xid2; |
457 | 0 | members[1].status = status2; |
458 | |
|
459 | 0 | newMulti = MultiXactIdCreateFromMembers(2, members); |
460 | |
|
461 | 0 | debug_elog3(DEBUG2, "Create: %s", |
462 | 0 | mxid_to_string(newMulti, 2, members)); |
463 | |
|
464 | 0 | return newMulti; |
465 | 0 | } |
466 | | |
467 | | /* |
468 | | * MultiXactIdExpand |
469 | | * Add a TransactionId to a pre-existing MultiXactId. |
470 | | * |
471 | | * If the TransactionId is already a member of the passed MultiXactId with the |
472 | | * same status, just return it as-is. |
473 | | * |
474 | | * Note that we do NOT actually modify the membership of a pre-existing |
475 | | * MultiXactId; instead we create a new one. This is necessary to avoid |
476 | | * a race condition against code trying to wait for one MultiXactId to finish; |
477 | | * see notes in heapam.c. |
478 | | * |
479 | | * NB - we don't worry about our local MultiXactId cache here, because that |
480 | | * is handled by the lower-level routines. |
481 | | * |
482 | | * Note: It is critical that MultiXactIds that come from an old cluster (i.e. |
483 | | * one upgraded by pg_upgrade from a cluster older than this feature) are not |
484 | | * passed in. |
485 | | */ |
486 | | MultiXactId |
487 | | MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) |
488 | 0 | { |
489 | 0 | MultiXactId newMulti; |
490 | 0 | MultiXactMember *members; |
491 | 0 | MultiXactMember *newMembers; |
492 | 0 | int nmembers; |
493 | 0 | int i; |
494 | 0 | int j; |
495 | |
|
496 | 0 | Assert(MultiXactIdIsValid(multi)); |
497 | 0 | Assert(TransactionIdIsValid(xid)); |
498 | | |
499 | | /* MultiXactIdSetOldestMember() must have been called already. */ |
500 | 0 | Assert(MultiXactIdIsValid(OldestMemberMXactId[MyProcNumber])); |
501 | |
|
502 | 0 | debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s", |
503 | 0 | multi, xid, mxstatus_to_string(status)); |
504 | | |
505 | | /* |
506 | | * Note: we don't allow for old multis here. The reason is that the only |
507 | | * caller of this function does a check that the multixact is no longer |
508 | | * running. |
509 | | */ |
510 | 0 | nmembers = GetMultiXactIdMembers(multi, &members, false, false); |
511 | |
|
512 | 0 | if (nmembers < 0) |
513 | 0 | { |
514 | 0 | MultiXactMember member; |
515 | | |
516 | | /* |
517 | | * The MultiXactId is obsolete. This can only happen if all the |
518 | | * MultiXactId members stop running between the caller checking and |
519 | | * passing it to us. It would be better to return that fact to the |
520 | | * caller, but it would complicate the API and it's unlikely to happen |
521 | | * too often, so just deal with it by creating a singleton MultiXact. |
522 | | */ |
523 | 0 | member.xid = xid; |
524 | 0 | member.status = status; |
525 | 0 | newMulti = MultiXactIdCreateFromMembers(1, &member); |
526 | |
|
527 | 0 | debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u", |
528 | 0 | multi, newMulti); |
529 | 0 | return newMulti; |
530 | 0 | } |
531 | | |
532 | | /* |
533 | | * If the TransactionId is already a member of the MultiXactId with the |
534 | | * same status, just return the existing MultiXactId. |
535 | | */ |
536 | 0 | for (i = 0; i < nmembers; i++) |
537 | 0 | { |
538 | 0 | if (TransactionIdEquals(members[i].xid, xid) && |
539 | 0 | (members[i].status == status)) |
540 | 0 | { |
541 | 0 | debug_elog4(DEBUG2, "Expand: %u is already a member of %u", |
542 | 0 | xid, multi); |
543 | 0 | pfree(members); |
544 | 0 | return multi; |
545 | 0 | } |
546 | 0 | } |
547 | | |
548 | | /* |
549 | | * Determine which of the members of the MultiXactId are still of |
550 | | * interest. This is any running transaction, and also any transaction |
551 | | * that grabbed something stronger than just a lock and was committed. (An |
552 | | * update that aborted is of no interest here; and having more than one |
553 | | * update Xid in a multixact would cause errors elsewhere.) |
554 | | * |
555 | | * Removing dead members is not just an optimization: freezing of tuples |
556 | | * whose Xmax are multis depends on this behavior. |
557 | | * |
558 | | * Note we have the same race condition here as above: j could be 0 at the |
559 | | * end of the loop. |
560 | | */ |
561 | 0 | newMembers = (MultiXactMember *) |
562 | 0 | palloc(sizeof(MultiXactMember) * (nmembers + 1)); |
563 | |
|
564 | 0 | for (i = 0, j = 0; i < nmembers; i++) |
565 | 0 | { |
566 | 0 | if (TransactionIdIsInProgress(members[i].xid) || |
567 | 0 | (ISUPDATE_from_mxstatus(members[i].status) && |
568 | 0 | TransactionIdDidCommit(members[i].xid))) |
569 | 0 | { |
570 | 0 | newMembers[j].xid = members[i].xid; |
571 | 0 | newMembers[j++].status = members[i].status; |
572 | 0 | } |
573 | 0 | } |
574 | |
|
575 | 0 | newMembers[j].xid = xid; |
576 | 0 | newMembers[j++].status = status; |
577 | 0 | newMulti = MultiXactIdCreateFromMembers(j, newMembers); |
578 | |
|
579 | 0 | pfree(members); |
580 | 0 | pfree(newMembers); |
581 | |
|
582 | 0 | debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti); |
583 | |
|
584 | 0 | return newMulti; |
585 | 0 | } |
586 | | |
587 | | /* |
588 | | * MultiXactIdIsRunning |
589 | | * Returns whether a MultiXactId is "running". |
590 | | * |
591 | | * We return true if at least one member of the given MultiXactId is still |
592 | | * running. Note that a "false" result is certain not to change, |
593 | | * because it is not legal to add members to an existing MultiXactId. |
594 | | * |
595 | | * Caller is expected to have verified that the multixact does not come from |
596 | | * a pg_upgraded share-locked tuple. |
597 | | */ |
598 | | bool |
599 | | MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly) |
600 | 0 | { |
601 | 0 | MultiXactMember *members; |
602 | 0 | int nmembers; |
603 | 0 | int i; |
604 | |
|
605 | 0 | debug_elog3(DEBUG2, "IsRunning %u?", multi); |
606 | | |
607 | | /* |
608 | | * "false" here means we assume our callers have checked that the given |
609 | | * multi cannot possibly come from a pg_upgraded database. |
610 | | */ |
611 | 0 | nmembers = GetMultiXactIdMembers(multi, &members, false, isLockOnly); |
612 | |
|
613 | 0 | if (nmembers <= 0) |
614 | 0 | { |
615 | 0 | debug_elog2(DEBUG2, "IsRunning: no members"); |
616 | 0 | return false; |
617 | 0 | } |
618 | | |
619 | | /* |
620 | | * Checking for myself is cheap compared to looking in shared memory; |
621 | | * return true if any live subtransaction of the current top-level |
622 | | * transaction is a member. |
623 | | * |
624 | | * This is not needed for correctness, it's just a fast path. |
625 | | */ |
626 | 0 | for (i = 0; i < nmembers; i++) |
627 | 0 | { |
628 | 0 | if (TransactionIdIsCurrentTransactionId(members[i].xid)) |
629 | 0 | { |
630 | 0 | debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i); |
631 | 0 | pfree(members); |
632 | 0 | return true; |
633 | 0 | } |
634 | 0 | } |
635 | | |
636 | | /* |
637 | | * This could be made faster by having another entry point in procarray.c, |
638 | | * walking the PGPROC array only once for all the members. But in most |
639 | | * cases nmembers should be small enough that it doesn't much matter. |
640 | | */ |
641 | 0 | for (i = 0; i < nmembers; i++) |
642 | 0 | { |
643 | 0 | if (TransactionIdIsInProgress(members[i].xid)) |
644 | 0 | { |
645 | 0 | debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running", |
646 | 0 | i, members[i].xid); |
647 | 0 | pfree(members); |
648 | 0 | return true; |
649 | 0 | } |
650 | 0 | } |
651 | | |
652 | 0 | pfree(members); |
653 | |
|
654 | 0 | debug_elog3(DEBUG2, "IsRunning: %u is not running", multi); |
655 | |
|
656 | 0 | return false; |
657 | 0 | } |
658 | | |
659 | | /* |
660 | | * MultiXactIdSetOldestMember |
661 | | * Save the oldest MultiXactId this transaction could be a member of. |
662 | | * |
663 | | * We set the OldestMemberMXactId for a given transaction the first time it's |
664 | | * going to do some operation that might require a MultiXactId (tuple lock, |
665 | | * update or delete). We need to do this even if we end up using a |
666 | | * TransactionId instead of a MultiXactId, because there is a chance that |
667 | | * another transaction would add our XID to a MultiXactId. |
668 | | * |
669 | | * The value to set is the next-to-be-assigned MultiXactId, so this is meant to |
670 | | * be called just before doing any such possibly-MultiXactId-able operation. |
671 | | */ |
672 | | void |
673 | | MultiXactIdSetOldestMember(void) |
674 | 0 | { |
675 | 0 | if (!MultiXactIdIsValid(OldestMemberMXactId[MyProcNumber])) |
676 | 0 | { |
677 | 0 | MultiXactId nextMXact; |
678 | | |
679 | | /* |
680 | | * You might think we don't need to acquire a lock here, since |
681 | | * fetching and storing of TransactionIds is probably atomic, but in |
682 | | * fact we do: suppose we pick up nextMXact and then lose the CPU for |
683 | | * a long time. Someone else could advance nextMXact, and then |
684 | | * another someone else could compute an OldestVisibleMXactId that |
685 | | * would be after the value we are going to store when we get control |
686 | | * back. Which would be wrong. |
687 | | * |
688 | | * Note that a shared lock is sufficient, because it's enough to stop |
689 | | * someone from advancing nextMXact; and nobody else could be trying |
690 | | * to write to our OldestMember entry, only reading (and we assume |
691 | | * storing it is atomic.) |
692 | | */ |
693 | 0 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
694 | | |
695 | | /* |
696 | | * We have to beware of the possibility that nextMXact is in the |
697 | | * wrapped-around state. We don't fix the counter itself here, but we |
698 | | * must be sure to store a valid value in our array entry. |
699 | | */ |
700 | 0 | nextMXact = MultiXactState->nextMXact; |
701 | 0 | if (nextMXact < FirstMultiXactId) |
702 | 0 | nextMXact = FirstMultiXactId; |
703 | |
|
704 | 0 | OldestMemberMXactId[MyProcNumber] = nextMXact; |
705 | |
|
706 | 0 | LWLockRelease(MultiXactGenLock); |
707 | |
|
708 | 0 | debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u", |
709 | 0 | MyProcNumber, nextMXact); |
710 | 0 | } |
711 | 0 | } |
712 | | |
713 | | /* |
714 | | * MultiXactIdSetOldestVisible |
715 | | * Save the oldest MultiXactId this transaction considers possibly live. |
716 | | * |
717 | | * We set the OldestVisibleMXactId for a given transaction the first time |
718 | | * it's going to inspect any MultiXactId. Once we have set this, we are |
719 | | * guaranteed that SLRU data for MultiXactIds >= our own OldestVisibleMXactId |
720 | | * won't be truncated away. |
721 | | * |
722 | | * The value to set is the oldest of nextMXact and all the valid per-backend |
723 | | * OldestMemberMXactId[] entries. Because of the locking we do, we can be |
724 | | * certain that no subsequent call to MultiXactIdSetOldestMember can set |
725 | | * an OldestMemberMXactId[] entry older than what we compute here. Therefore |
726 | | * there is no live transaction, now or later, that can be a member of any |
727 | | * MultiXactId older than the OldestVisibleMXactId we compute here. |
728 | | */ |
729 | | static void |
730 | | MultiXactIdSetOldestVisible(void) |
731 | 0 | { |
732 | 0 | if (!MultiXactIdIsValid(OldestVisibleMXactId[MyProcNumber])) |
733 | 0 | { |
734 | 0 | MultiXactId oldestMXact; |
735 | 0 | int i; |
736 | |
|
737 | 0 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
738 | | |
739 | | /* |
740 | | * We have to beware of the possibility that nextMXact is in the |
741 | | * wrapped-around state. We don't fix the counter itself here, but we |
742 | | * must be sure to store a valid value in our array entry. |
743 | | */ |
744 | 0 | oldestMXact = MultiXactState->nextMXact; |
745 | 0 | if (oldestMXact < FirstMultiXactId) |
746 | 0 | oldestMXact = FirstMultiXactId; |
747 | |
|
748 | 0 | for (i = 0; i < MaxOldestSlot; i++) |
749 | 0 | { |
750 | 0 | MultiXactId thisoldest = OldestMemberMXactId[i]; |
751 | |
|
752 | 0 | if (MultiXactIdIsValid(thisoldest) && |
753 | 0 | MultiXactIdPrecedes(thisoldest, oldestMXact)) |
754 | 0 | oldestMXact = thisoldest; |
755 | 0 | } |
756 | |
|
757 | 0 | OldestVisibleMXactId[MyProcNumber] = oldestMXact; |
758 | |
|
759 | 0 | LWLockRelease(MultiXactGenLock); |
760 | |
|
761 | 0 | debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u", |
762 | 0 | MyProcNumber, oldestMXact); |
763 | 0 | } |
764 | 0 | } |
765 | | |
766 | | /* |
767 | | * ReadNextMultiXactId |
768 | | * Return the next MultiXactId to be assigned, but don't allocate it |
769 | | */ |
770 | | MultiXactId |
771 | | ReadNextMultiXactId(void) |
772 | 0 | { |
773 | 0 | MultiXactId mxid; |
774 | | |
775 | | /* XXX we could presumably do this without a lock. */ |
776 | 0 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
777 | 0 | mxid = MultiXactState->nextMXact; |
778 | 0 | LWLockRelease(MultiXactGenLock); |
779 | |
|
780 | 0 | if (mxid < FirstMultiXactId) |
781 | 0 | mxid = FirstMultiXactId; |
782 | |
|
783 | 0 | return mxid; |
784 | 0 | } |
785 | | |
786 | | /* |
787 | | * ReadMultiXactIdRange |
788 | | * Get the range of IDs that may still be referenced by a relation. |
789 | | */ |
790 | | void |
791 | | ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next) |
792 | 0 | { |
793 | 0 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
794 | 0 | *oldest = MultiXactState->oldestMultiXactId; |
795 | 0 | *next = MultiXactState->nextMXact; |
796 | 0 | LWLockRelease(MultiXactGenLock); |
797 | |
|
798 | 0 | if (*oldest < FirstMultiXactId) |
799 | 0 | *oldest = FirstMultiXactId; |
800 | 0 | if (*next < FirstMultiXactId) |
801 | 0 | *next = FirstMultiXactId; |
802 | 0 | } |
803 | | |
804 | | |
805 | | /* |
806 | | * MultiXactIdCreateFromMembers |
807 | | * Make a new MultiXactId from the specified set of members |
808 | | * |
809 | | * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the |
810 | | * given TransactionIds as members. Returns the newly created MultiXactId. |
811 | | * |
812 | | * NB: the passed members[] array will be sorted in-place. |
813 | | */ |
814 | | MultiXactId |
815 | | MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members) |
816 | 0 | { |
817 | 0 | MultiXactId multi; |
818 | 0 | MultiXactOffset offset; |
819 | 0 | xl_multixact_create xlrec; |
820 | |
|
821 | 0 | debug_elog3(DEBUG2, "Create: %s", |
822 | 0 | mxid_to_string(InvalidMultiXactId, nmembers, members)); |
823 | | |
824 | | /* |
825 | | * See if the same set of members already exists in our cache; if so, just |
826 | | * re-use that MultiXactId. (Note: it might seem that looking in our |
827 | | * cache is insufficient, and we ought to search disk to see if a |
828 | | * duplicate definition already exists. But since we only ever create |
829 | | * MultiXacts containing our own XID, in most cases any such MultiXacts |
830 | | * were in fact created by us, and so will be in our cache. There are |
831 | | * corner cases where someone else added us to a MultiXact without our |
832 | | * knowledge, but it's not worth checking for.) |
833 | | */ |
834 | 0 | multi = mXactCacheGetBySet(nmembers, members); |
835 | 0 | if (MultiXactIdIsValid(multi)) |
836 | 0 | { |
837 | 0 | debug_elog2(DEBUG2, "Create: in cache!"); |
838 | 0 | return multi; |
839 | 0 | } |
840 | | |
841 | | /* Verify that there is a single update Xid among the given members. */ |
842 | 0 | { |
843 | 0 | int i; |
844 | 0 | bool has_update = false; |
845 | |
|
846 | 0 | for (i = 0; i < nmembers; i++) |
847 | 0 | { |
848 | 0 | if (ISUPDATE_from_mxstatus(members[i].status)) |
849 | 0 | { |
850 | 0 | if (has_update) |
851 | 0 | elog(ERROR, "new multixact has more than one updating member: %s", |
852 | 0 | mxid_to_string(InvalidMultiXactId, nmembers, members)); |
853 | 0 | has_update = true; |
854 | 0 | } |
855 | 0 | } |
856 | 0 | } |
857 | | |
858 | | /* Load the injection point before entering the critical section */ |
859 | 0 | INJECTION_POINT_LOAD("multixact-create-from-members"); |
860 | | |
861 | | /* |
862 | | * Assign the MXID and offsets range to use, and make sure there is space |
863 | | * in the OFFSETs and MEMBERs files. NB: this routine does |
864 | | * START_CRIT_SECTION(). |
865 | | * |
866 | | * Note: unlike MultiXactIdCreate and MultiXactIdExpand, we do not check |
867 | | * that we've called MultiXactIdSetOldestMember here. This is because |
868 | | * this routine is used in some places to create new MultiXactIds of which |
869 | | * the current backend is not a member, notably during freezing of multis |
870 | | * in vacuum. During vacuum, in particular, it would be unacceptable to |
871 | | * keep OldestMulti set, in case it runs for long. |
872 | | */ |
873 | 0 | multi = GetNewMultiXactId(nmembers, &offset); |
874 | |
|
875 | 0 | INJECTION_POINT_CACHED("multixact-create-from-members", NULL); |
876 | | |
877 | | /* Make an XLOG entry describing the new MXID. */ |
878 | 0 | xlrec.mid = multi; |
879 | 0 | xlrec.moff = offset; |
880 | 0 | xlrec.nmembers = nmembers; |
881 | | |
882 | | /* |
883 | | * XXX Note: there's a lot of padding space in MultiXactMember. We could |
884 | | * find a more compact representation of this Xlog record -- perhaps all |
885 | | * the status flags in one XLogRecData, then all the xids in another one? |
886 | | * Not clear that it's worth the trouble though. |
887 | | */ |
888 | 0 | XLogBeginInsert(); |
889 | 0 | XLogRegisterData(&xlrec, SizeOfMultiXactCreate); |
890 | 0 | XLogRegisterData(members, nmembers * sizeof(MultiXactMember)); |
891 | |
|
892 | 0 | (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID); |
893 | | |
894 | | /* Now enter the information into the OFFSETs and MEMBERs logs */ |
895 | 0 | RecordNewMultiXact(multi, offset, nmembers, members); |
896 | | |
897 | | /* Done with critical section */ |
898 | 0 | END_CRIT_SECTION(); |
899 | | |
900 | | /* Store the new MultiXactId in the local cache, too */ |
901 | 0 | mXactCachePut(multi, nmembers, members); |
902 | |
|
903 | 0 | debug_elog2(DEBUG2, "Create: all done"); |
904 | |
|
905 | 0 | return multi; |
906 | 0 | } |
907 | | |
908 | | /* |
909 | | * RecordNewMultiXact |
910 | | * Write info about a new multixact into the offsets and members files |
911 | | * |
912 | | * This is broken out of MultiXactIdCreateFromMembers so that xlog replay can |
913 | | * use it. |
914 | | */ |
915 | | static void |
916 | | RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, |
917 | | int nmembers, MultiXactMember *members) |
918 | 0 | { |
919 | 0 | int64 pageno; |
920 | 0 | int64 prev_pageno; |
921 | 0 | int entryno; |
922 | 0 | int slotno; |
923 | 0 | MultiXactOffset *offptr; |
924 | 0 | int i; |
925 | 0 | LWLock *lock; |
926 | 0 | LWLock *prevlock = NULL; |
927 | |
|
928 | 0 | pageno = MultiXactIdToOffsetPage(multi); |
929 | 0 | entryno = MultiXactIdToOffsetEntry(multi); |
930 | |
|
931 | 0 | lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); |
932 | 0 | LWLockAcquire(lock, LW_EXCLUSIVE); |
933 | | |
934 | | /* |
935 | | * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction" |
936 | | * to complain about if there's any I/O error. This is kinda bogus, but |
937 | | * since the errors will always give the full pathname, it should be clear |
938 | | * enough that a MultiXactId is really involved. Perhaps someday we'll |
939 | | * take the trouble to generalize the slru.c error reporting code. |
940 | | */ |
941 | 0 | slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi); |
942 | 0 | offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
943 | 0 | offptr += entryno; |
944 | |
|
945 | 0 | *offptr = offset; |
946 | |
|
947 | 0 | MultiXactOffsetCtl->shared->page_dirty[slotno] = true; |
948 | | |
949 | | /* Release MultiXactOffset SLRU lock. */ |
950 | 0 | LWLockRelease(lock); |
951 | | |
952 | | /* |
953 | | * If anybody was waiting to know the offset of this multixact ID we just |
954 | | * wrote, they can read it now, so wake them up. |
955 | | */ |
956 | 0 | ConditionVariableBroadcast(&MultiXactState->nextoff_cv); |
957 | |
|
958 | 0 | prev_pageno = -1; |
959 | |
|
960 | 0 | for (i = 0; i < nmembers; i++, offset++) |
961 | 0 | { |
962 | 0 | TransactionId *memberptr; |
963 | 0 | uint32 *flagsptr; |
964 | 0 | uint32 flagsval; |
965 | 0 | int bshift; |
966 | 0 | int flagsoff; |
967 | 0 | int memberoff; |
968 | |
|
969 | 0 | Assert(members[i].status <= MultiXactStatusUpdate); |
970 | |
|
971 | 0 | pageno = MXOffsetToMemberPage(offset); |
972 | 0 | memberoff = MXOffsetToMemberOffset(offset); |
973 | 0 | flagsoff = MXOffsetToFlagsOffset(offset); |
974 | 0 | bshift = MXOffsetToFlagsBitShift(offset); |
975 | |
|
976 | 0 | if (pageno != prev_pageno) |
977 | 0 | { |
978 | | /* |
979 | | * MultiXactMember SLRU page is changed so check if this new page |
980 | | * fall into the different SLRU bank then release the old bank's |
981 | | * lock and acquire lock on the new bank. |
982 | | */ |
983 | 0 | lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno); |
984 | 0 | if (lock != prevlock) |
985 | 0 | { |
986 | 0 | if (prevlock != NULL) |
987 | 0 | LWLockRelease(prevlock); |
988 | |
|
989 | 0 | LWLockAcquire(lock, LW_EXCLUSIVE); |
990 | 0 | prevlock = lock; |
991 | 0 | } |
992 | 0 | slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi); |
993 | 0 | prev_pageno = pageno; |
994 | 0 | } |
995 | |
|
996 | 0 | memberptr = (TransactionId *) |
997 | 0 | (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); |
998 | |
|
999 | 0 | *memberptr = members[i].xid; |
1000 | |
|
1001 | 0 | flagsptr = (uint32 *) |
1002 | 0 | (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); |
1003 | |
|
1004 | 0 | flagsval = *flagsptr; |
1005 | 0 | flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); |
1006 | 0 | flagsval |= (members[i].status << bshift); |
1007 | 0 | *flagsptr = flagsval; |
1008 | |
|
1009 | 0 | MultiXactMemberCtl->shared->page_dirty[slotno] = true; |
1010 | 0 | } |
1011 | |
|
1012 | 0 | if (prevlock != NULL) |
1013 | 0 | LWLockRelease(prevlock); |
1014 | 0 | } |
1015 | | |
1016 | | /* |
1017 | | * GetNewMultiXactId |
1018 | | * Get the next MultiXactId. |
1019 | | * |
1020 | | * Also, reserve the needed amount of space in the "members" area. The |
1021 | | * starting offset of the reserved space is returned in *offset. |
1022 | | * |
1023 | | * This may generate XLOG records for expansion of the offsets and/or members |
1024 | | * files. Unfortunately, we have to do that while holding MultiXactGenLock |
1025 | | * to avoid race conditions --- the XLOG record for zeroing a page must appear |
1026 | | * before any backend can possibly try to store data in that page! |
1027 | | * |
1028 | | * We start a critical section before advancing the shared counters. The |
1029 | | * caller must end the critical section after writing SLRU data. |
1030 | | */ |
1031 | | static MultiXactId |
1032 | | GetNewMultiXactId(int nmembers, MultiXactOffset *offset) |
1033 | 0 | { |
1034 | 0 | MultiXactId result; |
1035 | 0 | MultiXactOffset nextOffset; |
1036 | |
|
1037 | 0 | debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers); |
1038 | | |
1039 | | /* safety check, we should never get this far in a HS standby */ |
1040 | 0 | if (RecoveryInProgress()) |
1041 | 0 | elog(ERROR, "cannot assign MultiXactIds during recovery"); |
1042 | | |
1043 | 0 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
1044 | | |
1045 | | /* Handle wraparound of the nextMXact counter */ |
1046 | 0 | if (MultiXactState->nextMXact < FirstMultiXactId) |
1047 | 0 | MultiXactState->nextMXact = FirstMultiXactId; |
1048 | | |
1049 | | /* Assign the MXID */ |
1050 | 0 | result = MultiXactState->nextMXact; |
1051 | | |
1052 | | /*---------- |
1053 | | * Check to see if it's safe to assign another MultiXactId. This protects |
1054 | | * against catastrophic data loss due to multixact wraparound. The basic |
1055 | | * rules are: |
1056 | | * |
1057 | | * If we're past multiVacLimit or the safe threshold for member storage |
1058 | | * space, or we don't know what the safe threshold for member storage is, |
1059 | | * start trying to force autovacuum cycles. |
1060 | | * If we're past multiWarnLimit, start issuing warnings. |
1061 | | * If we're past multiStopLimit, refuse to create new MultiXactIds. |
1062 | | * |
1063 | | * Note these are pretty much the same protections in GetNewTransactionId. |
1064 | | *---------- |
1065 | | */ |
1066 | 0 | if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit)) |
1067 | 0 | { |
1068 | | /* |
1069 | | * For safety's sake, we release MultiXactGenLock while sending |
1070 | | * signals, warnings, etc. This is not so much because we care about |
1071 | | * preserving concurrency in this situation, as to avoid any |
1072 | | * possibility of deadlock while doing get_database_name(). First, |
1073 | | * copy all the shared values we'll need in this path. |
1074 | | */ |
1075 | 0 | MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit; |
1076 | 0 | MultiXactId multiStopLimit = MultiXactState->multiStopLimit; |
1077 | 0 | MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit; |
1078 | 0 | Oid oldest_datoid = MultiXactState->oldestMultiXactDB; |
1079 | |
|
1080 | 0 | LWLockRelease(MultiXactGenLock); |
1081 | |
|
1082 | 0 | if (IsUnderPostmaster && |
1083 | 0 | !MultiXactIdPrecedes(result, multiStopLimit)) |
1084 | 0 | { |
1085 | 0 | char *oldest_datname = get_database_name(oldest_datoid); |
1086 | | |
1087 | | /* |
1088 | | * Immediately kick autovacuum into action as we're already in |
1089 | | * ERROR territory. |
1090 | | */ |
1091 | 0 | SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); |
1092 | | |
1093 | | /* complain even if that DB has disappeared */ |
1094 | 0 | if (oldest_datname) |
1095 | 0 | ereport(ERROR, |
1096 | 0 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
1097 | 0 | errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database \"%s\"", |
1098 | 0 | oldest_datname), |
1099 | 0 | errhint("Execute a database-wide VACUUM in that database.\n" |
1100 | 0 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); |
1101 | 0 | else |
1102 | 0 | ereport(ERROR, |
1103 | 0 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
1104 | 0 | errmsg("database is not accepting commands that assign new MultiXactIds to avoid wraparound data loss in database with OID %u", |
1105 | 0 | oldest_datoid), |
1106 | 0 | errhint("Execute a database-wide VACUUM in that database.\n" |
1107 | 0 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); |
1108 | 0 | } |
1109 | | |
1110 | | /* |
1111 | | * To avoid swamping the postmaster with signals, we issue the autovac |
1112 | | * request only once per 64K multis generated. This still gives |
1113 | | * plenty of chances before we get into real trouble. |
1114 | | */ |
1115 | 0 | if (IsUnderPostmaster && (result % 65536) == 0) |
1116 | 0 | SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); |
1117 | |
|
1118 | 0 | if (!MultiXactIdPrecedes(result, multiWarnLimit)) |
1119 | 0 | { |
1120 | 0 | char *oldest_datname = get_database_name(oldest_datoid); |
1121 | | |
1122 | | /* complain even if that DB has disappeared */ |
1123 | 0 | if (oldest_datname) |
1124 | 0 | ereport(WARNING, |
1125 | 0 | (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used", |
1126 | 0 | "database \"%s\" must be vacuumed before %u more MultiXactIds are used", |
1127 | 0 | multiWrapLimit - result, |
1128 | 0 | oldest_datname, |
1129 | 0 | multiWrapLimit - result), |
1130 | 0 | errhint("Execute a database-wide VACUUM in that database.\n" |
1131 | 0 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); |
1132 | 0 | else |
1133 | 0 | ereport(WARNING, |
1134 | 0 | (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used", |
1135 | 0 | "database with OID %u must be vacuumed before %u more MultiXactIds are used", |
1136 | 0 | multiWrapLimit - result, |
1137 | 0 | oldest_datoid, |
1138 | 0 | multiWrapLimit - result), |
1139 | 0 | errhint("Execute a database-wide VACUUM in that database.\n" |
1140 | 0 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); |
1141 | 0 | } |
1142 | | |
1143 | | /* Re-acquire lock and start over */ |
1144 | 0 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
1145 | 0 | result = MultiXactState->nextMXact; |
1146 | 0 | if (result < FirstMultiXactId) |
1147 | 0 | result = FirstMultiXactId; |
1148 | 0 | } |
1149 | | |
1150 | | /* Make sure there is room for the MXID in the file. */ |
1151 | 0 | ExtendMultiXactOffset(result); |
1152 | | |
1153 | | /* |
1154 | | * Reserve the members space, similarly to above. Also, be careful not to |
1155 | | * return zero as the starting offset for any multixact. See |
1156 | | * GetMultiXactIdMembers() for motivation. |
1157 | | */ |
1158 | 0 | nextOffset = MultiXactState->nextOffset; |
1159 | 0 | if (nextOffset == 0) |
1160 | 0 | { |
1161 | 0 | *offset = 1; |
1162 | 0 | nmembers++; /* allocate member slot 0 too */ |
1163 | 0 | } |
1164 | 0 | else |
1165 | 0 | *offset = nextOffset; |
1166 | | |
1167 | | /*---------- |
1168 | | * Protect against overrun of the members space as well, with the |
1169 | | * following rules: |
1170 | | * |
1171 | | * If we're past offsetStopLimit, refuse to generate more multis. |
1172 | | * If we're close to offsetStopLimit, emit a warning. |
1173 | | * |
1174 | | * Arbitrarily, we start emitting warnings when we're 20 segments or less |
1175 | | * from offsetStopLimit. |
1176 | | * |
1177 | | * Note we haven't updated the shared state yet, so if we fail at this |
1178 | | * point, the multixact ID we grabbed can still be used by the next guy. |
1179 | | * |
1180 | | * Note that there is no point in forcing autovacuum runs here: the |
1181 | | * multixact freeze settings would have to be reduced for that to have any |
1182 | | * effect. |
1183 | | *---------- |
1184 | | */ |
1185 | 0 | #define OFFSET_WARN_SEGMENTS 20 |
1186 | 0 | if (MultiXactState->oldestOffsetKnown && |
1187 | 0 | MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset, |
1188 | 0 | nmembers)) |
1189 | 0 | { |
1190 | | /* see comment in the corresponding offsets wraparound case */ |
1191 | 0 | SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); |
1192 | |
|
1193 | 0 | ereport(ERROR, |
1194 | 0 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
1195 | 0 | errmsg("multixact \"members\" limit exceeded"), |
1196 | 0 | errdetail_plural("This command would create a multixact with %u members, but the remaining space is only enough for %u member.", |
1197 | 0 | "This command would create a multixact with %u members, but the remaining space is only enough for %u members.", |
1198 | 0 | MultiXactState->offsetStopLimit - nextOffset - 1, |
1199 | 0 | nmembers, |
1200 | 0 | MultiXactState->offsetStopLimit - nextOffset - 1), |
1201 | 0 | errhint("Execute a database-wide VACUUM in database with OID %u with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings.", |
1202 | 0 | MultiXactState->oldestMultiXactDB))); |
1203 | 0 | } |
1204 | | |
1205 | | /* |
1206 | | * Check whether we should kick autovacuum into action, to prevent members |
1207 | | * wraparound. NB we use a much larger window to trigger autovacuum than |
1208 | | * just the warning limit. The warning is just a measure of last resort - |
1209 | | * this is in line with GetNewTransactionId's behaviour. |
1210 | | */ |
1211 | 0 | if (!MultiXactState->oldestOffsetKnown || |
1212 | 0 | (MultiXactState->nextOffset - MultiXactState->oldestOffset |
1213 | 0 | > MULTIXACT_MEMBER_SAFE_THRESHOLD)) |
1214 | 0 | { |
1215 | | /* |
1216 | | * To avoid swamping the postmaster with signals, we issue the autovac |
1217 | | * request only when crossing a segment boundary. With default |
1218 | | * compilation settings that's roughly after 50k members. This still |
1219 | | * gives plenty of chances before we get into real trouble. |
1220 | | */ |
1221 | 0 | if ((MXOffsetToMemberPage(nextOffset) / SLRU_PAGES_PER_SEGMENT) != |
1222 | 0 | (MXOffsetToMemberPage(nextOffset + nmembers) / SLRU_PAGES_PER_SEGMENT)) |
1223 | 0 | SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); |
1224 | 0 | } |
1225 | |
|
1226 | 0 | if (MultiXactState->oldestOffsetKnown && |
1227 | 0 | MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, |
1228 | 0 | nextOffset, |
1229 | 0 | nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS)) |
1230 | 0 | ereport(WARNING, |
1231 | 0 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
1232 | 0 | errmsg_plural("database with OID %u must be vacuumed before %d more multixact member is used", |
1233 | 0 | "database with OID %u must be vacuumed before %d more multixact members are used", |
1234 | 0 | MultiXactState->offsetStopLimit - nextOffset + nmembers, |
1235 | 0 | MultiXactState->oldestMultiXactDB, |
1236 | 0 | MultiXactState->offsetStopLimit - nextOffset + nmembers), |
1237 | 0 | errhint("Execute a database-wide VACUUM in that database with reduced \"vacuum_multixact_freeze_min_age\" and \"vacuum_multixact_freeze_table_age\" settings."))); |
1238 | | |
1239 | 0 | ExtendMultiXactMember(nextOffset, nmembers); |
1240 | | |
1241 | | /* |
1242 | | * Critical section from here until caller has written the data into the |
1243 | | * just-reserved SLRU space; we don't want to error out with a partly |
1244 | | * written MultiXact structure. (In particular, failing to write our |
1245 | | * start offset after advancing nextMXact would effectively corrupt the |
1246 | | * previous MultiXact.) |
1247 | | */ |
1248 | 0 | START_CRIT_SECTION(); |
1249 | | |
1250 | | /* |
1251 | | * Advance counters. As in GetNewTransactionId(), this must not happen |
1252 | | * until after file extension has succeeded! |
1253 | | * |
1254 | | * We don't care about MultiXactId wraparound here; it will be handled by |
1255 | | * the next iteration. But note that nextMXact may be InvalidMultiXactId |
1256 | | * or the first value on a segment-beginning page after this routine |
1257 | | * exits, so anyone else looking at the variable must be prepared to deal |
1258 | | * with either case. Similarly, nextOffset may be zero, but we won't use |
1259 | | * that as the actual start offset of the next multixact. |
1260 | | */ |
1261 | 0 | (MultiXactState->nextMXact)++; |
1262 | |
|
1263 | 0 | MultiXactState->nextOffset += nmembers; |
1264 | |
|
1265 | 0 | LWLockRelease(MultiXactGenLock); |
1266 | |
|
1267 | 0 | debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset); |
1268 | 0 | return result; |
1269 | 0 | } |
1270 | | |
1271 | | /* |
1272 | | * GetMultiXactIdMembers |
1273 | | * Return the set of MultiXactMembers that make up a MultiXactId |
1274 | | * |
1275 | | * Return value is the number of members found, or -1 if there are none, |
1276 | | * and *members is set to a newly palloc'ed array of members. It's the |
1277 | | * caller's responsibility to free it when done with it. |
1278 | | * |
1279 | | * from_pgupgrade must be passed as true if and only if only the multixact |
1280 | | * corresponds to a value from a tuple that was locked in a 9.2-or-older |
1281 | | * installation and later pg_upgrade'd (that is, the infomask is |
1282 | | * HEAP_LOCKED_UPGRADED). In this case, we know for certain that no members |
1283 | | * can still be running, so we return -1 just like for an empty multixact |
1284 | | * without any further checking. It would be wrong to try to resolve such a |
1285 | | * multixact: either the multixact is within the current valid multixact |
1286 | | * range, in which case the returned result would be bogus, or outside that |
1287 | | * range, in which case an error would be raised. |
1288 | | * |
1289 | | * In all other cases, the passed multixact must be within the known valid |
1290 | | * range, that is, greater than or equal to oldestMultiXactId, and less than |
1291 | | * nextMXact. Otherwise, an error is raised. |
1292 | | * |
1293 | | * isLockOnly must be set to true if caller is certain that the given multi |
1294 | | * is used only to lock tuples; can be false without loss of correctness, |
1295 | | * but passing a true means we can return quickly without checking for |
1296 | | * old updates. |
1297 | | */ |
1298 | | int |
1299 | | GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, |
1300 | | bool from_pgupgrade, bool isLockOnly) |
1301 | 0 | { |
1302 | 0 | int64 pageno; |
1303 | 0 | int64 prev_pageno; |
1304 | 0 | int entryno; |
1305 | 0 | int slotno; |
1306 | 0 | MultiXactOffset *offptr; |
1307 | 0 | MultiXactOffset offset; |
1308 | 0 | int length; |
1309 | 0 | int truelength; |
1310 | 0 | MultiXactId oldestMXact; |
1311 | 0 | MultiXactId nextMXact; |
1312 | 0 | MultiXactId tmpMXact; |
1313 | 0 | MultiXactOffset nextOffset; |
1314 | 0 | MultiXactMember *ptr; |
1315 | 0 | LWLock *lock; |
1316 | 0 | bool slept = false; |
1317 | |
|
1318 | 0 | debug_elog3(DEBUG2, "GetMembers: asked for %u", multi); |
1319 | |
|
1320 | 0 | if (!MultiXactIdIsValid(multi) || from_pgupgrade) |
1321 | 0 | { |
1322 | 0 | *members = NULL; |
1323 | 0 | return -1; |
1324 | 0 | } |
1325 | | |
1326 | | /* See if the MultiXactId is in the local cache */ |
1327 | 0 | length = mXactCacheGetById(multi, members); |
1328 | 0 | if (length >= 0) |
1329 | 0 | { |
1330 | 0 | debug_elog3(DEBUG2, "GetMembers: found %s in the cache", |
1331 | 0 | mxid_to_string(multi, length, *members)); |
1332 | 0 | return length; |
1333 | 0 | } |
1334 | | |
1335 | | /* Set our OldestVisibleMXactId[] entry if we didn't already */ |
1336 | 0 | MultiXactIdSetOldestVisible(); |
1337 | | |
1338 | | /* |
1339 | | * If we know the multi is used only for locking and not for updates, then |
1340 | | * we can skip checking if the value is older than our oldest visible |
1341 | | * multi. It cannot possibly still be running. |
1342 | | */ |
1343 | 0 | if (isLockOnly && |
1344 | 0 | MultiXactIdPrecedes(multi, OldestVisibleMXactId[MyProcNumber])) |
1345 | 0 | { |
1346 | 0 | debug_elog2(DEBUG2, "GetMembers: a locker-only multi is too old"); |
1347 | 0 | *members = NULL; |
1348 | 0 | return -1; |
1349 | 0 | } |
1350 | | |
1351 | | /* |
1352 | | * We check known limits on MultiXact before resorting to the SLRU area. |
1353 | | * |
1354 | | * An ID older than MultiXactState->oldestMultiXactId cannot possibly be |
1355 | | * useful; it has already been removed, or will be removed shortly, by |
1356 | | * truncation. If one is passed, an error is raised. |
1357 | | * |
1358 | | * Also, an ID >= nextMXact shouldn't ever be seen here; if it is seen, it |
1359 | | * implies undetected ID wraparound has occurred. This raises a hard |
1360 | | * error. |
1361 | | * |
1362 | | * Shared lock is enough here since we aren't modifying any global state. |
1363 | | * Acquire it just long enough to grab the current counter values. We may |
1364 | | * need both nextMXact and nextOffset; see below. |
1365 | | */ |
1366 | 0 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
1367 | |
|
1368 | 0 | oldestMXact = MultiXactState->oldestMultiXactId; |
1369 | 0 | nextMXact = MultiXactState->nextMXact; |
1370 | 0 | nextOffset = MultiXactState->nextOffset; |
1371 | |
|
1372 | 0 | LWLockRelease(MultiXactGenLock); |
1373 | |
|
1374 | 0 | if (MultiXactIdPrecedes(multi, oldestMXact)) |
1375 | 0 | ereport(ERROR, |
1376 | 0 | (errcode(ERRCODE_INTERNAL_ERROR), |
1377 | 0 | errmsg("MultiXactId %u does no longer exist -- apparent wraparound", |
1378 | 0 | multi))); |
1379 | | |
1380 | 0 | if (!MultiXactIdPrecedes(multi, nextMXact)) |
1381 | 0 | ereport(ERROR, |
1382 | 0 | (errcode(ERRCODE_INTERNAL_ERROR), |
1383 | 0 | errmsg("MultiXactId %u has not been created yet -- apparent wraparound", |
1384 | 0 | multi))); |
1385 | | |
1386 | | /* |
1387 | | * Find out the offset at which we need to start reading MultiXactMembers |
1388 | | * and the number of members in the multixact. We determine the latter as |
1389 | | * the difference between this multixact's starting offset and the next |
1390 | | * one's. However, there are some corner cases to worry about: |
1391 | | * |
1392 | | * 1. This multixact may be the latest one created, in which case there is |
1393 | | * no next one to look at. In this case the nextOffset value we just |
1394 | | * saved is the correct endpoint. |
1395 | | * |
1396 | | * 2. The next multixact may still be in process of being filled in: that |
1397 | | * is, another process may have done GetNewMultiXactId but not yet written |
1398 | | * the offset entry for that ID. In that scenario, it is guaranteed that |
1399 | | * the offset entry for that multixact exists (because GetNewMultiXactId |
1400 | | * won't release MultiXactGenLock until it does) but contains zero |
1401 | | * (because we are careful to pre-zero offset pages). Because |
1402 | | * GetNewMultiXactId will never return zero as the starting offset for a |
1403 | | * multixact, when we read zero as the next multixact's offset, we know we |
1404 | | * have this case. We handle this by sleeping on the condition variable |
1405 | | * we have just for this; the process in charge will signal the CV as soon |
1406 | | * as it has finished writing the multixact offset. |
1407 | | * |
1408 | | * 3. Because GetNewMultiXactId increments offset zero to offset one to |
1409 | | * handle case #2, there is an ambiguity near the point of offset |
1410 | | * wraparound. If we see next multixact's offset is one, is that our |
1411 | | * multixact's actual endpoint, or did it end at zero with a subsequent |
1412 | | * increment? We handle this using the knowledge that if the zero'th |
1413 | | * member slot wasn't filled, it'll contain zero, and zero isn't a valid |
1414 | | * transaction ID so it can't be a multixact member. Therefore, if we |
1415 | | * read a zero from the members array, just ignore it. |
1416 | | * |
1417 | | * This is all pretty messy, but the mess occurs only in infrequent corner |
1418 | | * cases, so it seems better than holding the MultiXactGenLock for a long |
1419 | | * time on every multixact creation. |
1420 | | */ |
1421 | 0 | retry: |
1422 | 0 | pageno = MultiXactIdToOffsetPage(multi); |
1423 | 0 | entryno = MultiXactIdToOffsetEntry(multi); |
1424 | | |
1425 | | /* Acquire the bank lock for the page we need. */ |
1426 | 0 | lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); |
1427 | 0 | LWLockAcquire(lock, LW_EXCLUSIVE); |
1428 | |
|
1429 | 0 | slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi); |
1430 | 0 | offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
1431 | 0 | offptr += entryno; |
1432 | 0 | offset = *offptr; |
1433 | |
|
1434 | 0 | Assert(offset != 0); |
1435 | | |
1436 | | /* |
1437 | | * Use the same increment rule as GetNewMultiXactId(), that is, don't |
1438 | | * handle wraparound explicitly until needed. |
1439 | | */ |
1440 | 0 | tmpMXact = multi + 1; |
1441 | |
|
1442 | 0 | if (nextMXact == tmpMXact) |
1443 | 0 | { |
1444 | | /* Corner case 1: there is no next multixact */ |
1445 | 0 | length = nextOffset - offset; |
1446 | 0 | } |
1447 | 0 | else |
1448 | 0 | { |
1449 | 0 | MultiXactOffset nextMXOffset; |
1450 | | |
1451 | | /* handle wraparound if needed */ |
1452 | 0 | if (tmpMXact < FirstMultiXactId) |
1453 | 0 | tmpMXact = FirstMultiXactId; |
1454 | |
|
1455 | 0 | prev_pageno = pageno; |
1456 | |
|
1457 | 0 | pageno = MultiXactIdToOffsetPage(tmpMXact); |
1458 | 0 | entryno = MultiXactIdToOffsetEntry(tmpMXact); |
1459 | |
|
1460 | 0 | if (pageno != prev_pageno) |
1461 | 0 | { |
1462 | 0 | LWLock *newlock; |
1463 | | |
1464 | | /* |
1465 | | * Since we're going to access a different SLRU page, if this page |
1466 | | * falls under a different bank, release the old bank's lock and |
1467 | | * acquire the lock of the new bank. |
1468 | | */ |
1469 | 0 | newlock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); |
1470 | 0 | if (newlock != lock) |
1471 | 0 | { |
1472 | 0 | LWLockRelease(lock); |
1473 | 0 | LWLockAcquire(newlock, LW_EXCLUSIVE); |
1474 | 0 | lock = newlock; |
1475 | 0 | } |
1476 | 0 | slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact); |
1477 | 0 | } |
1478 | |
|
1479 | 0 | offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
1480 | 0 | offptr += entryno; |
1481 | 0 | nextMXOffset = *offptr; |
1482 | |
|
1483 | 0 | if (nextMXOffset == 0) |
1484 | 0 | { |
1485 | | /* Corner case 2: next multixact is still being filled in */ |
1486 | 0 | LWLockRelease(lock); |
1487 | 0 | CHECK_FOR_INTERRUPTS(); |
1488 | |
|
1489 | 0 | INJECTION_POINT("multixact-get-members-cv-sleep", NULL); |
1490 | |
|
1491 | 0 | ConditionVariableSleep(&MultiXactState->nextoff_cv, |
1492 | 0 | WAIT_EVENT_MULTIXACT_CREATION); |
1493 | 0 | slept = true; |
1494 | 0 | goto retry; |
1495 | 0 | } |
1496 | | |
1497 | 0 | length = nextMXOffset - offset; |
1498 | 0 | } |
1499 | | |
1500 | 0 | LWLockRelease(lock); |
1501 | 0 | lock = NULL; |
1502 | | |
1503 | | /* |
1504 | | * If we slept above, clean up state; it's no longer needed. |
1505 | | */ |
1506 | 0 | if (slept) |
1507 | 0 | ConditionVariableCancelSleep(); |
1508 | |
|
1509 | 0 | ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); |
1510 | |
|
1511 | 0 | truelength = 0; |
1512 | 0 | prev_pageno = -1; |
1513 | 0 | for (int i = 0; i < length; i++, offset++) |
1514 | 0 | { |
1515 | 0 | TransactionId *xactptr; |
1516 | 0 | uint32 *flagsptr; |
1517 | 0 | int flagsoff; |
1518 | 0 | int bshift; |
1519 | 0 | int memberoff; |
1520 | |
|
1521 | 0 | pageno = MXOffsetToMemberPage(offset); |
1522 | 0 | memberoff = MXOffsetToMemberOffset(offset); |
1523 | |
|
1524 | 0 | if (pageno != prev_pageno) |
1525 | 0 | { |
1526 | 0 | LWLock *newlock; |
1527 | | |
1528 | | /* |
1529 | | * Since we're going to access a different SLRU page, if this page |
1530 | | * falls under a different bank, release the old bank's lock and |
1531 | | * acquire the lock of the new bank. |
1532 | | */ |
1533 | 0 | newlock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno); |
1534 | 0 | if (newlock != lock) |
1535 | 0 | { |
1536 | 0 | if (lock) |
1537 | 0 | LWLockRelease(lock); |
1538 | 0 | LWLockAcquire(newlock, LW_EXCLUSIVE); |
1539 | 0 | lock = newlock; |
1540 | 0 | } |
1541 | |
|
1542 | 0 | slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi); |
1543 | 0 | prev_pageno = pageno; |
1544 | 0 | } |
1545 | |
|
1546 | 0 | xactptr = (TransactionId *) |
1547 | 0 | (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); |
1548 | |
|
1549 | 0 | if (!TransactionIdIsValid(*xactptr)) |
1550 | 0 | { |
1551 | | /* Corner case 3: we must be looking at unused slot zero */ |
1552 | 0 | Assert(offset == 0); |
1553 | 0 | continue; |
1554 | 0 | } |
1555 | | |
1556 | 0 | flagsoff = MXOffsetToFlagsOffset(offset); |
1557 | 0 | bshift = MXOffsetToFlagsBitShift(offset); |
1558 | 0 | flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); |
1559 | |
|
1560 | 0 | ptr[truelength].xid = *xactptr; |
1561 | 0 | ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; |
1562 | 0 | truelength++; |
1563 | 0 | } |
1564 | |
|
1565 | 0 | LWLockRelease(lock); |
1566 | | |
1567 | | /* A multixid with zero members should not happen */ |
1568 | 0 | Assert(truelength > 0); |
1569 | | |
1570 | | /* |
1571 | | * Copy the result into the local cache. |
1572 | | */ |
1573 | 0 | mXactCachePut(multi, truelength, ptr); |
1574 | |
|
1575 | 0 | debug_elog3(DEBUG2, "GetMembers: no cache for %s", |
1576 | 0 | mxid_to_string(multi, truelength, ptr)); |
1577 | 0 | *members = ptr; |
1578 | 0 | return truelength; |
1579 | 0 | } |
1580 | | |
1581 | | /* |
1582 | | * mxactMemberComparator |
1583 | | * qsort comparison function for MultiXactMember |
1584 | | * |
1585 | | * We can't use wraparound comparison for XIDs because that does not respect |
1586 | | * the triangle inequality! Any old sort order will do. |
1587 | | */ |
1588 | | static int |
1589 | | mxactMemberComparator(const void *arg1, const void *arg2) |
1590 | 0 | { |
1591 | 0 | MultiXactMember member1 = *(const MultiXactMember *) arg1; |
1592 | 0 | MultiXactMember member2 = *(const MultiXactMember *) arg2; |
1593 | |
|
1594 | 0 | if (member1.xid > member2.xid) |
1595 | 0 | return 1; |
1596 | 0 | if (member1.xid < member2.xid) |
1597 | 0 | return -1; |
1598 | 0 | if (member1.status > member2.status) |
1599 | 0 | return 1; |
1600 | 0 | if (member1.status < member2.status) |
1601 | 0 | return -1; |
1602 | 0 | return 0; |
1603 | 0 | } |
1604 | | |
1605 | | /* |
1606 | | * mXactCacheGetBySet |
1607 | | * returns a MultiXactId from the cache based on the set of |
1608 | | * TransactionIds that compose it, or InvalidMultiXactId if |
1609 | | * none matches. |
1610 | | * |
1611 | | * This is helpful, for example, if two transactions want to lock a huge |
1612 | | * table. By using the cache, the second will use the same MultiXactId |
1613 | | * for the majority of tuples, thus keeping MultiXactId usage low (saving |
1614 | | * both I/O and wraparound issues). |
1615 | | * |
1616 | | * NB: the passed members array will be sorted in-place. |
1617 | | */ |
1618 | | static MultiXactId |
1619 | | mXactCacheGetBySet(int nmembers, MultiXactMember *members) |
1620 | 0 | { |
1621 | 0 | dlist_iter iter; |
1622 | |
|
1623 | 0 | debug_elog3(DEBUG2, "CacheGet: looking for %s", |
1624 | 0 | mxid_to_string(InvalidMultiXactId, nmembers, members)); |
1625 | | |
1626 | | /* sort the array so comparison is easy */ |
1627 | 0 | qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator); |
1628 | |
|
1629 | 0 | dclist_foreach(iter, &MXactCache) |
1630 | 0 | { |
1631 | 0 | mXactCacheEnt *entry = dclist_container(mXactCacheEnt, node, |
1632 | 0 | iter.cur); |
1633 | |
|
1634 | 0 | if (entry->nmembers != nmembers) |
1635 | 0 | continue; |
1636 | | |
1637 | | /* |
1638 | | * We assume the cache entries are sorted, and that the unused bits in |
1639 | | * "status" are zeroed. |
1640 | | */ |
1641 | 0 | if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0) |
1642 | 0 | { |
1643 | 0 | debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi); |
1644 | 0 | dclist_move_head(&MXactCache, iter.cur); |
1645 | 0 | return entry->multi; |
1646 | 0 | } |
1647 | 0 | } |
1648 | | |
1649 | 0 | debug_elog2(DEBUG2, "CacheGet: not found :-("); |
1650 | 0 | return InvalidMultiXactId; |
1651 | 0 | } |
1652 | | |
1653 | | /* |
1654 | | * mXactCacheGetById |
1655 | | * returns the composing MultiXactMember set from the cache for a |
1656 | | * given MultiXactId, if present. |
1657 | | * |
1658 | | * If successful, *xids is set to the address of a palloc'd copy of the |
1659 | | * MultiXactMember set. Return value is number of members, or -1 on failure. |
1660 | | */ |
1661 | | static int |
1662 | | mXactCacheGetById(MultiXactId multi, MultiXactMember **members) |
1663 | 0 | { |
1664 | 0 | dlist_iter iter; |
1665 | |
|
1666 | 0 | debug_elog3(DEBUG2, "CacheGet: looking for %u", multi); |
1667 | |
|
1668 | 0 | dclist_foreach(iter, &MXactCache) |
1669 | 0 | { |
1670 | 0 | mXactCacheEnt *entry = dclist_container(mXactCacheEnt, node, |
1671 | 0 | iter.cur); |
1672 | |
|
1673 | 0 | if (entry->multi == multi) |
1674 | 0 | { |
1675 | 0 | MultiXactMember *ptr; |
1676 | 0 | Size size; |
1677 | |
|
1678 | 0 | size = sizeof(MultiXactMember) * entry->nmembers; |
1679 | 0 | ptr = (MultiXactMember *) palloc(size); |
1680 | |
|
1681 | 0 | memcpy(ptr, entry->members, size); |
1682 | |
|
1683 | 0 | debug_elog3(DEBUG2, "CacheGet: found %s", |
1684 | 0 | mxid_to_string(multi, |
1685 | 0 | entry->nmembers, |
1686 | 0 | entry->members)); |
1687 | | |
1688 | | /* |
1689 | | * Note we modify the list while not using a modifiable iterator. |
1690 | | * This is acceptable only because we exit the iteration |
1691 | | * immediately afterwards. |
1692 | | */ |
1693 | 0 | dclist_move_head(&MXactCache, iter.cur); |
1694 | |
|
1695 | 0 | *members = ptr; |
1696 | 0 | return entry->nmembers; |
1697 | 0 | } |
1698 | 0 | } |
1699 | | |
1700 | 0 | debug_elog2(DEBUG2, "CacheGet: not found"); |
1701 | 0 | return -1; |
1702 | 0 | } |
1703 | | |
1704 | | /* |
1705 | | * mXactCachePut |
1706 | | * Add a new MultiXactId and its composing set into the local cache. |
1707 | | */ |
1708 | | static void |
1709 | | mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members) |
1710 | 0 | { |
1711 | 0 | mXactCacheEnt *entry; |
1712 | |
|
1713 | 0 | debug_elog3(DEBUG2, "CachePut: storing %s", |
1714 | 0 | mxid_to_string(multi, nmembers, members)); |
1715 | |
|
1716 | 0 | if (MXactContext == NULL) |
1717 | 0 | { |
1718 | | /* The cache only lives as long as the current transaction */ |
1719 | 0 | debug_elog2(DEBUG2, "CachePut: initializing memory context"); |
1720 | 0 | MXactContext = AllocSetContextCreate(TopTransactionContext, |
1721 | 0 | "MultiXact cache context", |
1722 | 0 | ALLOCSET_SMALL_SIZES); |
1723 | 0 | } |
1724 | |
|
1725 | 0 | entry = (mXactCacheEnt *) |
1726 | 0 | MemoryContextAlloc(MXactContext, |
1727 | 0 | offsetof(mXactCacheEnt, members) + |
1728 | 0 | nmembers * sizeof(MultiXactMember)); |
1729 | |
|
1730 | 0 | entry->multi = multi; |
1731 | 0 | entry->nmembers = nmembers; |
1732 | 0 | memcpy(entry->members, members, nmembers * sizeof(MultiXactMember)); |
1733 | | |
1734 | | /* mXactCacheGetBySet assumes the entries are sorted, so sort them */ |
1735 | 0 | qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator); |
1736 | |
|
1737 | 0 | dclist_push_head(&MXactCache, &entry->node); |
1738 | 0 | if (dclist_count(&MXactCache) > MAX_CACHE_ENTRIES) |
1739 | 0 | { |
1740 | 0 | dlist_node *node; |
1741 | |
|
1742 | 0 | node = dclist_tail_node(&MXactCache); |
1743 | 0 | dclist_delete_from(&MXactCache, node); |
1744 | |
|
1745 | 0 | entry = dclist_container(mXactCacheEnt, node, node); |
1746 | 0 | debug_elog3(DEBUG2, "CachePut: pruning cached multi %u", |
1747 | 0 | entry->multi); |
1748 | |
|
1749 | 0 | pfree(entry); |
1750 | 0 | } |
1751 | 0 | } |
1752 | | |
1753 | | static char * |
1754 | | mxstatus_to_string(MultiXactStatus status) |
1755 | 0 | { |
1756 | 0 | switch (status) |
1757 | 0 | { |
1758 | 0 | case MultiXactStatusForKeyShare: |
1759 | 0 | return "keysh"; |
1760 | 0 | case MultiXactStatusForShare: |
1761 | 0 | return "sh"; |
1762 | 0 | case MultiXactStatusForNoKeyUpdate: |
1763 | 0 | return "fornokeyupd"; |
1764 | 0 | case MultiXactStatusForUpdate: |
1765 | 0 | return "forupd"; |
1766 | 0 | case MultiXactStatusNoKeyUpdate: |
1767 | 0 | return "nokeyupd"; |
1768 | 0 | case MultiXactStatusUpdate: |
1769 | 0 | return "upd"; |
1770 | 0 | default: |
1771 | 0 | elog(ERROR, "unrecognized multixact status %d", status); |
1772 | 0 | return ""; |
1773 | 0 | } |
1774 | 0 | } |
1775 | | |
1776 | | char * |
1777 | | mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members) |
1778 | 0 | { |
1779 | 0 | static char *str = NULL; |
1780 | 0 | StringInfoData buf; |
1781 | 0 | int i; |
1782 | |
|
1783 | 0 | if (str != NULL) |
1784 | 0 | pfree(str); |
1785 | |
|
1786 | 0 | initStringInfo(&buf); |
1787 | |
|
1788 | 0 | appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid, |
1789 | 0 | mxstatus_to_string(members[0].status)); |
1790 | |
|
1791 | 0 | for (i = 1; i < nmembers; i++) |
1792 | 0 | appendStringInfo(&buf, ", %u (%s)", members[i].xid, |
1793 | 0 | mxstatus_to_string(members[i].status)); |
1794 | |
|
1795 | 0 | appendStringInfoChar(&buf, ']'); |
1796 | 0 | str = MemoryContextStrdup(TopMemoryContext, buf.data); |
1797 | 0 | pfree(buf.data); |
1798 | 0 | return str; |
1799 | 0 | } |
1800 | | |
1801 | | /* |
1802 | | * AtEOXact_MultiXact |
1803 | | * Handle transaction end for MultiXact |
1804 | | * |
1805 | | * This is called at top transaction commit or abort (we don't care which). |
1806 | | */ |
1807 | | void |
1808 | | AtEOXact_MultiXact(void) |
1809 | 0 | { |
1810 | | /* |
1811 | | * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of |
1812 | | * which should only be valid while within a transaction. |
1813 | | * |
1814 | | * We assume that storing a MultiXactId is atomic and so we need not take |
1815 | | * MultiXactGenLock to do this. |
1816 | | */ |
1817 | 0 | OldestMemberMXactId[MyProcNumber] = InvalidMultiXactId; |
1818 | 0 | OldestVisibleMXactId[MyProcNumber] = InvalidMultiXactId; |
1819 | | |
1820 | | /* |
1821 | | * Discard the local MultiXactId cache. Since MXactContext was created as |
1822 | | * a child of TopTransactionContext, we needn't delete it explicitly. |
1823 | | */ |
1824 | 0 | MXactContext = NULL; |
1825 | 0 | dclist_init(&MXactCache); |
1826 | 0 | } |
1827 | | |
1828 | | /* |
1829 | | * AtPrepare_MultiXact |
1830 | | * Save multixact state at 2PC transaction prepare |
1831 | | * |
1832 | | * In this phase, we only store our OldestMemberMXactId value in the two-phase |
1833 | | * state file. |
1834 | | */ |
1835 | | void |
1836 | | AtPrepare_MultiXact(void) |
1837 | 0 | { |
1838 | 0 | MultiXactId myOldestMember = OldestMemberMXactId[MyProcNumber]; |
1839 | |
|
1840 | 0 | if (MultiXactIdIsValid(myOldestMember)) |
1841 | 0 | RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0, |
1842 | 0 | &myOldestMember, sizeof(MultiXactId)); |
1843 | 0 | } |
1844 | | |
1845 | | /* |
1846 | | * PostPrepare_MultiXact |
1847 | | * Clean up after successful PREPARE TRANSACTION |
1848 | | */ |
1849 | | void |
1850 | | PostPrepare_MultiXact(TransactionId xid) |
1851 | 0 | { |
1852 | 0 | MultiXactId myOldestMember; |
1853 | | |
1854 | | /* |
1855 | | * Transfer our OldestMemberMXactId value to the slot reserved for the |
1856 | | * prepared transaction. |
1857 | | */ |
1858 | 0 | myOldestMember = OldestMemberMXactId[MyProcNumber]; |
1859 | 0 | if (MultiXactIdIsValid(myOldestMember)) |
1860 | 0 | { |
1861 | 0 | ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false); |
1862 | | |
1863 | | /* |
1864 | | * Even though storing MultiXactId is atomic, acquire lock to make |
1865 | | * sure others see both changes, not just the reset of the slot of the |
1866 | | * current backend. Using a volatile pointer might suffice, but this |
1867 | | * isn't a hot spot. |
1868 | | */ |
1869 | 0 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
1870 | |
|
1871 | 0 | OldestMemberMXactId[dummyProcNumber] = myOldestMember; |
1872 | 0 | OldestMemberMXactId[MyProcNumber] = InvalidMultiXactId; |
1873 | |
|
1874 | 0 | LWLockRelease(MultiXactGenLock); |
1875 | 0 | } |
1876 | | |
1877 | | /* |
1878 | | * We don't need to transfer OldestVisibleMXactId value, because the |
1879 | | * transaction is not going to be looking at any more multixacts once it's |
1880 | | * prepared. |
1881 | | * |
1882 | | * We assume that storing a MultiXactId is atomic and so we need not take |
1883 | | * MultiXactGenLock to do this. |
1884 | | */ |
1885 | 0 | OldestVisibleMXactId[MyProcNumber] = InvalidMultiXactId; |
1886 | | |
1887 | | /* |
1888 | | * Discard the local MultiXactId cache like in AtEOXact_MultiXact. |
1889 | | */ |
1890 | 0 | MXactContext = NULL; |
1891 | 0 | dclist_init(&MXactCache); |
1892 | 0 | } |
1893 | | |
1894 | | /* |
1895 | | * multixact_twophase_recover |
1896 | | * Recover the state of a prepared transaction at startup |
1897 | | */ |
1898 | | void |
1899 | | multixact_twophase_recover(TransactionId xid, uint16 info, |
1900 | | void *recdata, uint32 len) |
1901 | 0 | { |
1902 | 0 | ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, false); |
1903 | 0 | MultiXactId oldestMember; |
1904 | | |
1905 | | /* |
1906 | | * Get the oldest member XID from the state file record, and set it in the |
1907 | | * OldestMemberMXactId slot reserved for this prepared transaction. |
1908 | | */ |
1909 | 0 | Assert(len == sizeof(MultiXactId)); |
1910 | 0 | oldestMember = *((MultiXactId *) recdata); |
1911 | |
|
1912 | 0 | OldestMemberMXactId[dummyProcNumber] = oldestMember; |
1913 | 0 | } |
1914 | | |
1915 | | /* |
1916 | | * multixact_twophase_postcommit |
1917 | | * Similar to AtEOXact_MultiXact but for COMMIT PREPARED |
1918 | | */ |
1919 | | void |
1920 | | multixact_twophase_postcommit(TransactionId xid, uint16 info, |
1921 | | void *recdata, uint32 len) |
1922 | 0 | { |
1923 | 0 | ProcNumber dummyProcNumber = TwoPhaseGetDummyProcNumber(xid, true); |
1924 | |
|
1925 | 0 | Assert(len == sizeof(MultiXactId)); |
1926 | |
|
1927 | 0 | OldestMemberMXactId[dummyProcNumber] = InvalidMultiXactId; |
1928 | 0 | } |
1929 | | |
1930 | | /* |
1931 | | * multixact_twophase_postabort |
1932 | | * This is actually just the same as the COMMIT case. |
1933 | | */ |
1934 | | void |
1935 | | multixact_twophase_postabort(TransactionId xid, uint16 info, |
1936 | | void *recdata, uint32 len) |
1937 | 0 | { |
1938 | 0 | multixact_twophase_postcommit(xid, info, recdata, len); |
1939 | 0 | } |
1940 | | |
1941 | | /* |
1942 | | * Initialization of shared memory for MultiXact. We use two SLRU areas, |
1943 | | * thus double memory. Also, reserve space for the shared MultiXactState |
1944 | | * struct and the per-backend MultiXactId arrays (two of those, too). |
1945 | | */ |
1946 | | Size |
1947 | | MultiXactShmemSize(void) |
1948 | 0 | { |
1949 | 0 | Size size; |
1950 | | |
1951 | | /* We need 2*MaxOldestSlot perBackendXactIds[] entries */ |
1952 | 0 | #define SHARED_MULTIXACT_STATE_SIZE \ |
1953 | 0 | add_size(offsetof(MultiXactStateData, perBackendXactIds), \ |
1954 | 0 | mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot)) |
1955 | |
|
1956 | 0 | size = SHARED_MULTIXACT_STATE_SIZE; |
1957 | 0 | size = add_size(size, SimpleLruShmemSize(multixact_offset_buffers, 0)); |
1958 | 0 | size = add_size(size, SimpleLruShmemSize(multixact_member_buffers, 0)); |
1959 | |
|
1960 | 0 | return size; |
1961 | 0 | } |
1962 | | |
1963 | | void |
1964 | | MultiXactShmemInit(void) |
1965 | 0 | { |
1966 | 0 | bool found; |
1967 | |
|
1968 | 0 | debug_elog2(DEBUG2, "Shared Memory Init for MultiXact"); |
1969 | |
|
1970 | 0 | MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes; |
1971 | 0 | MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes; |
1972 | |
|
1973 | 0 | SimpleLruInit(MultiXactOffsetCtl, |
1974 | 0 | "multixact_offset", multixact_offset_buffers, 0, |
1975 | 0 | "pg_multixact/offsets", LWTRANCHE_MULTIXACTOFFSET_BUFFER, |
1976 | 0 | LWTRANCHE_MULTIXACTOFFSET_SLRU, |
1977 | 0 | SYNC_HANDLER_MULTIXACT_OFFSET, |
1978 | 0 | false); |
1979 | 0 | SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE); |
1980 | 0 | SimpleLruInit(MultiXactMemberCtl, |
1981 | 0 | "multixact_member", multixact_member_buffers, 0, |
1982 | 0 | "pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER, |
1983 | 0 | LWTRANCHE_MULTIXACTMEMBER_SLRU, |
1984 | 0 | SYNC_HANDLER_MULTIXACT_MEMBER, |
1985 | 0 | false); |
1986 | | /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */ |
1987 | | |
1988 | | /* Initialize our shared state struct */ |
1989 | 0 | MultiXactState = ShmemInitStruct("Shared MultiXact State", |
1990 | 0 | SHARED_MULTIXACT_STATE_SIZE, |
1991 | 0 | &found); |
1992 | 0 | if (!IsUnderPostmaster) |
1993 | 0 | { |
1994 | 0 | Assert(!found); |
1995 | | |
1996 | | /* Make sure we zero out the per-backend state */ |
1997 | 0 | MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE); |
1998 | 0 | ConditionVariableInit(&MultiXactState->nextoff_cv); |
1999 | 0 | } |
2000 | 0 | else |
2001 | 0 | Assert(found); |
2002 | | |
2003 | | /* |
2004 | | * Set up array pointers. |
2005 | | */ |
2006 | 0 | OldestMemberMXactId = MultiXactState->perBackendXactIds; |
2007 | 0 | OldestVisibleMXactId = OldestMemberMXactId + MaxOldestSlot; |
2008 | 0 | } |
2009 | | |
2010 | | /* |
2011 | | * GUC check_hook for multixact_offset_buffers |
2012 | | */ |
2013 | | bool |
2014 | | check_multixact_offset_buffers(int *newval, void **extra, GucSource source) |
2015 | 0 | { |
2016 | 0 | return check_slru_buffers("multixact_offset_buffers", newval); |
2017 | 0 | } |
2018 | | |
2019 | | /* |
2020 | | * GUC check_hook for multixact_member_buffers |
2021 | | */ |
2022 | | bool |
2023 | | check_multixact_member_buffers(int *newval, void **extra, GucSource source) |
2024 | 0 | { |
2025 | 0 | return check_slru_buffers("multixact_member_buffers", newval); |
2026 | 0 | } |
2027 | | |
2028 | | /* |
2029 | | * This func must be called ONCE on system install. It creates the initial |
2030 | | * MultiXact segments. (The MultiXacts directories are assumed to have been |
2031 | | * created by initdb, and MultiXactShmemInit must have been called already.) |
2032 | | */ |
2033 | | void |
2034 | | BootStrapMultiXact(void) |
2035 | 0 | { |
2036 | 0 | int slotno; |
2037 | 0 | LWLock *lock; |
2038 | |
|
2039 | 0 | lock = SimpleLruGetBankLock(MultiXactOffsetCtl, 0); |
2040 | 0 | LWLockAcquire(lock, LW_EXCLUSIVE); |
2041 | | |
2042 | | /* Create and zero the first page of the offsets log */ |
2043 | 0 | slotno = ZeroMultiXactOffsetPage(0, false); |
2044 | | |
2045 | | /* Make sure it's written out */ |
2046 | 0 | SimpleLruWritePage(MultiXactOffsetCtl, slotno); |
2047 | 0 | Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); |
2048 | |
|
2049 | 0 | LWLockRelease(lock); |
2050 | |
|
2051 | 0 | lock = SimpleLruGetBankLock(MultiXactMemberCtl, 0); |
2052 | 0 | LWLockAcquire(lock, LW_EXCLUSIVE); |
2053 | | |
2054 | | /* Create and zero the first page of the members log */ |
2055 | 0 | slotno = ZeroMultiXactMemberPage(0, false); |
2056 | | |
2057 | | /* Make sure it's written out */ |
2058 | 0 | SimpleLruWritePage(MultiXactMemberCtl, slotno); |
2059 | 0 | Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); |
2060 | |
|
2061 | 0 | LWLockRelease(lock); |
2062 | 0 | } |
2063 | | |
2064 | | /* |
2065 | | * Initialize (or reinitialize) a page of MultiXactOffset to zeroes. |
2066 | | * If writeXlog is true, also emit an XLOG record saying we did this. |
2067 | | * |
2068 | | * The page is not actually written, just set up in shared memory. |
2069 | | * The slot number of the new page is returned. |
2070 | | * |
2071 | | * Control lock must be held at entry, and will be held at exit. |
2072 | | */ |
2073 | | static int |
2074 | | ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog) |
2075 | 0 | { |
2076 | 0 | int slotno; |
2077 | |
|
2078 | 0 | slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); |
2079 | |
|
2080 | 0 | if (writeXlog) |
2081 | 0 | WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE); |
2082 | |
|
2083 | 0 | return slotno; |
2084 | 0 | } |
2085 | | |
2086 | | /* |
2087 | | * Ditto, for MultiXactMember |
2088 | | */ |
2089 | | static int |
2090 | | ZeroMultiXactMemberPage(int64 pageno, bool writeXlog) |
2091 | 0 | { |
2092 | 0 | int slotno; |
2093 | |
|
2094 | 0 | slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno); |
2095 | |
|
2096 | 0 | if (writeXlog) |
2097 | 0 | WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE); |
2098 | |
|
2099 | 0 | return slotno; |
2100 | 0 | } |
2101 | | |
2102 | | /* |
2103 | | * MaybeExtendOffsetSlru |
2104 | | * Extend the offsets SLRU area, if necessary |
2105 | | * |
2106 | | * After a binary upgrade from <= 9.2, the pg_multixact/offsets SLRU area might |
2107 | | * contain files that are shorter than necessary; this would occur if the old |
2108 | | * installation had used multixacts beyond the first page (files cannot be |
2109 | | * copied, because the on-disk representation is different). pg_upgrade would |
2110 | | * update pg_control to set the next offset value to be at that position, so |
2111 | | * that tuples marked as locked by such MultiXacts would be seen as visible |
2112 | | * without having to consult multixact. However, trying to create and use a |
2113 | | * new MultiXactId would result in an error because the page on which the new |
2114 | | * value would reside does not exist. This routine is in charge of creating |
2115 | | * such pages. |
2116 | | */ |
2117 | | static void |
2118 | | MaybeExtendOffsetSlru(void) |
2119 | 0 | { |
2120 | 0 | int64 pageno; |
2121 | 0 | LWLock *lock; |
2122 | |
|
2123 | 0 | pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact); |
2124 | 0 | lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); |
2125 | |
|
2126 | 0 | LWLockAcquire(lock, LW_EXCLUSIVE); |
2127 | |
|
2128 | 0 | if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) |
2129 | 0 | { |
2130 | 0 | int slotno; |
2131 | | |
2132 | | /* |
2133 | | * Fortunately for us, SimpleLruWritePage is already prepared to deal |
2134 | | * with creating a new segment file even if the page we're writing is |
2135 | | * not the first in it, so this is enough. |
2136 | | */ |
2137 | 0 | slotno = ZeroMultiXactOffsetPage(pageno, false); |
2138 | 0 | SimpleLruWritePage(MultiXactOffsetCtl, slotno); |
2139 | 0 | } |
2140 | |
|
2141 | 0 | LWLockRelease(lock); |
2142 | 0 | } |
2143 | | |
2144 | | /* |
2145 | | * This must be called ONCE during postmaster or standalone-backend startup. |
2146 | | * |
2147 | | * StartupXLOG has already established nextMXact/nextOffset by calling |
2148 | | * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti |
2149 | | * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet |
2150 | | * replayed WAL. |
2151 | | */ |
2152 | | void |
2153 | | StartupMultiXact(void) |
2154 | 0 | { |
2155 | 0 | MultiXactId multi = MultiXactState->nextMXact; |
2156 | 0 | MultiXactOffset offset = MultiXactState->nextOffset; |
2157 | 0 | int64 pageno; |
2158 | | |
2159 | | /* |
2160 | | * Initialize offset's idea of the latest page number. |
2161 | | */ |
2162 | 0 | pageno = MultiXactIdToOffsetPage(multi); |
2163 | 0 | pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number, |
2164 | 0 | pageno); |
2165 | | |
2166 | | /* |
2167 | | * Initialize member's idea of the latest page number. |
2168 | | */ |
2169 | 0 | pageno = MXOffsetToMemberPage(offset); |
2170 | 0 | pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number, |
2171 | 0 | pageno); |
2172 | 0 | } |
2173 | | |
2174 | | /* |
2175 | | * This must be called ONCE at the end of startup/recovery. |
2176 | | */ |
2177 | | void |
2178 | | TrimMultiXact(void) |
2179 | 0 | { |
2180 | 0 | MultiXactId nextMXact; |
2181 | 0 | MultiXactOffset offset; |
2182 | 0 | MultiXactId oldestMXact; |
2183 | 0 | Oid oldestMXactDB; |
2184 | 0 | int64 pageno; |
2185 | 0 | int entryno; |
2186 | 0 | int flagsoff; |
2187 | |
|
2188 | 0 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
2189 | 0 | nextMXact = MultiXactState->nextMXact; |
2190 | 0 | offset = MultiXactState->nextOffset; |
2191 | 0 | oldestMXact = MultiXactState->oldestMultiXactId; |
2192 | 0 | oldestMXactDB = MultiXactState->oldestMultiXactDB; |
2193 | 0 | LWLockRelease(MultiXactGenLock); |
2194 | | |
2195 | | /* Clean up offsets state */ |
2196 | | |
2197 | | /* |
2198 | | * (Re-)Initialize our idea of the latest page number for offsets. |
2199 | | */ |
2200 | 0 | pageno = MultiXactIdToOffsetPage(nextMXact); |
2201 | 0 | pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number, |
2202 | 0 | pageno); |
2203 | | |
2204 | | /* |
2205 | | * Zero out the remainder of the current offsets page. See notes in |
2206 | | * TrimCLOG() for background. Unlike CLOG, some WAL record covers every |
2207 | | * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL |
2208 | | * rule "write xlog before data," nextMXact successors may carry obsolete, |
2209 | | * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers() |
2210 | | * operates normally. |
2211 | | */ |
2212 | 0 | entryno = MultiXactIdToOffsetEntry(nextMXact); |
2213 | 0 | if (entryno != 0) |
2214 | 0 | { |
2215 | 0 | int slotno; |
2216 | 0 | MultiXactOffset *offptr; |
2217 | 0 | LWLock *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); |
2218 | |
|
2219 | 0 | LWLockAcquire(lock, LW_EXCLUSIVE); |
2220 | 0 | slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); |
2221 | 0 | offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
2222 | 0 | offptr += entryno; |
2223 | |
|
2224 | 0 | MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); |
2225 | |
|
2226 | 0 | MultiXactOffsetCtl->shared->page_dirty[slotno] = true; |
2227 | 0 | LWLockRelease(lock); |
2228 | 0 | } |
2229 | | |
2230 | | /* |
2231 | | * And the same for members. |
2232 | | * |
2233 | | * (Re-)Initialize our idea of the latest page number for members. |
2234 | | */ |
2235 | 0 | pageno = MXOffsetToMemberPage(offset); |
2236 | 0 | pg_atomic_write_u64(&MultiXactMemberCtl->shared->latest_page_number, |
2237 | 0 | pageno); |
2238 | | |
2239 | | /* |
2240 | | * Zero out the remainder of the current members page. See notes in |
2241 | | * TrimCLOG() for motivation. |
2242 | | */ |
2243 | 0 | flagsoff = MXOffsetToFlagsOffset(offset); |
2244 | 0 | if (flagsoff != 0) |
2245 | 0 | { |
2246 | 0 | int slotno; |
2247 | 0 | TransactionId *xidptr; |
2248 | 0 | int memberoff; |
2249 | 0 | LWLock *lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno); |
2250 | |
|
2251 | 0 | LWLockAcquire(lock, LW_EXCLUSIVE); |
2252 | 0 | memberoff = MXOffsetToMemberOffset(offset); |
2253 | 0 | slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset); |
2254 | 0 | xidptr = (TransactionId *) |
2255 | 0 | (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); |
2256 | |
|
2257 | 0 | MemSet(xidptr, 0, BLCKSZ - memberoff); |
2258 | | |
2259 | | /* |
2260 | | * Note: we don't need to zero out the flag bits in the remaining |
2261 | | * members of the current group, because they are always reset before |
2262 | | * writing. |
2263 | | */ |
2264 | |
|
2265 | 0 | MultiXactMemberCtl->shared->page_dirty[slotno] = true; |
2266 | 0 | LWLockRelease(lock); |
2267 | 0 | } |
2268 | | |
2269 | | /* signal that we're officially up */ |
2270 | 0 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
2271 | 0 | MultiXactState->finishedStartup = true; |
2272 | 0 | LWLockRelease(MultiXactGenLock); |
2273 | | |
2274 | | /* Now compute how far away the next members wraparound is. */ |
2275 | 0 | SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true); |
2276 | 0 | } |
2277 | | |
2278 | | /* |
2279 | | * Get the MultiXact data to save in a checkpoint record |
2280 | | */ |
2281 | | void |
2282 | | MultiXactGetCheckptMulti(bool is_shutdown, |
2283 | | MultiXactId *nextMulti, |
2284 | | MultiXactOffset *nextMultiOffset, |
2285 | | MultiXactId *oldestMulti, |
2286 | | Oid *oldestMultiDB) |
2287 | 0 | { |
2288 | 0 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
2289 | 0 | *nextMulti = MultiXactState->nextMXact; |
2290 | 0 | *nextMultiOffset = MultiXactState->nextOffset; |
2291 | 0 | *oldestMulti = MultiXactState->oldestMultiXactId; |
2292 | 0 | *oldestMultiDB = MultiXactState->oldestMultiXactDB; |
2293 | 0 | LWLockRelease(MultiXactGenLock); |
2294 | |
|
2295 | 0 | debug_elog6(DEBUG2, |
2296 | 0 | "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u", |
2297 | 0 | *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB); |
2298 | 0 | } |
2299 | | |
2300 | | /* |
2301 | | * Perform a checkpoint --- either during shutdown, or on-the-fly |
2302 | | */ |
2303 | | void |
2304 | | CheckPointMultiXact(void) |
2305 | 0 | { |
2306 | 0 | TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true); |
2307 | | |
2308 | | /* |
2309 | | * Write dirty MultiXact pages to disk. This may result in sync requests |
2310 | | * queued for later handling by ProcessSyncRequests(), as part of the |
2311 | | * checkpoint. |
2312 | | */ |
2313 | 0 | SimpleLruWriteAll(MultiXactOffsetCtl, true); |
2314 | 0 | SimpleLruWriteAll(MultiXactMemberCtl, true); |
2315 | |
|
2316 | 0 | TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); |
2317 | 0 | } |
2318 | | |
2319 | | /* |
2320 | | * Set the next-to-be-assigned MultiXactId and offset |
2321 | | * |
2322 | | * This is used when we can determine the correct next ID/offset exactly |
2323 | | * from a checkpoint record. Although this is only called during bootstrap |
2324 | | * and XLog replay, we take the lock in case any hot-standby backends are |
2325 | | * examining the values. |
2326 | | */ |
2327 | | void |
2328 | | MultiXactSetNextMXact(MultiXactId nextMulti, |
2329 | | MultiXactOffset nextMultiOffset) |
2330 | 0 | { |
2331 | 0 | debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u", |
2332 | 0 | nextMulti, nextMultiOffset); |
2333 | 0 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
2334 | 0 | MultiXactState->nextMXact = nextMulti; |
2335 | 0 | MultiXactState->nextOffset = nextMultiOffset; |
2336 | 0 | LWLockRelease(MultiXactGenLock); |
2337 | | |
2338 | | /* |
2339 | | * During a binary upgrade, make sure that the offsets SLRU is large |
2340 | | * enough to contain the next value that would be created. |
2341 | | * |
2342 | | * We need to do this pretty early during the first startup in binary |
2343 | | * upgrade mode: before StartupMultiXact() in fact, because this routine |
2344 | | * is called even before that by StartupXLOG(). And we can't do it |
2345 | | * earlier than at this point, because during that first call of this |
2346 | | * routine we determine the MultiXactState->nextMXact value that |
2347 | | * MaybeExtendOffsetSlru needs. |
2348 | | */ |
2349 | 0 | if (IsBinaryUpgrade) |
2350 | 0 | MaybeExtendOffsetSlru(); |
2351 | 0 | } |
2352 | | |
2353 | | /* |
2354 | | * Determine the last safe MultiXactId to allocate given the currently oldest |
2355 | | * datminmxid (ie, the oldest MultiXactId that might exist in any database |
2356 | | * of our cluster), and the OID of the (or a) database with that value. |
2357 | | * |
2358 | | * is_startup is true when we are just starting the cluster, false when we |
2359 | | * are updating state in a running cluster. This only affects log messages. |
2360 | | */ |
2361 | | void |
2362 | | SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid, |
2363 | | bool is_startup) |
2364 | 0 | { |
2365 | 0 | MultiXactId multiVacLimit; |
2366 | 0 | MultiXactId multiWarnLimit; |
2367 | 0 | MultiXactId multiStopLimit; |
2368 | 0 | MultiXactId multiWrapLimit; |
2369 | 0 | MultiXactId curMulti; |
2370 | 0 | bool needs_offset_vacuum; |
2371 | |
|
2372 | 0 | Assert(MultiXactIdIsValid(oldest_datminmxid)); |
2373 | | |
2374 | | /* |
2375 | | * We pretend that a wrap will happen halfway through the multixact ID |
2376 | | * space, but that's not really true, because multixacts wrap differently |
2377 | | * from transaction IDs. Note that, separately from any concern about |
2378 | | * multixact IDs wrapping, we must ensure that multixact members do not |
2379 | | * wrap. Limits for that are set in SetOffsetVacuumLimit, not here. |
2380 | | */ |
2381 | 0 | multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1); |
2382 | 0 | if (multiWrapLimit < FirstMultiXactId) |
2383 | 0 | multiWrapLimit += FirstMultiXactId; |
2384 | | |
2385 | | /* |
2386 | | * We'll refuse to continue assigning MultiXactIds once we get within 3M |
2387 | | * multi of data loss. See SetTransactionIdLimit. |
2388 | | */ |
2389 | 0 | multiStopLimit = multiWrapLimit - 3000000; |
2390 | 0 | if (multiStopLimit < FirstMultiXactId) |
2391 | 0 | multiStopLimit -= FirstMultiXactId; |
2392 | | |
2393 | | /* |
2394 | | * We'll start complaining loudly when we get within 40M multis of data |
2395 | | * loss. This is kind of arbitrary, but if you let your gas gauge get |
2396 | | * down to 2% of full, would you be looking for the next gas station? We |
2397 | | * need to be fairly liberal about this number because there are lots of |
2398 | | * scenarios where most transactions are done by automatic clients that |
2399 | | * won't pay attention to warnings. (No, we're not gonna make this |
2400 | | * configurable. If you know enough to configure it, you know enough to |
2401 | | * not get in this kind of trouble in the first place.) |
2402 | | */ |
2403 | 0 | multiWarnLimit = multiWrapLimit - 40000000; |
2404 | 0 | if (multiWarnLimit < FirstMultiXactId) |
2405 | 0 | multiWarnLimit -= FirstMultiXactId; |
2406 | | |
2407 | | /* |
2408 | | * We'll start trying to force autovacuums when oldest_datminmxid gets to |
2409 | | * be more than autovacuum_multixact_freeze_max_age mxids old. |
2410 | | * |
2411 | | * Note: autovacuum_multixact_freeze_max_age is a PGC_POSTMASTER parameter |
2412 | | * so that we don't have to worry about dealing with on-the-fly changes in |
2413 | | * its value. See SetTransactionIdLimit. |
2414 | | */ |
2415 | 0 | multiVacLimit = oldest_datminmxid + autovacuum_multixact_freeze_max_age; |
2416 | 0 | if (multiVacLimit < FirstMultiXactId) |
2417 | 0 | multiVacLimit += FirstMultiXactId; |
2418 | | |
2419 | | /* Grab lock for just long enough to set the new limit values */ |
2420 | 0 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
2421 | 0 | MultiXactState->oldestMultiXactId = oldest_datminmxid; |
2422 | 0 | MultiXactState->oldestMultiXactDB = oldest_datoid; |
2423 | 0 | MultiXactState->multiVacLimit = multiVacLimit; |
2424 | 0 | MultiXactState->multiWarnLimit = multiWarnLimit; |
2425 | 0 | MultiXactState->multiStopLimit = multiStopLimit; |
2426 | 0 | MultiXactState->multiWrapLimit = multiWrapLimit; |
2427 | 0 | curMulti = MultiXactState->nextMXact; |
2428 | 0 | LWLockRelease(MultiXactGenLock); |
2429 | | |
2430 | | /* Log the info */ |
2431 | 0 | ereport(DEBUG1, |
2432 | 0 | (errmsg_internal("MultiXactId wrap limit is %u, limited by database with OID %u", |
2433 | 0 | multiWrapLimit, oldest_datoid))); |
2434 | | |
2435 | | /* |
2436 | | * Computing the actual limits is only possible once the data directory is |
2437 | | * in a consistent state. There's no need to compute the limits while |
2438 | | * still replaying WAL - no decisions about new multis are made even |
2439 | | * though multixact creations might be replayed. So we'll only do further |
2440 | | * checks after TrimMultiXact() has been called. |
2441 | | */ |
2442 | 0 | if (!MultiXactState->finishedStartup) |
2443 | 0 | return; |
2444 | | |
2445 | 0 | Assert(!InRecovery); |
2446 | | |
2447 | | /* Set limits for offset vacuum. */ |
2448 | 0 | needs_offset_vacuum = SetOffsetVacuumLimit(is_startup); |
2449 | | |
2450 | | /* |
2451 | | * If past the autovacuum force point, immediately signal an autovac |
2452 | | * request. The reason for this is that autovac only processes one |
2453 | | * database per invocation. Once it's finished cleaning up the oldest |
2454 | | * database, it'll call here, and we'll signal the postmaster to start |
2455 | | * another iteration immediately if there are still any old databases. |
2456 | | */ |
2457 | 0 | if ((MultiXactIdPrecedes(multiVacLimit, curMulti) || |
2458 | 0 | needs_offset_vacuum) && IsUnderPostmaster) |
2459 | 0 | SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); |
2460 | | |
2461 | | /* Give an immediate warning if past the wrap warn point */ |
2462 | 0 | if (MultiXactIdPrecedes(multiWarnLimit, curMulti)) |
2463 | 0 | { |
2464 | 0 | char *oldest_datname; |
2465 | | |
2466 | | /* |
2467 | | * We can be called when not inside a transaction, for example during |
2468 | | * StartupXLOG(). In such a case we cannot do database access, so we |
2469 | | * must just report the oldest DB's OID. |
2470 | | * |
2471 | | * Note: it's also possible that get_database_name fails and returns |
2472 | | * NULL, for example because the database just got dropped. We'll |
2473 | | * still warn, even though the warning might now be unnecessary. |
2474 | | */ |
2475 | 0 | if (IsTransactionState()) |
2476 | 0 | oldest_datname = get_database_name(oldest_datoid); |
2477 | 0 | else |
2478 | 0 | oldest_datname = NULL; |
2479 | |
|
2480 | 0 | if (oldest_datname) |
2481 | 0 | ereport(WARNING, |
2482 | 0 | (errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used", |
2483 | 0 | "database \"%s\" must be vacuumed before %u more MultiXactIds are used", |
2484 | 0 | multiWrapLimit - curMulti, |
2485 | 0 | oldest_datname, |
2486 | 0 | multiWrapLimit - curMulti), |
2487 | 0 | errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n" |
2488 | 0 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); |
2489 | 0 | else |
2490 | 0 | ereport(WARNING, |
2491 | 0 | (errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used", |
2492 | 0 | "database with OID %u must be vacuumed before %u more MultiXactIds are used", |
2493 | 0 | multiWrapLimit - curMulti, |
2494 | 0 | oldest_datoid, |
2495 | 0 | multiWrapLimit - curMulti), |
2496 | 0 | errhint("To avoid MultiXactId assignment failures, execute a database-wide VACUUM in that database.\n" |
2497 | 0 | "You might also need to commit or roll back old prepared transactions, or drop stale replication slots."))); |
2498 | 0 | } |
2499 | 0 | } |
2500 | | |
2501 | | /* |
2502 | | * Ensure the next-to-be-assigned MultiXactId is at least minMulti, |
2503 | | * and similarly nextOffset is at least minMultiOffset. |
2504 | | * |
2505 | | * This is used when we can determine minimum safe values from an XLog |
2506 | | * record (either an on-line checkpoint or an mxact creation log entry). |
2507 | | * Although this is only called during XLog replay, we take the lock in case |
2508 | | * any hot-standby backends are examining the values. |
2509 | | */ |
2510 | | void |
2511 | | MultiXactAdvanceNextMXact(MultiXactId minMulti, |
2512 | | MultiXactOffset minMultiOffset) |
2513 | 0 | { |
2514 | 0 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
2515 | 0 | if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti)) |
2516 | 0 | { |
2517 | 0 | debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti); |
2518 | 0 | MultiXactState->nextMXact = minMulti; |
2519 | 0 | } |
2520 | 0 | if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset)) |
2521 | 0 | { |
2522 | 0 | debug_elog3(DEBUG2, "MultiXact: setting next offset to %u", |
2523 | 0 | minMultiOffset); |
2524 | 0 | MultiXactState->nextOffset = minMultiOffset; |
2525 | 0 | } |
2526 | 0 | LWLockRelease(MultiXactGenLock); |
2527 | 0 | } |
2528 | | |
2529 | | /* |
2530 | | * Update our oldestMultiXactId value, but only if it's more recent than what |
2531 | | * we had. |
2532 | | * |
2533 | | * This may only be called during WAL replay. |
2534 | | */ |
2535 | | void |
2536 | | MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB) |
2537 | 0 | { |
2538 | 0 | Assert(InRecovery); |
2539 | |
|
2540 | 0 | if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti)) |
2541 | 0 | SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false); |
2542 | 0 | } |
2543 | | |
2544 | | /* |
2545 | | * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId. |
2546 | | * |
2547 | | * NB: this is called while holding MultiXactGenLock. We want it to be very |
2548 | | * fast most of the time; even when it's not so fast, no actual I/O need |
2549 | | * happen unless we're forced to write out a dirty log or xlog page to make |
2550 | | * room in shared memory. |
2551 | | */ |
2552 | | static void |
2553 | | ExtendMultiXactOffset(MultiXactId multi) |
2554 | 0 | { |
2555 | 0 | int64 pageno; |
2556 | 0 | LWLock *lock; |
2557 | | |
2558 | | /* |
2559 | | * No work except at first MultiXactId of a page. But beware: just after |
2560 | | * wraparound, the first MultiXactId of page zero is FirstMultiXactId. |
2561 | | */ |
2562 | 0 | if (MultiXactIdToOffsetEntry(multi) != 0 && |
2563 | 0 | multi != FirstMultiXactId) |
2564 | 0 | return; |
2565 | | |
2566 | 0 | pageno = MultiXactIdToOffsetPage(multi); |
2567 | 0 | lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); |
2568 | |
|
2569 | 0 | LWLockAcquire(lock, LW_EXCLUSIVE); |
2570 | | |
2571 | | /* Zero the page and make an XLOG entry about it */ |
2572 | 0 | ZeroMultiXactOffsetPage(pageno, true); |
2573 | |
|
2574 | 0 | LWLockRelease(lock); |
2575 | 0 | } |
2576 | | |
2577 | | /* |
2578 | | * Make sure that MultiXactMember has room for the members of a newly- |
2579 | | * allocated MultiXactId. |
2580 | | * |
2581 | | * Like the above routine, this is called while holding MultiXactGenLock; |
2582 | | * same comments apply. |
2583 | | */ |
2584 | | static void |
2585 | | ExtendMultiXactMember(MultiXactOffset offset, int nmembers) |
2586 | 0 | { |
2587 | | /* |
2588 | | * It's possible that the members span more than one page of the members |
2589 | | * file, so we loop to ensure we consider each page. The coding is not |
2590 | | * optimal if the members span several pages, but that seems unusual |
2591 | | * enough to not worry much about. |
2592 | | */ |
2593 | 0 | while (nmembers > 0) |
2594 | 0 | { |
2595 | 0 | int flagsoff; |
2596 | 0 | int flagsbit; |
2597 | 0 | uint32 difference; |
2598 | | |
2599 | | /* |
2600 | | * Only zero when at first entry of a page. |
2601 | | */ |
2602 | 0 | flagsoff = MXOffsetToFlagsOffset(offset); |
2603 | 0 | flagsbit = MXOffsetToFlagsBitShift(offset); |
2604 | 0 | if (flagsoff == 0 && flagsbit == 0) |
2605 | 0 | { |
2606 | 0 | int64 pageno; |
2607 | 0 | LWLock *lock; |
2608 | |
|
2609 | 0 | pageno = MXOffsetToMemberPage(offset); |
2610 | 0 | lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno); |
2611 | |
|
2612 | 0 | LWLockAcquire(lock, LW_EXCLUSIVE); |
2613 | | |
2614 | | /* Zero the page and make an XLOG entry about it */ |
2615 | 0 | ZeroMultiXactMemberPage(pageno, true); |
2616 | |
|
2617 | 0 | LWLockRelease(lock); |
2618 | 0 | } |
2619 | | |
2620 | | /* |
2621 | | * Compute the number of items till end of current page. Careful: if |
2622 | | * addition of unsigned ints wraps around, we're at the last page of |
2623 | | * the last segment; since that page holds a different number of items |
2624 | | * than other pages, we need to do it differently. |
2625 | | */ |
2626 | 0 | if (offset + MAX_MEMBERS_IN_LAST_MEMBERS_PAGE < offset) |
2627 | 0 | { |
2628 | | /* |
2629 | | * This is the last page of the last segment; we can compute the |
2630 | | * number of items left to allocate in it without modulo |
2631 | | * arithmetic. |
2632 | | */ |
2633 | 0 | difference = MaxMultiXactOffset - offset + 1; |
2634 | 0 | } |
2635 | 0 | else |
2636 | 0 | difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; |
2637 | | |
2638 | | /* |
2639 | | * Advance to next page, taking care to properly handle the wraparound |
2640 | | * case. OK if nmembers goes negative. |
2641 | | */ |
2642 | 0 | nmembers -= difference; |
2643 | 0 | offset += difference; |
2644 | 0 | } |
2645 | 0 | } |
2646 | | |
2647 | | /* |
2648 | | * GetOldestMultiXactId |
2649 | | * |
2650 | | * Return the oldest MultiXactId that's still possibly still seen as live by |
2651 | | * any running transaction. Older ones might still exist on disk, but they no |
2652 | | * longer have any running member transaction. |
2653 | | * |
2654 | | * It's not safe to truncate MultiXact SLRU segments on the value returned by |
2655 | | * this function; however, it can be set as the new relminmxid for any table |
2656 | | * that VACUUM knows has no remaining MXIDs < the same value. It is only safe |
2657 | | * to truncate SLRUs when no table can possibly still have a referencing MXID. |
2658 | | */ |
2659 | | MultiXactId |
2660 | | GetOldestMultiXactId(void) |
2661 | 0 | { |
2662 | 0 | MultiXactId oldestMXact; |
2663 | 0 | MultiXactId nextMXact; |
2664 | 0 | int i; |
2665 | | |
2666 | | /* |
2667 | | * This is the oldest valid value among all the OldestMemberMXactId[] and |
2668 | | * OldestVisibleMXactId[] entries, or nextMXact if none are valid. |
2669 | | */ |
2670 | 0 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
2671 | | |
2672 | | /* |
2673 | | * We have to beware of the possibility that nextMXact is in the |
2674 | | * wrapped-around state. We don't fix the counter itself here, but we |
2675 | | * must be sure to use a valid value in our calculation. |
2676 | | */ |
2677 | 0 | nextMXact = MultiXactState->nextMXact; |
2678 | 0 | if (nextMXact < FirstMultiXactId) |
2679 | 0 | nextMXact = FirstMultiXactId; |
2680 | |
|
2681 | 0 | oldestMXact = nextMXact; |
2682 | 0 | for (i = 0; i < MaxOldestSlot; i++) |
2683 | 0 | { |
2684 | 0 | MultiXactId thisoldest; |
2685 | |
|
2686 | 0 | thisoldest = OldestMemberMXactId[i]; |
2687 | 0 | if (MultiXactIdIsValid(thisoldest) && |
2688 | 0 | MultiXactIdPrecedes(thisoldest, oldestMXact)) |
2689 | 0 | oldestMXact = thisoldest; |
2690 | 0 | thisoldest = OldestVisibleMXactId[i]; |
2691 | 0 | if (MultiXactIdIsValid(thisoldest) && |
2692 | 0 | MultiXactIdPrecedes(thisoldest, oldestMXact)) |
2693 | 0 | oldestMXact = thisoldest; |
2694 | 0 | } |
2695 | |
|
2696 | 0 | LWLockRelease(MultiXactGenLock); |
2697 | |
|
2698 | 0 | return oldestMXact; |
2699 | 0 | } |
2700 | | |
2701 | | /* |
2702 | | * Determine how aggressively we need to vacuum in order to prevent member |
2703 | | * wraparound. |
2704 | | * |
2705 | | * To do so determine what's the oldest member offset and install the limit |
2706 | | * info in MultiXactState, where it can be used to prevent overrun of old data |
2707 | | * in the members SLRU area. |
2708 | | * |
2709 | | * The return value is true if emergency autovacuum is required and false |
2710 | | * otherwise. |
2711 | | */ |
2712 | | static bool |
2713 | | SetOffsetVacuumLimit(bool is_startup) |
2714 | 0 | { |
2715 | 0 | MultiXactId oldestMultiXactId; |
2716 | 0 | MultiXactId nextMXact; |
2717 | 0 | MultiXactOffset oldestOffset = 0; /* placate compiler */ |
2718 | 0 | MultiXactOffset prevOldestOffset; |
2719 | 0 | MultiXactOffset nextOffset; |
2720 | 0 | bool oldestOffsetKnown = false; |
2721 | 0 | bool prevOldestOffsetKnown; |
2722 | 0 | MultiXactOffset offsetStopLimit = 0; |
2723 | 0 | MultiXactOffset prevOffsetStopLimit; |
2724 | | |
2725 | | /* |
2726 | | * NB: Have to prevent concurrent truncation, we might otherwise try to |
2727 | | * lookup an oldestMulti that's concurrently getting truncated away. |
2728 | | */ |
2729 | 0 | LWLockAcquire(MultiXactTruncationLock, LW_SHARED); |
2730 | | |
2731 | | /* Read relevant fields from shared memory. */ |
2732 | 0 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
2733 | 0 | oldestMultiXactId = MultiXactState->oldestMultiXactId; |
2734 | 0 | nextMXact = MultiXactState->nextMXact; |
2735 | 0 | nextOffset = MultiXactState->nextOffset; |
2736 | 0 | prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown; |
2737 | 0 | prevOldestOffset = MultiXactState->oldestOffset; |
2738 | 0 | prevOffsetStopLimit = MultiXactState->offsetStopLimit; |
2739 | 0 | Assert(MultiXactState->finishedStartup); |
2740 | 0 | LWLockRelease(MultiXactGenLock); |
2741 | | |
2742 | | /* |
2743 | | * Determine the offset of the oldest multixact. Normally, we can read |
2744 | | * the offset from the multixact itself, but there's an important special |
2745 | | * case: if there are no multixacts in existence at all, oldestMXact |
2746 | | * obviously can't point to one. It will instead point to the multixact |
2747 | | * ID that will be assigned the next time one is needed. |
2748 | | */ |
2749 | 0 | if (oldestMultiXactId == nextMXact) |
2750 | 0 | { |
2751 | | /* |
2752 | | * When the next multixact gets created, it will be stored at the next |
2753 | | * offset. |
2754 | | */ |
2755 | 0 | oldestOffset = nextOffset; |
2756 | 0 | oldestOffsetKnown = true; |
2757 | 0 | } |
2758 | 0 | else |
2759 | 0 | { |
2760 | | /* |
2761 | | * Figure out where the oldest existing multixact's offsets are |
2762 | | * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X, |
2763 | | * the supposedly-earliest multixact might not really exist. We are |
2764 | | * careful not to fail in that case. |
2765 | | */ |
2766 | 0 | oldestOffsetKnown = |
2767 | 0 | find_multixact_start(oldestMultiXactId, &oldestOffset); |
2768 | |
|
2769 | 0 | if (oldestOffsetKnown) |
2770 | 0 | ereport(DEBUG1, |
2771 | 0 | (errmsg_internal("oldest MultiXactId member is at offset %u", |
2772 | 0 | oldestOffset))); |
2773 | 0 | else |
2774 | 0 | ereport(LOG, |
2775 | 0 | (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk", |
2776 | 0 | oldestMultiXactId))); |
2777 | 0 | } |
2778 | | |
2779 | 0 | LWLockRelease(MultiXactTruncationLock); |
2780 | | |
2781 | | /* |
2782 | | * If we can, compute limits (and install them MultiXactState) to prevent |
2783 | | * overrun of old data in the members SLRU area. We can only do so if the |
2784 | | * oldest offset is known though. |
2785 | | */ |
2786 | 0 | if (oldestOffsetKnown) |
2787 | 0 | { |
2788 | | /* move back to start of the corresponding segment */ |
2789 | 0 | offsetStopLimit = oldestOffset - (oldestOffset % |
2790 | 0 | (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT)); |
2791 | | |
2792 | | /* always leave one segment before the wraparound point */ |
2793 | 0 | offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT); |
2794 | |
|
2795 | 0 | if (!prevOldestOffsetKnown && !is_startup) |
2796 | 0 | ereport(LOG, |
2797 | 0 | (errmsg("MultiXact member wraparound protections are now enabled"))); |
2798 | | |
2799 | 0 | ereport(DEBUG1, |
2800 | 0 | (errmsg_internal("MultiXact member stop limit is now %u based on MultiXact %u", |
2801 | 0 | offsetStopLimit, oldestMultiXactId))); |
2802 | 0 | } |
2803 | 0 | else if (prevOldestOffsetKnown) |
2804 | 0 | { |
2805 | | /* |
2806 | | * If we failed to get the oldest offset this time, but we have a |
2807 | | * value from a previous pass through this function, use the old |
2808 | | * values rather than automatically forcing an emergency autovacuum |
2809 | | * cycle again. |
2810 | | */ |
2811 | 0 | oldestOffset = prevOldestOffset; |
2812 | 0 | oldestOffsetKnown = true; |
2813 | 0 | offsetStopLimit = prevOffsetStopLimit; |
2814 | 0 | } |
2815 | | |
2816 | | /* Install the computed values */ |
2817 | 0 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
2818 | 0 | MultiXactState->oldestOffset = oldestOffset; |
2819 | 0 | MultiXactState->oldestOffsetKnown = oldestOffsetKnown; |
2820 | 0 | MultiXactState->offsetStopLimit = offsetStopLimit; |
2821 | 0 | LWLockRelease(MultiXactGenLock); |
2822 | | |
2823 | | /* |
2824 | | * Do we need an emergency autovacuum? If we're not sure, assume yes. |
2825 | | */ |
2826 | 0 | return !oldestOffsetKnown || |
2827 | 0 | (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD); |
2828 | 0 | } |
2829 | | |
2830 | | /* |
2831 | | * Return whether adding "distance" to "start" would move past "boundary". |
2832 | | * |
2833 | | * We use this to determine whether the addition is "wrapping around" the |
2834 | | * boundary point, hence the name. The reason we don't want to use the regular |
2835 | | * 2^31-modulo arithmetic here is that we want to be able to use the whole of |
2836 | | * the 2^32-1 space here, allowing for more multixacts than would fit |
2837 | | * otherwise. |
2838 | | */ |
2839 | | static bool |
2840 | | MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, |
2841 | | uint32 distance) |
2842 | 0 | { |
2843 | 0 | MultiXactOffset finish; |
2844 | | |
2845 | | /* |
2846 | | * Note that offset number 0 is not used (see GetMultiXactIdMembers), so |
2847 | | * if the addition wraps around the UINT_MAX boundary, skip that value. |
2848 | | */ |
2849 | 0 | finish = start + distance; |
2850 | 0 | if (finish < start) |
2851 | 0 | finish++; |
2852 | | |
2853 | | /*----------------------------------------------------------------------- |
2854 | | * When the boundary is numerically greater than the starting point, any |
2855 | | * value numerically between the two is not wrapped: |
2856 | | * |
2857 | | * <----S----B----> |
2858 | | * [---) = F wrapped past B (and UINT_MAX) |
2859 | | * [---) = F not wrapped |
2860 | | * [----] = F wrapped past B |
2861 | | * |
2862 | | * When the boundary is numerically less than the starting point (i.e. the |
2863 | | * UINT_MAX wraparound occurs somewhere in between) then all values in |
2864 | | * between are wrapped: |
2865 | | * |
2866 | | * <----B----S----> |
2867 | | * [---) = F not wrapped past B (but wrapped past UINT_MAX) |
2868 | | * [---) = F wrapped past B (and UINT_MAX) |
2869 | | * [----] = F not wrapped |
2870 | | *----------------------------------------------------------------------- |
2871 | | */ |
2872 | 0 | if (start < boundary) |
2873 | 0 | return finish >= boundary || finish < start; |
2874 | 0 | else |
2875 | 0 | return finish >= boundary && finish < start; |
2876 | 0 | } |
2877 | | |
2878 | | /* |
2879 | | * Find the starting offset of the given MultiXactId. |
2880 | | * |
2881 | | * Returns false if the file containing the multi does not exist on disk. |
2882 | | * Otherwise, returns true and sets *result to the starting member offset. |
2883 | | * |
2884 | | * This function does not prevent concurrent truncation, so if that's |
2885 | | * required, the caller has to protect against that. |
2886 | | */ |
2887 | | static bool |
2888 | | find_multixact_start(MultiXactId multi, MultiXactOffset *result) |
2889 | 0 | { |
2890 | 0 | MultiXactOffset offset; |
2891 | 0 | int64 pageno; |
2892 | 0 | int entryno; |
2893 | 0 | int slotno; |
2894 | 0 | MultiXactOffset *offptr; |
2895 | |
|
2896 | 0 | Assert(MultiXactState->finishedStartup); |
2897 | |
|
2898 | 0 | pageno = MultiXactIdToOffsetPage(multi); |
2899 | 0 | entryno = MultiXactIdToOffsetEntry(multi); |
2900 | | |
2901 | | /* |
2902 | | * Write out dirty data, so PhysicalPageExists can work correctly. |
2903 | | */ |
2904 | 0 | SimpleLruWriteAll(MultiXactOffsetCtl, true); |
2905 | 0 | SimpleLruWriteAll(MultiXactMemberCtl, true); |
2906 | |
|
2907 | 0 | if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) |
2908 | 0 | return false; |
2909 | | |
2910 | | /* lock is acquired by SimpleLruReadPage_ReadOnly */ |
2911 | 0 | slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi); |
2912 | 0 | offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; |
2913 | 0 | offptr += entryno; |
2914 | 0 | offset = *offptr; |
2915 | 0 | LWLockRelease(SimpleLruGetBankLock(MultiXactOffsetCtl, pageno)); |
2916 | |
|
2917 | 0 | *result = offset; |
2918 | 0 | return true; |
2919 | 0 | } |
2920 | | |
2921 | | /* |
2922 | | * Determine how many multixacts, and how many multixact members, currently |
2923 | | * exist. Return false if unable to determine. |
2924 | | */ |
2925 | | static bool |
2926 | | ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members) |
2927 | 0 | { |
2928 | 0 | MultiXactOffset nextOffset; |
2929 | 0 | MultiXactOffset oldestOffset; |
2930 | 0 | MultiXactId oldestMultiXactId; |
2931 | 0 | MultiXactId nextMultiXactId; |
2932 | 0 | bool oldestOffsetKnown; |
2933 | |
|
2934 | 0 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
2935 | 0 | nextOffset = MultiXactState->nextOffset; |
2936 | 0 | oldestMultiXactId = MultiXactState->oldestMultiXactId; |
2937 | 0 | nextMultiXactId = MultiXactState->nextMXact; |
2938 | 0 | oldestOffset = MultiXactState->oldestOffset; |
2939 | 0 | oldestOffsetKnown = MultiXactState->oldestOffsetKnown; |
2940 | 0 | LWLockRelease(MultiXactGenLock); |
2941 | |
|
2942 | 0 | if (!oldestOffsetKnown) |
2943 | 0 | return false; |
2944 | | |
2945 | 0 | *members = nextOffset - oldestOffset; |
2946 | 0 | *multixacts = nextMultiXactId - oldestMultiXactId; |
2947 | 0 | return true; |
2948 | 0 | } |
2949 | | |
2950 | | /* |
2951 | | * Multixact members can be removed once the multixacts that refer to them |
2952 | | * are older than every datminmxid. autovacuum_multixact_freeze_max_age and |
2953 | | * vacuum_multixact_freeze_table_age work together to make sure we never have |
2954 | | * too many multixacts; we hope that, at least under normal circumstances, |
2955 | | * this will also be sufficient to keep us from using too many offsets. |
2956 | | * However, if the average multixact has many members, we might exhaust the |
2957 | | * members space while still using few enough members that these limits fail |
2958 | | * to trigger relminmxid advancement by VACUUM. At that point, we'd have no |
2959 | | * choice but to start failing multixact-creating operations with an error. |
2960 | | * |
2961 | | * To prevent that, if more than a threshold portion of the members space is |
2962 | | * used, we effectively reduce autovacuum_multixact_freeze_max_age and |
2963 | | * to a value just less than the number of multixacts in use. We hope that |
2964 | | * this will quickly trigger autovacuuming on the table or tables with the |
2965 | | * oldest relminmxid, thus allowing datminmxid values to advance and removing |
2966 | | * some members. |
2967 | | * |
2968 | | * As the fraction of the member space currently in use grows, we become |
2969 | | * more aggressive in clamping this value. That not only causes autovacuum |
2970 | | * to ramp up, but also makes any manual vacuums the user issues more |
2971 | | * aggressive. This happens because vacuum_get_cutoffs() will clamp the |
2972 | | * freeze table and the minimum freeze age cutoffs based on the effective |
2973 | | * autovacuum_multixact_freeze_max_age this function returns. In the worst |
2974 | | * case, we'll claim the freeze_max_age to zero, and every vacuum of any |
2975 | | * table will freeze every multixact. |
2976 | | */ |
2977 | | int |
2978 | | MultiXactMemberFreezeThreshold(void) |
2979 | 0 | { |
2980 | 0 | MultiXactOffset members; |
2981 | 0 | uint32 multixacts; |
2982 | 0 | uint32 victim_multixacts; |
2983 | 0 | double fraction; |
2984 | 0 | int result; |
2985 | | |
2986 | | /* If we can't determine member space utilization, assume the worst. */ |
2987 | 0 | if (!ReadMultiXactCounts(&multixacts, &members)) |
2988 | 0 | return 0; |
2989 | | |
2990 | | /* If member space utilization is low, no special action is required. */ |
2991 | 0 | if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD) |
2992 | 0 | return autovacuum_multixact_freeze_max_age; |
2993 | | |
2994 | | /* |
2995 | | * Compute a target for relminmxid advancement. The number of multixacts |
2996 | | * we try to eliminate from the system is based on how far we are past |
2997 | | * MULTIXACT_MEMBER_SAFE_THRESHOLD. |
2998 | | */ |
2999 | 0 | fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) / |
3000 | 0 | (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD); |
3001 | 0 | victim_multixacts = multixacts * fraction; |
3002 | | |
3003 | | /* fraction could be > 1.0, but lowest possible freeze age is zero */ |
3004 | 0 | if (victim_multixacts > multixacts) |
3005 | 0 | return 0; |
3006 | 0 | result = multixacts - victim_multixacts; |
3007 | | |
3008 | | /* |
3009 | | * Clamp to autovacuum_multixact_freeze_max_age, so that we never make |
3010 | | * autovacuum less aggressive than it would otherwise be. |
3011 | | */ |
3012 | 0 | return Min(result, autovacuum_multixact_freeze_max_age); |
3013 | 0 | } |
3014 | | |
3015 | | typedef struct mxtruncinfo |
3016 | | { |
3017 | | int64 earliestExistingPage; |
3018 | | } mxtruncinfo; |
3019 | | |
3020 | | /* |
3021 | | * SlruScanDirectory callback |
3022 | | * This callback determines the earliest existing page number. |
3023 | | */ |
3024 | | static bool |
3025 | | SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int64 segpage, void *data) |
3026 | 0 | { |
3027 | 0 | mxtruncinfo *trunc = (mxtruncinfo *) data; |
3028 | |
|
3029 | 0 | if (trunc->earliestExistingPage == -1 || |
3030 | 0 | ctl->PagePrecedes(segpage, trunc->earliestExistingPage)) |
3031 | 0 | { |
3032 | 0 | trunc->earliestExistingPage = segpage; |
3033 | 0 | } |
3034 | |
|
3035 | 0 | return false; /* keep going */ |
3036 | 0 | } |
3037 | | |
3038 | | |
3039 | | /* |
3040 | | * Delete members segments [oldest, newOldest) |
3041 | | * |
3042 | | * The members SLRU can, in contrast to the offsets one, be filled to almost |
3043 | | * the full range at once. This means SimpleLruTruncate() can't trivially be |
3044 | | * used - instead the to-be-deleted range is computed using the offsets |
3045 | | * SLRU. C.f. TruncateMultiXact(). |
3046 | | */ |
3047 | | static void |
3048 | | PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset) |
3049 | 0 | { |
3050 | 0 | const int64 maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset); |
3051 | 0 | int64 startsegment = MXOffsetToMemberSegment(oldestOffset); |
3052 | 0 | int64 endsegment = MXOffsetToMemberSegment(newOldestOffset); |
3053 | 0 | int64 segment = startsegment; |
3054 | | |
3055 | | /* |
3056 | | * Delete all the segments but the last one. The last segment can still |
3057 | | * contain, possibly partially, valid data. |
3058 | | */ |
3059 | 0 | while (segment != endsegment) |
3060 | 0 | { |
3061 | 0 | elog(DEBUG2, "truncating multixact members segment %" PRIx64, |
3062 | 0 | segment); |
3063 | 0 | SlruDeleteSegment(MultiXactMemberCtl, segment); |
3064 | | |
3065 | | /* move to next segment, handling wraparound correctly */ |
3066 | 0 | if (segment == maxsegment) |
3067 | 0 | segment = 0; |
3068 | 0 | else |
3069 | 0 | segment += 1; |
3070 | 0 | } |
3071 | 0 | } |
3072 | | |
3073 | | /* |
3074 | | * Delete offsets segments [oldest, newOldest) |
3075 | | */ |
3076 | | static void |
3077 | | PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti) |
3078 | 0 | { |
3079 | | /* |
3080 | | * We step back one multixact to avoid passing a cutoff page that hasn't |
3081 | | * been created yet in the rare case that oldestMulti would be the first |
3082 | | * item on a page and oldestMulti == nextMulti. In that case, if we |
3083 | | * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound |
3084 | | * detection. |
3085 | | */ |
3086 | 0 | SimpleLruTruncate(MultiXactOffsetCtl, |
3087 | 0 | MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti))); |
3088 | 0 | } |
3089 | | |
3090 | | /* |
3091 | | * Remove all MultiXactOffset and MultiXactMember segments before the oldest |
3092 | | * ones still of interest. |
3093 | | * |
3094 | | * This is only called on a primary as part of vacuum (via |
3095 | | * vac_truncate_clog()). During recovery truncation is done by replaying |
3096 | | * truncation WAL records logged here. |
3097 | | * |
3098 | | * newOldestMulti is the oldest currently required multixact, newOldestMultiDB |
3099 | | * is one of the databases preventing newOldestMulti from increasing. |
3100 | | */ |
3101 | | void |
3102 | | TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) |
3103 | 0 | { |
3104 | 0 | MultiXactId oldestMulti; |
3105 | 0 | MultiXactId nextMulti; |
3106 | 0 | MultiXactOffset newOldestOffset; |
3107 | 0 | MultiXactOffset oldestOffset; |
3108 | 0 | MultiXactOffset nextOffset; |
3109 | 0 | mxtruncinfo trunc; |
3110 | 0 | MultiXactId earliest; |
3111 | |
|
3112 | 0 | Assert(!RecoveryInProgress()); |
3113 | 0 | Assert(MultiXactState->finishedStartup); |
3114 | | |
3115 | | /* |
3116 | | * We can only allow one truncation to happen at once. Otherwise parts of |
3117 | | * members might vanish while we're doing lookups or similar. There's no |
3118 | | * need to have an interlock with creating new multis or such, since those |
3119 | | * are constrained by the limits (which only grow, never shrink). |
3120 | | */ |
3121 | 0 | LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE); |
3122 | |
|
3123 | 0 | LWLockAcquire(MultiXactGenLock, LW_SHARED); |
3124 | 0 | nextMulti = MultiXactState->nextMXact; |
3125 | 0 | nextOffset = MultiXactState->nextOffset; |
3126 | 0 | oldestMulti = MultiXactState->oldestMultiXactId; |
3127 | 0 | LWLockRelease(MultiXactGenLock); |
3128 | 0 | Assert(MultiXactIdIsValid(oldestMulti)); |
3129 | | |
3130 | | /* |
3131 | | * Make sure to only attempt truncation if there's values to truncate |
3132 | | * away. In normal processing values shouldn't go backwards, but there's |
3133 | | * some corner cases (due to bugs) where that's possible. |
3134 | | */ |
3135 | 0 | if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti)) |
3136 | 0 | { |
3137 | 0 | LWLockRelease(MultiXactTruncationLock); |
3138 | 0 | return; |
3139 | 0 | } |
3140 | | |
3141 | | /* |
3142 | | * Note we can't just plow ahead with the truncation; it's possible that |
3143 | | * there are no segments to truncate, which is a problem because we are |
3144 | | * going to attempt to read the offsets page to determine where to |
3145 | | * truncate the members SLRU. So we first scan the directory to determine |
3146 | | * the earliest offsets page number that we can read without error. |
3147 | | * |
3148 | | * When nextMXact is less than one segment away from multiWrapLimit, |
3149 | | * SlruScanDirCbFindEarliest can find some early segment other than the |
3150 | | * actual earliest. (MultiXactOffsetPagePrecedes(EARLIEST, LATEST) |
3151 | | * returns false, because not all pairs of entries have the same answer.) |
3152 | | * That can also arise when an earlier truncation attempt failed unlink() |
3153 | | * or returned early from this function. The only consequence is |
3154 | | * returning early, which wastes space that we could have liberated. |
3155 | | * |
3156 | | * NB: It's also possible that the page that oldestMulti is on has already |
3157 | | * been truncated away, and we crashed before updating oldestMulti. |
3158 | | */ |
3159 | 0 | trunc.earliestExistingPage = -1; |
3160 | 0 | SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc); |
3161 | 0 | earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE; |
3162 | 0 | if (earliest < FirstMultiXactId) |
3163 | 0 | earliest = FirstMultiXactId; |
3164 | | |
3165 | | /* If there's nothing to remove, we can bail out early. */ |
3166 | 0 | if (MultiXactIdPrecedes(oldestMulti, earliest)) |
3167 | 0 | { |
3168 | 0 | LWLockRelease(MultiXactTruncationLock); |
3169 | 0 | return; |
3170 | 0 | } |
3171 | | |
3172 | | /* |
3173 | | * First, compute the safe truncation point for MultiXactMember. This is |
3174 | | * the starting offset of the oldest multixact. |
3175 | | * |
3176 | | * Hopefully, find_multixact_start will always work here, because we've |
3177 | | * already checked that it doesn't precede the earliest MultiXact on disk. |
3178 | | * But if it fails, don't truncate anything, and log a message. |
3179 | | */ |
3180 | 0 | if (oldestMulti == nextMulti) |
3181 | 0 | { |
3182 | | /* there are NO MultiXacts */ |
3183 | 0 | oldestOffset = nextOffset; |
3184 | 0 | } |
3185 | 0 | else if (!find_multixact_start(oldestMulti, &oldestOffset)) |
3186 | 0 | { |
3187 | 0 | ereport(LOG, |
3188 | 0 | (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation", |
3189 | 0 | oldestMulti, earliest))); |
3190 | 0 | LWLockRelease(MultiXactTruncationLock); |
3191 | 0 | return; |
3192 | 0 | } |
3193 | | |
3194 | | /* |
3195 | | * Secondly compute up to where to truncate. Lookup the corresponding |
3196 | | * member offset for newOldestMulti for that. |
3197 | | */ |
3198 | 0 | if (newOldestMulti == nextMulti) |
3199 | 0 | { |
3200 | | /* there are NO MultiXacts */ |
3201 | 0 | newOldestOffset = nextOffset; |
3202 | 0 | } |
3203 | 0 | else if (!find_multixact_start(newOldestMulti, &newOldestOffset)) |
3204 | 0 | { |
3205 | 0 | ereport(LOG, |
3206 | 0 | (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation", |
3207 | 0 | newOldestMulti))); |
3208 | 0 | LWLockRelease(MultiXactTruncationLock); |
3209 | 0 | return; |
3210 | 0 | } |
3211 | | |
3212 | 0 | elog(DEBUG1, "performing multixact truncation: " |
3213 | 0 | "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), " |
3214 | 0 | "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")", |
3215 | 0 | oldestMulti, newOldestMulti, |
3216 | 0 | MultiXactIdToOffsetSegment(oldestMulti), |
3217 | 0 | MultiXactIdToOffsetSegment(newOldestMulti), |
3218 | 0 | oldestOffset, newOldestOffset, |
3219 | 0 | MXOffsetToMemberSegment(oldestOffset), |
3220 | 0 | MXOffsetToMemberSegment(newOldestOffset)); |
3221 | | |
3222 | | /* |
3223 | | * Do truncation, and the WAL logging of the truncation, in a critical |
3224 | | * section. That way offsets/members cannot get out of sync anymore, i.e. |
3225 | | * once consistent the newOldestMulti will always exist in members, even |
3226 | | * if we crashed in the wrong moment. |
3227 | | */ |
3228 | 0 | START_CRIT_SECTION(); |
3229 | | |
3230 | | /* |
3231 | | * Prevent checkpoints from being scheduled concurrently. This is critical |
3232 | | * because otherwise a truncation record might not be replayed after a |
3233 | | * crash/basebackup, even though the state of the data directory would |
3234 | | * require it. |
3235 | | */ |
3236 | 0 | Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); |
3237 | 0 | MyProc->delayChkptFlags |= DELAY_CHKPT_START; |
3238 | | |
3239 | | /* WAL log truncation */ |
3240 | 0 | WriteMTruncateXlogRec(newOldestMultiDB, |
3241 | 0 | oldestMulti, newOldestMulti, |
3242 | 0 | oldestOffset, newOldestOffset); |
3243 | | |
3244 | | /* |
3245 | | * Update in-memory limits before performing the truncation, while inside |
3246 | | * the critical section: Have to do it before truncation, to prevent |
3247 | | * concurrent lookups of those values. Has to be inside the critical |
3248 | | * section as otherwise a future call to this function would error out, |
3249 | | * while looking up the oldest member in offsets, if our caller crashes |
3250 | | * before updating the limits. |
3251 | | */ |
3252 | 0 | LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); |
3253 | 0 | MultiXactState->oldestMultiXactId = newOldestMulti; |
3254 | 0 | MultiXactState->oldestMultiXactDB = newOldestMultiDB; |
3255 | 0 | LWLockRelease(MultiXactGenLock); |
3256 | | |
3257 | | /* First truncate members */ |
3258 | 0 | PerformMembersTruncation(oldestOffset, newOldestOffset); |
3259 | | |
3260 | | /* Then offsets */ |
3261 | 0 | PerformOffsetsTruncation(oldestMulti, newOldestMulti); |
3262 | |
|
3263 | 0 | MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; |
3264 | |
|
3265 | 0 | END_CRIT_SECTION(); |
3266 | 0 | LWLockRelease(MultiXactTruncationLock); |
3267 | 0 | } |
3268 | | |
3269 | | /* |
3270 | | * Decide whether a MultiXactOffset page number is "older" for truncation |
3271 | | * purposes. Analogous to CLOGPagePrecedes(). |
3272 | | * |
3273 | | * Offsetting the values is optional, because MultiXactIdPrecedes() has |
3274 | | * translational symmetry. |
3275 | | */ |
3276 | | static bool |
3277 | | MultiXactOffsetPagePrecedes(int64 page1, int64 page2) |
3278 | 0 | { |
3279 | 0 | MultiXactId multi1; |
3280 | 0 | MultiXactId multi2; |
3281 | |
|
3282 | 0 | multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE; |
3283 | 0 | multi1 += FirstMultiXactId + 1; |
3284 | 0 | multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE; |
3285 | 0 | multi2 += FirstMultiXactId + 1; |
3286 | |
|
3287 | 0 | return (MultiXactIdPrecedes(multi1, multi2) && |
3288 | 0 | MultiXactIdPrecedes(multi1, |
3289 | 0 | multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1)); |
3290 | 0 | } |
3291 | | |
3292 | | /* |
3293 | | * Decide whether a MultiXactMember page number is "older" for truncation |
3294 | | * purposes. There is no "invalid offset number" so use the numbers verbatim. |
3295 | | */ |
3296 | | static bool |
3297 | | MultiXactMemberPagePrecedes(int64 page1, int64 page2) |
3298 | 0 | { |
3299 | 0 | MultiXactOffset offset1; |
3300 | 0 | MultiXactOffset offset2; |
3301 | |
|
3302 | 0 | offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE; |
3303 | 0 | offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE; |
3304 | |
|
3305 | 0 | return (MultiXactOffsetPrecedes(offset1, offset2) && |
3306 | 0 | MultiXactOffsetPrecedes(offset1, |
3307 | 0 | offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1)); |
3308 | 0 | } |
3309 | | |
3310 | | /* |
3311 | | * Decide which of two MultiXactIds is earlier. |
3312 | | * |
3313 | | * XXX do we need to do something special for InvalidMultiXactId? |
3314 | | * (Doesn't look like it.) |
3315 | | */ |
3316 | | bool |
3317 | | MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2) |
3318 | 0 | { |
3319 | 0 | int32 diff = (int32) (multi1 - multi2); |
3320 | |
|
3321 | 0 | return (diff < 0); |
3322 | 0 | } |
3323 | | |
3324 | | /* |
3325 | | * MultiXactIdPrecedesOrEquals -- is multi1 logically <= multi2? |
3326 | | * |
3327 | | * XXX do we need to do something special for InvalidMultiXactId? |
3328 | | * (Doesn't look like it.) |
3329 | | */ |
3330 | | bool |
3331 | | MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2) |
3332 | 0 | { |
3333 | 0 | int32 diff = (int32) (multi1 - multi2); |
3334 | |
|
3335 | 0 | return (diff <= 0); |
3336 | 0 | } |
3337 | | |
3338 | | |
3339 | | /* |
3340 | | * Decide which of two offsets is earlier. |
3341 | | */ |
3342 | | static bool |
3343 | | MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) |
3344 | 0 | { |
3345 | 0 | int32 diff = (int32) (offset1 - offset2); |
3346 | |
|
3347 | 0 | return (diff < 0); |
3348 | 0 | } |
3349 | | |
3350 | | /* |
3351 | | * Write an xlog record reflecting the zeroing of either a MEMBERs or |
3352 | | * OFFSETs page (info shows which) |
3353 | | */ |
3354 | | static void |
3355 | | WriteMZeroPageXlogRec(int64 pageno, uint8 info) |
3356 | 0 | { |
3357 | 0 | XLogBeginInsert(); |
3358 | 0 | XLogRegisterData(&pageno, sizeof(pageno)); |
3359 | 0 | (void) XLogInsert(RM_MULTIXACT_ID, info); |
3360 | 0 | } |
3361 | | |
3362 | | /* |
3363 | | * Write a TRUNCATE xlog record |
3364 | | * |
3365 | | * We must flush the xlog record to disk before returning --- see notes in |
3366 | | * TruncateCLOG(). |
3367 | | */ |
3368 | | static void |
3369 | | WriteMTruncateXlogRec(Oid oldestMultiDB, |
3370 | | MultiXactId startTruncOff, MultiXactId endTruncOff, |
3371 | | MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb) |
3372 | 0 | { |
3373 | 0 | XLogRecPtr recptr; |
3374 | 0 | xl_multixact_truncate xlrec; |
3375 | |
|
3376 | 0 | xlrec.oldestMultiDB = oldestMultiDB; |
3377 | |
|
3378 | 0 | xlrec.startTruncOff = startTruncOff; |
3379 | 0 | xlrec.endTruncOff = endTruncOff; |
3380 | |
|
3381 | 0 | xlrec.startTruncMemb = startTruncMemb; |
3382 | 0 | xlrec.endTruncMemb = endTruncMemb; |
3383 | |
|
3384 | 0 | XLogBeginInsert(); |
3385 | 0 | XLogRegisterData(&xlrec, SizeOfMultiXactTruncate); |
3386 | 0 | recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID); |
3387 | 0 | XLogFlush(recptr); |
3388 | 0 | } |
3389 | | |
3390 | | /* |
3391 | | * MULTIXACT resource manager's routines |
3392 | | */ |
3393 | | void |
3394 | | multixact_redo(XLogReaderState *record) |
3395 | 0 | { |
3396 | 0 | uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; |
3397 | | |
3398 | | /* Backup blocks are not used in multixact records */ |
3399 | 0 | Assert(!XLogRecHasAnyBlockRefs(record)); |
3400 | |
|
3401 | 0 | if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) |
3402 | 0 | { |
3403 | 0 | int64 pageno; |
3404 | 0 | int slotno; |
3405 | 0 | LWLock *lock; |
3406 | |
|
3407 | 0 | memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); |
3408 | |
|
3409 | 0 | lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); |
3410 | 0 | LWLockAcquire(lock, LW_EXCLUSIVE); |
3411 | |
|
3412 | 0 | slotno = ZeroMultiXactOffsetPage(pageno, false); |
3413 | 0 | SimpleLruWritePage(MultiXactOffsetCtl, slotno); |
3414 | 0 | Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); |
3415 | |
|
3416 | 0 | LWLockRelease(lock); |
3417 | 0 | } |
3418 | 0 | else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) |
3419 | 0 | { |
3420 | 0 | int64 pageno; |
3421 | 0 | int slotno; |
3422 | 0 | LWLock *lock; |
3423 | |
|
3424 | 0 | memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); |
3425 | |
|
3426 | 0 | lock = SimpleLruGetBankLock(MultiXactMemberCtl, pageno); |
3427 | 0 | LWLockAcquire(lock, LW_EXCLUSIVE); |
3428 | |
|
3429 | 0 | slotno = ZeroMultiXactMemberPage(pageno, false); |
3430 | 0 | SimpleLruWritePage(MultiXactMemberCtl, slotno); |
3431 | 0 | Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); |
3432 | |
|
3433 | 0 | LWLockRelease(lock); |
3434 | 0 | } |
3435 | 0 | else if (info == XLOG_MULTIXACT_CREATE_ID) |
3436 | 0 | { |
3437 | 0 | xl_multixact_create *xlrec = |
3438 | 0 | (xl_multixact_create *) XLogRecGetData(record); |
3439 | 0 | TransactionId max_xid; |
3440 | 0 | int i; |
3441 | | |
3442 | | /* Store the data back into the SLRU files */ |
3443 | 0 | RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers, |
3444 | 0 | xlrec->members); |
3445 | | |
3446 | | /* Make sure nextMXact/nextOffset are beyond what this record has */ |
3447 | 0 | MultiXactAdvanceNextMXact(xlrec->mid + 1, |
3448 | 0 | xlrec->moff + xlrec->nmembers); |
3449 | | |
3450 | | /* |
3451 | | * Make sure nextXid is beyond any XID mentioned in the record. This |
3452 | | * should be unnecessary, since any XID found here ought to have other |
3453 | | * evidence in the XLOG, but let's be safe. |
3454 | | */ |
3455 | 0 | max_xid = XLogRecGetXid(record); |
3456 | 0 | for (i = 0; i < xlrec->nmembers; i++) |
3457 | 0 | { |
3458 | 0 | if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid)) |
3459 | 0 | max_xid = xlrec->members[i].xid; |
3460 | 0 | } |
3461 | |
|
3462 | 0 | AdvanceNextFullTransactionIdPastXid(max_xid); |
3463 | 0 | } |
3464 | 0 | else if (info == XLOG_MULTIXACT_TRUNCATE_ID) |
3465 | 0 | { |
3466 | 0 | xl_multixact_truncate xlrec; |
3467 | 0 | int64 pageno; |
3468 | |
|
3469 | 0 | memcpy(&xlrec, XLogRecGetData(record), |
3470 | 0 | SizeOfMultiXactTruncate); |
3471 | |
|
3472 | 0 | elog(DEBUG1, "replaying multixact truncation: " |
3473 | 0 | "offsets [%u, %u), offsets segments [%" PRIx64 ", %" PRIx64 "), " |
3474 | 0 | "members [%u, %u), members segments [%" PRIx64 ", %" PRIx64 ")", |
3475 | 0 | xlrec.startTruncOff, xlrec.endTruncOff, |
3476 | 0 | MultiXactIdToOffsetSegment(xlrec.startTruncOff), |
3477 | 0 | MultiXactIdToOffsetSegment(xlrec.endTruncOff), |
3478 | 0 | xlrec.startTruncMemb, xlrec.endTruncMemb, |
3479 | 0 | MXOffsetToMemberSegment(xlrec.startTruncMemb), |
3480 | 0 | MXOffsetToMemberSegment(xlrec.endTruncMemb)); |
3481 | | |
3482 | | /* should not be required, but more than cheap enough */ |
3483 | 0 | LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE); |
3484 | | |
3485 | | /* |
3486 | | * Advance the horizon values, so they're current at the end of |
3487 | | * recovery. |
3488 | | */ |
3489 | 0 | SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false); |
3490 | |
|
3491 | 0 | PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb); |
3492 | | |
3493 | | /* |
3494 | | * During XLOG replay, latest_page_number isn't necessarily set up |
3495 | | * yet; insert a suitable value to bypass the sanity test in |
3496 | | * SimpleLruTruncate. |
3497 | | */ |
3498 | 0 | pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff); |
3499 | 0 | pg_atomic_write_u64(&MultiXactOffsetCtl->shared->latest_page_number, |
3500 | 0 | pageno); |
3501 | 0 | PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff); |
3502 | |
|
3503 | 0 | LWLockRelease(MultiXactTruncationLock); |
3504 | 0 | } |
3505 | 0 | else |
3506 | 0 | elog(PANIC, "multixact_redo: unknown op code %u", info); |
3507 | 0 | } |
3508 | | |
3509 | | Datum |
3510 | | pg_get_multixact_members(PG_FUNCTION_ARGS) |
3511 | 0 | { |
3512 | 0 | typedef struct |
3513 | 0 | { |
3514 | 0 | MultiXactMember *members; |
3515 | 0 | int nmembers; |
3516 | 0 | int iter; |
3517 | 0 | } mxact; |
3518 | 0 | MultiXactId mxid = PG_GETARG_TRANSACTIONID(0); |
3519 | 0 | mxact *multi; |
3520 | 0 | FuncCallContext *funccxt; |
3521 | |
|
3522 | 0 | if (mxid < FirstMultiXactId) |
3523 | 0 | ereport(ERROR, |
3524 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
3525 | 0 | errmsg("invalid MultiXactId: %u", mxid))); |
3526 | | |
3527 | 0 | if (SRF_IS_FIRSTCALL()) |
3528 | 0 | { |
3529 | 0 | MemoryContext oldcxt; |
3530 | 0 | TupleDesc tupdesc; |
3531 | |
|
3532 | 0 | funccxt = SRF_FIRSTCALL_INIT(); |
3533 | 0 | oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx); |
3534 | |
|
3535 | 0 | multi = palloc(sizeof(mxact)); |
3536 | | /* no need to allow for old values here */ |
3537 | 0 | multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false, |
3538 | 0 | false); |
3539 | 0 | multi->iter = 0; |
3540 | |
|
3541 | 0 | if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) |
3542 | 0 | elog(ERROR, "return type must be a row type"); |
3543 | 0 | funccxt->tuple_desc = tupdesc; |
3544 | 0 | funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc); |
3545 | 0 | funccxt->user_fctx = multi; |
3546 | |
|
3547 | 0 | MemoryContextSwitchTo(oldcxt); |
3548 | 0 | } |
3549 | | |
3550 | 0 | funccxt = SRF_PERCALL_SETUP(); |
3551 | 0 | multi = (mxact *) funccxt->user_fctx; |
3552 | |
|
3553 | 0 | while (multi->iter < multi->nmembers) |
3554 | 0 | { |
3555 | 0 | HeapTuple tuple; |
3556 | 0 | char *values[2]; |
3557 | |
|
3558 | 0 | values[0] = psprintf("%u", multi->members[multi->iter].xid); |
3559 | 0 | values[1] = mxstatus_to_string(multi->members[multi->iter].status); |
3560 | |
|
3561 | 0 | tuple = BuildTupleFromCStrings(funccxt->attinmeta, values); |
3562 | |
|
3563 | 0 | multi->iter++; |
3564 | 0 | pfree(values[0]); |
3565 | 0 | SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple)); |
3566 | 0 | } |
3567 | | |
3568 | 0 | SRF_RETURN_DONE(funccxt); |
3569 | 0 | } |
3570 | | |
3571 | | /* |
3572 | | * Entrypoint for sync.c to sync offsets files. |
3573 | | */ |
3574 | | int |
3575 | | multixactoffsetssyncfiletag(const FileTag *ftag, char *path) |
3576 | 0 | { |
3577 | 0 | return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path); |
3578 | 0 | } |
3579 | | |
3580 | | /* |
3581 | | * Entrypoint for sync.c to sync members files. |
3582 | | */ |
3583 | | int |
3584 | | multixactmemberssyncfiletag(const FileTag *ftag, char *path) |
3585 | 0 | { |
3586 | 0 | return SlruSyncFileTag(MultiXactMemberCtl, ftag, path); |
3587 | 0 | } |