/src/samba/lib/tdb/common/transaction.c
Line | Count | Source |
1 | | /* |
2 | | Unix SMB/CIFS implementation. |
3 | | |
4 | | trivial database library |
5 | | |
6 | | Copyright (C) Andrew Tridgell 2005 |
7 | | |
8 | | ** NOTE! The following LGPL license applies to the tdb |
9 | | ** library. This does NOT imply that all of Samba is released |
10 | | ** under the LGPL |
11 | | |
12 | | This library is free software; you can redistribute it and/or |
13 | | modify it under the terms of the GNU Lesser General Public |
14 | | License as published by the Free Software Foundation; either |
15 | | version 3 of the License, or (at your option) any later version. |
16 | | |
17 | | This library is distributed in the hope that it will be useful, |
18 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
19 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
20 | | Lesser General Public License for more details. |
21 | | |
22 | | You should have received a copy of the GNU Lesser General Public |
23 | | License along with this library; if not, see <http://www.gnu.org/licenses/>. |
24 | | */ |
25 | | |
26 | | #include "tdb_private.h" |
27 | | |
28 | | /* |
29 | | transaction design: |
30 | | |
31 | | - only allow a single transaction at a time per database. This makes |
32 | | using the transaction API simpler, as otherwise the caller would |
33 | | have to cope with temporary failures in transactions that conflict |
34 | | with other current transactions |
35 | | |
36 | | - keep the transaction recovery information in the same file as the |
37 | | database, using a special 'transaction recovery' record pointed at |
38 | | by the header. This removes the need for extra journal files as |
39 | | used by some other databases |
40 | | |
41 | | - dynamically allocated the transaction recover record, re-using it |
42 | | for subsequent transactions. If a larger record is needed then |
43 | | tdb_free() the old record to place it on the normal tdb freelist |
44 | | before allocating the new record |
45 | | |
46 | | - during transactions, keep a linked list of all writes that have |
47 | | been performed by intercepting all tdb_write() calls. The hooked |
48 | | transaction versions of tdb_read() and tdb_write() check this |
49 | | linked list and try to use the elements of the list in preference |
50 | | to the real database. |
51 | | |
52 | | - don't allow any locks to be held when a transaction starts, |
53 | | otherwise we can end up with deadlock (plus lack of lock nesting |
54 | | in posix locks would mean the lock is lost) |
55 | | |
56 | | - if the caller gains a lock during the transaction but doesn't |
57 | | release it then fail the commit |
58 | | |
59 | | - allow for nested calls to tdb_transaction_start(), re-using the |
60 | | existing transaction record. If the inner transaction is cancelled |
61 | | then a subsequent commit will fail |
62 | | |
63 | | - keep a mirrored copy of the tdb hash chain heads to allow for the |
64 | | fast hash heads scan on traverse, updating the mirrored copy in |
65 | | the transaction version of tdb_write |
66 | | |
67 | | - allow callers to mix transaction and non-transaction use of tdb, |
68 | | although once a transaction is started then an exclusive lock is |
69 | | gained until the transaction is committed or cancelled |
70 | | |
71 | | - the commit strategy involves first saving away all modified data |
72 | | into a linearised buffer in the transaction recovery area, then |
73 | | marking the transaction recovery area with a magic value to |
74 | | indicate a valid recovery record. In total 4 fsync/msync calls are |
75 | | needed per commit to prevent race conditions. It might be possible |
76 | | to reduce this to 3 or even 2 with some more work. |
77 | | |
78 | | - check for a valid recovery record on open of the tdb, while the |
79 | | open lock is held. Automatically recover from the transaction |
80 | | recovery area if needed, then continue with the open as |
81 | | usual. This allows for smooth crash recovery with no administrator |
82 | | intervention. |
83 | | |
84 | | - if TDB_NOSYNC is passed to flags in tdb_open then transactions are |
85 | | still available, but no fsync/msync calls are made. This means we |
86 | | are still proof against a process dying during transaction commit, |
87 | | but not against machine reboot. |
88 | | |
89 | | - if TDB_ALLOW_NESTING is passed to flags in tdb open, or added using |
90 | | tdb_add_flags() transaction nesting is enabled. |
91 | | It resets the TDB_DISALLOW_NESTING flag, as both cannot be used together. |
92 | | The default is that transaction nesting is allowed. |
93 | | Note: this default may change in future versions of tdb. |
94 | | |
95 | | Beware. when transactions are nested a transaction successfully |
96 | | completed with tdb_transaction_commit() can be silently unrolled later. |
97 | | |
98 | | - if TDB_DISALLOW_NESTING is passed to flags in tdb open, or added using |
99 | | tdb_add_flags() transaction nesting is disabled. |
100 | | It resets the TDB_ALLOW_NESTING flag, as both cannot be used together. |
101 | | An attempt create a nested transaction will fail with TDB_ERR_NESTING. |
102 | | The default is that transaction nesting is allowed. |
103 | | Note: this default may change in future versions of tdb. |
104 | | */ |
105 | | |
106 | | |
107 | | /* |
108 | | hold the context of any current transaction |
109 | | */ |
110 | | struct tdb_transaction { |
111 | | /* we keep a mirrored copy of the tdb hash heads here so |
112 | | tdb_next_hash_chain() can operate efficiently */ |
113 | | uint32_t *hash_heads; |
114 | | |
115 | | /* the original io methods - used to do IOs to the real db */ |
116 | | const struct tdb_methods *io_methods; |
117 | | |
118 | | /* the list of transaction blocks. When a block is first |
119 | | written to, it gets created in this list */ |
120 | | uint8_t **blocks; |
121 | | uint32_t num_blocks; |
122 | | uint32_t block_size; /* bytes in each block */ |
123 | | uint32_t last_block_size; /* number of valid bytes in the last block */ |
124 | | |
125 | | /* non-zero when an internal transaction error has |
126 | | occurred. All write operations will then fail until the |
127 | | transaction is ended */ |
128 | | int transaction_error; |
129 | | |
130 | | /* when inside a transaction we need to keep track of any |
131 | | nested tdb_transaction_start() calls, as these are allowed, |
132 | | but don't create a new transaction */ |
133 | | int nesting; |
134 | | |
135 | | /* set when a prepare has already occurred */ |
136 | | bool prepared; |
137 | | tdb_off_t magic_offset; |
138 | | |
139 | | /* old file size before transaction */ |
140 | | tdb_len_t old_map_size; |
141 | | |
142 | | /* did we expand in this transaction */ |
143 | | bool expanded; |
144 | | }; |
145 | | |
146 | | |
147 | | /* |
148 | | read while in a transaction. We need to check first if the data is in our list |
149 | | of transaction elements, then if not do a real read |
150 | | */ |
151 | | static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf, |
152 | | tdb_len_t len, int cv) |
153 | 0 | { |
154 | 0 | uint32_t blk; |
155 | | |
156 | | /* break it down into block sized ops */ |
157 | 0 | while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) { |
158 | 0 | tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size); |
159 | 0 | if (transaction_read(tdb, off, buf, len2, cv) != 0) { |
160 | 0 | return -1; |
161 | 0 | } |
162 | 0 | len -= len2; |
163 | 0 | off += len2; |
164 | 0 | buf = (void *)(len2 + (char *)buf); |
165 | 0 | } |
166 | | |
167 | 0 | if (len == 0) { |
168 | 0 | return 0; |
169 | 0 | } |
170 | | |
171 | 0 | blk = off / tdb->transaction->block_size; |
172 | | |
173 | | /* see if we have it in the block list */ |
174 | 0 | if (tdb->transaction->num_blocks <= blk || |
175 | 0 | tdb->transaction->blocks[blk] == NULL) { |
176 | | /* nope, do a real read */ |
177 | 0 | if (tdb->transaction->io_methods->tdb_read(tdb, off, buf, len, cv) != 0) { |
178 | 0 | goto fail; |
179 | 0 | } |
180 | 0 | return 0; |
181 | 0 | } |
182 | | |
183 | | /* it is in the block list. Now check for the last block */ |
184 | 0 | if (blk == tdb->transaction->num_blocks-1) { |
185 | 0 | if (len > tdb->transaction->last_block_size) { |
186 | 0 | goto fail; |
187 | 0 | } |
188 | 0 | } |
189 | | |
190 | | /* now copy it out of this block */ |
191 | 0 | memcpy(buf, tdb->transaction->blocks[blk] + (off % tdb->transaction->block_size), len); |
192 | 0 | if (cv) { |
193 | 0 | tdb_convert(buf, len); |
194 | 0 | } |
195 | 0 | return 0; |
196 | | |
197 | 0 | fail: |
198 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_read: failed at off=%u len=%u\n", off, len)); |
199 | 0 | tdb->ecode = TDB_ERR_IO; |
200 | 0 | tdb->transaction->transaction_error = 1; |
201 | 0 | return -1; |
202 | 0 | } |
203 | | |
204 | | |
205 | | /* |
206 | | write while in a transaction |
207 | | */ |
208 | | static int transaction_write(struct tdb_context *tdb, tdb_off_t off, |
209 | | const void *buf, tdb_len_t len) |
210 | 0 | { |
211 | 0 | uint32_t blk; |
212 | |
|
213 | 0 | if (buf == NULL) { |
214 | 0 | return -1; |
215 | 0 | } |
216 | | |
217 | | /* Only a commit is allowed on a prepared transaction */ |
218 | 0 | if (tdb->transaction->prepared) { |
219 | 0 | tdb->ecode = TDB_ERR_EINVAL; |
220 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: transaction already prepared, write not allowed\n")); |
221 | 0 | tdb->transaction->transaction_error = 1; |
222 | 0 | return -1; |
223 | 0 | } |
224 | | |
225 | | /* if the write is to a hash head, then update the transaction |
226 | | hash heads */ |
227 | 0 | if (len == sizeof(tdb_off_t) && off >= FREELIST_TOP && |
228 | 0 | off < FREELIST_TOP+TDB_HASHTABLE_SIZE(tdb)) { |
229 | 0 | uint32_t chain = (off-FREELIST_TOP) / sizeof(tdb_off_t); |
230 | 0 | memcpy(&tdb->transaction->hash_heads[chain], buf, len); |
231 | 0 | } |
232 | | |
233 | | /* break it up into block sized chunks */ |
234 | 0 | while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) { |
235 | 0 | tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size); |
236 | 0 | if (transaction_write(tdb, off, buf, len2) != 0) { |
237 | 0 | return -1; |
238 | 0 | } |
239 | 0 | len -= len2; |
240 | 0 | off += len2; |
241 | 0 | buf = (const void *)(len2 + (const char *)buf); |
242 | 0 | } |
243 | | |
244 | 0 | if (len == 0) { |
245 | 0 | return 0; |
246 | 0 | } |
247 | | |
248 | 0 | blk = off / tdb->transaction->block_size; |
249 | 0 | off = off % tdb->transaction->block_size; |
250 | |
|
251 | 0 | if (tdb->transaction->num_blocks <= blk) { |
252 | 0 | uint8_t **new_blocks; |
253 | | /* expand the blocks array */ |
254 | 0 | new_blocks = (uint8_t **)realloc(tdb->transaction->blocks, |
255 | 0 | (blk+1)*sizeof(uint8_t *)); |
256 | 0 | if (new_blocks == NULL) { |
257 | 0 | tdb->ecode = TDB_ERR_OOM; |
258 | 0 | goto fail; |
259 | 0 | } |
260 | 0 | memset(&new_blocks[tdb->transaction->num_blocks], 0, |
261 | 0 | (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *)); |
262 | 0 | tdb->transaction->blocks = new_blocks; |
263 | 0 | tdb->transaction->num_blocks = blk+1; |
264 | 0 | tdb->transaction->last_block_size = 0; |
265 | 0 | } |
266 | | |
267 | | /* allocate and fill a block? */ |
268 | 0 | if (tdb->transaction->blocks[blk] == NULL) { |
269 | 0 | tdb->transaction->blocks[blk] = (uint8_t *)calloc(tdb->transaction->block_size, 1); |
270 | 0 | if (tdb->transaction->blocks[blk] == NULL) { |
271 | 0 | tdb->ecode = TDB_ERR_OOM; |
272 | 0 | tdb->transaction->transaction_error = 1; |
273 | 0 | return -1; |
274 | 0 | } |
275 | 0 | if (tdb->transaction->old_map_size > blk * tdb->transaction->block_size) { |
276 | 0 | tdb_len_t len2 = tdb->transaction->block_size; |
277 | 0 | if (len2 + (blk * tdb->transaction->block_size) > tdb->transaction->old_map_size) { |
278 | 0 | len2 = tdb->transaction->old_map_size - (blk * tdb->transaction->block_size); |
279 | 0 | } |
280 | 0 | if (tdb->transaction->io_methods->tdb_read(tdb, blk * tdb->transaction->block_size, |
281 | 0 | tdb->transaction->blocks[blk], |
282 | 0 | len2, 0) != 0) { |
283 | 0 | SAFE_FREE(tdb->transaction->blocks[blk]); |
284 | 0 | tdb->ecode = TDB_ERR_IO; |
285 | 0 | goto fail; |
286 | 0 | } |
287 | 0 | if (blk == tdb->transaction->num_blocks-1) { |
288 | 0 | tdb->transaction->last_block_size = len2; |
289 | 0 | } |
290 | 0 | } |
291 | 0 | } |
292 | | |
293 | | /* overwrite part of an existing block */ |
294 | 0 | memcpy(tdb->transaction->blocks[blk] + off, buf, len); |
295 | 0 | if (blk == tdb->transaction->num_blocks-1) { |
296 | 0 | if (len + off > tdb->transaction->last_block_size) { |
297 | 0 | tdb->transaction->last_block_size = len + off; |
298 | 0 | } |
299 | 0 | } |
300 | |
|
301 | 0 | return 0; |
302 | | |
303 | 0 | fail: |
304 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: failed at off=%u len=%u\n", |
305 | 0 | (blk*tdb->transaction->block_size) + off, len)); |
306 | 0 | tdb->transaction->transaction_error = 1; |
307 | 0 | return -1; |
308 | 0 | } |
309 | | |
310 | | |
311 | | /* |
312 | | write while in a transaction - this variant never expands the transaction blocks, it only |
313 | | updates existing blocks. This means it cannot change the recovery size |
314 | | */ |
315 | | static int transaction_write_existing(struct tdb_context *tdb, tdb_off_t off, |
316 | | const void *buf, tdb_len_t len) |
317 | 0 | { |
318 | 0 | uint32_t blk; |
319 | | |
320 | | /* break it up into block sized chunks */ |
321 | 0 | while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) { |
322 | 0 | tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size); |
323 | 0 | if (transaction_write_existing(tdb, off, buf, len2) != 0) { |
324 | 0 | return -1; |
325 | 0 | } |
326 | 0 | len -= len2; |
327 | 0 | off += len2; |
328 | 0 | if (buf != NULL) { |
329 | 0 | buf = (const void *)(len2 + (const char *)buf); |
330 | 0 | } |
331 | 0 | } |
332 | | |
333 | 0 | if (len == 0 || buf == NULL) { |
334 | 0 | return 0; |
335 | 0 | } |
336 | | |
337 | 0 | blk = off / tdb->transaction->block_size; |
338 | 0 | off = off % tdb->transaction->block_size; |
339 | |
|
340 | 0 | if (tdb->transaction->num_blocks <= blk || |
341 | 0 | tdb->transaction->blocks[blk] == NULL) { |
342 | 0 | return 0; |
343 | 0 | } |
344 | | |
345 | 0 | if (blk == tdb->transaction->num_blocks-1 && |
346 | 0 | off + len > tdb->transaction->last_block_size) { |
347 | 0 | if (off >= tdb->transaction->last_block_size) { |
348 | 0 | return 0; |
349 | 0 | } |
350 | 0 | len = tdb->transaction->last_block_size - off; |
351 | 0 | } |
352 | | |
353 | | /* overwrite part of an existing block */ |
354 | 0 | memcpy(tdb->transaction->blocks[blk] + off, buf, len); |
355 | |
|
356 | 0 | return 0; |
357 | 0 | } |
358 | | |
359 | | |
360 | | /* |
361 | | accelerated hash chain head search, using the cached hash heads |
362 | | */ |
363 | | static void transaction_next_hash_chain(struct tdb_context *tdb, uint32_t *chain) |
364 | 0 | { |
365 | 0 | uint32_t h = *chain; |
366 | 0 | for (;h < tdb->hash_size;h++) { |
367 | | /* the +1 takes account of the freelist */ |
368 | 0 | if (0 != tdb->transaction->hash_heads[h+1]) { |
369 | 0 | break; |
370 | 0 | } |
371 | 0 | } |
372 | 0 | (*chain) = h; |
373 | 0 | } |
374 | | |
375 | | /* |
376 | | out of bounds check during a transaction |
377 | | */ |
378 | | static int transaction_oob(struct tdb_context *tdb, tdb_off_t off, |
379 | | tdb_len_t len, int probe) |
380 | 0 | { |
381 | | /* |
382 | | * This duplicates functionality from tdb_oob(). Don't remove: |
383 | | * we still have direct callers of tdb->methods->tdb_oob() |
384 | | * inside transaction.c. |
385 | | */ |
386 | 0 | if (off + len >= off && off + len <= tdb->map_size) { |
387 | 0 | return 0; |
388 | 0 | } |
389 | 0 | tdb->ecode = TDB_ERR_IO; |
390 | 0 | return -1; |
391 | 0 | } |
392 | | |
393 | | /* |
394 | | transaction version of tdb_expand(). |
395 | | */ |
396 | | static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t size, |
397 | | tdb_off_t addition) |
398 | 0 | { |
399 | 0 | const char buf_zero[8192] = {0}; |
400 | 0 | size_t buf_len = sizeof(buf_zero); |
401 | |
|
402 | 0 | while (addition > 0) { |
403 | 0 | size_t n = MIN(addition, buf_len); |
404 | 0 | int ret; |
405 | |
|
406 | 0 | ret = transaction_write(tdb, size, buf_zero, n); |
407 | 0 | if (ret != 0) { |
408 | 0 | return ret; |
409 | 0 | } |
410 | | |
411 | 0 | addition -= n; |
412 | 0 | size += n; |
413 | 0 | } |
414 | | |
415 | 0 | tdb->transaction->expanded = true; |
416 | |
|
417 | 0 | return 0; |
418 | 0 | } |
419 | | |
420 | | static const struct tdb_methods transaction_methods = { |
421 | | transaction_read, |
422 | | transaction_write, |
423 | | transaction_next_hash_chain, |
424 | | transaction_oob, |
425 | | transaction_expand_file, |
426 | | }; |
427 | | |
428 | | /* |
429 | | * Is a transaction currently active on this context? |
430 | | * |
431 | | */ |
432 | | _PUBLIC_ bool tdb_transaction_active(struct tdb_context *tdb) |
433 | 0 | { |
434 | 0 | return (tdb->transaction != NULL); |
435 | 0 | } |
436 | | |
437 | | /* |
438 | | start a tdb transaction. No token is returned, as only a single |
439 | | transaction is allowed to be pending per tdb_context |
440 | | */ |
441 | | static int _tdb_transaction_start(struct tdb_context *tdb, |
442 | | enum tdb_lock_flags lockflags) |
443 | 0 | { |
444 | | /* some sanity checks */ |
445 | 0 | if (tdb->read_only || (tdb->flags & TDB_INTERNAL) |
446 | 0 | || tdb->traverse_read) { |
447 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n")); |
448 | 0 | tdb->ecode = TDB_ERR_EINVAL; |
449 | 0 | return -1; |
450 | 0 | } |
451 | | |
452 | | /* cope with nested tdb_transaction_start() calls */ |
453 | 0 | if (tdb->transaction != NULL) { |
454 | 0 | if (!(tdb->flags & TDB_ALLOW_NESTING)) { |
455 | 0 | tdb->ecode = TDB_ERR_NESTING; |
456 | 0 | return -1; |
457 | 0 | } |
458 | 0 | tdb->transaction->nesting++; |
459 | 0 | TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_start: nesting %d\n", |
460 | 0 | tdb->transaction->nesting)); |
461 | 0 | return 0; |
462 | 0 | } |
463 | | |
464 | 0 | if (tdb_have_extra_locks(tdb)) { |
465 | | /* the caller must not have any locks when starting a |
466 | | transaction as otherwise we'll be screwed by lack |
467 | | of nested locks in posix */ |
468 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction with locks held\n")); |
469 | 0 | tdb->ecode = TDB_ERR_LOCK; |
470 | 0 | return -1; |
471 | 0 | } |
472 | | |
473 | 0 | if (tdb->travlocks.next != NULL) { |
474 | | /* you cannot use transactions inside a traverse (although you can use |
475 | | traverse inside a transaction) as otherwise you can end up with |
476 | | deadlock */ |
477 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction within a traverse\n")); |
478 | 0 | tdb->ecode = TDB_ERR_LOCK; |
479 | 0 | return -1; |
480 | 0 | } |
481 | | |
482 | 0 | tdb->transaction = (struct tdb_transaction *) |
483 | 0 | calloc(sizeof(struct tdb_transaction), 1); |
484 | 0 | if (tdb->transaction == NULL) { |
485 | 0 | tdb->ecode = TDB_ERR_OOM; |
486 | 0 | return -1; |
487 | 0 | } |
488 | | |
489 | | /* a page at a time seems like a reasonable compromise between compactness and efficiency */ |
490 | 0 | tdb->transaction->block_size = tdb->page_size; |
491 | | |
492 | | /* get the transaction write lock. This is a blocking lock. As |
493 | | discussed with Volker, there are a number of ways we could |
494 | | make this async, which we will probably do in the future */ |
495 | 0 | if (tdb_transaction_lock(tdb, F_WRLCK, lockflags) == -1) { |
496 | 0 | SAFE_FREE(tdb->transaction->blocks); |
497 | 0 | SAFE_FREE(tdb->transaction); |
498 | 0 | if ((lockflags & TDB_LOCK_WAIT) == 0) { |
499 | 0 | tdb->ecode = TDB_ERR_NOLOCK; |
500 | 0 | } else { |
501 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
502 | 0 | "tdb_transaction_start: " |
503 | 0 | "failed to get transaction lock\n")); |
504 | 0 | } |
505 | 0 | return -1; |
506 | 0 | } |
507 | | |
508 | | /* get a read lock from the freelist to the end of file. This |
509 | | is upgraded to a write lock during the commit */ |
510 | 0 | if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true) == -1) { |
511 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to get hash locks\n")); |
512 | 0 | goto fail_allrecord_lock; |
513 | 0 | } |
514 | | |
515 | | /* setup a copy of the hash table heads so the hash scan in |
516 | | traverse can be fast */ |
517 | 0 | tdb->transaction->hash_heads = (uint32_t *) |
518 | 0 | calloc(tdb->hash_size+1, sizeof(uint32_t)); |
519 | 0 | if (tdb->transaction->hash_heads == NULL) { |
520 | 0 | tdb->ecode = TDB_ERR_OOM; |
521 | 0 | goto fail; |
522 | 0 | } |
523 | 0 | if (tdb->methods->tdb_read(tdb, FREELIST_TOP, tdb->transaction->hash_heads, |
524 | 0 | TDB_HASHTABLE_SIZE(tdb), 0) != 0) { |
525 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to read hash heads\n")); |
526 | 0 | tdb->ecode = TDB_ERR_IO; |
527 | 0 | goto fail; |
528 | 0 | } |
529 | | |
530 | | /* make sure we know about any file expansions already done by |
531 | | anyone else */ |
532 | 0 | tdb_oob(tdb, tdb->map_size, 1, 1); |
533 | 0 | tdb->transaction->old_map_size = tdb->map_size; |
534 | | |
535 | | /* finally hook the io methods, replacing them with |
536 | | transaction specific methods */ |
537 | 0 | tdb->transaction->io_methods = tdb->methods; |
538 | 0 | tdb->methods = &transaction_methods; |
539 | | |
540 | | /* Trace at the end, so we get sequence number correct. */ |
541 | 0 | tdb_trace(tdb, "tdb_transaction_start"); |
542 | 0 | return 0; |
543 | | |
544 | 0 | fail: |
545 | 0 | tdb_allrecord_unlock(tdb, F_RDLCK, false); |
546 | 0 | fail_allrecord_lock: |
547 | 0 | tdb_transaction_unlock(tdb, F_WRLCK); |
548 | 0 | SAFE_FREE(tdb->transaction->blocks); |
549 | 0 | SAFE_FREE(tdb->transaction->hash_heads); |
550 | 0 | SAFE_FREE(tdb->transaction); |
551 | 0 | return -1; |
552 | 0 | } |
553 | | |
554 | | _PUBLIC_ int tdb_transaction_start(struct tdb_context *tdb) |
555 | 0 | { |
556 | 0 | return _tdb_transaction_start(tdb, TDB_LOCK_WAIT); |
557 | 0 | } |
558 | | |
559 | | _PUBLIC_ int tdb_transaction_start_nonblock(struct tdb_context *tdb) |
560 | 0 | { |
561 | 0 | return _tdb_transaction_start(tdb, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE); |
562 | 0 | } |
563 | | |
564 | | /* |
565 | | sync to disk |
566 | | */ |
567 | | static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t length) |
568 | 0 | { |
569 | 0 | if (tdb->flags & TDB_NOSYNC) { |
570 | 0 | return 0; |
571 | 0 | } |
572 | | |
573 | 0 | #ifdef HAVE_FDATASYNC |
574 | 0 | if (fdatasync(tdb->fd) != 0) { |
575 | | #else |
576 | | if (fsync(tdb->fd) != 0) { |
577 | | #endif |
578 | 0 | tdb->ecode = TDB_ERR_IO; |
579 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: fsync failed\n")); |
580 | 0 | return -1; |
581 | 0 | } |
582 | 0 | #ifdef HAVE_MMAP |
583 | 0 | if (tdb->map_ptr) { |
584 | 0 | tdb_off_t moffset = offset & ~(tdb->page_size-1); |
585 | 0 | if (msync(moffset + (char *)tdb->map_ptr, |
586 | 0 | length + (offset - moffset), MS_SYNC) != 0) { |
587 | 0 | tdb->ecode = TDB_ERR_IO; |
588 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: msync failed - %s\n", |
589 | 0 | strerror(errno))); |
590 | 0 | return -1; |
591 | 0 | } |
592 | 0 | } |
593 | 0 | #endif |
594 | 0 | return 0; |
595 | 0 | } |
596 | | |
597 | | |
598 | | static int _tdb_transaction_cancel(struct tdb_context *tdb) |
599 | 0 | { |
600 | 0 | uint32_t i; |
601 | 0 | int ret = 0; |
602 | |
|
603 | 0 | if (tdb->transaction == NULL) { |
604 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_cancel: no transaction\n")); |
605 | 0 | return -1; |
606 | 0 | } |
607 | | |
608 | 0 | if (tdb->transaction->nesting != 0) { |
609 | 0 | tdb->transaction->transaction_error = 1; |
610 | 0 | tdb->transaction->nesting--; |
611 | 0 | return 0; |
612 | 0 | } |
613 | | |
614 | 0 | tdb->map_size = tdb->transaction->old_map_size; |
615 | | |
616 | | /* free all the transaction blocks */ |
617 | 0 | for (i=0;i<tdb->transaction->num_blocks;i++) { |
618 | 0 | if ((tdb->transaction->blocks != NULL) && |
619 | 0 | tdb->transaction->blocks[i] != NULL) { |
620 | 0 | free(tdb->transaction->blocks[i]); |
621 | 0 | } |
622 | 0 | } |
623 | 0 | SAFE_FREE(tdb->transaction->blocks); |
624 | |
|
625 | 0 | if (tdb->transaction->magic_offset) { |
626 | 0 | const struct tdb_methods *methods = tdb->transaction->io_methods; |
627 | 0 | const uint32_t invalid = TDB_RECOVERY_INVALID_MAGIC; |
628 | | |
629 | | /* remove the recovery marker */ |
630 | 0 | if (methods->tdb_write(tdb, tdb->transaction->magic_offset, &invalid, 4) == -1 || |
631 | 0 | transaction_sync(tdb, tdb->transaction->magic_offset, 4) == -1) { |
632 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_cancel: failed to remove recovery magic\n")); |
633 | 0 | ret = -1; |
634 | 0 | } |
635 | 0 | } |
636 | | |
637 | | /* This also removes the OPEN_LOCK, if we have it. */ |
638 | 0 | tdb_release_transaction_locks(tdb); |
639 | | |
640 | | /* restore the normal io methods */ |
641 | 0 | tdb->methods = tdb->transaction->io_methods; |
642 | |
|
643 | 0 | SAFE_FREE(tdb->transaction->hash_heads); |
644 | 0 | SAFE_FREE(tdb->transaction); |
645 | |
|
646 | 0 | return ret; |
647 | 0 | } |
648 | | |
649 | | /* |
650 | | cancel the current transaction |
651 | | */ |
652 | | _PUBLIC_ int tdb_transaction_cancel(struct tdb_context *tdb) |
653 | 0 | { |
654 | 0 | tdb_trace(tdb, "tdb_transaction_cancel"); |
655 | 0 | return _tdb_transaction_cancel(tdb); |
656 | 0 | } |
657 | | |
658 | | /* |
659 | | work out how much space the linearised recovery data will consume |
660 | | */ |
661 | | static bool tdb_recovery_size(struct tdb_context *tdb, tdb_len_t *result) |
662 | 0 | { |
663 | 0 | tdb_len_t recovery_size = 0; |
664 | 0 | uint32_t i; |
665 | |
|
666 | 0 | recovery_size = sizeof(uint32_t); |
667 | 0 | for (i=0;i<tdb->transaction->num_blocks;i++) { |
668 | 0 | tdb_len_t block_size; |
669 | 0 | if (i * tdb->transaction->block_size >= tdb->transaction->old_map_size) { |
670 | 0 | break; |
671 | 0 | } |
672 | 0 | if (tdb->transaction->blocks[i] == NULL) { |
673 | 0 | continue; |
674 | 0 | } |
675 | 0 | if (!tdb_add_len_t(recovery_size, 2*sizeof(tdb_off_t), |
676 | 0 | &recovery_size)) { |
677 | 0 | return false; |
678 | 0 | } |
679 | 0 | if (i == tdb->transaction->num_blocks-1) { |
680 | 0 | block_size = tdb->transaction->last_block_size; |
681 | 0 | } else { |
682 | 0 | block_size = tdb->transaction->block_size; |
683 | 0 | } |
684 | 0 | if (!tdb_add_len_t(recovery_size, block_size, |
685 | 0 | &recovery_size)) { |
686 | 0 | return false; |
687 | 0 | } |
688 | 0 | } |
689 | | |
690 | 0 | *result = recovery_size; |
691 | 0 | return true; |
692 | 0 | } |
693 | | |
694 | | int tdb_recovery_area(struct tdb_context *tdb, |
695 | | const struct tdb_methods *methods, |
696 | | tdb_off_t *recovery_offset, |
697 | | struct tdb_record *rec) |
698 | 0 | { |
699 | 0 | int ret; |
700 | |
|
701 | 0 | if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, recovery_offset) == -1) { |
702 | 0 | return -1; |
703 | 0 | } |
704 | | |
705 | 0 | if (*recovery_offset == 0) { |
706 | 0 | rec->rec_len = 0; |
707 | 0 | return 0; |
708 | 0 | } |
709 | | |
710 | 0 | if (methods->tdb_read(tdb, *recovery_offset, rec, sizeof(*rec), |
711 | 0 | DOCONV()) == -1) { |
712 | 0 | return -1; |
713 | 0 | } |
714 | | |
715 | | /* ignore invalid recovery regions: can happen in crash */ |
716 | 0 | if (rec->magic != TDB_RECOVERY_MAGIC && |
717 | 0 | rec->magic != TDB_RECOVERY_INVALID_MAGIC) { |
718 | 0 | *recovery_offset = 0; |
719 | 0 | rec->rec_len = 0; |
720 | 0 | } |
721 | |
|
722 | 0 | ret = methods->tdb_oob(tdb, *recovery_offset, rec->rec_len, 1); |
723 | 0 | if (ret == -1) { |
724 | 0 | *recovery_offset = 0; |
725 | 0 | rec->rec_len = 0; |
726 | 0 | } |
727 | |
|
728 | 0 | return 0; |
729 | 0 | } |
730 | | |
731 | | /* |
732 | | allocate the recovery area, or use an existing recovery area if it is |
733 | | large enough |
734 | | */ |
735 | | static int tdb_recovery_allocate(struct tdb_context *tdb, |
736 | | tdb_len_t *recovery_size, |
737 | | tdb_off_t *recovery_offset, |
738 | | tdb_len_t *recovery_max_size) |
739 | 0 | { |
740 | 0 | struct tdb_record rec; |
741 | 0 | const struct tdb_methods *methods = tdb->transaction->io_methods; |
742 | 0 | tdb_off_t recovery_head, new_end; |
743 | |
|
744 | 0 | if (tdb_recovery_area(tdb, methods, &recovery_head, &rec) == -1) { |
745 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery head\n")); |
746 | 0 | return -1; |
747 | 0 | } |
748 | | |
749 | 0 | if (!tdb_recovery_size(tdb, recovery_size)) { |
750 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: " |
751 | 0 | "overflow recovery size\n")); |
752 | 0 | return -1; |
753 | 0 | } |
754 | | |
755 | | /* Existing recovery area? */ |
756 | 0 | if (recovery_head != 0 && *recovery_size <= rec.rec_len) { |
757 | | /* it fits in the existing area */ |
758 | 0 | *recovery_max_size = rec.rec_len; |
759 | 0 | *recovery_offset = recovery_head; |
760 | 0 | return 0; |
761 | 0 | } |
762 | | |
763 | | /* If recovery area in middle of file, we need a new one. */ |
764 | 0 | if (recovery_head == 0 |
765 | 0 | || recovery_head + sizeof(rec) + rec.rec_len != tdb->map_size) { |
766 | | /* we need to free up the old recovery area, then allocate a |
767 | | new one at the end of the file. Note that we cannot use |
768 | | tdb_allocate() to allocate the new one as that might return |
769 | | us an area that is being currently used (as of the start of |
770 | | the transaction) */ |
771 | 0 | if (recovery_head) { |
772 | 0 | if (tdb_free(tdb, recovery_head, &rec) == -1) { |
773 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, |
774 | 0 | "tdb_recovery_allocate: failed to" |
775 | 0 | " free previous recovery area\n")); |
776 | 0 | return -1; |
777 | 0 | } |
778 | | |
779 | | /* the tdb_free() call might have increased |
780 | | * the recovery size */ |
781 | 0 | if (!tdb_recovery_size(tdb, recovery_size)) { |
782 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, |
783 | 0 | "tdb_recovery_allocate: " |
784 | 0 | "overflow recovery size\n")); |
785 | 0 | return -1; |
786 | 0 | } |
787 | 0 | } |
788 | | |
789 | | /* New head will be at end of file. */ |
790 | 0 | recovery_head = tdb->map_size; |
791 | 0 | } |
792 | | |
793 | | /* Now we know where it will be. */ |
794 | 0 | *recovery_offset = recovery_head; |
795 | | |
796 | | /* Expand by more than we need, so we don't do it often. */ |
797 | 0 | *recovery_max_size = tdb_expand_adjust(tdb->map_size, |
798 | 0 | *recovery_size, |
799 | 0 | tdb->page_size) |
800 | 0 | - sizeof(rec); |
801 | |
|
802 | 0 | if (!tdb_add_off_t(recovery_head, sizeof(rec), &new_end) || |
803 | 0 | !tdb_add_off_t(new_end, *recovery_max_size, &new_end)) { |
804 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: " |
805 | 0 | "overflow recovery area\n")); |
806 | 0 | return -1; |
807 | 0 | } |
808 | | |
809 | 0 | if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size, |
810 | 0 | new_end - tdb->transaction->old_map_size) |
811 | 0 | == -1) { |
812 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to create recovery area\n")); |
813 | 0 | return -1; |
814 | 0 | } |
815 | | |
816 | | /* remap the file (if using mmap) */ |
817 | 0 | methods->tdb_oob(tdb, tdb->map_size, 1, 1); |
818 | | |
819 | | /* we have to reset the old map size so that we don't try to expand the file |
820 | | again in the transaction commit, which would destroy the recovery area */ |
821 | 0 | tdb->transaction->old_map_size = tdb->map_size; |
822 | | |
823 | | /* write the recovery header offset and sync - we can sync without a race here |
824 | | as the magic ptr in the recovery record has not been set */ |
825 | 0 | CONVERT(recovery_head); |
826 | 0 | if (methods->tdb_write(tdb, TDB_RECOVERY_HEAD, |
827 | 0 | &recovery_head, sizeof(tdb_off_t)) == -1) { |
828 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n")); |
829 | 0 | return -1; |
830 | 0 | } |
831 | 0 | if (transaction_write_existing(tdb, TDB_RECOVERY_HEAD, &recovery_head, sizeof(tdb_off_t)) == -1) { |
832 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n")); |
833 | 0 | return -1; |
834 | 0 | } |
835 | | |
836 | 0 | return 0; |
837 | 0 | } |
838 | | |
839 | | |
840 | | /* |
841 | | setup the recovery data that will be used on a crash during commit |
842 | | */ |
843 | | static int transaction_setup_recovery(struct tdb_context *tdb, |
844 | | tdb_off_t *magic_offset) |
845 | 0 | { |
846 | 0 | tdb_len_t recovery_size; |
847 | 0 | unsigned char *data, *p; |
848 | 0 | const struct tdb_methods *methods = tdb->transaction->io_methods; |
849 | 0 | struct tdb_record *rec; |
850 | 0 | tdb_off_t recovery_offset, recovery_max_size; |
851 | 0 | tdb_off_t old_map_size = tdb->transaction->old_map_size; |
852 | 0 | uint32_t magic, tailer; |
853 | 0 | uint32_t i; |
854 | | |
855 | | /* |
856 | | check that the recovery area has enough space |
857 | | */ |
858 | 0 | if (tdb_recovery_allocate(tdb, &recovery_size, |
859 | 0 | &recovery_offset, &recovery_max_size) == -1) { |
860 | 0 | return -1; |
861 | 0 | } |
862 | | |
863 | 0 | rec = malloc(recovery_size + sizeof(*rec)); |
864 | 0 | if (rec == NULL) { |
865 | 0 | tdb->ecode = TDB_ERR_OOM; |
866 | 0 | return -1; |
867 | 0 | } |
868 | | |
869 | 0 | memset(rec, 0, sizeof(*rec)); |
870 | |
|
871 | 0 | rec->magic = TDB_RECOVERY_INVALID_MAGIC; |
872 | 0 | rec->data_len = recovery_size; |
873 | 0 | rec->rec_len = recovery_max_size; |
874 | 0 | rec->key_len = old_map_size; |
875 | 0 | CONVERT(*rec); |
876 | |
|
877 | 0 | data = (unsigned char *)rec; |
878 | | |
879 | | /* build the recovery data into a single blob to allow us to do a single |
880 | | large write, which should be more efficient */ |
881 | 0 | p = data + sizeof(*rec); |
882 | 0 | for (i=0;i<tdb->transaction->num_blocks;i++) { |
883 | 0 | tdb_off_t offset; |
884 | 0 | tdb_len_t length; |
885 | |
|
886 | 0 | if (tdb->transaction->blocks[i] == NULL) { |
887 | 0 | continue; |
888 | 0 | } |
889 | | |
890 | 0 | offset = i * tdb->transaction->block_size; |
891 | 0 | length = tdb->transaction->block_size; |
892 | 0 | if (i == tdb->transaction->num_blocks-1) { |
893 | 0 | length = tdb->transaction->last_block_size; |
894 | 0 | } |
895 | |
|
896 | 0 | if (offset >= old_map_size) { |
897 | 0 | continue; |
898 | 0 | } |
899 | 0 | if (offset + length > tdb->transaction->old_map_size) { |
900 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: transaction data over new region boundary\n")); |
901 | 0 | free(data); |
902 | 0 | tdb->ecode = TDB_ERR_CORRUPT; |
903 | 0 | return -1; |
904 | 0 | } |
905 | 0 | memcpy(p, &offset, 4); |
906 | 0 | memcpy(p+4, &length, 4); |
907 | 0 | if (DOCONV()) { |
908 | 0 | tdb_convert(p, 8); |
909 | 0 | } |
910 | | /* the recovery area contains the old data, not the |
911 | | new data, so we have to call the original tdb_read |
912 | | method to get it */ |
913 | 0 | if (methods->tdb_read(tdb, offset, p + 8, length, 0) != 0) { |
914 | 0 | free(data); |
915 | 0 | tdb->ecode = TDB_ERR_IO; |
916 | 0 | return -1; |
917 | 0 | } |
918 | 0 | p += 8 + length; |
919 | 0 | } |
920 | | |
921 | | /* and the tailer */ |
922 | 0 | tailer = sizeof(*rec) + recovery_max_size; |
923 | 0 | memcpy(p, &tailer, 4); |
924 | 0 | if (DOCONV()) { |
925 | 0 | tdb_convert(p, 4); |
926 | 0 | } |
927 | | |
928 | | /* write the recovery data to the recovery area */ |
929 | 0 | if (methods->tdb_write(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) { |
930 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery data\n")); |
931 | 0 | free(data); |
932 | 0 | tdb->ecode = TDB_ERR_IO; |
933 | 0 | return -1; |
934 | 0 | } |
935 | 0 | if (transaction_write_existing(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) { |
936 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery data\n")); |
937 | 0 | free(data); |
938 | 0 | tdb->ecode = TDB_ERR_IO; |
939 | 0 | return -1; |
940 | 0 | } |
941 | | |
942 | | /* as we don't have ordered writes, we have to sync the recovery |
943 | | data before we update the magic to indicate that the recovery |
944 | | data is present */ |
945 | 0 | if (transaction_sync(tdb, recovery_offset, sizeof(*rec) + recovery_size) == -1) { |
946 | 0 | free(data); |
947 | 0 | return -1; |
948 | 0 | } |
949 | | |
950 | 0 | free(data); |
951 | |
|
952 | 0 | magic = TDB_RECOVERY_MAGIC; |
953 | 0 | CONVERT(magic); |
954 | |
|
955 | 0 | *magic_offset = recovery_offset + offsetof(struct tdb_record, magic); |
956 | |
|
957 | 0 | if (methods->tdb_write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) { |
958 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery magic\n")); |
959 | 0 | tdb->ecode = TDB_ERR_IO; |
960 | 0 | return -1; |
961 | 0 | } |
962 | 0 | if (transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic)) == -1) { |
963 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery magic\n")); |
964 | 0 | tdb->ecode = TDB_ERR_IO; |
965 | 0 | return -1; |
966 | 0 | } |
967 | | |
968 | | /* ensure the recovery magic marker is on disk */ |
969 | 0 | if (transaction_sync(tdb, *magic_offset, sizeof(magic)) == -1) { |
970 | 0 | return -1; |
971 | 0 | } |
972 | | |
973 | 0 | return 0; |
974 | 0 | } |
975 | | |
976 | | static int _tdb_transaction_prepare_commit(struct tdb_context *tdb) |
977 | 0 | { |
978 | 0 | const struct tdb_methods *methods; |
979 | |
|
980 | 0 | if (tdb->transaction == NULL) { |
981 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: no transaction\n")); |
982 | 0 | return -1; |
983 | 0 | } |
984 | | |
985 | 0 | if (tdb->transaction->prepared) { |
986 | 0 | tdb->ecode = TDB_ERR_EINVAL; |
987 | 0 | _tdb_transaction_cancel(tdb); |
988 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction already prepared\n")); |
989 | 0 | return -1; |
990 | 0 | } |
991 | | |
992 | 0 | if (tdb->transaction->transaction_error) { |
993 | 0 | tdb->ecode = TDB_ERR_IO; |
994 | 0 | _tdb_transaction_cancel(tdb); |
995 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction error pending\n")); |
996 | 0 | return -1; |
997 | 0 | } |
998 | | |
999 | | |
1000 | 0 | if (tdb->transaction->nesting != 0) { |
1001 | 0 | return 0; |
1002 | 0 | } |
1003 | | |
1004 | | /* check for a null transaction */ |
1005 | 0 | if (tdb->transaction->blocks == NULL) { |
1006 | 0 | return 0; |
1007 | 0 | } |
1008 | | |
1009 | 0 | methods = tdb->transaction->io_methods; |
1010 | | |
1011 | | /* if there are any locks pending then the caller has not |
1012 | | nested their locks properly, so fail the transaction */ |
1013 | 0 | if (tdb_have_extra_locks(tdb)) { |
1014 | 0 | tdb->ecode = TDB_ERR_LOCK; |
1015 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: locks pending on commit\n")); |
1016 | 0 | _tdb_transaction_cancel(tdb); |
1017 | 0 | return -1; |
1018 | 0 | } |
1019 | | |
1020 | | /* upgrade the main transaction lock region to a write lock */ |
1021 | 0 | if (tdb_allrecord_upgrade(tdb) == -1) { |
1022 | 0 | if (tdb->ecode == TDB_ERR_RDONLY && tdb->read_only) { |
1023 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
1024 | 0 | "tdb_transaction_prepare_commit: " |
1025 | 0 | "failed to upgrade hash locks: " |
1026 | 0 | "database is read only\n")); |
1027 | 0 | } else if (tdb->ecode == TDB_ERR_RDONLY |
1028 | 0 | && tdb->traverse_read) { |
1029 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
1030 | 0 | "tdb_transaction_prepare_commit: " |
1031 | 0 | "failed to upgrade hash locks: " |
1032 | 0 | "a database traverse is in progress\n")); |
1033 | 0 | } else { |
1034 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
1035 | 0 | "tdb_transaction_prepare_commit: " |
1036 | 0 | "failed to upgrade hash locks: %s\n", |
1037 | 0 | tdb_errorstr(tdb))); |
1038 | 0 | } |
1039 | 0 | _tdb_transaction_cancel(tdb); |
1040 | 0 | return -1; |
1041 | 0 | } |
1042 | | |
1043 | | /* get the open lock - this prevents new users attaching to the database |
1044 | | during the commit */ |
1045 | 0 | if (tdb_nest_lock(tdb, OPEN_LOCK, F_WRLCK, TDB_LOCK_WAIT) == -1) { |
1046 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to get open lock\n")); |
1047 | 0 | _tdb_transaction_cancel(tdb); |
1048 | 0 | return -1; |
1049 | 0 | } |
1050 | | |
1051 | | /* write the recovery data to the end of the file */ |
1052 | 0 | if (transaction_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) { |
1053 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: failed to setup recovery data\n")); |
1054 | 0 | _tdb_transaction_cancel(tdb); |
1055 | 0 | return -1; |
1056 | 0 | } |
1057 | | |
1058 | 0 | tdb->transaction->prepared = true; |
1059 | | |
1060 | | /* expand the file to the new size if needed */ |
1061 | 0 | if (tdb->map_size != tdb->transaction->old_map_size) { |
1062 | 0 | if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size, |
1063 | 0 | tdb->map_size - |
1064 | 0 | tdb->transaction->old_map_size) == -1) { |
1065 | 0 | tdb->ecode = TDB_ERR_IO; |
1066 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: expansion failed\n")); |
1067 | 0 | _tdb_transaction_cancel(tdb); |
1068 | 0 | return -1; |
1069 | 0 | } |
1070 | 0 | tdb->map_size = tdb->transaction->old_map_size; |
1071 | 0 | methods->tdb_oob(tdb, tdb->map_size, 1, 1); |
1072 | 0 | } |
1073 | | |
1074 | | /* Keep the open lock until the actual commit */ |
1075 | | |
1076 | 0 | return 0; |
1077 | 0 | } |
1078 | | |
1079 | | /* |
1080 | | prepare to commit the current transaction |
1081 | | */ |
1082 | | _PUBLIC_ int tdb_transaction_prepare_commit(struct tdb_context *tdb) |
1083 | 0 | { |
1084 | 0 | tdb_trace(tdb, "tdb_transaction_prepare_commit"); |
1085 | 0 | return _tdb_transaction_prepare_commit(tdb); |
1086 | 0 | } |
1087 | | |
1088 | | /* A repack is worthwhile if the largest is less than half total free. */ |
1089 | | static bool repack_worthwhile(struct tdb_context *tdb) |
1090 | 0 | { |
1091 | 0 | tdb_off_t ptr; |
1092 | 0 | struct tdb_record rec; |
1093 | 0 | tdb_len_t total = 0, largest = 0; |
1094 | |
|
1095 | 0 | if (tdb_ofs_read(tdb, FREELIST_TOP, &ptr) == -1) { |
1096 | 0 | return false; |
1097 | 0 | } |
1098 | | |
1099 | 0 | while (ptr != 0 && tdb_rec_free_read(tdb, ptr, &rec) == 0) { |
1100 | 0 | total += rec.rec_len; |
1101 | 0 | if (rec.rec_len > largest) { |
1102 | 0 | largest = rec.rec_len; |
1103 | 0 | } |
1104 | 0 | ptr = rec.next; |
1105 | 0 | } |
1106 | |
|
1107 | 0 | return total > largest * 2; |
1108 | 0 | } |
1109 | | |
1110 | | /* |
1111 | | commit the current transaction |
1112 | | */ |
1113 | | _PUBLIC_ int tdb_transaction_commit(struct tdb_context *tdb) |
1114 | 0 | { |
1115 | 0 | const struct tdb_methods *methods; |
1116 | 0 | uint32_t i; |
1117 | 0 | bool need_repack = false; |
1118 | |
|
1119 | 0 | if (tdb->transaction == NULL) { |
1120 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: no transaction\n")); |
1121 | 0 | return -1; |
1122 | 0 | } |
1123 | | |
1124 | 0 | tdb_trace(tdb, "tdb_transaction_commit"); |
1125 | |
|
1126 | 0 | if (tdb->transaction->transaction_error) { |
1127 | 0 | tdb->ecode = TDB_ERR_IO; |
1128 | 0 | _tdb_transaction_cancel(tdb); |
1129 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: transaction error pending\n")); |
1130 | 0 | return -1; |
1131 | 0 | } |
1132 | | |
1133 | | |
1134 | 0 | if (tdb->transaction->nesting != 0) { |
1135 | 0 | tdb->transaction->nesting--; |
1136 | 0 | return 0; |
1137 | 0 | } |
1138 | | |
1139 | | /* check for a null transaction */ |
1140 | 0 | if (tdb->transaction->blocks == NULL) { |
1141 | 0 | _tdb_transaction_cancel(tdb); |
1142 | 0 | return 0; |
1143 | 0 | } |
1144 | | |
1145 | 0 | if (!tdb->transaction->prepared) { |
1146 | 0 | int ret = _tdb_transaction_prepare_commit(tdb); |
1147 | 0 | if (ret) |
1148 | 0 | return ret; |
1149 | 0 | } |
1150 | | |
1151 | 0 | methods = tdb->transaction->io_methods; |
1152 | | |
1153 | | /* perform all the writes */ |
1154 | 0 | for (i=0;i<tdb->transaction->num_blocks;i++) { |
1155 | 0 | tdb_off_t offset; |
1156 | 0 | tdb_len_t length; |
1157 | |
|
1158 | 0 | if (tdb->transaction->blocks[i] == NULL) { |
1159 | 0 | continue; |
1160 | 0 | } |
1161 | | |
1162 | 0 | offset = i * tdb->transaction->block_size; |
1163 | 0 | length = tdb->transaction->block_size; |
1164 | 0 | if (i == tdb->transaction->num_blocks-1) { |
1165 | 0 | length = tdb->transaction->last_block_size; |
1166 | 0 | } |
1167 | |
|
1168 | 0 | if (methods->tdb_write(tdb, offset, tdb->transaction->blocks[i], length) == -1) { |
1169 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed during commit\n")); |
1170 | | |
1171 | | /* we've overwritten part of the data and |
1172 | | possibly expanded the file, so we need to |
1173 | | run the crash recovery code */ |
1174 | 0 | tdb->methods = methods; |
1175 | 0 | tdb_transaction_recover(tdb); |
1176 | |
|
1177 | 0 | _tdb_transaction_cancel(tdb); |
1178 | |
|
1179 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed\n")); |
1180 | 0 | return -1; |
1181 | 0 | } |
1182 | 0 | SAFE_FREE(tdb->transaction->blocks[i]); |
1183 | 0 | } |
1184 | | |
1185 | | /* Do this before we drop lock or blocks. */ |
1186 | 0 | if (tdb->transaction->expanded) { |
1187 | 0 | need_repack = repack_worthwhile(tdb); |
1188 | 0 | } |
1189 | |
|
1190 | 0 | SAFE_FREE(tdb->transaction->blocks); |
1191 | 0 | tdb->transaction->num_blocks = 0; |
1192 | | |
1193 | | /* ensure the new data is on disk */ |
1194 | 0 | if (transaction_sync(tdb, 0, tdb->map_size) == -1) { |
1195 | 0 | return -1; |
1196 | 0 | } |
1197 | | |
1198 | | /* |
1199 | | TODO: maybe write to some dummy hdr field, or write to magic |
1200 | | offset without mmap, before the last sync, instead of the |
1201 | | utime() call |
1202 | | */ |
1203 | | |
1204 | | /* on some systems (like Linux 2.6.x) changes via mmap/msync |
1205 | | don't change the mtime of the file, this means the file may |
1206 | | not be backed up (as tdb rounding to block sizes means that |
1207 | | file size changes are quite rare too). The following forces |
1208 | | mtime changes when a transaction completes */ |
1209 | 0 | futimens(tdb->fd, NULL); |
1210 | | |
1211 | | /* use a transaction cancel to free memory and remove the |
1212 | | transaction locks */ |
1213 | 0 | _tdb_transaction_cancel(tdb); |
1214 | |
|
1215 | 0 | if (need_repack) { |
1216 | 0 | int ret = tdb_repack(tdb); |
1217 | 0 | if (ret != 0) { |
1218 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, |
1219 | 0 | __location__ " Failed to repack database (not fatal)\n")); |
1220 | 0 | } |
1221 | | /* |
1222 | | * Ignore the error. |
1223 | | * |
1224 | | * Why? |
1225 | | * |
1226 | | * We just committed to the DB above, so anything |
1227 | | * written during the transaction is committed, the |
1228 | | * caller needs to know that the long-term state was |
1229 | | * successfully modified. |
1230 | | * |
1231 | | * tdb_repack is an optimization that can fail for |
1232 | | * reasons like lock ordering and we cannot recover |
1233 | | * the transaction lock at this point, having released |
1234 | | * it above. |
1235 | | * |
1236 | | * If we return a failure the caller thinks the |
1237 | | * transaction was rolled back. |
1238 | | */ |
1239 | 0 | } |
1240 | |
|
1241 | 0 | return 0; |
1242 | 0 | } |
1243 | | |
1244 | | |
1245 | | /* |
1246 | | recover from an aborted transaction. Must be called with exclusive |
1247 | | database write access already established (including the open |
1248 | | lock to prevent new processes attaching) |
1249 | | */ |
1250 | | int tdb_transaction_recover(struct tdb_context *tdb) |
1251 | 0 | { |
1252 | 0 | tdb_off_t recovery_head, recovery_eof; |
1253 | 0 | unsigned char *data, *p; |
1254 | 0 | uint32_t zero = 0; |
1255 | 0 | struct tdb_record rec; |
1256 | | |
1257 | | /* find the recovery area */ |
1258 | 0 | if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) { |
1259 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery head\n")); |
1260 | 0 | tdb->ecode = TDB_ERR_IO; |
1261 | 0 | return -1; |
1262 | 0 | } |
1263 | | |
1264 | 0 | if (recovery_head == 0) { |
1265 | | /* we have never allocated a recovery record */ |
1266 | 0 | return 0; |
1267 | 0 | } |
1268 | | |
1269 | | /* read the recovery record */ |
1270 | 0 | if (tdb->methods->tdb_read(tdb, recovery_head, &rec, |
1271 | 0 | sizeof(rec), DOCONV()) == -1) { |
1272 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery record\n")); |
1273 | 0 | tdb->ecode = TDB_ERR_IO; |
1274 | 0 | return -1; |
1275 | 0 | } |
1276 | | |
1277 | 0 | if (rec.magic != TDB_RECOVERY_MAGIC) { |
1278 | | /* there is no valid recovery data */ |
1279 | 0 | return 0; |
1280 | 0 | } |
1281 | | |
1282 | 0 | if (tdb->read_only) { |
1283 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: attempt to recover read only database\n")); |
1284 | 0 | tdb->ecode = TDB_ERR_CORRUPT; |
1285 | 0 | return -1; |
1286 | 0 | } |
1287 | | |
1288 | 0 | recovery_eof = rec.key_len; |
1289 | |
|
1290 | 0 | data = (unsigned char *)malloc(rec.data_len); |
1291 | 0 | if (data == NULL) { |
1292 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to allocate recovery data\n")); |
1293 | 0 | tdb->ecode = TDB_ERR_OOM; |
1294 | 0 | return -1; |
1295 | 0 | } |
1296 | | |
1297 | | /* read the full recovery data */ |
1298 | 0 | if (tdb->methods->tdb_read(tdb, recovery_head + sizeof(rec), data, |
1299 | 0 | rec.data_len, 0) == -1) { |
1300 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery data\n")); |
1301 | 0 | tdb->ecode = TDB_ERR_IO; |
1302 | 0 | free(data); |
1303 | 0 | return -1; |
1304 | 0 | } |
1305 | | |
1306 | | /* recover the file data */ |
1307 | 0 | p = data; |
1308 | 0 | while (p+8 < data + rec.data_len) { |
1309 | 0 | uint32_t ofs, len; |
1310 | 0 | if (DOCONV()) { |
1311 | 0 | tdb_convert(p, 8); |
1312 | 0 | } |
1313 | 0 | memcpy(&ofs, p, 4); |
1314 | 0 | memcpy(&len, p+4, 4); |
1315 | |
|
1316 | 0 | if (tdb->methods->tdb_write(tdb, ofs, p+8, len) == -1) { |
1317 | 0 | free(data); |
1318 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to recover %u bytes at offset %u\n", len, ofs)); |
1319 | 0 | tdb->ecode = TDB_ERR_IO; |
1320 | 0 | return -1; |
1321 | 0 | } |
1322 | 0 | p += 8 + len; |
1323 | 0 | } |
1324 | | |
1325 | 0 | free(data); |
1326 | |
|
1327 | 0 | if (transaction_sync(tdb, 0, tdb->map_size) == -1) { |
1328 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync recovery\n")); |
1329 | 0 | tdb->ecode = TDB_ERR_IO; |
1330 | 0 | return -1; |
1331 | 0 | } |
1332 | | |
1333 | | /* if the recovery area is after the recovered eof then remove it */ |
1334 | 0 | if (recovery_eof <= recovery_head) { |
1335 | 0 | if (tdb_ofs_write(tdb, TDB_RECOVERY_HEAD, &zero) == -1) { |
1336 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery head\n")); |
1337 | 0 | tdb->ecode = TDB_ERR_IO; |
1338 | 0 | return -1; |
1339 | 0 | } |
1340 | 0 | } |
1341 | | |
1342 | | /* remove the recovery magic */ |
1343 | 0 | if (tdb_ofs_write(tdb, recovery_head + offsetof(struct tdb_record, magic), |
1344 | 0 | &zero) == -1) { |
1345 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery magic\n")); |
1346 | 0 | tdb->ecode = TDB_ERR_IO; |
1347 | 0 | return -1; |
1348 | 0 | } |
1349 | | |
1350 | 0 | if (transaction_sync(tdb, 0, recovery_eof) == -1) { |
1351 | 0 | TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync2 recovery\n")); |
1352 | 0 | tdb->ecode = TDB_ERR_IO; |
1353 | 0 | return -1; |
1354 | 0 | } |
1355 | | |
1356 | 0 | TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_recover: recovered %u byte database\n", |
1357 | 0 | recovery_eof)); |
1358 | | |
1359 | | /* all done */ |
1360 | 0 | return 0; |
1361 | 0 | } |
1362 | | |
1363 | | /* Any I/O failures we say "needs recovery". */ |
1364 | | bool tdb_needs_recovery(struct tdb_context *tdb) |
1365 | 0 | { |
1366 | 0 | tdb_off_t recovery_head; |
1367 | 0 | struct tdb_record rec; |
1368 | | |
1369 | | /* find the recovery area */ |
1370 | 0 | if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) { |
1371 | 0 | return true; |
1372 | 0 | } |
1373 | | |
1374 | 0 | if (recovery_head == 0) { |
1375 | | /* we have never allocated a recovery record */ |
1376 | 0 | return false; |
1377 | 0 | } |
1378 | | |
1379 | | /* read the recovery record */ |
1380 | 0 | if (tdb->methods->tdb_read(tdb, recovery_head, &rec, |
1381 | 0 | sizeof(rec), DOCONV()) == -1) { |
1382 | 0 | return true; |
1383 | 0 | } |
1384 | | |
1385 | 0 | return (rec.magic == TDB_RECOVERY_MAGIC); |
1386 | 0 | } |