Coverage Report

Created: 2025-12-31 06:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/samba/lib/tdb/common/transaction.c
Line
Count
Source
1
 /*
2
   Unix SMB/CIFS implementation.
3
4
   trivial database library
5
6
   Copyright (C) Andrew Tridgell              2005
7
8
     ** NOTE! The following LGPL license applies to the tdb
9
     ** library. This does NOT imply that all of Samba is released
10
     ** under the LGPL
11
12
   This library is free software; you can redistribute it and/or
13
   modify it under the terms of the GNU Lesser General Public
14
   License as published by the Free Software Foundation; either
15
   version 3 of the License, or (at your option) any later version.
16
17
   This library is distributed in the hope that it will be useful,
18
   but WITHOUT ANY WARRANTY; without even the implied warranty of
19
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
   Lesser General Public License for more details.
21
22
   You should have received a copy of the GNU Lesser General Public
23
   License along with this library; if not, see <http://www.gnu.org/licenses/>.
24
*/
25
26
#include "tdb_private.h"
27
28
/*
29
  transaction design:
30
31
  - only allow a single transaction at a time per database. This makes
32
    using the transaction API simpler, as otherwise the caller would
33
    have to cope with temporary failures in transactions that conflict
34
    with other current transactions
35
36
  - keep the transaction recovery information in the same file as the
37
    database, using a special 'transaction recovery' record pointed at
38
    by the header. This removes the need for extra journal files as
39
    used by some other databases
40
41
  - dynamically allocated the transaction recover record, re-using it
42
    for subsequent transactions. If a larger record is needed then
43
    tdb_free() the old record to place it on the normal tdb freelist
44
    before allocating the new record
45
46
  - during transactions, keep a linked list of all writes that have
47
    been performed by intercepting all tdb_write() calls. The hooked
48
    transaction versions of tdb_read() and tdb_write() check this
49
    linked list and try to use the elements of the list in preference
50
    to the real database.
51
52
  - don't allow any locks to be held when a transaction starts,
53
    otherwise we can end up with deadlock (plus lack of lock nesting
54
    in posix locks would mean the lock is lost)
55
56
  - if the caller gains a lock during the transaction but doesn't
57
    release it then fail the commit
58
59
  - allow for nested calls to tdb_transaction_start(), re-using the
60
    existing transaction record. If the inner transaction is cancelled
61
    then a subsequent commit will fail
62
63
  - keep a mirrored copy of the tdb hash chain heads to allow for the
64
    fast hash heads scan on traverse, updating the mirrored copy in
65
    the transaction version of tdb_write
66
67
  - allow callers to mix transaction and non-transaction use of tdb,
68
    although once a transaction is started then an exclusive lock is
69
    gained until the transaction is committed or cancelled
70
71
  - the commit strategy involves first saving away all modified data
72
    into a linearised buffer in the transaction recovery area, then
73
    marking the transaction recovery area with a magic value to
74
    indicate a valid recovery record. In total 4 fsync/msync calls are
75
    needed per commit to prevent race conditions. It might be possible
76
    to reduce this to 3 or even 2 with some more work.
77
78
  - check for a valid recovery record on open of the tdb, while the
79
    open lock is held. Automatically recover from the transaction
80
    recovery area if needed, then continue with the open as
81
    usual. This allows for smooth crash recovery with no administrator
82
    intervention.
83
84
  - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
85
    still available, but no fsync/msync calls are made.  This means we
86
    are still proof against a process dying during transaction commit,
87
    but not against machine reboot.
88
89
  - if TDB_ALLOW_NESTING is passed to flags in tdb open, or added using
90
    tdb_add_flags() transaction nesting is enabled.
91
    It resets the TDB_DISALLOW_NESTING flag, as both cannot be used together.
92
    The default is that transaction nesting is allowed.
93
    Note: this default may change in future versions of tdb.
94
95
    Beware. when transactions are nested a transaction successfully
96
    completed with tdb_transaction_commit() can be silently unrolled later.
97
98
  - if TDB_DISALLOW_NESTING is passed to flags in tdb open, or added using
99
    tdb_add_flags() transaction nesting is disabled.
100
    It resets the TDB_ALLOW_NESTING flag, as both cannot be used together.
101
    An attempt create a nested transaction will fail with TDB_ERR_NESTING.
102
    The default is that transaction nesting is allowed.
103
    Note: this default may change in future versions of tdb.
104
*/
105
106
107
/*
108
  hold the context of any current transaction
109
*/
110
struct tdb_transaction {
111
  /* we keep a mirrored copy of the tdb hash heads here so
112
     tdb_next_hash_chain() can operate efficiently */
113
  uint32_t *hash_heads;
114
115
  /* the original io methods - used to do IOs to the real db */
116
  const struct tdb_methods *io_methods;
117
118
  /* the list of transaction blocks. When a block is first
119
     written to, it gets created in this list */
120
  uint8_t **blocks;
121
  uint32_t num_blocks;
122
  uint32_t block_size;      /* bytes in each block */
123
  uint32_t last_block_size; /* number of valid bytes in the last block */
124
125
  /* non-zero when an internal transaction error has
126
     occurred. All write operations will then fail until the
127
     transaction is ended */
128
  int transaction_error;
129
130
  /* when inside a transaction we need to keep track of any
131
     nested tdb_transaction_start() calls, as these are allowed,
132
     but don't create a new transaction */
133
  int nesting;
134
135
  /* set when a prepare has already occurred */
136
  bool prepared;
137
  tdb_off_t magic_offset;
138
139
  /* old file size before transaction */
140
  tdb_len_t old_map_size;
141
142
  /* did we expand in this transaction */
143
  bool expanded;
144
};
145
146
147
/*
148
  read while in a transaction. We need to check first if the data is in our list
149
  of transaction elements, then if not do a real read
150
*/
151
static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
152
          tdb_len_t len, int cv)
153
0
{
154
0
  uint32_t blk;
155
156
  /* break it down into block sized ops */
157
0
  while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
158
0
    tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
159
0
    if (transaction_read(tdb, off, buf, len2, cv) != 0) {
160
0
      return -1;
161
0
    }
162
0
    len -= len2;
163
0
    off += len2;
164
0
    buf = (void *)(len2 + (char *)buf);
165
0
  }
166
167
0
  if (len == 0) {
168
0
    return 0;
169
0
  }
170
171
0
  blk = off / tdb->transaction->block_size;
172
173
  /* see if we have it in the block list */
174
0
  if (tdb->transaction->num_blocks <= blk ||
175
0
      tdb->transaction->blocks[blk] == NULL) {
176
    /* nope, do a real read */
177
0
    if (tdb->transaction->io_methods->tdb_read(tdb, off, buf, len, cv) != 0) {
178
0
      goto fail;
179
0
    }
180
0
    return 0;
181
0
  }
182
183
  /* it is in the block list. Now check for the last block */
184
0
  if (blk == tdb->transaction->num_blocks-1) {
185
0
    if (len > tdb->transaction->last_block_size) {
186
0
      goto fail;
187
0
    }
188
0
  }
189
190
  /* now copy it out of this block */
191
0
  memcpy(buf, tdb->transaction->blocks[blk] + (off % tdb->transaction->block_size), len);
192
0
  if (cv) {
193
0
    tdb_convert(buf, len);
194
0
  }
195
0
  return 0;
196
197
0
fail:
198
0
  TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_read: failed at off=%u len=%u\n", off, len));
199
0
  tdb->ecode = TDB_ERR_IO;
200
0
  tdb->transaction->transaction_error = 1;
201
0
  return -1;
202
0
}
203
204
205
/*
206
  write while in a transaction
207
*/
208
static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
209
           const void *buf, tdb_len_t len)
210
0
{
211
0
  uint32_t blk;
212
213
0
  if (buf == NULL) {
214
0
    return -1;
215
0
  }
216
217
  /* Only a commit is allowed on a prepared transaction */
218
0
  if (tdb->transaction->prepared) {
219
0
    tdb->ecode = TDB_ERR_EINVAL;
220
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: transaction already prepared, write not allowed\n"));
221
0
    tdb->transaction->transaction_error = 1;
222
0
    return -1;
223
0
  }
224
225
  /* if the write is to a hash head, then update the transaction
226
     hash heads */
227
0
  if (len == sizeof(tdb_off_t) && off >= FREELIST_TOP &&
228
0
      off < FREELIST_TOP+TDB_HASHTABLE_SIZE(tdb)) {
229
0
    uint32_t chain = (off-FREELIST_TOP) / sizeof(tdb_off_t);
230
0
    memcpy(&tdb->transaction->hash_heads[chain], buf, len);
231
0
  }
232
233
  /* break it up into block sized chunks */
234
0
  while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
235
0
    tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
236
0
    if (transaction_write(tdb, off, buf, len2) != 0) {
237
0
      return -1;
238
0
    }
239
0
    len -= len2;
240
0
    off += len2;
241
0
    buf = (const void *)(len2 + (const char *)buf);
242
0
  }
243
244
0
  if (len == 0) {
245
0
    return 0;
246
0
  }
247
248
0
  blk = off / tdb->transaction->block_size;
249
0
  off = off % tdb->transaction->block_size;
250
251
0
  if (tdb->transaction->num_blocks <= blk) {
252
0
    uint8_t **new_blocks;
253
    /* expand the blocks array */
254
0
    new_blocks = (uint8_t **)realloc(tdb->transaction->blocks,
255
0
             (blk+1)*sizeof(uint8_t *));
256
0
    if (new_blocks == NULL) {
257
0
      tdb->ecode = TDB_ERR_OOM;
258
0
      goto fail;
259
0
    }
260
0
    memset(&new_blocks[tdb->transaction->num_blocks], 0,
261
0
           (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *));
262
0
    tdb->transaction->blocks = new_blocks;
263
0
    tdb->transaction->num_blocks = blk+1;
264
0
    tdb->transaction->last_block_size = 0;
265
0
  }
266
267
  /* allocate and fill a block? */
268
0
  if (tdb->transaction->blocks[blk] == NULL) {
269
0
    tdb->transaction->blocks[blk] = (uint8_t *)calloc(tdb->transaction->block_size, 1);
270
0
    if (tdb->transaction->blocks[blk] == NULL) {
271
0
      tdb->ecode = TDB_ERR_OOM;
272
0
      tdb->transaction->transaction_error = 1;
273
0
      return -1;
274
0
    }
275
0
    if (tdb->transaction->old_map_size > blk * tdb->transaction->block_size) {
276
0
      tdb_len_t len2 = tdb->transaction->block_size;
277
0
      if (len2 + (blk * tdb->transaction->block_size) > tdb->transaction->old_map_size) {
278
0
        len2 = tdb->transaction->old_map_size - (blk * tdb->transaction->block_size);
279
0
      }
280
0
      if (tdb->transaction->io_methods->tdb_read(tdb, blk * tdb->transaction->block_size,
281
0
                   tdb->transaction->blocks[blk],
282
0
                   len2, 0) != 0) {
283
0
        SAFE_FREE(tdb->transaction->blocks[blk]);
284
0
        tdb->ecode = TDB_ERR_IO;
285
0
        goto fail;
286
0
      }
287
0
      if (blk == tdb->transaction->num_blocks-1) {
288
0
        tdb->transaction->last_block_size = len2;
289
0
      }
290
0
    }
291
0
  }
292
293
  /* overwrite part of an existing block */
294
0
  memcpy(tdb->transaction->blocks[blk] + off, buf, len);
295
0
  if (blk == tdb->transaction->num_blocks-1) {
296
0
    if (len + off > tdb->transaction->last_block_size) {
297
0
      tdb->transaction->last_block_size = len + off;
298
0
    }
299
0
  }
300
301
0
  return 0;
302
303
0
fail:
304
0
  TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: failed at off=%u len=%u\n",
305
0
     (blk*tdb->transaction->block_size) + off, len));
306
0
  tdb->transaction->transaction_error = 1;
307
0
  return -1;
308
0
}
309
310
311
/*
312
  write while in a transaction - this variant never expands the transaction blocks, it only
313
  updates existing blocks. This means it cannot change the recovery size
314
*/
315
static int transaction_write_existing(struct tdb_context *tdb, tdb_off_t off,
316
              const void *buf, tdb_len_t len)
317
0
{
318
0
  uint32_t blk;
319
320
  /* break it up into block sized chunks */
321
0
  while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
322
0
    tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
323
0
    if (transaction_write_existing(tdb, off, buf, len2) != 0) {
324
0
      return -1;
325
0
    }
326
0
    len -= len2;
327
0
    off += len2;
328
0
    if (buf != NULL) {
329
0
      buf = (const void *)(len2 + (const char *)buf);
330
0
    }
331
0
  }
332
333
0
  if (len == 0 || buf == NULL) {
334
0
    return 0;
335
0
  }
336
337
0
  blk = off / tdb->transaction->block_size;
338
0
  off = off % tdb->transaction->block_size;
339
340
0
  if (tdb->transaction->num_blocks <= blk ||
341
0
      tdb->transaction->blocks[blk] == NULL) {
342
0
    return 0;
343
0
  }
344
345
0
  if (blk == tdb->transaction->num_blocks-1 &&
346
0
      off + len > tdb->transaction->last_block_size) {
347
0
    if (off >= tdb->transaction->last_block_size) {
348
0
      return 0;
349
0
    }
350
0
    len = tdb->transaction->last_block_size - off;
351
0
  }
352
353
  /* overwrite part of an existing block */
354
0
  memcpy(tdb->transaction->blocks[blk] + off, buf, len);
355
356
0
  return 0;
357
0
}
358
359
360
/*
361
  accelerated hash chain head search, using the cached hash heads
362
*/
363
static void transaction_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
364
0
{
365
0
  uint32_t h = *chain;
366
0
  for (;h < tdb->hash_size;h++) {
367
    /* the +1 takes account of the freelist */
368
0
    if (0 != tdb->transaction->hash_heads[h+1]) {
369
0
      break;
370
0
    }
371
0
  }
372
0
  (*chain) = h;
373
0
}
374
375
/*
376
  out of bounds check during a transaction
377
*/
378
static int transaction_oob(struct tdb_context *tdb, tdb_off_t off,
379
         tdb_len_t len, int probe)
380
0
{
381
  /*
382
   * This duplicates functionality from tdb_oob(). Don't remove:
383
   * we still have direct callers of tdb->methods->tdb_oob()
384
   * inside transaction.c.
385
   */
386
0
  if (off + len >= off && off + len <= tdb->map_size) {
387
0
    return 0;
388
0
  }
389
0
  tdb->ecode = TDB_ERR_IO;
390
0
  return -1;
391
0
}
392
393
/*
394
  transaction version of tdb_expand().
395
*/
396
static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t size,
397
           tdb_off_t addition)
398
0
{
399
0
  const char buf_zero[8192] = {0};
400
0
  size_t buf_len = sizeof(buf_zero);
401
402
0
  while (addition > 0) {
403
0
    size_t n = MIN(addition, buf_len);
404
0
    int ret;
405
406
0
    ret = transaction_write(tdb, size, buf_zero, n);
407
0
    if (ret != 0) {
408
0
      return ret;
409
0
    }
410
411
0
    addition -= n;
412
0
    size += n;
413
0
  }
414
415
0
  tdb->transaction->expanded = true;
416
417
0
  return 0;
418
0
}
419
420
static const struct tdb_methods transaction_methods = {
421
  transaction_read,
422
  transaction_write,
423
  transaction_next_hash_chain,
424
  transaction_oob,
425
  transaction_expand_file,
426
};
427
428
/*
429
 * Is a transaction currently active on this context?
430
 *
431
 */
432
_PUBLIC_ bool tdb_transaction_active(struct tdb_context *tdb)
433
0
{
434
0
  return (tdb->transaction != NULL);
435
0
}
436
437
/*
438
  start a tdb transaction. No token is returned, as only a single
439
  transaction is allowed to be pending per tdb_context
440
*/
441
static int _tdb_transaction_start(struct tdb_context *tdb,
442
          enum tdb_lock_flags lockflags)
443
0
{
444
  /* some sanity checks */
445
0
  if (tdb->read_only || (tdb->flags & TDB_INTERNAL)
446
0
      || tdb->traverse_read) {
447
0
    TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n"));
448
0
    tdb->ecode = TDB_ERR_EINVAL;
449
0
    return -1;
450
0
  }
451
452
  /* cope with nested tdb_transaction_start() calls */
453
0
  if (tdb->transaction != NULL) {
454
0
    if (!(tdb->flags & TDB_ALLOW_NESTING)) {
455
0
      tdb->ecode = TDB_ERR_NESTING;
456
0
      return -1;
457
0
    }
458
0
    tdb->transaction->nesting++;
459
0
    TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_start: nesting %d\n",
460
0
       tdb->transaction->nesting));
461
0
    return 0;
462
0
  }
463
464
0
  if (tdb_have_extra_locks(tdb)) {
465
    /* the caller must not have any locks when starting a
466
       transaction as otherwise we'll be screwed by lack
467
       of nested locks in posix */
468
0
    TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction with locks held\n"));
469
0
    tdb->ecode = TDB_ERR_LOCK;
470
0
    return -1;
471
0
  }
472
473
0
  if (tdb->travlocks.next != NULL) {
474
    /* you cannot use transactions inside a traverse (although you can use
475
       traverse inside a transaction) as otherwise you can end up with
476
       deadlock */
477
0
    TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction within a traverse\n"));
478
0
    tdb->ecode = TDB_ERR_LOCK;
479
0
    return -1;
480
0
  }
481
482
0
  tdb->transaction = (struct tdb_transaction *)
483
0
    calloc(sizeof(struct tdb_transaction), 1);
484
0
  if (tdb->transaction == NULL) {
485
0
    tdb->ecode = TDB_ERR_OOM;
486
0
    return -1;
487
0
  }
488
489
  /* a page at a time seems like a reasonable compromise between compactness and efficiency */
490
0
  tdb->transaction->block_size = tdb->page_size;
491
492
  /* get the transaction write lock. This is a blocking lock. As
493
     discussed with Volker, there are a number of ways we could
494
     make this async, which we will probably do in the future */
495
0
  if (tdb_transaction_lock(tdb, F_WRLCK, lockflags) == -1) {
496
0
    SAFE_FREE(tdb->transaction->blocks);
497
0
    SAFE_FREE(tdb->transaction);
498
0
    if ((lockflags & TDB_LOCK_WAIT) == 0) {
499
0
      tdb->ecode = TDB_ERR_NOLOCK;
500
0
    } else {
501
0
      TDB_LOG((tdb, TDB_DEBUG_ERROR,
502
0
         "tdb_transaction_start: "
503
0
         "failed to get transaction lock\n"));
504
0
    }
505
0
    return -1;
506
0
  }
507
508
  /* get a read lock from the freelist to the end of file. This
509
     is upgraded to a write lock during the commit */
510
0
  if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true) == -1) {
511
0
    TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to get hash locks\n"));
512
0
    goto fail_allrecord_lock;
513
0
  }
514
515
  /* setup a copy of the hash table heads so the hash scan in
516
     traverse can be fast */
517
0
  tdb->transaction->hash_heads = (uint32_t *)
518
0
    calloc(tdb->hash_size+1, sizeof(uint32_t));
519
0
  if (tdb->transaction->hash_heads == NULL) {
520
0
    tdb->ecode = TDB_ERR_OOM;
521
0
    goto fail;
522
0
  }
523
0
  if (tdb->methods->tdb_read(tdb, FREELIST_TOP, tdb->transaction->hash_heads,
524
0
           TDB_HASHTABLE_SIZE(tdb), 0) != 0) {
525
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to read hash heads\n"));
526
0
    tdb->ecode = TDB_ERR_IO;
527
0
    goto fail;
528
0
  }
529
530
  /* make sure we know about any file expansions already done by
531
     anyone else */
532
0
  tdb_oob(tdb, tdb->map_size, 1, 1);
533
0
  tdb->transaction->old_map_size = tdb->map_size;
534
535
  /* finally hook the io methods, replacing them with
536
     transaction specific methods */
537
0
  tdb->transaction->io_methods = tdb->methods;
538
0
  tdb->methods = &transaction_methods;
539
540
  /* Trace at the end, so we get sequence number correct. */
541
0
  tdb_trace(tdb, "tdb_transaction_start");
542
0
  return 0;
543
544
0
fail:
545
0
  tdb_allrecord_unlock(tdb, F_RDLCK, false);
546
0
fail_allrecord_lock:
547
0
  tdb_transaction_unlock(tdb, F_WRLCK);
548
0
  SAFE_FREE(tdb->transaction->blocks);
549
0
  SAFE_FREE(tdb->transaction->hash_heads);
550
0
  SAFE_FREE(tdb->transaction);
551
0
  return -1;
552
0
}
553
554
_PUBLIC_ int tdb_transaction_start(struct tdb_context *tdb)
555
0
{
556
0
  return _tdb_transaction_start(tdb, TDB_LOCK_WAIT);
557
0
}
558
559
_PUBLIC_ int tdb_transaction_start_nonblock(struct tdb_context *tdb)
560
0
{
561
0
  return _tdb_transaction_start(tdb, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
562
0
}
563
564
/*
565
  sync to disk
566
*/
567
static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t length)
568
0
{
569
0
  if (tdb->flags & TDB_NOSYNC) {
570
0
    return 0;
571
0
  }
572
573
0
#ifdef HAVE_FDATASYNC
574
0
  if (fdatasync(tdb->fd) != 0) {
575
#else
576
  if (fsync(tdb->fd) != 0) {
577
#endif
578
0
    tdb->ecode = TDB_ERR_IO;
579
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: fsync failed\n"));
580
0
    return -1;
581
0
  }
582
0
#ifdef HAVE_MMAP
583
0
  if (tdb->map_ptr) {
584
0
    tdb_off_t moffset = offset & ~(tdb->page_size-1);
585
0
    if (msync(moffset + (char *)tdb->map_ptr,
586
0
        length + (offset - moffset), MS_SYNC) != 0) {
587
0
      tdb->ecode = TDB_ERR_IO;
588
0
      TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: msync failed - %s\n",
589
0
         strerror(errno)));
590
0
      return -1;
591
0
    }
592
0
  }
593
0
#endif
594
0
  return 0;
595
0
}
596
597
598
static int _tdb_transaction_cancel(struct tdb_context *tdb)
599
0
{
600
0
  uint32_t i;
601
0
  int ret = 0;
602
603
0
  if (tdb->transaction == NULL) {
604
0
    TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_cancel: no transaction\n"));
605
0
    return -1;
606
0
  }
607
608
0
  if (tdb->transaction->nesting != 0) {
609
0
    tdb->transaction->transaction_error = 1;
610
0
    tdb->transaction->nesting--;
611
0
    return 0;
612
0
  }
613
614
0
  tdb->map_size = tdb->transaction->old_map_size;
615
616
  /* free all the transaction blocks */
617
0
  for (i=0;i<tdb->transaction->num_blocks;i++) {
618
0
    if ((tdb->transaction->blocks != NULL) &&
619
0
        tdb->transaction->blocks[i] != NULL) {
620
0
      free(tdb->transaction->blocks[i]);
621
0
    }
622
0
  }
623
0
  SAFE_FREE(tdb->transaction->blocks);
624
625
0
  if (tdb->transaction->magic_offset) {
626
0
    const struct tdb_methods *methods = tdb->transaction->io_methods;
627
0
    const uint32_t invalid = TDB_RECOVERY_INVALID_MAGIC;
628
629
    /* remove the recovery marker */
630
0
    if (methods->tdb_write(tdb, tdb->transaction->magic_offset, &invalid, 4) == -1 ||
631
0
    transaction_sync(tdb, tdb->transaction->magic_offset, 4) == -1) {
632
0
      TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_cancel: failed to remove recovery magic\n"));
633
0
      ret = -1;
634
0
    }
635
0
  }
636
637
  /* This also removes the OPEN_LOCK, if we have it. */
638
0
  tdb_release_transaction_locks(tdb);
639
640
  /* restore the normal io methods */
641
0
  tdb->methods = tdb->transaction->io_methods;
642
643
0
  SAFE_FREE(tdb->transaction->hash_heads);
644
0
  SAFE_FREE(tdb->transaction);
645
646
0
  return ret;
647
0
}
648
649
/*
650
  cancel the current transaction
651
*/
652
_PUBLIC_ int tdb_transaction_cancel(struct tdb_context *tdb)
653
0
{
654
0
  tdb_trace(tdb, "tdb_transaction_cancel");
655
0
  return _tdb_transaction_cancel(tdb);
656
0
}
657
658
/*
659
  work out how much space the linearised recovery data will consume
660
*/
661
static bool tdb_recovery_size(struct tdb_context *tdb, tdb_len_t *result)
662
0
{
663
0
  tdb_len_t recovery_size = 0;
664
0
  uint32_t i;
665
666
0
  recovery_size = sizeof(uint32_t);
667
0
  for (i=0;i<tdb->transaction->num_blocks;i++) {
668
0
    tdb_len_t block_size;
669
0
    if (i * tdb->transaction->block_size >= tdb->transaction->old_map_size) {
670
0
      break;
671
0
    }
672
0
    if (tdb->transaction->blocks[i] == NULL) {
673
0
      continue;
674
0
    }
675
0
    if (!tdb_add_len_t(recovery_size, 2*sizeof(tdb_off_t),
676
0
           &recovery_size)) {
677
0
      return false;
678
0
    }
679
0
    if (i == tdb->transaction->num_blocks-1) {
680
0
      block_size = tdb->transaction->last_block_size;
681
0
    } else {
682
0
      block_size =  tdb->transaction->block_size;
683
0
    }
684
0
    if (!tdb_add_len_t(recovery_size, block_size,
685
0
           &recovery_size)) {
686
0
      return false;
687
0
    }
688
0
  }
689
690
0
  *result = recovery_size;
691
0
  return true;
692
0
}
693
694
int tdb_recovery_area(struct tdb_context *tdb,
695
          const struct tdb_methods *methods,
696
          tdb_off_t *recovery_offset,
697
          struct tdb_record *rec)
698
0
{
699
0
  int ret;
700
701
0
  if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, recovery_offset) == -1) {
702
0
    return -1;
703
0
  }
704
705
0
  if (*recovery_offset == 0) {
706
0
    rec->rec_len = 0;
707
0
    return 0;
708
0
  }
709
710
0
  if (methods->tdb_read(tdb, *recovery_offset, rec, sizeof(*rec),
711
0
            DOCONV()) == -1) {
712
0
    return -1;
713
0
  }
714
715
  /* ignore invalid recovery regions: can happen in crash */
716
0
  if (rec->magic != TDB_RECOVERY_MAGIC &&
717
0
      rec->magic != TDB_RECOVERY_INVALID_MAGIC) {
718
0
    *recovery_offset = 0;
719
0
    rec->rec_len = 0;
720
0
  }
721
722
0
  ret = methods->tdb_oob(tdb, *recovery_offset, rec->rec_len, 1);
723
0
  if (ret == -1) {
724
0
    *recovery_offset = 0;
725
0
    rec->rec_len = 0;
726
0
  }
727
728
0
  return 0;
729
0
}
730
731
/*
732
  allocate the recovery area, or use an existing recovery area if it is
733
  large enough
734
*/
735
static int tdb_recovery_allocate(struct tdb_context *tdb,
736
         tdb_len_t *recovery_size,
737
         tdb_off_t *recovery_offset,
738
         tdb_len_t *recovery_max_size)
739
0
{
740
0
  struct tdb_record rec;
741
0
  const struct tdb_methods *methods = tdb->transaction->io_methods;
742
0
  tdb_off_t recovery_head, new_end;
743
744
0
  if (tdb_recovery_area(tdb, methods, &recovery_head, &rec) == -1) {
745
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery head\n"));
746
0
    return -1;
747
0
  }
748
749
0
  if (!tdb_recovery_size(tdb, recovery_size)) {
750
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: "
751
0
       "overflow recovery size\n"));
752
0
    return -1;
753
0
  }
754
755
  /* Existing recovery area? */
756
0
  if (recovery_head != 0 && *recovery_size <= rec.rec_len) {
757
    /* it fits in the existing area */
758
0
    *recovery_max_size = rec.rec_len;
759
0
    *recovery_offset = recovery_head;
760
0
    return 0;
761
0
  }
762
763
  /* If recovery area in middle of file, we need a new one. */
764
0
  if (recovery_head == 0
765
0
      || recovery_head + sizeof(rec) + rec.rec_len != tdb->map_size) {
766
    /* we need to free up the old recovery area, then allocate a
767
       new one at the end of the file. Note that we cannot use
768
       tdb_allocate() to allocate the new one as that might return
769
       us an area that is being currently used (as of the start of
770
       the transaction) */
771
0
    if (recovery_head) {
772
0
      if (tdb_free(tdb, recovery_head, &rec) == -1) {
773
0
        TDB_LOG((tdb, TDB_DEBUG_FATAL,
774
0
           "tdb_recovery_allocate: failed to"
775
0
           " free previous recovery area\n"));
776
0
        return -1;
777
0
      }
778
779
      /* the tdb_free() call might have increased
780
       * the recovery size */
781
0
      if (!tdb_recovery_size(tdb, recovery_size)) {
782
0
        TDB_LOG((tdb, TDB_DEBUG_FATAL,
783
0
           "tdb_recovery_allocate: "
784
0
           "overflow recovery size\n"));
785
0
        return -1;
786
0
      }
787
0
    }
788
789
    /* New head will be at end of file. */
790
0
    recovery_head = tdb->map_size;
791
0
  }
792
793
  /* Now we know where it will be. */
794
0
  *recovery_offset = recovery_head;
795
796
  /* Expand by more than we need, so we don't do it often. */
797
0
  *recovery_max_size = tdb_expand_adjust(tdb->map_size,
798
0
                 *recovery_size,
799
0
                 tdb->page_size)
800
0
    - sizeof(rec);
801
802
0
  if (!tdb_add_off_t(recovery_head, sizeof(rec), &new_end) ||
803
0
      !tdb_add_off_t(new_end, *recovery_max_size, &new_end)) {
804
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: "
805
0
       "overflow recovery area\n"));
806
0
    return -1;
807
0
  }
808
809
0
  if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
810
0
             new_end - tdb->transaction->old_map_size)
811
0
      == -1) {
812
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to create recovery area\n"));
813
0
    return -1;
814
0
  }
815
816
  /* remap the file (if using mmap) */
817
0
  methods->tdb_oob(tdb, tdb->map_size, 1, 1);
818
819
  /* we have to reset the old map size so that we don't try to expand the file
820
     again in the transaction commit, which would destroy the recovery area */
821
0
  tdb->transaction->old_map_size = tdb->map_size;
822
823
  /* write the recovery header offset and sync - we can sync without a race here
824
     as the magic ptr in the recovery record has not been set */
825
0
  CONVERT(recovery_head);
826
0
  if (methods->tdb_write(tdb, TDB_RECOVERY_HEAD,
827
0
             &recovery_head, sizeof(tdb_off_t)) == -1) {
828
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
829
0
    return -1;
830
0
  }
831
0
  if (transaction_write_existing(tdb, TDB_RECOVERY_HEAD, &recovery_head, sizeof(tdb_off_t)) == -1) {
832
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
833
0
    return -1;
834
0
  }
835
836
0
  return 0;
837
0
}
838
839
840
/*
841
  setup the recovery data that will be used on a crash during commit
842
*/
843
static int transaction_setup_recovery(struct tdb_context *tdb,
844
              tdb_off_t *magic_offset)
845
0
{
846
0
  tdb_len_t recovery_size;
847
0
  unsigned char *data, *p;
848
0
  const struct tdb_methods *methods = tdb->transaction->io_methods;
849
0
  struct tdb_record *rec;
850
0
  tdb_off_t recovery_offset, recovery_max_size;
851
0
  tdb_off_t old_map_size = tdb->transaction->old_map_size;
852
0
  uint32_t magic, tailer;
853
0
  uint32_t i;
854
855
  /*
856
    check that the recovery area has enough space
857
  */
858
0
  if (tdb_recovery_allocate(tdb, &recovery_size,
859
0
          &recovery_offset, &recovery_max_size) == -1) {
860
0
    return -1;
861
0
  }
862
863
0
  rec = malloc(recovery_size + sizeof(*rec));
864
0
  if (rec == NULL) {
865
0
    tdb->ecode = TDB_ERR_OOM;
866
0
    return -1;
867
0
  }
868
869
0
  memset(rec, 0, sizeof(*rec));
870
871
0
  rec->magic    = TDB_RECOVERY_INVALID_MAGIC;
872
0
  rec->data_len = recovery_size;
873
0
  rec->rec_len  = recovery_max_size;
874
0
  rec->key_len  = old_map_size;
875
0
  CONVERT(*rec);
876
877
0
  data = (unsigned char *)rec;
878
879
  /* build the recovery data into a single blob to allow us to do a single
880
     large write, which should be more efficient */
881
0
  p = data + sizeof(*rec);
882
0
  for (i=0;i<tdb->transaction->num_blocks;i++) {
883
0
    tdb_off_t offset;
884
0
    tdb_len_t length;
885
886
0
    if (tdb->transaction->blocks[i] == NULL) {
887
0
      continue;
888
0
    }
889
890
0
    offset = i * tdb->transaction->block_size;
891
0
    length = tdb->transaction->block_size;
892
0
    if (i == tdb->transaction->num_blocks-1) {
893
0
      length = tdb->transaction->last_block_size;
894
0
    }
895
896
0
    if (offset >= old_map_size) {
897
0
      continue;
898
0
    }
899
0
    if (offset + length > tdb->transaction->old_map_size) {
900
0
      TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: transaction data over new region boundary\n"));
901
0
      free(data);
902
0
      tdb->ecode = TDB_ERR_CORRUPT;
903
0
      return -1;
904
0
    }
905
0
    memcpy(p, &offset, 4);
906
0
    memcpy(p+4, &length, 4);
907
0
    if (DOCONV()) {
908
0
      tdb_convert(p, 8);
909
0
    }
910
    /* the recovery area contains the old data, not the
911
       new data, so we have to call the original tdb_read
912
       method to get it */
913
0
    if (methods->tdb_read(tdb, offset, p + 8, length, 0) != 0) {
914
0
      free(data);
915
0
      tdb->ecode = TDB_ERR_IO;
916
0
      return -1;
917
0
    }
918
0
    p += 8 + length;
919
0
  }
920
921
  /* and the tailer */
922
0
  tailer = sizeof(*rec) + recovery_max_size;
923
0
  memcpy(p, &tailer, 4);
924
0
  if (DOCONV()) {
925
0
    tdb_convert(p, 4);
926
0
  }
927
928
  /* write the recovery data to the recovery area */
929
0
  if (methods->tdb_write(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
930
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery data\n"));
931
0
    free(data);
932
0
    tdb->ecode = TDB_ERR_IO;
933
0
    return -1;
934
0
  }
935
0
  if (transaction_write_existing(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
936
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery data\n"));
937
0
    free(data);
938
0
    tdb->ecode = TDB_ERR_IO;
939
0
    return -1;
940
0
  }
941
942
  /* as we don't have ordered writes, we have to sync the recovery
943
     data before we update the magic to indicate that the recovery
944
     data is present */
945
0
  if (transaction_sync(tdb, recovery_offset, sizeof(*rec) + recovery_size) == -1) {
946
0
    free(data);
947
0
    return -1;
948
0
  }
949
950
0
  free(data);
951
952
0
  magic = TDB_RECOVERY_MAGIC;
953
0
  CONVERT(magic);
954
955
0
  *magic_offset = recovery_offset + offsetof(struct tdb_record, magic);
956
957
0
  if (methods->tdb_write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
958
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery magic\n"));
959
0
    tdb->ecode = TDB_ERR_IO;
960
0
    return -1;
961
0
  }
962
0
  if (transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
963
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery magic\n"));
964
0
    tdb->ecode = TDB_ERR_IO;
965
0
    return -1;
966
0
  }
967
968
  /* ensure the recovery magic marker is on disk */
969
0
  if (transaction_sync(tdb, *magic_offset, sizeof(magic)) == -1) {
970
0
    return -1;
971
0
  }
972
973
0
  return 0;
974
0
}
975
976
static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
977
0
{
978
0
  const struct tdb_methods *methods;
979
980
0
  if (tdb->transaction == NULL) {
981
0
    TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: no transaction\n"));
982
0
    return -1;
983
0
  }
984
985
0
  if (tdb->transaction->prepared) {
986
0
    tdb->ecode = TDB_ERR_EINVAL;
987
0
    _tdb_transaction_cancel(tdb);
988
0
    TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction already prepared\n"));
989
0
    return -1;
990
0
  }
991
992
0
  if (tdb->transaction->transaction_error) {
993
0
    tdb->ecode = TDB_ERR_IO;
994
0
    _tdb_transaction_cancel(tdb);
995
0
    TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction error pending\n"));
996
0
    return -1;
997
0
  }
998
999
1000
0
  if (tdb->transaction->nesting != 0) {
1001
0
    return 0;
1002
0
  }
1003
1004
  /* check for a null transaction */
1005
0
  if (tdb->transaction->blocks == NULL) {
1006
0
    return 0;
1007
0
  }
1008
1009
0
  methods = tdb->transaction->io_methods;
1010
1011
  /* if there are any locks pending then the caller has not
1012
     nested their locks properly, so fail the transaction */
1013
0
  if (tdb_have_extra_locks(tdb)) {
1014
0
    tdb->ecode = TDB_ERR_LOCK;
1015
0
    TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: locks pending on commit\n"));
1016
0
    _tdb_transaction_cancel(tdb);
1017
0
    return -1;
1018
0
  }
1019
1020
  /* upgrade the main transaction lock region to a write lock */
1021
0
  if (tdb_allrecord_upgrade(tdb) == -1) {
1022
0
    if (tdb->ecode == TDB_ERR_RDONLY && tdb->read_only) {
1023
0
      TDB_LOG((tdb, TDB_DEBUG_ERROR,
1024
0
         "tdb_transaction_prepare_commit: "
1025
0
         "failed to upgrade hash locks: "
1026
0
         "database is read only\n"));
1027
0
    } else if (tdb->ecode == TDB_ERR_RDONLY
1028
0
         && tdb->traverse_read) {
1029
0
      TDB_LOG((tdb, TDB_DEBUG_ERROR,
1030
0
         "tdb_transaction_prepare_commit: "
1031
0
         "failed to upgrade hash locks: "
1032
0
         "a database traverse is in progress\n"));
1033
0
    } else {
1034
0
      TDB_LOG((tdb, TDB_DEBUG_ERROR,
1035
0
         "tdb_transaction_prepare_commit: "
1036
0
         "failed to upgrade hash locks: %s\n",
1037
0
         tdb_errorstr(tdb)));
1038
0
    }
1039
0
    _tdb_transaction_cancel(tdb);
1040
0
    return -1;
1041
0
  }
1042
1043
  /* get the open lock - this prevents new users attaching to the database
1044
     during the commit */
1045
0
  if (tdb_nest_lock(tdb, OPEN_LOCK, F_WRLCK, TDB_LOCK_WAIT) == -1) {
1046
0
    TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to get open lock\n"));
1047
0
    _tdb_transaction_cancel(tdb);
1048
0
    return -1;
1049
0
  }
1050
1051
  /* write the recovery data to the end of the file */
1052
0
  if (transaction_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) {
1053
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: failed to setup recovery data\n"));
1054
0
    _tdb_transaction_cancel(tdb);
1055
0
    return -1;
1056
0
  }
1057
1058
0
  tdb->transaction->prepared = true;
1059
1060
  /* expand the file to the new size if needed */
1061
0
  if (tdb->map_size != tdb->transaction->old_map_size) {
1062
0
    if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
1063
0
               tdb->map_size -
1064
0
               tdb->transaction->old_map_size) == -1) {
1065
0
      tdb->ecode = TDB_ERR_IO;
1066
0
      TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: expansion failed\n"));
1067
0
      _tdb_transaction_cancel(tdb);
1068
0
      return -1;
1069
0
    }
1070
0
    tdb->map_size = tdb->transaction->old_map_size;
1071
0
    methods->tdb_oob(tdb, tdb->map_size, 1, 1);
1072
0
  }
1073
1074
  /* Keep the open lock until the actual commit */
1075
1076
0
  return 0;
1077
0
}
1078
1079
/*
1080
   prepare to commit the current transaction
1081
*/
1082
_PUBLIC_ int tdb_transaction_prepare_commit(struct tdb_context *tdb)
1083
0
{
1084
0
  tdb_trace(tdb, "tdb_transaction_prepare_commit");
1085
0
  return _tdb_transaction_prepare_commit(tdb);
1086
0
}
1087
1088
/* A repack is worthwhile if the largest is less than half total free. */
1089
static bool repack_worthwhile(struct tdb_context *tdb)
1090
0
{
1091
0
  tdb_off_t ptr;
1092
0
  struct tdb_record rec;
1093
0
  tdb_len_t total = 0, largest = 0;
1094
1095
0
  if (tdb_ofs_read(tdb, FREELIST_TOP, &ptr) == -1) {
1096
0
    return false;
1097
0
  }
1098
1099
0
  while (ptr != 0 && tdb_rec_free_read(tdb, ptr, &rec) == 0) {
1100
0
    total += rec.rec_len;
1101
0
    if (rec.rec_len > largest) {
1102
0
      largest = rec.rec_len;
1103
0
    }
1104
0
    ptr = rec.next;
1105
0
  }
1106
1107
0
  return total > largest * 2;
1108
0
}
1109
1110
/*
1111
  commit the current transaction
1112
*/
1113
_PUBLIC_ int tdb_transaction_commit(struct tdb_context *tdb)
1114
0
{
1115
0
  const struct tdb_methods *methods;
1116
0
  uint32_t i;
1117
0
  bool need_repack = false;
1118
1119
0
  if (tdb->transaction == NULL) {
1120
0
    TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: no transaction\n"));
1121
0
    return -1;
1122
0
  }
1123
1124
0
  tdb_trace(tdb, "tdb_transaction_commit");
1125
1126
0
  if (tdb->transaction->transaction_error) {
1127
0
    tdb->ecode = TDB_ERR_IO;
1128
0
    _tdb_transaction_cancel(tdb);
1129
0
    TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: transaction error pending\n"));
1130
0
    return -1;
1131
0
  }
1132
1133
1134
0
  if (tdb->transaction->nesting != 0) {
1135
0
    tdb->transaction->nesting--;
1136
0
    return 0;
1137
0
  }
1138
1139
  /* check for a null transaction */
1140
0
  if (tdb->transaction->blocks == NULL) {
1141
0
    _tdb_transaction_cancel(tdb);
1142
0
    return 0;
1143
0
  }
1144
1145
0
  if (!tdb->transaction->prepared) {
1146
0
    int ret = _tdb_transaction_prepare_commit(tdb);
1147
0
    if (ret)
1148
0
      return ret;
1149
0
  }
1150
1151
0
  methods = tdb->transaction->io_methods;
1152
1153
  /* perform all the writes */
1154
0
  for (i=0;i<tdb->transaction->num_blocks;i++) {
1155
0
    tdb_off_t offset;
1156
0
    tdb_len_t length;
1157
1158
0
    if (tdb->transaction->blocks[i] == NULL) {
1159
0
      continue;
1160
0
    }
1161
1162
0
    offset = i * tdb->transaction->block_size;
1163
0
    length = tdb->transaction->block_size;
1164
0
    if (i == tdb->transaction->num_blocks-1) {
1165
0
      length = tdb->transaction->last_block_size;
1166
0
    }
1167
1168
0
    if (methods->tdb_write(tdb, offset, tdb->transaction->blocks[i], length) == -1) {
1169
0
      TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed during commit\n"));
1170
1171
      /* we've overwritten part of the data and
1172
         possibly expanded the file, so we need to
1173
         run the crash recovery code */
1174
0
      tdb->methods = methods;
1175
0
      tdb_transaction_recover(tdb);
1176
1177
0
      _tdb_transaction_cancel(tdb);
1178
1179
0
      TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed\n"));
1180
0
      return -1;
1181
0
    }
1182
0
    SAFE_FREE(tdb->transaction->blocks[i]);
1183
0
  }
1184
1185
  /* Do this before we drop lock or blocks. */
1186
0
  if (tdb->transaction->expanded) {
1187
0
    need_repack = repack_worthwhile(tdb);
1188
0
  }
1189
1190
0
  SAFE_FREE(tdb->transaction->blocks);
1191
0
  tdb->transaction->num_blocks = 0;
1192
1193
  /* ensure the new data is on disk */
1194
0
  if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
1195
0
    return -1;
1196
0
  }
1197
1198
  /*
1199
    TODO: maybe write to some dummy hdr field, or write to magic
1200
    offset without mmap, before the last sync, instead of the
1201
    utime() call
1202
  */
1203
1204
  /* on some systems (like Linux 2.6.x) changes via mmap/msync
1205
     don't change the mtime of the file, this means the file may
1206
     not be backed up (as tdb rounding to block sizes means that
1207
     file size changes are quite rare too). The following forces
1208
     mtime changes when a transaction completes */
1209
0
  futimens(tdb->fd, NULL);
1210
1211
  /* use a transaction cancel to free memory and remove the
1212
     transaction locks */
1213
0
  _tdb_transaction_cancel(tdb);
1214
1215
0
  if (need_repack) {
1216
0
    int ret = tdb_repack(tdb);
1217
0
    if (ret != 0) {
1218
0
      TDB_LOG((tdb, TDB_DEBUG_FATAL,
1219
0
         __location__ " Failed to repack database (not fatal)\n"));
1220
0
    }
1221
    /*
1222
     * Ignore the error.
1223
     *
1224
     * Why?
1225
     *
1226
     * We just committed to the DB above, so anything
1227
     * written during the transaction is committed, the
1228
     * caller needs to know that the long-term state was
1229
     * successfully modified.
1230
     *
1231
     * tdb_repack is an optimization that can fail for
1232
     * reasons like lock ordering and we cannot recover
1233
     * the transaction lock at this point, having released
1234
     * it above.
1235
     *
1236
     * If we return a failure the caller thinks the
1237
     * transaction was rolled back.
1238
     */
1239
0
  }
1240
1241
0
  return 0;
1242
0
}
1243
1244
1245
/*
1246
  recover from an aborted transaction. Must be called with exclusive
1247
  database write access already established (including the open
1248
  lock to prevent new processes attaching)
1249
*/
1250
int tdb_transaction_recover(struct tdb_context *tdb)
1251
0
{
1252
0
  tdb_off_t recovery_head, recovery_eof;
1253
0
  unsigned char *data, *p;
1254
0
  uint32_t zero = 0;
1255
0
  struct tdb_record rec;
1256
1257
  /* find the recovery area */
1258
0
  if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
1259
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery head\n"));
1260
0
    tdb->ecode = TDB_ERR_IO;
1261
0
    return -1;
1262
0
  }
1263
1264
0
  if (recovery_head == 0) {
1265
    /* we have never allocated a recovery record */
1266
0
    return 0;
1267
0
  }
1268
1269
  /* read the recovery record */
1270
0
  if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
1271
0
           sizeof(rec), DOCONV()) == -1) {
1272
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery record\n"));
1273
0
    tdb->ecode = TDB_ERR_IO;
1274
0
    return -1;
1275
0
  }
1276
1277
0
  if (rec.magic != TDB_RECOVERY_MAGIC) {
1278
    /* there is no valid recovery data */
1279
0
    return 0;
1280
0
  }
1281
1282
0
  if (tdb->read_only) {
1283
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: attempt to recover read only database\n"));
1284
0
    tdb->ecode = TDB_ERR_CORRUPT;
1285
0
    return -1;
1286
0
  }
1287
1288
0
  recovery_eof = rec.key_len;
1289
1290
0
  data = (unsigned char *)malloc(rec.data_len);
1291
0
  if (data == NULL) {
1292
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to allocate recovery data\n"));
1293
0
    tdb->ecode = TDB_ERR_OOM;
1294
0
    return -1;
1295
0
  }
1296
1297
  /* read the full recovery data */
1298
0
  if (tdb->methods->tdb_read(tdb, recovery_head + sizeof(rec), data,
1299
0
           rec.data_len, 0) == -1) {
1300
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery data\n"));
1301
0
    tdb->ecode = TDB_ERR_IO;
1302
0
    free(data);
1303
0
    return -1;
1304
0
  }
1305
1306
  /* recover the file data */
1307
0
  p = data;
1308
0
  while (p+8 < data + rec.data_len) {
1309
0
    uint32_t ofs, len;
1310
0
    if (DOCONV()) {
1311
0
      tdb_convert(p, 8);
1312
0
    }
1313
0
    memcpy(&ofs, p, 4);
1314
0
    memcpy(&len, p+4, 4);
1315
1316
0
    if (tdb->methods->tdb_write(tdb, ofs, p+8, len) == -1) {
1317
0
      free(data);
1318
0
      TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to recover %u bytes at offset %u\n", len, ofs));
1319
0
      tdb->ecode = TDB_ERR_IO;
1320
0
      return -1;
1321
0
    }
1322
0
    p += 8 + len;
1323
0
  }
1324
1325
0
  free(data);
1326
1327
0
  if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
1328
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync recovery\n"));
1329
0
    tdb->ecode = TDB_ERR_IO;
1330
0
    return -1;
1331
0
  }
1332
1333
  /* if the recovery area is after the recovered eof then remove it */
1334
0
  if (recovery_eof <= recovery_head) {
1335
0
    if (tdb_ofs_write(tdb, TDB_RECOVERY_HEAD, &zero) == -1) {
1336
0
      TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery head\n"));
1337
0
      tdb->ecode = TDB_ERR_IO;
1338
0
      return -1;
1339
0
    }
1340
0
  }
1341
1342
  /* remove the recovery magic */
1343
0
  if (tdb_ofs_write(tdb, recovery_head + offsetof(struct tdb_record, magic),
1344
0
        &zero) == -1) {
1345
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery magic\n"));
1346
0
    tdb->ecode = TDB_ERR_IO;
1347
0
    return -1;
1348
0
  }
1349
1350
0
  if (transaction_sync(tdb, 0, recovery_eof) == -1) {
1351
0
    TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync2 recovery\n"));
1352
0
    tdb->ecode = TDB_ERR_IO;
1353
0
    return -1;
1354
0
  }
1355
1356
0
  TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_recover: recovered %u byte database\n",
1357
0
     recovery_eof));
1358
1359
  /* all done */
1360
0
  return 0;
1361
0
}
1362
1363
/* Any I/O failures we say "needs recovery". */
1364
bool tdb_needs_recovery(struct tdb_context *tdb)
1365
0
{
1366
0
  tdb_off_t recovery_head;
1367
0
  struct tdb_record rec;
1368
1369
  /* find the recovery area */
1370
0
  if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
1371
0
    return true;
1372
0
  }
1373
1374
0
  if (recovery_head == 0) {
1375
    /* we have never allocated a recovery record */
1376
0
    return false;
1377
0
  }
1378
1379
  /* read the recovery record */
1380
0
  if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
1381
0
           sizeof(rec), DOCONV()) == -1) {
1382
0
    return true;
1383
0
  }
1384
1385
0
  return (rec.magic == TDB_RECOVERY_MAGIC);
1386
0
}