/src/samba/lib/tdb/common/lock.c
Line | Count | Source |
1 | | /* |
2 | | Unix SMB/CIFS implementation. |
3 | | |
4 | | trivial database library |
5 | | |
6 | | Copyright (C) Andrew Tridgell 1999-2005 |
7 | | Copyright (C) Paul `Rusty' Russell 2000 |
8 | | Copyright (C) Jeremy Allison 2000-2003 |
9 | | |
10 | | ** NOTE! The following LGPL license applies to the tdb |
11 | | ** library. This does NOT imply that all of Samba is released |
12 | | ** under the LGPL |
13 | | |
14 | | This library is free software; you can redistribute it and/or |
15 | | modify it under the terms of the GNU Lesser General Public |
16 | | License as published by the Free Software Foundation; either |
17 | | version 3 of the License, or (at your option) any later version. |
18 | | |
19 | | This library is distributed in the hope that it will be useful, |
20 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
21 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
22 | | Lesser General Public License for more details. |
23 | | |
24 | | You should have received a copy of the GNU Lesser General Public |
25 | | License along with this library; if not, see <http://www.gnu.org/licenses/>. |
26 | | */ |
27 | | |
28 | | #include "tdb_private.h" |
29 | | |
30 | | _PUBLIC_ void tdb_setalarm_sigptr(struct tdb_context *tdb, volatile sig_atomic_t *ptr) |
31 | 0 | { |
32 | 0 | tdb->interrupt_sig_ptr = ptr; |
33 | 0 | } |
34 | | |
35 | | static int fcntl_lock(struct tdb_context *tdb, |
36 | | int rw, off_t off, off_t len, bool waitflag) |
37 | 0 | { |
38 | 0 | struct flock fl; |
39 | 0 | int cmd; |
40 | |
|
41 | 0 | #ifdef USE_TDB_MUTEX_LOCKING |
42 | 0 | { |
43 | 0 | int ret; |
44 | 0 | if (tdb_mutex_lock(tdb, rw, off, len, waitflag, &ret)) { |
45 | 0 | return ret; |
46 | 0 | } |
47 | 0 | } |
48 | 0 | #endif |
49 | | |
50 | 0 | fl.l_type = rw; |
51 | 0 | fl.l_whence = SEEK_SET; |
52 | 0 | fl.l_start = off; |
53 | 0 | fl.l_len = len; |
54 | 0 | fl.l_pid = 0; |
55 | |
|
56 | 0 | cmd = waitflag ? F_SETLKW : F_SETLK; |
57 | |
|
58 | 0 | return fcntl(tdb->fd, cmd, &fl); |
59 | 0 | } |
60 | | |
61 | | static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len) |
62 | 0 | { |
63 | 0 | struct flock fl; |
64 | | #if 0 /* Check they matched up locks and unlocks correctly. */ |
65 | | char line[80]; |
66 | | FILE *locks; |
67 | | bool found = false; |
68 | | |
69 | | locks = fopen("/proc/locks", "r"); |
70 | | |
71 | | while (fgets(line, 80, locks)) { |
72 | | char *p; |
73 | | int type, start, l; |
74 | | |
75 | | /* eg. 1: FLOCK ADVISORY WRITE 2440 08:01:2180826 0 EOF */ |
76 | | p = strchr(line, ':') + 1; |
77 | | if (strncmp(p, " POSIX ADVISORY ", strlen(" POSIX ADVISORY "))) |
78 | | continue; |
79 | | p += strlen(" FLOCK ADVISORY "); |
80 | | if (strncmp(p, "READ ", strlen("READ ")) == 0) |
81 | | type = F_RDLCK; |
82 | | else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0) |
83 | | type = F_WRLCK; |
84 | | else |
85 | | abort(); |
86 | | p += 6; |
87 | | if (atoi(p) != getpid()) |
88 | | continue; |
89 | | p = strchr(strchr(p, ' ') + 1, ' ') + 1; |
90 | | start = atoi(p); |
91 | | p = strchr(p, ' ') + 1; |
92 | | if (strncmp(p, "EOF", 3) == 0) |
93 | | l = 0; |
94 | | else |
95 | | l = atoi(p) - start + 1; |
96 | | |
97 | | if (off == start) { |
98 | | if (len != l) { |
99 | | fprintf(stderr, "Len %u should be %u: %s", |
100 | | (int)len, l, line); |
101 | | abort(); |
102 | | } |
103 | | if (type != rw) { |
104 | | fprintf(stderr, "Type %s wrong: %s", |
105 | | rw == F_RDLCK ? "READ" : "WRITE", line); |
106 | | abort(); |
107 | | } |
108 | | found = true; |
109 | | break; |
110 | | } |
111 | | } |
112 | | |
113 | | if (!found) { |
114 | | fprintf(stderr, "Unlock on %u@%u not found!\n", |
115 | | (int)off, (int)len); |
116 | | abort(); |
117 | | } |
118 | | |
119 | | fclose(locks); |
120 | | #endif |
121 | |
|
122 | 0 | #ifdef USE_TDB_MUTEX_LOCKING |
123 | 0 | { |
124 | 0 | int ret; |
125 | 0 | if (tdb_mutex_unlock(tdb, rw, off, len, &ret)) { |
126 | 0 | return ret; |
127 | 0 | } |
128 | 0 | } |
129 | 0 | #endif |
130 | | |
131 | 0 | fl.l_type = F_UNLCK; |
132 | 0 | fl.l_whence = SEEK_SET; |
133 | 0 | fl.l_start = off; |
134 | 0 | fl.l_len = len; |
135 | 0 | fl.l_pid = 0; |
136 | |
|
137 | 0 | return fcntl(tdb->fd, F_SETLKW, &fl); |
138 | 0 | } |
139 | | |
140 | | /* |
141 | | * Calculate the lock offset for a list |
142 | | * |
143 | | * list -1 is the freelist, otherwise a hash chain. |
144 | | * |
145 | | * Note that we consistently (but without real reason) lock hash chains at an |
146 | | * offset that is 4 bytes below the real offset of the corresponding list head |
147 | | * in the db. |
148 | | * |
149 | | * This is the memory layout of the hashchain array: |
150 | | * |
151 | | * FREELIST_TOP + 0 = freelist |
152 | | * FREELIST_TOP + 4 = hashtable list 0 |
153 | | * FREELIST_TOP + 8 = hashtable list 1 |
154 | | * ... |
155 | | * |
156 | | * Otoh lock_offset computes: |
157 | | * |
158 | | * freelist = FREELIST_TOP - 4 |
159 | | * list 0 = FREELIST_TOP + 0 |
160 | | * list 1 = FREELIST_TOP + 4 |
161 | | * ... |
162 | | * |
163 | | * Unfortunately we can't change this calculation in order to align the locking |
164 | | * offset with the memory layout, as that would make the locking incompatible |
165 | | * between different tdb versions. |
166 | | */ |
167 | | static tdb_off_t lock_offset(int list) |
168 | 0 | { |
169 | 0 | return FREELIST_TOP + 4*list; |
170 | 0 | } |
171 | | |
172 | | /* a byte range locking function - return 0 on success |
173 | | this functions locks/unlocks "len" byte at the specified offset. |
174 | | |
175 | | On error, errno is also set so that errors are passed back properly |
176 | | through tdb_open(). |
177 | | |
178 | | note that a len of zero means lock to end of file |
179 | | */ |
180 | | int tdb_brlock(struct tdb_context *tdb, |
181 | | int rw_type, tdb_off_t offset, size_t len, |
182 | | enum tdb_lock_flags flags) |
183 | 0 | { |
184 | 0 | int ret; |
185 | |
|
186 | 0 | if (tdb->flags & TDB_NOLOCK) { |
187 | 0 | return 0; |
188 | 0 | } |
189 | | |
190 | 0 | if (flags & TDB_LOCK_MARK_ONLY) { |
191 | 0 | return 0; |
192 | 0 | } |
193 | | |
194 | 0 | if ((rw_type == F_WRLCK) && (tdb->read_only || tdb->traverse_read)) { |
195 | 0 | tdb->ecode = TDB_ERR_RDONLY; |
196 | 0 | return -1; |
197 | 0 | } |
198 | | |
199 | 0 | do { |
200 | 0 | ret = fcntl_lock(tdb, rw_type, offset, len, |
201 | 0 | flags & TDB_LOCK_WAIT); |
202 | | /* Check for a sigalarm break. */ |
203 | 0 | if (ret == -1 && errno == EINTR && |
204 | 0 | tdb->interrupt_sig_ptr && |
205 | 0 | *tdb->interrupt_sig_ptr) { |
206 | 0 | break; |
207 | 0 | } |
208 | 0 | } while (ret == -1 && errno == EINTR); |
209 | |
|
210 | 0 | if (ret == -1) { |
211 | 0 | tdb->ecode = TDB_ERR_LOCK; |
212 | | /* Generic lock error. errno set by fcntl. |
213 | | * EAGAIN is an expected return from non-blocking |
214 | | * locks. */ |
215 | 0 | if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) { |
216 | 0 | TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock failed (fd=%d) at offset %u rw_type=%d flags=%d len=%zu\n", |
217 | 0 | tdb->fd, offset, rw_type, flags, len)); |
218 | 0 | } |
219 | 0 | return -1; |
220 | 0 | } |
221 | 0 | return 0; |
222 | 0 | } |
223 | | |
224 | | int tdb_brunlock(struct tdb_context *tdb, |
225 | | int rw_type, tdb_off_t offset, size_t len) |
226 | 0 | { |
227 | 0 | int ret; |
228 | |
|
229 | 0 | if (tdb->flags & TDB_NOLOCK) { |
230 | 0 | return 0; |
231 | 0 | } |
232 | | |
233 | 0 | do { |
234 | 0 | ret = fcntl_unlock(tdb, rw_type, offset, len); |
235 | 0 | } while (ret == -1 && errno == EINTR); |
236 | |
|
237 | 0 | if (ret == -1) { |
238 | 0 | TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brunlock failed (fd=%d) at offset %u rw_type=%u len=%zu\n", |
239 | 0 | tdb->fd, offset, rw_type, len)); |
240 | 0 | } |
241 | 0 | return ret; |
242 | 0 | } |
243 | | |
244 | | /* |
245 | | * Do a tdb_brlock in a loop. Some OSes (such as solaris) have too |
246 | | * conservative deadlock detection and claim a deadlock when progress can be |
247 | | * made. For those OSes we may loop for a while. |
248 | | */ |
249 | | |
250 | | static int tdb_brlock_retry(struct tdb_context *tdb, |
251 | | int rw_type, tdb_off_t offset, size_t len, |
252 | | enum tdb_lock_flags flags) |
253 | 0 | { |
254 | 0 | int count = 1000; |
255 | |
|
256 | 0 | while (count--) { |
257 | 0 | struct timeval tv; |
258 | 0 | int ret; |
259 | |
|
260 | 0 | ret = tdb_brlock(tdb, rw_type, offset, len, flags); |
261 | 0 | if (ret == 0) { |
262 | 0 | return 0; |
263 | 0 | } |
264 | 0 | if (errno != EDEADLK) { |
265 | 0 | break; |
266 | 0 | } |
267 | | /* sleep for as short a time as we can - more portable than usleep() */ |
268 | 0 | tv.tv_sec = 0; |
269 | 0 | tv.tv_usec = 1; |
270 | 0 | select(0, NULL, NULL, NULL, &tv); |
271 | 0 | } |
272 | 0 | return -1; |
273 | 0 | } |
274 | | |
275 | | /* |
276 | | upgrade a read lock to a write lock. |
277 | | */ |
278 | | int tdb_allrecord_upgrade(struct tdb_context *tdb) |
279 | 0 | { |
280 | 0 | int ret; |
281 | |
|
282 | 0 | if (tdb->allrecord_lock.count != 1) { |
283 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
284 | 0 | "tdb_allrecord_upgrade failed: count %u too high\n", |
285 | 0 | tdb->allrecord_lock.count)); |
286 | 0 | tdb->ecode = TDB_ERR_LOCK; |
287 | 0 | return -1; |
288 | 0 | } |
289 | | |
290 | 0 | if (tdb->allrecord_lock.off != 1) { |
291 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
292 | 0 | "tdb_allrecord_upgrade failed: already upgraded?\n")); |
293 | 0 | tdb->ecode = TDB_ERR_LOCK; |
294 | 0 | return -1; |
295 | 0 | } |
296 | | |
297 | 0 | if (tdb_have_mutexes(tdb)) { |
298 | 0 | ret = tdb_mutex_allrecord_upgrade(tdb); |
299 | 0 | if (ret == -1) { |
300 | 0 | goto fail; |
301 | 0 | } |
302 | 0 | ret = tdb_brlock_retry(tdb, F_WRLCK, lock_offset(tdb->hash_size), |
303 | 0 | 0, TDB_LOCK_WAIT|TDB_LOCK_PROBE); |
304 | 0 | if (ret == -1) { |
305 | 0 | tdb_mutex_allrecord_downgrade(tdb); |
306 | 0 | } |
307 | 0 | } else { |
308 | 0 | ret = tdb_brlock_retry(tdb, F_WRLCK, FREELIST_TOP, 0, |
309 | 0 | TDB_LOCK_WAIT|TDB_LOCK_PROBE); |
310 | 0 | } |
311 | | |
312 | 0 | if (ret == 0) { |
313 | 0 | tdb->allrecord_lock.ltype = F_WRLCK; |
314 | 0 | tdb->allrecord_lock.off = 0; |
315 | 0 | return 0; |
316 | 0 | } |
317 | 0 | fail: |
318 | 0 | TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_allrecord_upgrade failed\n")); |
319 | 0 | return -1; |
320 | 0 | } |
321 | | |
322 | | static struct tdb_lock_type *find_nestlock(struct tdb_context *tdb, |
323 | | tdb_off_t offset) |
324 | 0 | { |
325 | 0 | int i; |
326 | |
|
327 | 0 | for (i=0; i<tdb->num_lockrecs; i++) { |
328 | 0 | if (tdb->lockrecs[i].off == offset) { |
329 | 0 | return &tdb->lockrecs[i]; |
330 | 0 | } |
331 | 0 | } |
332 | 0 | return NULL; |
333 | 0 | } |
334 | | |
335 | | /* lock an offset in the database. */ |
336 | | int tdb_nest_lock(struct tdb_context *tdb, uint32_t offset, int ltype, |
337 | | enum tdb_lock_flags flags) |
338 | 0 | { |
339 | 0 | struct tdb_lock_type *new_lck; |
340 | |
|
341 | 0 | if (offset >= lock_offset(tdb->hash_size)) { |
342 | 0 | tdb->ecode = TDB_ERR_LOCK; |
343 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_lock: invalid offset %u for ltype=%d\n", |
344 | 0 | offset, ltype)); |
345 | 0 | return -1; |
346 | 0 | } |
347 | 0 | if (tdb->flags & TDB_NOLOCK) |
348 | 0 | return 0; |
349 | | |
350 | 0 | new_lck = find_nestlock(tdb, offset); |
351 | 0 | if (new_lck) { |
352 | 0 | if ((new_lck->ltype == F_RDLCK) && (ltype == F_WRLCK)) { |
353 | 0 | if (!tdb_have_mutexes(tdb)) { |
354 | 0 | int ret; |
355 | | /* |
356 | | * Upgrade the underlying fcntl |
357 | | * lock. Mutexes don't do readlocks, |
358 | | * so this only applies to fcntl |
359 | | * locking. |
360 | | */ |
361 | 0 | ret = tdb_brlock(tdb, ltype, offset, 1, flags); |
362 | 0 | if (ret != 0) { |
363 | 0 | return ret; |
364 | 0 | } |
365 | 0 | } |
366 | 0 | new_lck->ltype = F_WRLCK; |
367 | 0 | } |
368 | | /* |
369 | | * Just increment the in-memory struct, posix locks |
370 | | * don't stack. |
371 | | */ |
372 | 0 | new_lck->count++; |
373 | 0 | return 0; |
374 | 0 | } |
375 | | |
376 | 0 | if (tdb->num_lockrecs == tdb->lockrecs_array_length) { |
377 | 0 | new_lck = (struct tdb_lock_type *)realloc( |
378 | 0 | tdb->lockrecs, |
379 | 0 | sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1)); |
380 | 0 | if (new_lck == NULL) { |
381 | 0 | errno = ENOMEM; |
382 | 0 | return -1; |
383 | 0 | } |
384 | 0 | tdb->lockrecs_array_length = tdb->num_lockrecs+1; |
385 | 0 | tdb->lockrecs = new_lck; |
386 | 0 | } |
387 | | |
388 | | /* Since fcntl locks don't nest, we do a lock for the first one, |
389 | | and simply bump the count for future ones */ |
390 | 0 | if (tdb_brlock(tdb, ltype, offset, 1, flags)) { |
391 | 0 | return -1; |
392 | 0 | } |
393 | | |
394 | 0 | new_lck = &tdb->lockrecs[tdb->num_lockrecs]; |
395 | |
|
396 | 0 | new_lck->off = offset; |
397 | 0 | new_lck->count = 1; |
398 | 0 | new_lck->ltype = ltype; |
399 | 0 | tdb->num_lockrecs++; |
400 | |
|
401 | 0 | return 0; |
402 | 0 | } |
403 | | |
404 | | static int tdb_lock_and_recover(struct tdb_context *tdb) |
405 | 0 | { |
406 | 0 | int ret; |
407 | | |
408 | | /* We need to match locking order in transaction commit. */ |
409 | 0 | if (tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0, TDB_LOCK_WAIT)) { |
410 | 0 | return -1; |
411 | 0 | } |
412 | | |
413 | 0 | if (tdb_brlock(tdb, F_WRLCK, OPEN_LOCK, 1, TDB_LOCK_WAIT)) { |
414 | 0 | tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0); |
415 | 0 | return -1; |
416 | 0 | } |
417 | | |
418 | 0 | ret = tdb_transaction_recover(tdb); |
419 | |
|
420 | 0 | tdb_brunlock(tdb, F_WRLCK, OPEN_LOCK, 1); |
421 | 0 | tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0); |
422 | |
|
423 | 0 | return ret; |
424 | 0 | } |
425 | | |
426 | | static bool have_data_locks(const struct tdb_context *tdb) |
427 | 0 | { |
428 | 0 | int i; |
429 | |
|
430 | 0 | for (i = 0; i < tdb->num_lockrecs; i++) { |
431 | 0 | if (tdb->lockrecs[i].off >= lock_offset(-1)) |
432 | 0 | return true; |
433 | 0 | } |
434 | 0 | return false; |
435 | 0 | } |
436 | | |
437 | | /* |
438 | | * A allrecord lock allows us to avoid per chain locks. Check if the allrecord |
439 | | * lock is strong enough. |
440 | | */ |
441 | | static int tdb_lock_covered_by_allrecord_lock(struct tdb_context *tdb, |
442 | | int ltype) |
443 | 0 | { |
444 | 0 | if (ltype == F_RDLCK) { |
445 | | /* |
446 | | * The allrecord_lock is equal (F_RDLCK) or stronger |
447 | | * (F_WRLCK). Pass. |
448 | | */ |
449 | 0 | return 0; |
450 | 0 | } |
451 | | |
452 | 0 | if (tdb->allrecord_lock.ltype == F_RDLCK) { |
453 | | /* |
454 | | * We ask for ltype==F_WRLCK, but the allrecord_lock |
455 | | * is too weak. We can't upgrade here, so fail. |
456 | | */ |
457 | 0 | tdb->ecode = TDB_ERR_LOCK; |
458 | 0 | return -1; |
459 | 0 | } |
460 | | |
461 | | /* |
462 | | * Asking for F_WRLCK, allrecord is F_WRLCK as well. Pass. |
463 | | */ |
464 | 0 | return 0; |
465 | 0 | } |
466 | | |
467 | | static int tdb_lock_list(struct tdb_context *tdb, int list, int ltype, |
468 | | enum tdb_lock_flags waitflag) |
469 | 0 | { |
470 | 0 | int ret; |
471 | 0 | bool check = false; |
472 | |
|
473 | 0 | if (tdb->allrecord_lock.count) { |
474 | 0 | return tdb_lock_covered_by_allrecord_lock(tdb, ltype); |
475 | 0 | } |
476 | | |
477 | | /* |
478 | | * Check for recoveries: Someone might have kill -9'ed a process |
479 | | * during a commit. |
480 | | */ |
481 | 0 | check = !have_data_locks(tdb); |
482 | 0 | ret = tdb_nest_lock(tdb, lock_offset(list), ltype, waitflag); |
483 | |
|
484 | 0 | if (ret == 0 && check && tdb_needs_recovery(tdb)) { |
485 | 0 | tdb_nest_unlock(tdb, lock_offset(list), ltype, false); |
486 | |
|
487 | 0 | if (tdb_lock_and_recover(tdb) == -1) { |
488 | 0 | return -1; |
489 | 0 | } |
490 | 0 | return tdb_lock_list(tdb, list, ltype, waitflag); |
491 | 0 | } |
492 | 0 | return ret; |
493 | 0 | } |
494 | | |
495 | | /* lock a list in the database. list -1 is the alloc list */ |
496 | | int tdb_lock(struct tdb_context *tdb, int list, int ltype) |
497 | 0 | { |
498 | 0 | int ret; |
499 | |
|
500 | 0 | ret = tdb_lock_list(tdb, list, ltype, TDB_LOCK_WAIT); |
501 | 0 | if (ret) { |
502 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lock failed on list %d " |
503 | 0 | "ltype=%d (%s)\n", list, ltype, strerror(errno))); |
504 | 0 | } |
505 | 0 | return ret; |
506 | 0 | } |
507 | | |
508 | | /* lock a list in the database. list -1 is the alloc list. non-blocking lock */ |
509 | | _PUBLIC_ int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype); |
510 | | _PUBLIC_ int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype) |
511 | 0 | { |
512 | 0 | return tdb_lock_list(tdb, list, ltype, TDB_LOCK_NOWAIT); |
513 | 0 | } |
514 | | |
515 | | |
516 | | int tdb_nest_unlock(struct tdb_context *tdb, uint32_t offset, int ltype, |
517 | | bool mark_lock) |
518 | 0 | { |
519 | 0 | int ret = -1; |
520 | 0 | struct tdb_lock_type *lck; |
521 | |
|
522 | 0 | if (tdb->flags & TDB_NOLOCK) |
523 | 0 | return 0; |
524 | | |
525 | | /* Sanity checks */ |
526 | 0 | if (offset >= lock_offset(tdb->hash_size)) { |
527 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: offset %u invalid (%d)\n", offset, tdb->hash_size)); |
528 | 0 | return ret; |
529 | 0 | } |
530 | | |
531 | 0 | lck = find_nestlock(tdb, offset); |
532 | 0 | if ((lck == NULL) || (lck->count == 0)) { |
533 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: count is 0\n")); |
534 | 0 | return -1; |
535 | 0 | } |
536 | | |
537 | 0 | if (lck->count > 1) { |
538 | 0 | lck->count--; |
539 | 0 | return 0; |
540 | 0 | } |
541 | | |
542 | | /* |
543 | | * This lock has count==1 left, so we need to unlock it in the |
544 | | * kernel. We don't bother with decrementing the in-memory array |
545 | | * element, we're about to overwrite it with the last array element |
546 | | * anyway. |
547 | | */ |
548 | | |
549 | 0 | if (mark_lock) { |
550 | 0 | ret = 0; |
551 | 0 | } else { |
552 | 0 | ret = tdb_brunlock(tdb, ltype, offset, 1); |
553 | 0 | } |
554 | | |
555 | | /* |
556 | | * Shrink the array by overwriting the element just unlocked with the |
557 | | * last array element. |
558 | | */ |
559 | 0 | *lck = tdb->lockrecs[--tdb->num_lockrecs]; |
560 | | |
561 | | /* |
562 | | * We don't bother with realloc when the array shrinks, but if we have |
563 | | * a completely idle tdb we should get rid of the locked array. |
564 | | */ |
565 | |
|
566 | 0 | if (ret) |
567 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: An error occurred unlocking!\n")); |
568 | 0 | return ret; |
569 | 0 | } |
570 | | |
571 | | _PUBLIC_ int tdb_unlock(struct tdb_context *tdb, int list, int ltype); |
572 | | _PUBLIC_ int tdb_unlock(struct tdb_context *tdb, int list, int ltype) |
573 | 0 | { |
574 | | /* a global lock allows us to avoid per chain locks */ |
575 | 0 | if (tdb->allrecord_lock.count) { |
576 | 0 | return tdb_lock_covered_by_allrecord_lock(tdb, ltype); |
577 | 0 | } |
578 | | |
579 | 0 | return tdb_nest_unlock(tdb, lock_offset(list), ltype, false); |
580 | 0 | } |
581 | | |
582 | | /* |
583 | | get the transaction lock |
584 | | */ |
585 | | int tdb_transaction_lock(struct tdb_context *tdb, int ltype, |
586 | | enum tdb_lock_flags lockflags) |
587 | 0 | { |
588 | 0 | return tdb_nest_lock(tdb, TRANSACTION_LOCK, ltype, lockflags); |
589 | 0 | } |
590 | | |
591 | | /* |
592 | | release the transaction lock |
593 | | */ |
594 | | int tdb_transaction_unlock(struct tdb_context *tdb, int ltype) |
595 | 0 | { |
596 | 0 | return tdb_nest_unlock(tdb, TRANSACTION_LOCK, ltype, false); |
597 | 0 | } |
598 | | |
599 | | /* Returns 0 if all done, -1 if error, 1 if ok. */ |
600 | | static int tdb_allrecord_check(struct tdb_context *tdb, int ltype, |
601 | | enum tdb_lock_flags flags, bool upgradable) |
602 | 0 | { |
603 | | /* There are no locks on read-only dbs */ |
604 | 0 | if (tdb->read_only || tdb->traverse_read) { |
605 | 0 | tdb->ecode = TDB_ERR_LOCK; |
606 | 0 | return -1; |
607 | 0 | } |
608 | | |
609 | 0 | if (tdb->allrecord_lock.count && |
610 | 0 | tdb->allrecord_lock.ltype == (uint32_t)ltype) { |
611 | 0 | tdb->allrecord_lock.count++; |
612 | 0 | return 0; |
613 | 0 | } |
614 | | |
615 | 0 | if (tdb->allrecord_lock.count) { |
616 | | /* a global lock of a different type exists */ |
617 | 0 | tdb->ecode = TDB_ERR_LOCK; |
618 | 0 | return -1; |
619 | 0 | } |
620 | | |
621 | 0 | if (tdb_have_extra_locks(tdb)) { |
622 | | /* can't combine global and chain locks */ |
623 | 0 | tdb->ecode = TDB_ERR_LOCK; |
624 | 0 | return -1; |
625 | 0 | } |
626 | | |
627 | 0 | if (upgradable && ltype != F_RDLCK) { |
628 | | /* tdb error: you can't upgrade a write lock! */ |
629 | 0 | tdb->ecode = TDB_ERR_LOCK; |
630 | 0 | return -1; |
631 | 0 | } |
632 | 0 | return 1; |
633 | 0 | } |
634 | | |
635 | | /* We only need to lock individual bytes, but Linux merges consecutive locks |
636 | | * so we lock in contiguous ranges. */ |
637 | | static int tdb_chainlock_gradual(struct tdb_context *tdb, |
638 | | int ltype, enum tdb_lock_flags flags, |
639 | | size_t off, size_t len) |
640 | 0 | { |
641 | 0 | int ret; |
642 | 0 | enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT); |
643 | |
|
644 | 0 | if (len <= 4) { |
645 | | /* Single record. Just do blocking lock. */ |
646 | 0 | return tdb_brlock(tdb, ltype, off, len, flags); |
647 | 0 | } |
648 | | |
649 | | /* First we try non-blocking. */ |
650 | 0 | ret = tdb_brlock(tdb, ltype, off, len, nb_flags); |
651 | 0 | if (ret == 0) { |
652 | 0 | return 0; |
653 | 0 | } |
654 | | |
655 | | /* Try locking first half, then second. */ |
656 | 0 | ret = tdb_chainlock_gradual(tdb, ltype, flags, off, len / 2); |
657 | 0 | if (ret == -1) |
658 | 0 | return -1; |
659 | | |
660 | 0 | ret = tdb_chainlock_gradual(tdb, ltype, flags, |
661 | 0 | off + len / 2, len - len / 2); |
662 | 0 | if (ret == -1) { |
663 | 0 | tdb_brunlock(tdb, ltype, off, len / 2); |
664 | 0 | return -1; |
665 | 0 | } |
666 | 0 | return 0; |
667 | 0 | } |
668 | | |
669 | | /* lock/unlock entire database. It can only be upgradable if you have some |
670 | | * other way of guaranteeing exclusivity (ie. transaction write lock). |
671 | | * We do the locking gradually to avoid being starved by smaller locks. */ |
672 | | int tdb_allrecord_lock(struct tdb_context *tdb, int ltype, |
673 | | enum tdb_lock_flags flags, bool upgradable) |
674 | 0 | { |
675 | 0 | int ret; |
676 | |
|
677 | 0 | switch (tdb_allrecord_check(tdb, ltype, flags, upgradable)) { |
678 | 0 | case -1: |
679 | 0 | return -1; |
680 | 0 | case 0: |
681 | 0 | return 0; |
682 | 0 | } |
683 | | |
684 | | /* We cover two kinds of locks: |
685 | | * 1) Normal chain locks. Taken for almost all operations. |
686 | | * 2) Individual records locks. Taken after normal or free |
687 | | * chain locks. |
688 | | * |
689 | | * It is (1) which cause the starvation problem, so we're only |
690 | | * gradual for that. */ |
691 | | |
692 | 0 | if (tdb_have_mutexes(tdb)) { |
693 | 0 | ret = tdb_mutex_allrecord_lock(tdb, ltype, flags); |
694 | 0 | } else { |
695 | 0 | ret = tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP, |
696 | 0 | tdb->hash_size * 4); |
697 | 0 | } |
698 | |
|
699 | 0 | if (ret == -1) { |
700 | 0 | return -1; |
701 | 0 | } |
702 | | |
703 | | /* Grab individual record locks. */ |
704 | 0 | if (tdb_brlock(tdb, ltype, lock_offset(tdb->hash_size), 0, |
705 | 0 | flags) == -1) { |
706 | 0 | if (tdb_have_mutexes(tdb)) { |
707 | 0 | tdb_mutex_allrecord_unlock(tdb); |
708 | 0 | } else { |
709 | 0 | tdb_brunlock(tdb, ltype, FREELIST_TOP, |
710 | 0 | tdb->hash_size * 4); |
711 | 0 | } |
712 | 0 | return -1; |
713 | 0 | } |
714 | | |
715 | 0 | tdb->allrecord_lock.count = 1; |
716 | | /* If it's upgradable, it's actually exclusive so we can treat |
717 | | * it as a write lock. */ |
718 | 0 | tdb->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype; |
719 | 0 | tdb->allrecord_lock.off = upgradable; |
720 | |
|
721 | 0 | if (tdb_needs_recovery(tdb)) { |
722 | 0 | bool mark = flags & TDB_LOCK_MARK_ONLY; |
723 | 0 | tdb_allrecord_unlock(tdb, ltype, mark); |
724 | 0 | if (mark) { |
725 | 0 | tdb->ecode = TDB_ERR_LOCK; |
726 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, |
727 | 0 | "tdb_lockall_mark cannot do recovery\n")); |
728 | 0 | return -1; |
729 | 0 | } |
730 | 0 | if (tdb_lock_and_recover(tdb) == -1) { |
731 | 0 | return -1; |
732 | 0 | } |
733 | 0 | return tdb_allrecord_lock(tdb, ltype, flags, upgradable); |
734 | 0 | } |
735 | | |
736 | 0 | return 0; |
737 | 0 | } |
738 | | |
739 | | |
740 | | |
741 | | /* unlock entire db */ |
742 | | int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype, bool mark_lock) |
743 | 0 | { |
744 | | /* There are no locks on read-only dbs */ |
745 | 0 | if (tdb->read_only || tdb->traverse_read) { |
746 | 0 | tdb->ecode = TDB_ERR_LOCK; |
747 | 0 | return -1; |
748 | 0 | } |
749 | | |
750 | 0 | if (tdb->allrecord_lock.count == 0) { |
751 | 0 | tdb->ecode = TDB_ERR_LOCK; |
752 | 0 | return -1; |
753 | 0 | } |
754 | | |
755 | | /* Upgradable locks are marked as write locks. */ |
756 | 0 | if (tdb->allrecord_lock.ltype != (uint32_t)ltype |
757 | 0 | && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) { |
758 | 0 | tdb->ecode = TDB_ERR_LOCK; |
759 | 0 | return -1; |
760 | 0 | } |
761 | | |
762 | 0 | if (tdb->allrecord_lock.count > 1) { |
763 | 0 | tdb->allrecord_lock.count--; |
764 | 0 | return 0; |
765 | 0 | } |
766 | | |
767 | 0 | if (!mark_lock) { |
768 | 0 | int ret; |
769 | |
|
770 | 0 | if (tdb_have_mutexes(tdb)) { |
771 | 0 | ret = tdb_mutex_allrecord_unlock(tdb); |
772 | 0 | if (ret == 0) { |
773 | 0 | ret = tdb_brunlock(tdb, ltype, |
774 | 0 | lock_offset(tdb->hash_size), |
775 | 0 | 0); |
776 | 0 | } |
777 | 0 | } else { |
778 | 0 | ret = tdb_brunlock(tdb, ltype, FREELIST_TOP, 0); |
779 | 0 | } |
780 | |
|
781 | 0 | if (ret != 0) { |
782 | 0 | TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed " |
783 | 0 | "(%s)\n", strerror(errno))); |
784 | 0 | return -1; |
785 | 0 | } |
786 | 0 | } |
787 | | |
788 | 0 | tdb->allrecord_lock.count = 0; |
789 | 0 | tdb->allrecord_lock.ltype = 0; |
790 | |
|
791 | 0 | return 0; |
792 | 0 | } |
793 | | |
794 | | /* lock entire database with write lock */ |
795 | | _PUBLIC_ int tdb_lockall(struct tdb_context *tdb) |
796 | 0 | { |
797 | 0 | tdb_trace(tdb, "tdb_lockall"); |
798 | 0 | return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false); |
799 | 0 | } |
800 | | |
801 | | /* lock entire database with write lock - mark only */ |
802 | | _PUBLIC_ int tdb_lockall_mark(struct tdb_context *tdb) |
803 | 0 | { |
804 | 0 | tdb_trace(tdb, "tdb_lockall_mark"); |
805 | 0 | return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY, false); |
806 | 0 | } |
807 | | |
808 | | /* unlock entire database with write lock - unmark only */ |
809 | | _PUBLIC_ int tdb_lockall_unmark(struct tdb_context *tdb) |
810 | 0 | { |
811 | 0 | tdb_trace(tdb, "tdb_lockall_unmark"); |
812 | 0 | return tdb_allrecord_unlock(tdb, F_WRLCK, true); |
813 | 0 | } |
814 | | |
815 | | /* lock entire database with write lock - nonblocking variant */ |
816 | | _PUBLIC_ int tdb_lockall_nonblock(struct tdb_context *tdb) |
817 | 0 | { |
818 | 0 | int ret = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_NOWAIT, false); |
819 | 0 | tdb_trace_ret(tdb, "tdb_lockall_nonblock", ret); |
820 | 0 | return ret; |
821 | 0 | } |
822 | | |
823 | | /* unlock entire database with write lock */ |
824 | | _PUBLIC_ int tdb_unlockall(struct tdb_context *tdb) |
825 | 0 | { |
826 | 0 | tdb_trace(tdb, "tdb_unlockall"); |
827 | 0 | return tdb_allrecord_unlock(tdb, F_WRLCK, false); |
828 | 0 | } |
829 | | |
830 | | /* lock entire database with read lock */ |
831 | | _PUBLIC_ int tdb_lockall_read(struct tdb_context *tdb) |
832 | 0 | { |
833 | 0 | tdb_trace(tdb, "tdb_lockall_read"); |
834 | 0 | return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false); |
835 | 0 | } |
836 | | |
837 | | /* lock entire database with read lock - nonblock variant */ |
838 | | _PUBLIC_ int tdb_lockall_read_nonblock(struct tdb_context *tdb) |
839 | 0 | { |
840 | 0 | int ret = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_NOWAIT, false); |
841 | 0 | tdb_trace_ret(tdb, "tdb_lockall_read_nonblock", ret); |
842 | 0 | return ret; |
843 | 0 | } |
844 | | |
845 | | /* unlock entire database with read lock */ |
846 | | _PUBLIC_ int tdb_unlockall_read(struct tdb_context *tdb) |
847 | 0 | { |
848 | 0 | tdb_trace(tdb, "tdb_unlockall_read"); |
849 | 0 | return tdb_allrecord_unlock(tdb, F_RDLCK, false); |
850 | 0 | } |
851 | | |
852 | | /* lock/unlock one hash chain. This is meant to be used to reduce |
853 | | contention - it cannot guarantee how many records will be locked */ |
854 | | _PUBLIC_ int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key) |
855 | 0 | { |
856 | 0 | int ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK); |
857 | 0 | tdb_trace_1rec(tdb, "tdb_chainlock", key); |
858 | 0 | return ret; |
859 | 0 | } |
860 | | |
861 | | /* lock/unlock one hash chain, non-blocking. This is meant to be used |
862 | | to reduce contention - it cannot guarantee how many records will be |
863 | | locked */ |
864 | | _PUBLIC_ int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key) |
865 | 0 | { |
866 | 0 | int ret = tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK); |
867 | 0 | tdb_trace_1rec_ret(tdb, "tdb_chainlock_nonblock", key, ret); |
868 | 0 | return ret; |
869 | 0 | } |
870 | | |
871 | | /* mark a chain as locked without actually locking it. Warning! use with great caution! */ |
872 | | _PUBLIC_ int tdb_chainlock_mark(struct tdb_context *tdb, TDB_DATA key) |
873 | 0 | { |
874 | 0 | int ret = tdb_nest_lock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))), |
875 | 0 | F_WRLCK, TDB_LOCK_MARK_ONLY); |
876 | 0 | tdb_trace_1rec(tdb, "tdb_chainlock_mark", key); |
877 | 0 | return ret; |
878 | 0 | } |
879 | | |
880 | | /* unmark a chain as locked without actually locking it. Warning! use with great caution! */ |
881 | | _PUBLIC_ int tdb_chainlock_unmark(struct tdb_context *tdb, TDB_DATA key) |
882 | 0 | { |
883 | 0 | tdb_trace_1rec(tdb, "tdb_chainlock_unmark", key); |
884 | 0 | return tdb_nest_unlock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))), |
885 | 0 | F_WRLCK, true); |
886 | 0 | } |
887 | | |
888 | | _PUBLIC_ int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key) |
889 | 0 | { |
890 | 0 | tdb_trace_1rec(tdb, "tdb_chainunlock", key); |
891 | 0 | return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK); |
892 | 0 | } |
893 | | |
894 | | _PUBLIC_ int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key) |
895 | 0 | { |
896 | 0 | int ret; |
897 | 0 | ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK); |
898 | 0 | tdb_trace_1rec(tdb, "tdb_chainlock_read", key); |
899 | 0 | return ret; |
900 | 0 | } |
901 | | |
902 | | _PUBLIC_ int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key) |
903 | 0 | { |
904 | 0 | tdb_trace_1rec(tdb, "tdb_chainunlock_read", key); |
905 | 0 | return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK); |
906 | 0 | } |
907 | | |
908 | | _PUBLIC_ int tdb_chainlock_read_nonblock(struct tdb_context *tdb, TDB_DATA key) |
909 | 0 | { |
910 | 0 | int ret = tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK); |
911 | 0 | tdb_trace_1rec_ret(tdb, "tdb_chainlock_read_nonblock", key, ret); |
912 | 0 | return ret; |
913 | 0 | } |
914 | | |
915 | | /* record lock stops delete underneath */ |
916 | | int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off) |
917 | 0 | { |
918 | 0 | if (tdb->allrecord_lock.count) { |
919 | 0 | return 0; |
920 | 0 | } |
921 | 0 | return off ? tdb_brlock(tdb, F_RDLCK, off, 1, TDB_LOCK_WAIT) : 0; |
922 | 0 | } |
923 | | |
924 | | /* |
925 | | Write locks override our own fcntl readlocks, so check it here. |
926 | | Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not |
927 | | an error to fail to get the lock here. |
928 | | */ |
929 | | int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off) |
930 | 0 | { |
931 | 0 | struct tdb_traverse_lock *i; |
932 | 0 | if (tdb == NULL) { |
933 | 0 | return -1; |
934 | 0 | } |
935 | 0 | for (i = &tdb->travlocks; i; i = i->next) |
936 | 0 | if (i->off == off) |
937 | 0 | return -1; |
938 | 0 | if (tdb->allrecord_lock.count) { |
939 | 0 | if (tdb->allrecord_lock.ltype == F_WRLCK) { |
940 | 0 | return 0; |
941 | 0 | } |
942 | 0 | return -1; |
943 | 0 | } |
944 | 0 | return tdb_brlock(tdb, F_WRLCK, off, 1, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE); |
945 | 0 | } |
946 | | |
947 | | int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off) |
948 | 0 | { |
949 | 0 | if (tdb->allrecord_lock.count) { |
950 | 0 | return 0; |
951 | 0 | } |
952 | 0 | return tdb_brunlock(tdb, F_WRLCK, off, 1); |
953 | 0 | } |
954 | | |
955 | | /* fcntl locks don't stack: avoid unlocking someone else's */ |
956 | | int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off) |
957 | 0 | { |
958 | 0 | struct tdb_traverse_lock *i; |
959 | 0 | uint32_t count = 0; |
960 | |
|
961 | 0 | if (tdb->allrecord_lock.count) { |
962 | 0 | return 0; |
963 | 0 | } |
964 | | |
965 | 0 | if (off == 0) |
966 | 0 | return 0; |
967 | 0 | for (i = &tdb->travlocks; i; i = i->next) |
968 | 0 | if (i->off == off) |
969 | 0 | count++; |
970 | 0 | return (count == 1 ? tdb_brunlock(tdb, F_RDLCK, off, 1) : 0); |
971 | 0 | } |
972 | | |
973 | | bool tdb_have_extra_locks(struct tdb_context *tdb) |
974 | 0 | { |
975 | 0 | unsigned int extra = tdb->num_lockrecs; |
976 | | |
977 | | /* A transaction holds the lock for all records. */ |
978 | 0 | if (!tdb->transaction && tdb->allrecord_lock.count) { |
979 | 0 | return true; |
980 | 0 | } |
981 | | |
982 | | /* We always hold the active lock if CLEAR_IF_FIRST. */ |
983 | 0 | if (find_nestlock(tdb, ACTIVE_LOCK)) { |
984 | 0 | extra--; |
985 | 0 | } |
986 | | |
987 | | /* In a transaction, we expect to hold the transaction lock */ |
988 | 0 | if (tdb->transaction && find_nestlock(tdb, TRANSACTION_LOCK)) { |
989 | 0 | extra--; |
990 | 0 | } |
991 | |
|
992 | 0 | return extra; |
993 | 0 | } |
994 | | |
995 | | /* The transaction code uses this to remove all locks. */ |
996 | | void tdb_release_transaction_locks(struct tdb_context *tdb) |
997 | 0 | { |
998 | 0 | int i; |
999 | 0 | unsigned int active = 0; |
1000 | |
|
1001 | 0 | if (tdb->allrecord_lock.count != 0) { |
1002 | 0 | tdb_allrecord_unlock(tdb, tdb->allrecord_lock.ltype, false); |
1003 | 0 | tdb->allrecord_lock.count = 0; |
1004 | 0 | } |
1005 | |
|
1006 | 0 | for (i=0;i<tdb->num_lockrecs;i++) { |
1007 | 0 | struct tdb_lock_type *lck = &tdb->lockrecs[i]; |
1008 | | |
1009 | | /* Don't release the active lock! Copy it to first entry. */ |
1010 | 0 | if (lck->off == ACTIVE_LOCK) { |
1011 | 0 | tdb->lockrecs[active++] = *lck; |
1012 | 0 | } else { |
1013 | 0 | tdb_brunlock(tdb, lck->ltype, lck->off, 1); |
1014 | 0 | } |
1015 | 0 | } |
1016 | 0 | tdb->num_lockrecs = active; |
1017 | 0 | } |
1018 | | |
1019 | | /* Following functions are added specifically to support CTDB. */ |
1020 | | |
1021 | | /* Don't do actual fcntl locking, just mark tdb locked */ |
1022 | | _PUBLIC_ int tdb_transaction_write_lock_mark(struct tdb_context *tdb); |
1023 | | _PUBLIC_ int tdb_transaction_write_lock_mark(struct tdb_context *tdb) |
1024 | 0 | { |
1025 | 0 | return tdb_transaction_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY); |
1026 | 0 | } |
1027 | | |
1028 | | /* Don't do actual fcntl unlocking, just mark tdb unlocked */ |
1029 | | _PUBLIC_ int tdb_transaction_write_lock_unmark(struct tdb_context *tdb); |
1030 | | _PUBLIC_ int tdb_transaction_write_lock_unmark(struct tdb_context *tdb) |
1031 | 0 | { |
1032 | 0 | return tdb_nest_unlock(tdb, TRANSACTION_LOCK, F_WRLCK, true); |
1033 | 0 | } |