1 /*
2 URL: svn://svnanon.samba.org/samba/branches/SAMBA_4_0/source/lib/tdb/common
3 Rev: 23590
4 Last Changed Date: 2007-06-22 13:36:10 -0400 (Fri, 22 Jun 2007)
5 */
6 /*
7 trivial database library - standalone version
8
9 Copyright (C) Andrew Tridgell 1999-2005
10 Copyright (C) Jeremy Allison 2000-2006
11 Copyright (C) Paul `Rusty' Russell 2000
12
13 ** NOTE! The following LGPL license applies to the tdb
14 ** library. This does NOT imply that all of Samba is released
15 ** under the LGPL
16
17 This library is free software; you can redistribute it and/or
18 modify it under the terms of the GNU Lesser General Public
19 License as published by the Free Software Foundation; either
20 version 2 of the License, or (at your option) any later version.
21
22 This library is distributed in the hope that it will be useful,
23 but WITHOUT ANY WARRANTY; without even the implied warranty of
24 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25 Lesser General Public License for more details.
26
27 You should have received a copy of the GNU Lesser General Public
28 License along with this library; if not, write to the Free Software
29 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30 */
31
32 #ifdef CONFIG_STAND_ALONE
33 #define HAVE_MMAP
34 #define HAVE_STRDUP
35 #define HAVE_SYS_MMAN_H
36 #define HAVE_UTIME_H
37 #define HAVE_UTIME
38 #endif
39 #define _XOPEN_SOURCE 600
40
41 #include <unistd.h>
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <stdarg.h>
45 #include <stddef.h>
46 #include <errno.h>
47 #include <string.h>
48 #ifdef HAVE_SYS_SELECT_H
49 #include <sys/select.h>
50 #endif
51 #include <sys/time.h>
52 #include <sys/types.h>
53 #include <time.h>
54 #ifdef HAVE_UTIME_H
55 #include <utime.h>
56 #endif
57 #include <sys/stat.h>
58 #include <sys/file.h>
59 #include <fcntl.h>
60
61 #ifdef HAVE_SYS_MMAN_H
62 #include <sys/mman.h>
63 #endif
64
65 #ifndef MAP_FILE
66 #define MAP_FILE 0
67 #endif
68
69 #ifndef MAP_FAILED
70 #define MAP_FAILED ((void *)-1)
71 #endif
72
73 #ifndef HAVE_STRDUP
74 #define strdup rep_strdup
rep_strdup(const char * s)75 static char *rep_strdup(const char *s)
76 {
77 char *ret;
78 int length;
79 if (!s)
80 return NULL;
81
82 if (!length)
83 length = strlen(s);
84
85 ret = malloc(length + 1);
86 if (ret) {
87 strncpy(ret, s, length);
88 ret[length] = '\0';
89 }
90 return ret;
91 }
92 #endif
93
94 #ifndef PRINTF_ATTRIBUTE
95 #if (__GNUC__ >= 3) && (__GNUC_MINOR__ >= 1 )
96 /** Use gcc attribute to check printf fns. a1 is the 1-based index of
97 * the parameter containing the format, and a2 the index of the first
98 * argument. Note that some gcc 2.x versions don't handle this
99 * properly **/
100 #define PRINTF_ATTRIBUTE(a1, a2) __attribute__ ((format (__printf__, a1, a2)))
101 #else
102 #define PRINTF_ATTRIBUTE(a1, a2)
103 #endif
104 #endif
105
106 typedef int bool;
107
108 #include "tdb.h"
109
110 static TDB_DATA tdb_null;
111
112 #ifndef u32
113 #define u32 unsigned
114 #endif
115
116 typedef u32 tdb_len_t;
117 typedef u32 tdb_off_t;
118
119 #ifndef offsetof
120 #define offsetof(t,f) ((unsigned int)&((t *)0)->f)
121 #endif
122
123 #define TDB_MAGIC_FOOD "TDB file\n"
124 #define TDB_VERSION (0x26011967 + 6)
125 #define TDB_MAGIC (0x26011999U)
126 #define TDB_FREE_MAGIC (~TDB_MAGIC)
127 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
128 #define TDB_RECOVERY_MAGIC (0xf53bc0e7U)
129 #define TDB_ALIGNMENT 4
130 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
131 #define DEFAULT_HASH_SIZE 131
132 #define FREELIST_TOP (sizeof(struct tdb_header))
133 #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
134 #define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
135 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
136 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
137 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off_t))
138 #define TDB_HASHTABLE_SIZE(tdb) ((tdb->header.hash_size+1)*sizeof(tdb_off_t))
139 #define TDB_DATA_START(hash_size) TDB_HASH_TOP(hash_size-1)
140 #define TDB_RECOVERY_HEAD offsetof(struct tdb_header, recovery_start)
141 #define TDB_SEQNUM_OFS offsetof(struct tdb_header, sequence_number)
142 #define TDB_PAD_BYTE 0x42
143 #define TDB_PAD_U32 0x42424242
144
145 /* NB assumes there is a local variable called "tdb" that is the
146 * current context, also takes doubly-parenthesized print-style
147 * argument. */
148 #define TDB_LOG(x) tdb->log.log_fn x
149
150 /* lock offsets */
151 #define GLOBAL_LOCK 0
152 #define ACTIVE_LOCK 4
153 #define TRANSACTION_LOCK 8
154
155 /* free memory if the pointer is valid and zero the pointer */
156 #ifndef SAFE_FREE
157 #define SAFE_FREE(x) do { if ((x) != NULL) {free(x); (x)=NULL;} } while(0)
158 #endif
159
160 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
161
162 #define DOCONV() (tdb->flags & TDB_CONVERT)
163 #define CONVERT(x) (DOCONV() ? tdb_convert(&x, sizeof(x)) : &x)
164
165
166 /* the body of the database is made of one list_struct for the free space
167 plus a separate data list for each hash value */
168 struct list_struct {
169 tdb_off_t next; /* offset of the next record in the list */
170 tdb_len_t rec_len; /* total byte length of record */
171 tdb_len_t key_len; /* byte length of key */
172 tdb_len_t data_len; /* byte length of data */
173 u32 full_hash; /* the full 32 bit hash of the key */
174 u32 magic; /* try to catch errors */
175 /* the following union is implied:
176 union {
177 char record[rec_len];
178 struct {
179 char key[key_len];
180 char data[data_len];
181 }
182 u32 totalsize; (tailer)
183 }
184 */
185 };
186
187
188 /* this is stored at the front of every database */
189 struct tdb_header {
190 char magic_food[32]; /* for /etc/magic */
191 u32 version; /* version of the code */
192 u32 hash_size; /* number of hash entries */
193 tdb_off_t rwlocks; /* obsolete - kept to detect old formats */
194 tdb_off_t recovery_start; /* offset of transaction recovery region */
195 tdb_off_t sequence_number; /* used when TDB_SEQNUM is set */
196 tdb_off_t reserved[29];
197 };
198
199 struct tdb_lock_type {
200 int list;
201 u32 count;
202 u32 ltype;
203 };
204
205 struct tdb_traverse_lock {
206 struct tdb_traverse_lock *next;
207 u32 off;
208 u32 hash;
209 int lock_rw;
210 };
211
212
213 struct tdb_methods {
214 int (*tdb_read)(struct tdb_context *, tdb_off_t , void *, tdb_len_t , int );
215 int (*tdb_write)(struct tdb_context *, tdb_off_t, const void *, tdb_len_t);
216 void (*next_hash_chain)(struct tdb_context *, u32 *);
217 int (*tdb_oob)(struct tdb_context *, tdb_off_t , int );
218 int (*tdb_expand_file)(struct tdb_context *, tdb_off_t , tdb_off_t );
219 int (*tdb_brlock)(struct tdb_context *, tdb_off_t , int, int, int, size_t);
220 };
221
222 struct tdb_context {
223 char *name; /* the name of the database */
224 void *map_ptr; /* where it is currently mapped */
225 int fd; /* open file descriptor for the database */
226 tdb_len_t map_size; /* how much space has been mapped */
227 int read_only; /* opened read-only */
228 int traverse_read; /* read-only traversal */
229 struct tdb_lock_type global_lock;
230 int num_lockrecs;
231 struct tdb_lock_type *lockrecs; /* only real locks, all with count>0 */
232 enum TDB_ERROR ecode; /* error code for last tdb error */
233 struct tdb_header header; /* a cached copy of the header */
234 u32 flags; /* the flags passed to tdb_open */
235 struct tdb_traverse_lock travlocks; /* current traversal locks */
236 struct tdb_context *next; /* all tdbs to avoid multiple opens */
237 dev_t device; /* uniquely identifies this tdb */
238 ino_t inode; /* uniquely identifies this tdb */
239 struct tdb_logging_context log;
240 unsigned int (*hash_fn)(TDB_DATA *key);
241 int open_flags; /* flags used in the open - needed by reopen */
242 unsigned int num_locks; /* number of chain locks held */
243 const struct tdb_methods *methods;
244 struct tdb_transaction *transaction;
245 int page_size;
246 int max_dead_records;
247 bool have_transaction_lock;
248 };
249
250
251 /*
252 internal prototypes
253 */
254 static int tdb_munmap(struct tdb_context *tdb);
255 static void tdb_mmap(struct tdb_context *tdb);
256 static int tdb_lock(struct tdb_context *tdb, int list, int ltype);
257 static int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
258 static int tdb_brlock(struct tdb_context *tdb, tdb_off_t offset, int rw_type, int lck_type, int probe, size_t len);
259 static int tdb_transaction_lock(struct tdb_context *tdb, int ltype);
260 static int tdb_transaction_unlock(struct tdb_context *tdb);
261 static int tdb_brlock_upgrade(struct tdb_context *tdb, tdb_off_t offset, size_t len);
262 static int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off);
263 static int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off);
264 static int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
265 static int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
266 static void *tdb_convert(void *buf, u32 size);
267 static int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec);
268 static tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct list_struct *rec);
269 static int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
270 static int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
271 static int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off);
272 static int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off);
273 static int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec);
274 static int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec);
275 static int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct list_struct *rec);
276 static unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len);
277 static int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
278 tdb_off_t offset, tdb_len_t len,
279 int (*parser)(TDB_DATA key, TDB_DATA data,
280 void *private_data),
281 void *private_data);
282 static tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, u32 hash, int locktype,
283 struct list_struct *rec);
284 static void tdb_io_init(struct tdb_context *tdb);
285 static int tdb_expand(struct tdb_context *tdb, tdb_off_t size);
286 static int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off,
287 struct list_struct *rec);
288
289
290 /* file: error.c */
291
tdb_error(struct tdb_context * tdb)292 enum TDB_ERROR tdb_error(struct tdb_context *tdb)
293 {
294 return tdb->ecode;
295 }
296
297 static struct tdb_errname {
298 enum TDB_ERROR ecode; const char *estring;
299 } emap[] = { {TDB_SUCCESS, "Success"},
300 {TDB_ERR_CORRUPT, "Corrupt database"},
301 {TDB_ERR_IO, "IO Error"},
302 {TDB_ERR_LOCK, "Locking error"},
303 {TDB_ERR_OOM, "Out of memory"},
304 {TDB_ERR_EXISTS, "Record exists"},
305 {TDB_ERR_NOLOCK, "Lock exists on other keys"},
306 {TDB_ERR_EINVAL, "Invalid parameter"},
307 {TDB_ERR_NOEXIST, "Record does not exist"},
308 {TDB_ERR_RDONLY, "write not permitted"} };
309
310 /* Error string for the last tdb error */
tdb_errorstr(struct tdb_context * tdb)311 const char *tdb_errorstr(struct tdb_context *tdb)
312 {
313 u32 i;
314 for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
315 if (tdb->ecode == emap[i].ecode)
316 return emap[i].estring;
317 return "Invalid error code";
318 }
319
320 /* file: lock.c */
321
322 #define TDB_MARK_LOCK 0x80000000
323
324 /* a byte range locking function - return 0 on success
325 this functions locks/unlocks 1 byte at the specified offset.
326
327 On error, errno is also set so that errors are passed back properly
328 through tdb_open().
329
330 note that a len of zero means lock to end of file
331 */
tdb_brlock(struct tdb_context * tdb,tdb_off_t offset,int rw_type,int lck_type,int probe,size_t len)332 int tdb_brlock(struct tdb_context *tdb, tdb_off_t offset,
333 int rw_type, int lck_type, int probe, size_t len)
334 {
335 struct flock fl;
336 int ret;
337
338 if (tdb->flags & TDB_NOLOCK) {
339 return 0;
340 }
341
342 if ((rw_type == F_WRLCK) && (tdb->read_only || tdb->traverse_read)) {
343 tdb->ecode = TDB_ERR_RDONLY;
344 return -1;
345 }
346
347 fl.l_type = rw_type;
348 fl.l_whence = SEEK_SET;
349 fl.l_start = offset;
350 fl.l_len = len;
351 fl.l_pid = 0;
352
353 do {
354 ret = fcntl(tdb->fd,lck_type,&fl);
355 } while (ret == -1 && errno == EINTR);
356
357 if (ret == -1) {
358 /* Generic lock error. errno set by fcntl.
359 * EAGAIN is an expected return from non-blocking
360 * locks. */
361 if (!probe && lck_type != F_SETLK) {
362 /* Ensure error code is set for log fun to examine. */
363 tdb->ecode = TDB_ERR_LOCK;
364 TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d len=%d\n",
365 tdb->fd, offset, rw_type, lck_type, (int)len));
366 }
367 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
368 }
369 return 0;
370 }
371
372
373 /*
374 upgrade a read lock to a write lock. This needs to be handled in a
375 special way as some OSes (such as solaris) have too conservative
376 deadlock detection and claim a deadlock when progress can be
377 made. For those OSes we may loop for a while.
378 */
tdb_brlock_upgrade(struct tdb_context * tdb,tdb_off_t offset,size_t len)379 int tdb_brlock_upgrade(struct tdb_context *tdb, tdb_off_t offset, size_t len)
380 {
381 int count = 1000;
382 while (count--) {
383 struct timeval tv;
384 if (tdb_brlock(tdb, offset, F_WRLCK, F_SETLKW, 1, len) == 0) {
385 return 0;
386 }
387 if (errno != EDEADLK) {
388 break;
389 }
390 /* sleep for as short a time as we can - more portable than usleep() */
391 tv.tv_sec = 0;
392 tv.tv_usec = 1;
393 select(0, NULL, NULL, NULL, &tv);
394 }
395 TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock_upgrade failed at offset %d\n", offset));
396 return -1;
397 }
398
399
400 /* lock a list in the database. list -1 is the alloc list */
_tdb_lock(struct tdb_context * tdb,int list,int ltype,int op)401 static int _tdb_lock(struct tdb_context *tdb, int list, int ltype, int op)
402 {
403 struct tdb_lock_type *new_lck;
404 int i;
405 bool mark_lock = ((ltype & TDB_MARK_LOCK) == TDB_MARK_LOCK);
406
407 ltype &= ~TDB_MARK_LOCK;
408
409 /* a global lock allows us to avoid per chain locks */
410 if (tdb->global_lock.count &&
411 (ltype == tdb->global_lock.ltype || ltype == F_RDLCK)) {
412 return 0;
413 }
414
415 if (tdb->global_lock.count) {
416 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
417 }
418
419 if (list < -1 || list >= (int)tdb->header.hash_size) {
420 TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_lock: invalid list %d for ltype=%d\n",
421 list, ltype));
422 return -1;
423 }
424 if (tdb->flags & TDB_NOLOCK)
425 return 0;
426
427 for (i=0; i<tdb->num_lockrecs; i++) {
428 if (tdb->lockrecs[i].list == list) {
429 if (tdb->lockrecs[i].count == 0) {
430 /*
431 * Can't happen, see tdb_unlock(). It should
432 * be an assert.
433 */
434 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lock: "
435 "lck->count == 0 for list %d", list));
436 }
437 /*
438 * Just increment the in-memory struct, posix locks
439 * don't stack.
440 */
441 tdb->lockrecs[i].count++;
442 return 0;
443 }
444 }
445
446 new_lck = (struct tdb_lock_type *)realloc(
447 tdb->lockrecs,
448 sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
449 if (new_lck == NULL) {
450 errno = ENOMEM;
451 return -1;
452 }
453 tdb->lockrecs = new_lck;
454
455 /* Since fcntl locks don't nest, we do a lock for the first one,
456 and simply bump the count for future ones */
457 if (!mark_lock &&
458 tdb->methods->tdb_brlock(tdb,FREELIST_TOP+4*list, ltype, op,
459 0, 1)) {
460 return -1;
461 }
462
463 tdb->num_locks++;
464
465 tdb->lockrecs[tdb->num_lockrecs].list = list;
466 tdb->lockrecs[tdb->num_lockrecs].count = 1;
467 tdb->lockrecs[tdb->num_lockrecs].ltype = ltype;
468 tdb->num_lockrecs += 1;
469
470 return 0;
471 }
472
473 /* lock a list in the database. list -1 is the alloc list */
tdb_lock(struct tdb_context * tdb,int list,int ltype)474 int tdb_lock(struct tdb_context *tdb, int list, int ltype)
475 {
476 int ret;
477 ret = _tdb_lock(tdb, list, ltype, F_SETLKW);
478 if (ret) {
479 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lock failed on list %d "
480 "ltype=%d (%s)\n", list, ltype, strerror(errno)));
481 }
482 return ret;
483 }
484
485 /* lock a list in the database. list -1 is the alloc list. non-blocking lock */
tdb_lock_nonblock(struct tdb_context * tdb,int list,int ltype)486 int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype)
487 {
488 return _tdb_lock(tdb, list, ltype, F_SETLK);
489 }
490
491
492 /* unlock the database: returns void because it's too late for errors. */
493 /* changed to return int it may be interesting to know there
494 has been an error --simo */
tdb_unlock(struct tdb_context * tdb,int list,int ltype)495 int tdb_unlock(struct tdb_context *tdb, int list, int ltype)
496 {
497 int ret = -1;
498 int i;
499 struct tdb_lock_type *lck = NULL;
500 bool mark_lock = ((ltype & TDB_MARK_LOCK) == TDB_MARK_LOCK);
501
502 ltype &= ~TDB_MARK_LOCK;
503
504 /* a global lock allows us to avoid per chain locks */
505 if (tdb->global_lock.count &&
506 (ltype == tdb->global_lock.ltype || ltype == F_RDLCK)) {
507 return 0;
508 }
509
510 if (tdb->global_lock.count) {
511 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
512 }
513
514 if (tdb->flags & TDB_NOLOCK)
515 return 0;
516
517 /* Sanity checks */
518 if (list < -1 || list >= (int)tdb->header.hash_size) {
519 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
520 return ret;
521 }
522
523 for (i=0; i<tdb->num_lockrecs; i++) {
524 if (tdb->lockrecs[i].list == list) {
525 lck = &tdb->lockrecs[i];
526 break;
527 }
528 }
529
530 if ((lck == NULL) || (lck->count == 0)) {
531 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: count is 0\n"));
532 return -1;
533 }
534
535 if (lck->count > 1) {
536 lck->count--;
537 return 0;
538 }
539
540 /*
541 * This lock has count==1 left, so we need to unlock it in the
542 * kernel. We don't bother with decrementing the in-memory array
543 * element, we're about to overwrite it with the last array element
544 * anyway.
545 */
546
547 if (mark_lock) {
548 ret = 0;
549 } else {
550 ret = tdb->methods->tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK,
551 F_SETLKW, 0, 1);
552 }
553 tdb->num_locks--;
554
555 /*
556 * Shrink the array by overwriting the element just unlocked with the
557 * last array element.
558 */
559
560 if (tdb->num_lockrecs > 1) {
561 *lck = tdb->lockrecs[tdb->num_lockrecs-1];
562 }
563 tdb->num_lockrecs -= 1;
564
565 /*
566 * We don't bother with realloc when the array shrinks, but if we have
567 * a completely idle tdb we should get rid of the locked array.
568 */
569
570 if (tdb->num_lockrecs == 0) {
571 SAFE_FREE(tdb->lockrecs);
572 }
573
574 if (ret)
575 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: An error occurred unlocking!\n"));
576 return ret;
577 }
578
579 /*
580 get the transaction lock
581 */
tdb_transaction_lock(struct tdb_context * tdb,int ltype)582 int tdb_transaction_lock(struct tdb_context *tdb, int ltype)
583 {
584 if (tdb->have_transaction_lock || tdb->global_lock.count) {
585 return 0;
586 }
587 if (tdb->methods->tdb_brlock(tdb, TRANSACTION_LOCK, ltype,
588 F_SETLKW, 0, 1) == -1) {
589 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_lock: failed to get transaction lock\n"));
590 tdb->ecode = TDB_ERR_LOCK;
591 return -1;
592 }
593 tdb->have_transaction_lock = 1;
594 return 0;
595 }
596
597 /*
598 release the transaction lock
599 */
tdb_transaction_unlock(struct tdb_context * tdb)600 int tdb_transaction_unlock(struct tdb_context *tdb)
601 {
602 int ret;
603 if (!tdb->have_transaction_lock) {
604 return 0;
605 }
606 ret = tdb->methods->tdb_brlock(tdb, TRANSACTION_LOCK, F_UNLCK, F_SETLKW, 0, 1);
607 if (ret == 0) {
608 tdb->have_transaction_lock = 0;
609 }
610 return ret;
611 }
612
613
614
615
616 /* lock/unlock entire database */
_tdb_lockall(struct tdb_context * tdb,int ltype,int op)617 static int _tdb_lockall(struct tdb_context *tdb, int ltype, int op)
618 {
619 bool mark_lock = ((ltype & TDB_MARK_LOCK) == TDB_MARK_LOCK);
620
621 ltype &= ~TDB_MARK_LOCK;
622
623 /* There are no locks on read-only dbs */
624 if (tdb->read_only || tdb->traverse_read)
625 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
626
627 if (tdb->global_lock.count && tdb->global_lock.ltype == ltype) {
628 tdb->global_lock.count++;
629 return 0;
630 }
631
632 if (tdb->global_lock.count) {
633 /* a global lock of a different type exists */
634 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
635 }
636
637 if (tdb->num_locks != 0) {
638 /* can't combine global and chain locks */
639 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
640 }
641
642 if (!mark_lock &&
643 tdb->methods->tdb_brlock(tdb, FREELIST_TOP, ltype, op,
644 0, 4*tdb->header.hash_size)) {
645 if (op == F_SETLKW) {
646 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lockall failed (%s)\n", strerror(errno)));
647 }
648 return -1;
649 }
650
651 tdb->global_lock.count = 1;
652 tdb->global_lock.ltype = ltype;
653
654 return 0;
655 }
656
657
658
659 /* unlock entire db */
_tdb_unlockall(struct tdb_context * tdb,int ltype)660 static int _tdb_unlockall(struct tdb_context *tdb, int ltype)
661 {
662 bool mark_lock = ((ltype & TDB_MARK_LOCK) == TDB_MARK_LOCK);
663
664 ltype &= ~TDB_MARK_LOCK;
665
666 /* There are no locks on read-only dbs */
667 if (tdb->read_only || tdb->traverse_read) {
668 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
669 }
670
671 if (tdb->global_lock.ltype != ltype || tdb->global_lock.count == 0) {
672 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
673 }
674
675 if (tdb->global_lock.count > 1) {
676 tdb->global_lock.count--;
677 return 0;
678 }
679
680 if (!mark_lock &&
681 tdb->methods->tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW,
682 0, 4*tdb->header.hash_size)) {
683 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed (%s)\n", strerror(errno)));
684 return -1;
685 }
686
687 tdb->global_lock.count = 0;
688 tdb->global_lock.ltype = 0;
689
690 return 0;
691 }
692
693 /* lock entire database with write lock */
tdb_lockall(struct tdb_context * tdb)694 int tdb_lockall(struct tdb_context *tdb)
695 {
696 return _tdb_lockall(tdb, F_WRLCK, F_SETLKW);
697 }
698
699 /* lock entire database with write lock - mark only */
tdb_lockall_mark(struct tdb_context * tdb)700 int tdb_lockall_mark(struct tdb_context *tdb)
701 {
702 return _tdb_lockall(tdb, F_WRLCK | TDB_MARK_LOCK, F_SETLKW);
703 }
704
705 /* unlock entire database with write lock - unmark only */
tdb_lockall_unmark(struct tdb_context * tdb)706 int tdb_lockall_unmark(struct tdb_context *tdb)
707 {
708 return _tdb_unlockall(tdb, F_WRLCK | TDB_MARK_LOCK);
709 }
710
711 /* lock entire database with write lock - nonblocking varient */
tdb_lockall_nonblock(struct tdb_context * tdb)712 int tdb_lockall_nonblock(struct tdb_context *tdb)
713 {
714 return _tdb_lockall(tdb, F_WRLCK, F_SETLK);
715 }
716
717 /* unlock entire database with write lock */
tdb_unlockall(struct tdb_context * tdb)718 int tdb_unlockall(struct tdb_context *tdb)
719 {
720 return _tdb_unlockall(tdb, F_WRLCK);
721 }
722
723 /* lock entire database with read lock */
tdb_lockall_read(struct tdb_context * tdb)724 int tdb_lockall_read(struct tdb_context *tdb)
725 {
726 return _tdb_lockall(tdb, F_RDLCK, F_SETLKW);
727 }
728
729 /* lock entire database with read lock - nonblock varient */
tdb_lockall_read_nonblock(struct tdb_context * tdb)730 int tdb_lockall_read_nonblock(struct tdb_context *tdb)
731 {
732 return _tdb_lockall(tdb, F_RDLCK, F_SETLK);
733 }
734
735 /* unlock entire database with read lock */
tdb_unlockall_read(struct tdb_context * tdb)736 int tdb_unlockall_read(struct tdb_context *tdb)
737 {
738 return _tdb_unlockall(tdb, F_RDLCK);
739 }
740
741 /* lock/unlock one hash chain. This is meant to be used to reduce
742 contention - it cannot guarantee how many records will be locked */
tdb_chainlock(struct tdb_context * tdb,TDB_DATA key)743 int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
744 {
745 return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
746 }
747
748 /* lock/unlock one hash chain, non-blocking. This is meant to be used
749 to reduce contention - it cannot guarantee how many records will be
750 locked */
tdb_chainlock_nonblock(struct tdb_context * tdb,TDB_DATA key)751 int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key)
752 {
753 return tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
754 }
755
756 /* mark a chain as locked without actually locking it. Warning! use with great caution! */
tdb_chainlock_mark(struct tdb_context * tdb,TDB_DATA key)757 int tdb_chainlock_mark(struct tdb_context *tdb, TDB_DATA key)
758 {
759 return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK | TDB_MARK_LOCK);
760 }
761
762 /* unmark a chain as locked without actually locking it. Warning! use with great caution! */
tdb_chainlock_unmark(struct tdb_context * tdb,TDB_DATA key)763 int tdb_chainlock_unmark(struct tdb_context *tdb, TDB_DATA key)
764 {
765 return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK | TDB_MARK_LOCK);
766 }
767
tdb_chainunlock(struct tdb_context * tdb,TDB_DATA key)768 int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
769 {
770 return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
771 }
772
tdb_chainlock_read(struct tdb_context * tdb,TDB_DATA key)773 int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
774 {
775 return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
776 }
777
tdb_chainunlock_read(struct tdb_context * tdb,TDB_DATA key)778 int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
779 {
780 return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
781 }
782
783
784
785 /* record lock stops delete underneath */
tdb_lock_record(struct tdb_context * tdb,tdb_off_t off)786 int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off)
787 {
788 return off ? tdb->methods->tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0, 1) : 0;
789 }
790
791 /*
792 Write locks override our own fcntl readlocks, so check it here.
793 Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
794 an error to fail to get the lock here.
795 */
tdb_write_lock_record(struct tdb_context * tdb,tdb_off_t off)796 int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off)
797 {
798 struct tdb_traverse_lock *i;
799 for (i = &tdb->travlocks; i; i = i->next)
800 if (i->off == off)
801 return -1;
802 return tdb->methods->tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1, 1);
803 }
804
805 /*
806 Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
807 an error to fail to get the lock here.
808 */
tdb_write_unlock_record(struct tdb_context * tdb,tdb_off_t off)809 int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off)
810 {
811 return tdb->methods->tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0, 1);
812 }
813
814 /* fcntl locks don't stack: avoid unlocking someone else's */
tdb_unlock_record(struct tdb_context * tdb,tdb_off_t off)815 int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off)
816 {
817 struct tdb_traverse_lock *i;
818 u32 count = 0;
819
820 if (off == 0)
821 return 0;
822 for (i = &tdb->travlocks; i; i = i->next)
823 if (i->off == off)
824 count++;
825 return (count == 1 ? tdb->methods->tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0, 1) : 0);
826 }
827
828 /* file: io.c */
829
830 /* check for an out of bounds access - if it is out of bounds then
831 see if the database has been expanded by someone else and expand
832 if necessary
833 note that "len" is the minimum length needed for the db
834 */
tdb_oob(struct tdb_context * tdb,tdb_off_t len,int probe)835 static int tdb_oob(struct tdb_context *tdb, tdb_off_t len, int probe)
836 {
837 struct stat st;
838 if (len <= tdb->map_size)
839 return 0;
840 if (tdb->flags & TDB_INTERNAL) {
841 if (!probe) {
842 /* Ensure ecode is set for log fn. */
843 tdb->ecode = TDB_ERR_IO;
844 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %d beyond internal malloc size %d\n",
845 (int)len, (int)tdb->map_size));
846 }
847 return TDB_ERRCODE(TDB_ERR_IO, -1);
848 }
849
850 if (fstat(tdb->fd, &st) == -1) {
851 return TDB_ERRCODE(TDB_ERR_IO, -1);
852 }
853
854 if (st.st_size < (size_t)len) {
855 if (!probe) {
856 /* Ensure ecode is set for log fn. */
857 tdb->ecode = TDB_ERR_IO;
858 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %d beyond eof at %d\n",
859 (int)len, (int)st.st_size));
860 }
861 return TDB_ERRCODE(TDB_ERR_IO, -1);
862 }
863
864 /* Unmap, update size, remap */
865 if (tdb_munmap(tdb) == -1)
866 return TDB_ERRCODE(TDB_ERR_IO, -1);
867 tdb->map_size = st.st_size;
868 tdb_mmap(tdb);
869 return 0;
870 }
871
872 /* write a lump of data at a specified offset */
tdb_write(struct tdb_context * tdb,tdb_off_t off,const void * buf,tdb_len_t len)873 static int tdb_write(struct tdb_context *tdb, tdb_off_t off,
874 const void *buf, tdb_len_t len)
875 {
876 if (len == 0) {
877 return 0;
878 }
879
880 if (tdb->read_only || tdb->traverse_read) {
881 tdb->ecode = TDB_ERR_RDONLY;
882 return -1;
883 }
884
885 if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0)
886 return -1;
887
888 if (tdb->map_ptr) {
889 memcpy(off + (char *)tdb->map_ptr, buf, len);
890 } else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
891 /* Ensure ecode is set for log fn. */
892 tdb->ecode = TDB_ERR_IO;
893 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d len=%d (%s)\n",
894 off, len, strerror(errno)));
895 return TDB_ERRCODE(TDB_ERR_IO, -1);
896 }
897 return 0;
898 }
899
900 /* Endian conversion: we only ever deal with 4 byte quantities */
tdb_convert(void * buf,u32 size)901 void *tdb_convert(void *buf, u32 size)
902 {
903 u32 i, *p = (u32 *)buf;
904 for (i = 0; i < size / 4; i++)
905 p[i] = TDB_BYTEREV(p[i]);
906 return buf;
907 }
908
909
910 /* read a lump of data at a specified offset, maybe convert */
tdb_read(struct tdb_context * tdb,tdb_off_t off,void * buf,tdb_len_t len,int cv)911 static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
912 tdb_len_t len, int cv)
913 {
914 if (tdb->methods->tdb_oob(tdb, off + len, 0) != 0) {
915 return -1;
916 }
917
918 if (tdb->map_ptr) {
919 memcpy(buf, off + (char *)tdb->map_ptr, len);
920 } else {
921 ssize_t ret = pread(tdb->fd, buf, len, off);
922 if (ret != (ssize_t)len) {
923 /* Ensure ecode is set for log fn. */
924 tdb->ecode = TDB_ERR_IO;
925 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_read failed at %d "
926 "len=%d ret=%d (%s) map_size=%d\n",
927 (int)off, (int)len, (int)ret, strerror(errno),
928 (int)tdb->map_size));
929 return TDB_ERRCODE(TDB_ERR_IO, -1);
930 }
931 }
932 if (cv) {
933 tdb_convert(buf, len);
934 }
935 return 0;
936 }
937
938
939
940 /*
941 do an unlocked scan of the hash table heads to find the next non-zero head. The value
942 will then be confirmed with the lock held
943 */
tdb_next_hash_chain(struct tdb_context * tdb,u32 * chain)944 static void tdb_next_hash_chain(struct tdb_context *tdb, u32 *chain)
945 {
946 u32 h = *chain;
947 if (tdb->map_ptr) {
948 for (;h < tdb->header.hash_size;h++) {
949 if (0 != *(u32 *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
950 break;
951 }
952 }
953 } else {
954 u32 off=0;
955 for (;h < tdb->header.hash_size;h++) {
956 if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
957 break;
958 }
959 }
960 }
961 (*chain) = h;
962 }
963
964
tdb_munmap(struct tdb_context * tdb)965 int tdb_munmap(struct tdb_context *tdb)
966 {
967 if (tdb->flags & TDB_INTERNAL)
968 return 0;
969
970 #ifdef HAVE_MMAP
971 if (tdb->map_ptr) {
972 int ret = munmap(tdb->map_ptr, tdb->map_size);
973 if (ret != 0)
974 return ret;
975 }
976 #endif
977 tdb->map_ptr = NULL;
978 return 0;
979 }
980
tdb_mmap(struct tdb_context * tdb)981 void tdb_mmap(struct tdb_context *tdb)
982 {
983 if (tdb->flags & TDB_INTERNAL)
984 return;
985
986 #ifdef HAVE_MMAP
987 if (!(tdb->flags & TDB_NOMMAP)) {
988 tdb->map_ptr = mmap(NULL, tdb->map_size,
989 PROT_READ|(tdb->read_only? 0:PROT_WRITE),
990 MAP_SHARED|MAP_FILE, tdb->fd, 0);
991
992 /*
993 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
994 */
995
996 if (tdb->map_ptr == MAP_FAILED) {
997 tdb->map_ptr = NULL;
998 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_mmap failed for size %d (%s)\n",
999 tdb->map_size, strerror(errno)));
1000 }
1001 } else {
1002 tdb->map_ptr = NULL;
1003 }
1004 #else
1005 tdb->map_ptr = NULL;
1006 #endif
1007 }
1008
1009 /* expand a file. we prefer to use ftruncate, as that is what posix
1010 says to use for mmap expansion */
tdb_expand_file(struct tdb_context * tdb,tdb_off_t size,tdb_off_t addition)1011 static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t addition)
1012 {
1013 char buf[1024];
1014
1015 if (tdb->read_only || tdb->traverse_read) {
1016 tdb->ecode = TDB_ERR_RDONLY;
1017 return -1;
1018 }
1019
1020 if (ftruncate(tdb->fd, size+addition) == -1) {
1021 char b = 0;
1022 if (pwrite(tdb->fd, &b, 1, (size+addition) - 1) != 1) {
1023 TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file to %d failed (%s)\n",
1024 size+addition, strerror(errno)));
1025 return -1;
1026 }
1027 }
1028
1029 /* now fill the file with something. This ensures that the
1030 file isn't sparse, which would be very bad if we ran out of
1031 disk. This must be done with write, not via mmap */
1032 memset(buf, TDB_PAD_BYTE, sizeof(buf));
1033 while (addition) {
1034 int n = addition>sizeof(buf)?sizeof(buf):addition;
1035 int ret = pwrite(tdb->fd, buf, n, size);
1036 if (ret != n) {
1037 TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write of %d failed (%s)\n",
1038 n, strerror(errno)));
1039 return -1;
1040 }
1041 addition -= n;
1042 size += n;
1043 }
1044 return 0;
1045 }
1046
1047
1048 /* expand the database at least size bytes by expanding the underlying
1049 file and doing the mmap again if necessary */
tdb_expand(struct tdb_context * tdb,tdb_off_t size)1050 int tdb_expand(struct tdb_context *tdb, tdb_off_t size)
1051 {
1052 struct list_struct rec;
1053 tdb_off_t offset;
1054
1055 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
1056 TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n"));
1057 return -1;
1058 }
1059
1060 /* must know about any previous expansions by another process */
1061 tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
1062
1063 /* always make room for at least 10 more records, and round
1064 the database up to a multiple of the page size */
1065 size = TDB_ALIGN(tdb->map_size + size*10, tdb->page_size) - tdb->map_size;
1066
1067 if (!(tdb->flags & TDB_INTERNAL))
1068 tdb_munmap(tdb);
1069
1070 /*
1071 * We must ensure the file is unmapped before doing this
1072 * to ensure consistency with systems like OpenBSD where
1073 * writes and mmaps are not consistent.
1074 */
1075
1076 /* expand the file itself */
1077 if (!(tdb->flags & TDB_INTERNAL)) {
1078 if (tdb->methods->tdb_expand_file(tdb, tdb->map_size, size) != 0)
1079 goto fail;
1080 }
1081
1082 tdb->map_size += size;
1083
1084 if (tdb->flags & TDB_INTERNAL) {
1085 char *new_map_ptr = (char *)realloc(tdb->map_ptr,
1086 tdb->map_size);
1087 if (!new_map_ptr) {
1088 tdb->map_size -= size;
1089 goto fail;
1090 }
1091 tdb->map_ptr = new_map_ptr;
1092 } else {
1093 /*
1094 * We must ensure the file is remapped before adding the space
1095 * to ensure consistency with systems like OpenBSD where
1096 * writes and mmaps are not consistent.
1097 */
1098
1099 /* We're ok if the mmap fails as we'll fallback to read/write */
1100 tdb_mmap(tdb);
1101 }
1102
1103 /* form a new freelist record */
1104 memset(&rec,'\0',sizeof(rec));
1105 rec.rec_len = size - sizeof(rec);
1106
1107 /* link it into the free list */
1108 offset = tdb->map_size - size;
1109 if (tdb_free(tdb, offset, &rec) == -1)
1110 goto fail;
1111
1112 tdb_unlock(tdb, -1, F_WRLCK);
1113 return 0;
1114 fail:
1115 tdb_unlock(tdb, -1, F_WRLCK);
1116 return -1;
1117 }
1118
1119 /* read/write a tdb_off_t */
tdb_ofs_read(struct tdb_context * tdb,tdb_off_t offset,tdb_off_t * d)1120 int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
1121 {
1122 return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
1123 }
1124
tdb_ofs_write(struct tdb_context * tdb,tdb_off_t offset,tdb_off_t * d)1125 int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
1126 {
1127 tdb_off_t off = *d;
1128 return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
1129 }
1130
1131
1132 /* read a lump of data, allocating the space for it */
tdb_alloc_read(struct tdb_context * tdb,tdb_off_t offset,tdb_len_t len)1133 unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
1134 {
1135 unsigned char *buf;
1136
1137 /* some systems don't like zero length malloc */
1138 if (len == 0) {
1139 len = 1;
1140 }
1141
1142 if (!(buf = (unsigned char *)malloc(len))) {
1143 /* Ensure ecode is set for log fn. */
1144 tdb->ecode = TDB_ERR_OOM;
1145 TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_alloc_read malloc failed len=%d (%s)\n",
1146 len, strerror(errno)));
1147 return TDB_ERRCODE(TDB_ERR_OOM, buf);
1148 }
1149 if (tdb->methods->tdb_read(tdb, offset, buf, len, 0) == -1) {
1150 SAFE_FREE(buf);
1151 return NULL;
1152 }
1153 return buf;
1154 }
1155
1156 /* Give a piece of tdb data to a parser */
1157
tdb_parse_data(struct tdb_context * tdb,TDB_DATA key,tdb_off_t offset,tdb_len_t len,int (* parser)(TDB_DATA key,TDB_DATA data,void * private_data),void * private_data)1158 int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
1159 tdb_off_t offset, tdb_len_t len,
1160 int (*parser)(TDB_DATA key, TDB_DATA data,
1161 void *private_data),
1162 void *private_data)
1163 {
1164 TDB_DATA data;
1165 int result;
1166
1167 data.dsize = len;
1168
1169 if ((tdb->transaction == NULL) && (tdb->map_ptr != NULL)) {
1170 /*
1171 * Optimize by avoiding the malloc/memcpy/free, point the
1172 * parser directly at the mmap area.
1173 */
1174 if (tdb->methods->tdb_oob(tdb, offset+len, 0) != 0) {
1175 return -1;
1176 }
1177 data.dptr = offset + (unsigned char *)tdb->map_ptr;
1178 return parser(key, data, private_data);
1179 }
1180
1181 if (!(data.dptr = tdb_alloc_read(tdb, offset, len))) {
1182 return -1;
1183 }
1184
1185 result = parser(key, data, private_data);
1186 free(data.dptr);
1187 return result;
1188 }
1189
1190 /* read/write a record */
tdb_rec_read(struct tdb_context * tdb,tdb_off_t offset,struct list_struct * rec)1191 int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec)
1192 {
1193 if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
1194 return -1;
1195 if (TDB_BAD_MAGIC(rec)) {
1196 /* Ensure ecode is set for log fn. */
1197 tdb->ecode = TDB_ERR_CORRUPT;
1198 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
1199 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
1200 }
1201 return tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0);
1202 }
1203
tdb_rec_write(struct tdb_context * tdb,tdb_off_t offset,struct list_struct * rec)1204 int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec)
1205 {
1206 struct list_struct r = *rec;
1207 return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
1208 }
1209
1210 static const struct tdb_methods io_methods = {
1211 tdb_read,
1212 tdb_write,
1213 tdb_next_hash_chain,
1214 tdb_oob,
1215 tdb_expand_file,
1216 tdb_brlock
1217 };
1218
1219 /*
1220 initialise the default methods table
1221 */
tdb_io_init(struct tdb_context * tdb)1222 void tdb_io_init(struct tdb_context *tdb)
1223 {
1224 tdb->methods = &io_methods;
1225 }
1226
1227 /* file: transaction.c */
1228
1229 /*
1230 transaction design:
1231
1232 - only allow a single transaction at a time per database. This makes
1233 using the transaction API simpler, as otherwise the caller would
1234 have to cope with temporary failures in transactions that conflict
1235 with other current transactions
1236
1237 - keep the transaction recovery information in the same file as the
1238 database, using a special 'transaction recovery' record pointed at
1239 by the header. This removes the need for extra journal files as
1240 used by some other databases
1241
1242 - dynamically allocated the transaction recover record, re-using it
1243 for subsequent transactions. If a larger record is needed then
1244 tdb_free() the old record to place it on the normal tdb freelist
1245 before allocating the new record
1246
1247 - during transactions, keep a linked list of writes all that have
1248 been performed by intercepting all tdb_write() calls. The hooked
1249 transaction versions of tdb_read() and tdb_write() check this
1250 linked list and try to use the elements of the list in preference
1251 to the real database.
1252
1253 - don't allow any locks to be held when a transaction starts,
1254 otherwise we can end up with deadlock (plus lack of lock nesting
1255 in posix locks would mean the lock is lost)
1256
1257 - if the caller gains a lock during the transaction but doesn't
1258 release it then fail the commit
1259
1260 - allow for nested calls to tdb_transaction_start(), re-using the
1261 existing transaction record. If the inner transaction is cancelled
1262 then a subsequent commit will fail
1263
1264 - keep a mirrored copy of the tdb hash chain heads to allow for the
1265 fast hash heads scan on traverse, updating the mirrored copy in
1266 the transaction version of tdb_write
1267
1268 - allow callers to mix transaction and non-transaction use of tdb,
1269 although once a transaction is started then an exclusive lock is
1270 gained until the transaction is committed or cancelled
1271
1272 - the commit stategy involves first saving away all modified data
1273 into a linearised buffer in the transaction recovery area, then
1274 marking the transaction recovery area with a magic value to
1275 indicate a valid recovery record. In total 4 fsync/msync calls are
1276 needed per commit to prevent race conditions. It might be possible
1277 to reduce this to 3 or even 2 with some more work.
1278
1279 - check for a valid recovery record on open of the tdb, while the
1280 global lock is held. Automatically recover from the transaction
1281 recovery area if needed, then continue with the open as
1282 usual. This allows for smooth crash recovery with no administrator
1283 intervention.
1284
1285 - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
1286 still available, but no transaction recovery area is used and no
1287 fsync/msync calls are made.
1288
1289 */
1290
1291 struct tdb_transaction_el {
1292 struct tdb_transaction_el *next, *prev;
1293 tdb_off_t offset;
1294 tdb_len_t length;
1295 unsigned char *data;
1296 };
1297
1298 /*
1299 hold the context of any current transaction
1300 */
1301 struct tdb_transaction {
1302 /* we keep a mirrored copy of the tdb hash heads here so
1303 tdb_next_hash_chain() can operate efficiently */
1304 u32 *hash_heads;
1305
1306 /* the original io methods - used to do IOs to the real db */
1307 const struct tdb_methods *io_methods;
1308
1309 /* the list of transaction elements. We use a doubly linked
1310 list with a last pointer to allow us to keep the list
1311 ordered, with first element at the front of the list. It
1312 needs to be doubly linked as the read/write traversals need
1313 to be backwards, while the commit needs to be forwards */
1314 struct tdb_transaction_el *elements, *elements_last;
1315
1316 /* non-zero when an internal transaction error has
1317 occurred. All write operations will then fail until the
1318 transaction is ended */
1319 int transaction_error;
1320
1321 /* when inside a transaction we need to keep track of any
1322 nested tdb_transaction_start() calls, as these are allowed,
1323 but don't create a new transaction */
1324 int nesting;
1325
1326 /* old file size before transaction */
1327 tdb_len_t old_map_size;
1328 };
1329
1330
1331 /*
1332 read while in a transaction. We need to check first if the data is in our list
1333 of transaction elements, then if not do a real read
1334 */
transaction_read(struct tdb_context * tdb,tdb_off_t off,void * buf,tdb_len_t len,int cv)1335 static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf,
1336 tdb_len_t len, int cv)
1337 {
1338 struct tdb_transaction_el *el;
1339
1340 /* we need to walk the list backwards to get the most recent data */
1341 for (el=tdb->transaction->elements_last;el;el=el->prev) {
1342 tdb_len_t partial;
1343
1344 if (off+len <= el->offset) {
1345 continue;
1346 }
1347 if (off >= el->offset + el->length) {
1348 continue;
1349 }
1350
1351 /* an overlapping read - needs to be split into up to
1352 2 reads and a memcpy */
1353 if (off < el->offset) {
1354 partial = el->offset - off;
1355 if (transaction_read(tdb, off, buf, partial, cv) != 0) {
1356 goto fail;
1357 }
1358 len -= partial;
1359 off += partial;
1360 buf = (void *)(partial + (char *)buf);
1361 }
1362 if (off + len <= el->offset + el->length) {
1363 partial = len;
1364 } else {
1365 partial = el->offset + el->length - off;
1366 }
1367 memcpy(buf, el->data + (off - el->offset), partial);
1368 if (cv) {
1369 tdb_convert(buf, len);
1370 }
1371 len -= partial;
1372 off += partial;
1373 buf = (void *)(partial + (char *)buf);
1374
1375 if (len != 0 && transaction_read(tdb, off, buf, len, cv) != 0) {
1376 goto fail;
1377 }
1378
1379 return 0;
1380 }
1381
1382 /* its not in the transaction elements - do a real read */
1383 return tdb->transaction->io_methods->tdb_read(tdb, off, buf, len, cv);
1384
1385 fail:
1386 TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_read: failed at off=%d len=%d\n", off, len));
1387 tdb->ecode = TDB_ERR_IO;
1388 tdb->transaction->transaction_error = 1;
1389 return -1;
1390 }
1391
1392
1393 /*
1394 write while in a transaction
1395 */
transaction_write(struct tdb_context * tdb,tdb_off_t off,const void * buf,tdb_len_t len)1396 static int transaction_write(struct tdb_context *tdb, tdb_off_t off,
1397 const void *buf, tdb_len_t len)
1398 {
1399 struct tdb_transaction_el *el, *best_el=NULL;
1400
1401 if (len == 0) {
1402 return 0;
1403 }
1404
1405 /* if the write is to a hash head, then update the transaction
1406 hash heads */
1407 if (len == sizeof(tdb_off_t) && off >= FREELIST_TOP &&
1408 off < FREELIST_TOP+TDB_HASHTABLE_SIZE(tdb)) {
1409 u32 chain = (off-FREELIST_TOP) / sizeof(tdb_off_t);
1410 memcpy(&tdb->transaction->hash_heads[chain], buf, len);
1411 }
1412
1413 /* first see if we can replace an existing entry */
1414 for (el=tdb->transaction->elements_last;el;el=el->prev) {
1415 tdb_len_t partial;
1416
1417 if (best_el == NULL && off == el->offset+el->length) {
1418 best_el = el;
1419 }
1420
1421 if (off+len <= el->offset) {
1422 continue;
1423 }
1424 if (off >= el->offset + el->length) {
1425 continue;
1426 }
1427
1428 /* an overlapping write - needs to be split into up to
1429 2 writes and a memcpy */
1430 if (off < el->offset) {
1431 partial = el->offset - off;
1432 if (transaction_write(tdb, off, buf, partial) != 0) {
1433 goto fail;
1434 }
1435 len -= partial;
1436 off += partial;
1437 buf = (const void *)(partial + (const char *)buf);
1438 }
1439 if (off + len <= el->offset + el->length) {
1440 partial = len;
1441 } else {
1442 partial = el->offset + el->length - off;
1443 }
1444 memcpy(el->data + (off - el->offset), buf, partial);
1445 len -= partial;
1446 off += partial;
1447 buf = (const void *)(partial + (const char *)buf);
1448
1449 if (len != 0 && transaction_write(tdb, off, buf, len) != 0) {
1450 goto fail;
1451 }
1452
1453 return 0;
1454 }
1455
1456 /* see if we can append the new entry to an existing entry */
1457 if (best_el && best_el->offset + best_el->length == off &&
1458 (off+len < tdb->transaction->old_map_size ||
1459 off > tdb->transaction->old_map_size)) {
1460 unsigned char *data = best_el->data;
1461 el = best_el;
1462 el->data = (unsigned char *)realloc(el->data,
1463 el->length + len);
1464 if (el->data == NULL) {
1465 tdb->ecode = TDB_ERR_OOM;
1466 tdb->transaction->transaction_error = 1;
1467 el->data = data;
1468 return -1;
1469 }
1470 if (buf) {
1471 memcpy(el->data + el->length, buf, len);
1472 } else {
1473 memset(el->data + el->length, TDB_PAD_BYTE, len);
1474 }
1475 el->length += len;
1476 return 0;
1477 }
1478
1479 /* add a new entry at the end of the list */
1480 el = (struct tdb_transaction_el *)malloc(sizeof(*el));
1481 if (el == NULL) {
1482 tdb->ecode = TDB_ERR_OOM;
1483 tdb->transaction->transaction_error = 1;
1484 return -1;
1485 }
1486 el->next = NULL;
1487 el->prev = tdb->transaction->elements_last;
1488 el->offset = off;
1489 el->length = len;
1490 el->data = (unsigned char *)malloc(len);
1491 if (el->data == NULL) {
1492 free(el);
1493 tdb->ecode = TDB_ERR_OOM;
1494 tdb->transaction->transaction_error = 1;
1495 return -1;
1496 }
1497 if (buf) {
1498 memcpy(el->data, buf, len);
1499 } else {
1500 memset(el->data, TDB_PAD_BYTE, len);
1501 }
1502 if (el->prev) {
1503 el->prev->next = el;
1504 } else {
1505 tdb->transaction->elements = el;
1506 }
1507 tdb->transaction->elements_last = el;
1508 return 0;
1509
1510 fail:
1511 TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: failed at off=%d len=%d\n", off, len));
1512 tdb->ecode = TDB_ERR_IO;
1513 tdb->transaction->transaction_error = 1;
1514 return -1;
1515 }
1516
1517 /*
1518 accelerated hash chain head search, using the cached hash heads
1519 */
transaction_next_hash_chain(struct tdb_context * tdb,u32 * chain)1520 static void transaction_next_hash_chain(struct tdb_context *tdb, u32 *chain)
1521 {
1522 u32 h = *chain;
1523 for (;h < tdb->header.hash_size;h++) {
1524 /* the +1 takes account of the freelist */
1525 if (0 != tdb->transaction->hash_heads[h+1]) {
1526 break;
1527 }
1528 }
1529 (*chain) = h;
1530 }
1531
1532 /*
1533 out of bounds check during a transaction
1534 */
transaction_oob(struct tdb_context * tdb,tdb_off_t len,int probe)1535 static int transaction_oob(struct tdb_context *tdb, tdb_off_t len, int probe)
1536 {
1537 if (len <= tdb->map_size) {
1538 return 0;
1539 }
1540 return TDB_ERRCODE(TDB_ERR_IO, -1);
1541 }
1542
1543 /*
1544 transaction version of tdb_expand().
1545 */
transaction_expand_file(struct tdb_context * tdb,tdb_off_t size,tdb_off_t addition)1546 static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t size,
1547 tdb_off_t addition)
1548 {
1549 /* add a write to the transaction elements, so subsequent
1550 reads see the zero data */
1551 if (transaction_write(tdb, size, NULL, addition) != 0) {
1552 return -1;
1553 }
1554
1555 return 0;
1556 }
1557
1558 /*
1559 brlock during a transaction - ignore them
1560 */
transaction_brlock(struct tdb_context * tdb,tdb_off_t offset,int rw_type,int lck_type,int probe,size_t len)1561 static int transaction_brlock(struct tdb_context *tdb, tdb_off_t offset,
1562 int rw_type, int lck_type, int probe, size_t len)
1563 {
1564 return 0;
1565 }
1566
1567 static const struct tdb_methods transaction_methods = {
1568 transaction_read,
1569 transaction_write,
1570 transaction_next_hash_chain,
1571 transaction_oob,
1572 transaction_expand_file,
1573 transaction_brlock
1574 };
1575
1576
1577 /*
1578 start a tdb transaction. No token is returned, as only a single
1579 transaction is allowed to be pending per tdb_context
1580 */
tdb_transaction_start(struct tdb_context * tdb)1581 int tdb_transaction_start(struct tdb_context *tdb)
1582 {
1583 /* some sanity checks */
1584 if (tdb->read_only || (tdb->flags & TDB_INTERNAL) || tdb->traverse_read) {
1585 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n"));
1586 tdb->ecode = TDB_ERR_EINVAL;
1587 return -1;
1588 }
1589
1590 /* cope with nested tdb_transaction_start() calls */
1591 if (tdb->transaction != NULL) {
1592 tdb->transaction->nesting++;
1593 TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_start: nesting %d\n",
1594 tdb->transaction->nesting));
1595 return 0;
1596 }
1597
1598 if (tdb->num_locks != 0 || tdb->global_lock.count) {
1599 /* the caller must not have any locks when starting a
1600 transaction as otherwise we'll be screwed by lack
1601 of nested locks in posix */
1602 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction with locks held\n"));
1603 tdb->ecode = TDB_ERR_LOCK;
1604 return -1;
1605 }
1606
1607 if (tdb->travlocks.next != NULL) {
1608 /* you cannot use transactions inside a traverse (although you can use
1609 traverse inside a transaction) as otherwise you can end up with
1610 deadlock */
1611 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction within a traverse\n"));
1612 tdb->ecode = TDB_ERR_LOCK;
1613 return -1;
1614 }
1615
1616 tdb->transaction = (struct tdb_transaction *)
1617 calloc(sizeof(struct tdb_transaction), 1);
1618 if (tdb->transaction == NULL) {
1619 tdb->ecode = TDB_ERR_OOM;
1620 return -1;
1621 }
1622
1623 /* get the transaction write lock. This is a blocking lock. As
1624 discussed with Volker, there are a number of ways we could
1625 make this async, which we will probably do in the future */
1626 if (tdb_transaction_lock(tdb, F_WRLCK) == -1) {
1627 SAFE_FREE(tdb->transaction);
1628 return -1;
1629 }
1630
1631 /* get a read lock from the freelist to the end of file. This
1632 is upgraded to a write lock during the commit */
1633 if (tdb_brlock(tdb, FREELIST_TOP, F_RDLCK, F_SETLKW, 0, 0) == -1) {
1634 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to get hash locks\n"));
1635 tdb->ecode = TDB_ERR_LOCK;
1636 goto fail;
1637 }
1638
1639 /* setup a copy of the hash table heads so the hash scan in
1640 traverse can be fast */
1641 tdb->transaction->hash_heads = (u32 *)
1642 calloc(tdb->header.hash_size+1, sizeof(u32));
1643 if (tdb->transaction->hash_heads == NULL) {
1644 tdb->ecode = TDB_ERR_OOM;
1645 goto fail;
1646 }
1647 if (tdb->methods->tdb_read(tdb, FREELIST_TOP, tdb->transaction->hash_heads,
1648 TDB_HASHTABLE_SIZE(tdb), 0) != 0) {
1649 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to read hash heads\n"));
1650 tdb->ecode = TDB_ERR_IO;
1651 goto fail;
1652 }
1653
1654 /* make sure we know about any file expansions already done by
1655 anyone else */
1656 tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
1657 tdb->transaction->old_map_size = tdb->map_size;
1658
1659 /* finally hook the io methods, replacing them with
1660 transaction specific methods */
1661 tdb->transaction->io_methods = tdb->methods;
1662 tdb->methods = &transaction_methods;
1663
1664 /* by calling this transaction write here, we ensure that we don't grow the
1665 transaction linked list due to hash table updates */
1666 if (transaction_write(tdb, FREELIST_TOP, tdb->transaction->hash_heads,
1667 TDB_HASHTABLE_SIZE(tdb)) != 0) {
1668 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to prime hash table\n"));
1669 tdb->ecode = TDB_ERR_IO;
1670 tdb->methods = tdb->transaction->io_methods;
1671 goto fail;
1672 }
1673
1674 return 0;
1675
1676 fail:
1677 tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0);
1678 tdb_transaction_unlock(tdb);
1679 SAFE_FREE(tdb->transaction->hash_heads);
1680 SAFE_FREE(tdb->transaction);
1681 return -1;
1682 }
1683
1684
1685 /*
1686 cancel the current transaction
1687 */
tdb_transaction_cancel(struct tdb_context * tdb)1688 int tdb_transaction_cancel(struct tdb_context *tdb)
1689 {
1690 if (tdb->transaction == NULL) {
1691 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_cancel: no transaction\n"));
1692 return -1;
1693 }
1694
1695 if (tdb->transaction->nesting != 0) {
1696 tdb->transaction->transaction_error = 1;
1697 tdb->transaction->nesting--;
1698 return 0;
1699 }
1700
1701 tdb->map_size = tdb->transaction->old_map_size;
1702
1703 /* free all the transaction elements */
1704 while (tdb->transaction->elements) {
1705 struct tdb_transaction_el *el = tdb->transaction->elements;
1706 tdb->transaction->elements = el->next;
1707 free(el->data);
1708 free(el);
1709 }
1710
1711 /* remove any global lock created during the transaction */
1712 if (tdb->global_lock.count != 0) {
1713 tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 4*tdb->header.hash_size);
1714 tdb->global_lock.count = 0;
1715 }
1716
1717 /* remove any locks created during the transaction */
1718 if (tdb->num_locks != 0) {
1719 int i;
1720 for (i=0;i<tdb->num_lockrecs;i++) {
1721 tdb_brlock(tdb,FREELIST_TOP+4*tdb->lockrecs[i].list,
1722 F_UNLCK,F_SETLKW, 0, 1);
1723 }
1724 tdb->num_locks = 0;
1725 tdb->num_lockrecs = 0;
1726 SAFE_FREE(tdb->lockrecs);
1727 }
1728
1729 /* restore the normal io methods */
1730 tdb->methods = tdb->transaction->io_methods;
1731
1732 tdb_brlock(tdb, FREELIST_TOP, F_UNLCK, F_SETLKW, 0, 0);
1733 tdb_transaction_unlock(tdb);
1734 SAFE_FREE(tdb->transaction->hash_heads);
1735 SAFE_FREE(tdb->transaction);
1736
1737 return 0;
1738 }
1739
1740 /*
1741 sync to disk
1742 */
transaction_sync(struct tdb_context * tdb,tdb_off_t offset,tdb_len_t length)1743 static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t length)
1744 {
1745 if (fsync(tdb->fd) != 0) {
1746 tdb->ecode = TDB_ERR_IO;
1747 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: fsync failed\n"));
1748 return -1;
1749 }
1750 #if defined(HAVE_MSYNC) && defined(MS_SYNC)
1751 if (tdb->map_ptr) {
1752 tdb_off_t moffset = offset & ~(tdb->page_size-1);
1753 if (msync(moffset + (char *)tdb->map_ptr,
1754 length + (offset - moffset), MS_SYNC) != 0) {
1755 tdb->ecode = TDB_ERR_IO;
1756 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: msync failed - %s\n",
1757 strerror(errno)));
1758 return -1;
1759 }
1760 }
1761 #endif
1762 return 0;
1763 }
1764
1765
1766 /*
1767 work out how much space the linearised recovery data will consume
1768 */
tdb_recovery_size(struct tdb_context * tdb)1769 static tdb_len_t tdb_recovery_size(struct tdb_context *tdb)
1770 {
1771 struct tdb_transaction_el *el;
1772 tdb_len_t recovery_size = 0;
1773
1774 recovery_size = sizeof(u32);
1775 for (el=tdb->transaction->elements;el;el=el->next) {
1776 if (el->offset >= tdb->transaction->old_map_size) {
1777 continue;
1778 }
1779 recovery_size += 2*sizeof(tdb_off_t) + el->length;
1780 }
1781
1782 return recovery_size;
1783 }
1784
1785 /*
1786 allocate the recovery area, or use an existing recovery area if it is
1787 large enough
1788 */
tdb_recovery_allocate(struct tdb_context * tdb,tdb_len_t * recovery_size,tdb_off_t * recovery_offset,tdb_len_t * recovery_max_size)1789 static int tdb_recovery_allocate(struct tdb_context *tdb,
1790 tdb_len_t *recovery_size,
1791 tdb_off_t *recovery_offset,
1792 tdb_len_t *recovery_max_size)
1793 {
1794 struct list_struct rec;
1795 const struct tdb_methods *methods = tdb->transaction->io_methods;
1796 tdb_off_t recovery_head;
1797
1798 if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
1799 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery head\n"));
1800 return -1;
1801 }
1802
1803 rec.rec_len = 0;
1804
1805 if (recovery_head != 0 &&
1806 methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
1807 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery record\n"));
1808 return -1;
1809 }
1810
1811 *recovery_size = tdb_recovery_size(tdb);
1812
1813 if (recovery_head != 0 && *recovery_size <= rec.rec_len) {
1814 /* it fits in the existing area */
1815 *recovery_max_size = rec.rec_len;
1816 *recovery_offset = recovery_head;
1817 return 0;
1818 }
1819
1820 /* we need to free up the old recovery area, then allocate a
1821 new one at the end of the file. Note that we cannot use
1822 tdb_allocate() to allocate the new one as that might return
1823 us an area that is being currently used (as of the start of
1824 the transaction) */
1825 if (recovery_head != 0) {
1826 if (tdb_free(tdb, recovery_head, &rec) == -1) {
1827 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to free previous recovery area\n"));
1828 return -1;
1829 }
1830 }
1831
1832 /* the tdb_free() call might have increased the recovery size */
1833 *recovery_size = tdb_recovery_size(tdb);
1834
1835 /* round up to a multiple of page size */
1836 *recovery_max_size = TDB_ALIGN(sizeof(rec) + *recovery_size, tdb->page_size) - sizeof(rec);
1837 *recovery_offset = tdb->map_size;
1838 recovery_head = *recovery_offset;
1839
1840 if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
1841 (tdb->map_size - tdb->transaction->old_map_size) +
1842 sizeof(rec) + *recovery_max_size) == -1) {
1843 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to create recovery area\n"));
1844 return -1;
1845 }
1846
1847 /* remap the file (if using mmap) */
1848 methods->tdb_oob(tdb, tdb->map_size + 1, 1);
1849
1850 /* we have to reset the old map size so that we don't try to expand the file
1851 again in the transaction commit, which would destroy the recovery area */
1852 tdb->transaction->old_map_size = tdb->map_size;
1853
1854 /* write the recovery header offset and sync - we can sync without a race here
1855 as the magic ptr in the recovery record has not been set */
1856 CONVERT(recovery_head);
1857 if (methods->tdb_write(tdb, TDB_RECOVERY_HEAD,
1858 &recovery_head, sizeof(tdb_off_t)) == -1) {
1859 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
1860 return -1;
1861 }
1862
1863 return 0;
1864 }
1865
1866
1867 /*
1868 setup the recovery data that will be used on a crash during commit
1869 */
transaction_setup_recovery(struct tdb_context * tdb,tdb_off_t * magic_offset)1870 static int transaction_setup_recovery(struct tdb_context *tdb,
1871 tdb_off_t *magic_offset)
1872 {
1873 struct tdb_transaction_el *el;
1874 tdb_len_t recovery_size;
1875 unsigned char *data, *p;
1876 const struct tdb_methods *methods = tdb->transaction->io_methods;
1877 struct list_struct *rec;
1878 tdb_off_t recovery_offset, recovery_max_size;
1879 tdb_off_t old_map_size = tdb->transaction->old_map_size;
1880 u32 magic, tailer;
1881
1882 /*
1883 check that the recovery area has enough space
1884 */
1885 if (tdb_recovery_allocate(tdb, &recovery_size,
1886 &recovery_offset, &recovery_max_size) == -1) {
1887 return -1;
1888 }
1889
1890 data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
1891 if (data == NULL) {
1892 tdb->ecode = TDB_ERR_OOM;
1893 return -1;
1894 }
1895
1896 rec = (struct list_struct *)data;
1897 memset(rec, 0, sizeof(*rec));
1898
1899 rec->magic = 0;
1900 rec->data_len = recovery_size;
1901 rec->rec_len = recovery_max_size;
1902 rec->key_len = old_map_size;
1903 CONVERT(rec);
1904
1905 /* build the recovery data into a single blob to allow us to do a single
1906 large write, which should be more efficient */
1907 p = data + sizeof(*rec);
1908 for (el=tdb->transaction->elements;el;el=el->next) {
1909 if (el->offset >= old_map_size) {
1910 continue;
1911 }
1912 if (el->offset + el->length > tdb->transaction->old_map_size) {
1913 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: transaction data over new region boundary\n"));
1914 free(data);
1915 tdb->ecode = TDB_ERR_CORRUPT;
1916 return -1;
1917 }
1918 memcpy(p, &el->offset, 4);
1919 memcpy(p+4, &el->length, 4);
1920 if (DOCONV()) {
1921 tdb_convert(p, 8);
1922 }
1923 /* the recovery area contains the old data, not the
1924 new data, so we have to call the original tdb_read
1925 method to get it */
1926 if (methods->tdb_read(tdb, el->offset, p + 8, el->length, 0) != 0) {
1927 free(data);
1928 tdb->ecode = TDB_ERR_IO;
1929 return -1;
1930 }
1931 p += 8 + el->length;
1932 }
1933
1934 /* and the tailer */
1935 tailer = sizeof(*rec) + recovery_max_size;
1936 memcpy(p, &tailer, 4);
1937 CONVERT(p);
1938
1939 /* write the recovery data to the recovery area */
1940 if (methods->tdb_write(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
1941 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery data\n"));
1942 free(data);
1943 tdb->ecode = TDB_ERR_IO;
1944 return -1;
1945 }
1946
1947 /* as we don't have ordered writes, we have to sync the recovery
1948 data before we update the magic to indicate that the recovery
1949 data is present */
1950 if (transaction_sync(tdb, recovery_offset, sizeof(*rec) + recovery_size) == -1) {
1951 free(data);
1952 return -1;
1953 }
1954
1955 free(data);
1956
1957 magic = TDB_RECOVERY_MAGIC;
1958 CONVERT(magic);
1959
1960 *magic_offset = recovery_offset + offsetof(struct list_struct, magic);
1961
1962 if (methods->tdb_write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
1963 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery magic\n"));
1964 tdb->ecode = TDB_ERR_IO;
1965 return -1;
1966 }
1967
1968 /* ensure the recovery magic marker is on disk */
1969 if (transaction_sync(tdb, *magic_offset, sizeof(magic)) == -1) {
1970 return -1;
1971 }
1972
1973 return 0;
1974 }
1975
1976 /*
1977 commit the current transaction
1978 */
tdb_transaction_commit(struct tdb_context * tdb)1979 int tdb_transaction_commit(struct tdb_context *tdb)
1980 {
1981 const struct tdb_methods *methods;
1982 tdb_off_t magic_offset = 0;
1983 u32 zero = 0;
1984
1985 if (tdb->transaction == NULL) {
1986 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: no transaction\n"));
1987 return -1;
1988 }
1989
1990 if (tdb->transaction->transaction_error) {
1991 tdb->ecode = TDB_ERR_IO;
1992 tdb_transaction_cancel(tdb);
1993 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: transaction error pending\n"));
1994 return -1;
1995 }
1996
1997 if (tdb->transaction->nesting != 0) {
1998 tdb->transaction->nesting--;
1999 return 0;
2000 }
2001
2002 /* check for a null transaction */
2003 if (tdb->transaction->elements == NULL) {
2004 tdb_transaction_cancel(tdb);
2005 return 0;
2006 }
2007
2008 methods = tdb->transaction->io_methods;
2009
2010 /* if there are any locks pending then the caller has not
2011 nested their locks properly, so fail the transaction */
2012 if (tdb->num_locks || tdb->global_lock.count) {
2013 tdb->ecode = TDB_ERR_LOCK;
2014 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: locks pending on commit\n"));
2015 tdb_transaction_cancel(tdb);
2016 return -1;
2017 }
2018
2019 /* upgrade the main transaction lock region to a write lock */
2020 if (tdb_brlock_upgrade(tdb, FREELIST_TOP, 0) == -1) {
2021 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to upgrade hash locks\n"));
2022 tdb->ecode = TDB_ERR_LOCK;
2023 tdb_transaction_cancel(tdb);
2024 return -1;
2025 }
2026
2027 /* get the global lock - this prevents new users attaching to the database
2028 during the commit */
2029 if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) {
2030 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: failed to get global lock\n"));
2031 tdb->ecode = TDB_ERR_LOCK;
2032 tdb_transaction_cancel(tdb);
2033 return -1;
2034 }
2035
2036 if (!(tdb->flags & TDB_NOSYNC)) {
2037 /* write the recovery data to the end of the file */
2038 if (transaction_setup_recovery(tdb, &magic_offset) == -1) {
2039 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: failed to setup recovery data\n"));
2040 tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
2041 tdb_transaction_cancel(tdb);
2042 return -1;
2043 }
2044 }
2045
2046 /* expand the file to the new size if needed */
2047 if (tdb->map_size != tdb->transaction->old_map_size) {
2048 if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size,
2049 tdb->map_size -
2050 tdb->transaction->old_map_size) == -1) {
2051 tdb->ecode = TDB_ERR_IO;
2052 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: expansion failed\n"));
2053 tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
2054 tdb_transaction_cancel(tdb);
2055 return -1;
2056 }
2057 tdb->map_size = tdb->transaction->old_map_size;
2058 methods->tdb_oob(tdb, tdb->map_size + 1, 1);
2059 }
2060
2061 /* perform all the writes */
2062 while (tdb->transaction->elements) {
2063 struct tdb_transaction_el *el = tdb->transaction->elements;
2064
2065 if (methods->tdb_write(tdb, el->offset, el->data, el->length) == -1) {
2066 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed during commit\n"));
2067
2068 /* we've overwritten part of the data and
2069 possibly expanded the file, so we need to
2070 run the crash recovery code */
2071 tdb->methods = methods;
2072 tdb_transaction_recover(tdb);
2073
2074 tdb_transaction_cancel(tdb);
2075 tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
2076
2077 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed\n"));
2078 return -1;
2079 }
2080 tdb->transaction->elements = el->next;
2081 free(el->data);
2082 free(el);
2083 }
2084
2085 if (!(tdb->flags & TDB_NOSYNC)) {
2086 /* ensure the new data is on disk */
2087 if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
2088 return -1;
2089 }
2090
2091 /* remove the recovery marker */
2092 if (methods->tdb_write(tdb, magic_offset, &zero, 4) == -1) {
2093 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: failed to remove recovery magic\n"));
2094 return -1;
2095 }
2096
2097 /* ensure the recovery marker has been removed on disk */
2098 if (transaction_sync(tdb, magic_offset, 4) == -1) {
2099 return -1;
2100 }
2101 }
2102
2103 tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1);
2104
2105 /*
2106 TODO: maybe write to some dummy hdr field, or write to magic
2107 offset without mmap, before the last sync, instead of the
2108 utime() call
2109 */
2110
2111 /* on some systems (like Linux 2.6.x) changes via mmap/msync
2112 don't change the mtime of the file, this means the file may
2113 not be backed up (as tdb rounding to block sizes means that
2114 file size changes are quite rare too). The following forces
2115 mtime changes when a transaction completes */
2116 #ifdef HAVE_UTIME
2117 utime(tdb->name, NULL);
2118 #endif
2119
2120 /* use a transaction cancel to free memory and remove the
2121 transaction locks */
2122 tdb_transaction_cancel(tdb);
2123 return 0;
2124 }
2125
2126
2127 /*
2128 recover from an aborted transaction. Must be called with exclusive
2129 database write access already established (including the global
2130 lock to prevent new processes attaching)
2131 */
tdb_transaction_recover(struct tdb_context * tdb)2132 int tdb_transaction_recover(struct tdb_context *tdb)
2133 {
2134 tdb_off_t recovery_head, recovery_eof;
2135 unsigned char *data, *p;
2136 u32 zero = 0;
2137 struct list_struct rec;
2138
2139 /* find the recovery area */
2140 if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
2141 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery head\n"));
2142 tdb->ecode = TDB_ERR_IO;
2143 return -1;
2144 }
2145
2146 if (recovery_head == 0) {
2147 /* we have never allocated a recovery record */
2148 return 0;
2149 }
2150
2151 /* read the recovery record */
2152 if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
2153 sizeof(rec), DOCONV()) == -1) {
2154 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery record\n"));
2155 tdb->ecode = TDB_ERR_IO;
2156 return -1;
2157 }
2158
2159 if (rec.magic != TDB_RECOVERY_MAGIC) {
2160 /* there is no valid recovery data */
2161 return 0;
2162 }
2163
2164 if (tdb->read_only) {
2165 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: attempt to recover read only database\n"));
2166 tdb->ecode = TDB_ERR_CORRUPT;
2167 return -1;
2168 }
2169
2170 recovery_eof = rec.key_len;
2171
2172 data = (unsigned char *)malloc(rec.data_len);
2173 if (data == NULL) {
2174 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to allocate recovery data\n"));
2175 tdb->ecode = TDB_ERR_OOM;
2176 return -1;
2177 }
2178
2179 /* read the full recovery data */
2180 if (tdb->methods->tdb_read(tdb, recovery_head + sizeof(rec), data,
2181 rec.data_len, 0) == -1) {
2182 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery data\n"));
2183 tdb->ecode = TDB_ERR_IO;
2184 return -1;
2185 }
2186
2187 /* recover the file data */
2188 p = data;
2189 while (p+8 < data + rec.data_len) {
2190 u32 ofs, len;
2191 if (DOCONV()) {
2192 tdb_convert(p, 8);
2193 }
2194 memcpy(&ofs, p, 4);
2195 memcpy(&len, p+4, 4);
2196
2197 if (tdb->methods->tdb_write(tdb, ofs, p+8, len) == -1) {
2198 free(data);
2199 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to recover %d bytes at offset %d\n", len, ofs));
2200 tdb->ecode = TDB_ERR_IO;
2201 return -1;
2202 }
2203 p += 8 + len;
2204 }
2205
2206 free(data);
2207
2208 if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
2209 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync recovery\n"));
2210 tdb->ecode = TDB_ERR_IO;
2211 return -1;
2212 }
2213
2214 /* if the recovery area is after the recovered eof then remove it */
2215 if (recovery_eof <= recovery_head) {
2216 if (tdb_ofs_write(tdb, TDB_RECOVERY_HEAD, &zero) == -1) {
2217 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery head\n"));
2218 tdb->ecode = TDB_ERR_IO;
2219 return -1;
2220 }
2221 }
2222
2223 /* remove the recovery magic */
2224 if (tdb_ofs_write(tdb, recovery_head + offsetof(struct list_struct, magic),
2225 &zero) == -1) {
2226 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery magic\n"));
2227 tdb->ecode = TDB_ERR_IO;
2228 return -1;
2229 }
2230
2231 /* reduce the file size to the old size */
2232 tdb_munmap(tdb);
2233 if (ftruncate(tdb->fd, recovery_eof) != 0) {
2234 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to reduce to recovery size\n"));
2235 tdb->ecode = TDB_ERR_IO;
2236 return -1;
2237 }
2238 tdb->map_size = recovery_eof;
2239 tdb_mmap(tdb);
2240
2241 if (transaction_sync(tdb, 0, recovery_eof) == -1) {
2242 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync2 recovery\n"));
2243 tdb->ecode = TDB_ERR_IO;
2244 return -1;
2245 }
2246
2247 TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_recover: recovered %d byte database\n",
2248 recovery_eof));
2249
2250 /* all done */
2251 return 0;
2252 }
2253
2254 /* file: freelist.c */
2255
2256 /* read a freelist record and check for simple errors */
tdb_rec_free_read(struct tdb_context * tdb,tdb_off_t off,struct list_struct * rec)2257 static int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off, struct list_struct *rec)
2258 {
2259 if (tdb->methods->tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
2260 return -1;
2261
2262 if (rec->magic == TDB_MAGIC) {
2263 /* this happens when a app is showdown while deleting a record - we should
2264 not completely fail when this happens */
2265 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
2266 rec->magic, off));
2267 rec->magic = TDB_FREE_MAGIC;
2268 if (tdb->methods->tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
2269 return -1;
2270 }
2271
2272 if (rec->magic != TDB_FREE_MAGIC) {
2273 /* Ensure ecode is set for log fn. */
2274 tdb->ecode = TDB_ERR_CORRUPT;
2275 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_rec_free_read bad magic 0x%x at offset=%d\n",
2276 rec->magic, off));
2277 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
2278 }
2279 if (tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
2280 return -1;
2281 return 0;
2282 }
2283
2284
2285
2286 /* Remove an element from the freelist. Must have alloc lock. */
remove_from_freelist(struct tdb_context * tdb,tdb_off_t off,tdb_off_t next)2287 static int remove_from_freelist(struct tdb_context *tdb, tdb_off_t off, tdb_off_t next)
2288 {
2289 tdb_off_t last_ptr, i;
2290
2291 /* read in the freelist top */
2292 last_ptr = FREELIST_TOP;
2293 while (tdb_ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
2294 if (i == off) {
2295 /* We've found it! */
2296 return tdb_ofs_write(tdb, last_ptr, &next);
2297 }
2298 /* Follow chain (next offset is at start of record) */
2299 last_ptr = i;
2300 }
2301 TDB_LOG((tdb, TDB_DEBUG_FATAL,"remove_from_freelist: not on list at off=%d\n", off));
2302 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
2303 }
2304
2305
2306 /* update a record tailer (must hold allocation lock) */
update_tailer(struct tdb_context * tdb,tdb_off_t offset,const struct list_struct * rec)2307 static int update_tailer(struct tdb_context *tdb, tdb_off_t offset,
2308 const struct list_struct *rec)
2309 {
2310 tdb_off_t totalsize;
2311
2312 /* Offset of tailer from record header */
2313 totalsize = sizeof(*rec) + rec->rec_len;
2314 return tdb_ofs_write(tdb, offset + totalsize - sizeof(tdb_off_t),
2315 &totalsize);
2316 }
2317
2318 /* Add an element into the freelist. Merge adjacent records if
2319 neccessary. */
tdb_free(struct tdb_context * tdb,tdb_off_t offset,struct list_struct * rec)2320 int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct list_struct *rec)
2321 {
2322 tdb_off_t right, left;
2323
2324 /* Allocation and tailer lock */
2325 if (tdb_lock(tdb, -1, F_WRLCK) != 0)
2326 return -1;
2327
2328 /* set an initial tailer, so if we fail we don't leave a bogus record */
2329 if (update_tailer(tdb, offset, rec) != 0) {
2330 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed!\n"));
2331 goto fail;
2332 }
2333
2334 /* Look right first (I'm an Australian, dammit) */
2335 right = offset + sizeof(*rec) + rec->rec_len;
2336 if (right + sizeof(*rec) <= tdb->map_size) {
2337 struct list_struct r;
2338
2339 if (tdb->methods->tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
2340 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: right read failed at %u\n", right));
2341 goto left;
2342 }
2343
2344 /* If it's free, expand to include it. */
2345 if (r.magic == TDB_FREE_MAGIC) {
2346 if (remove_from_freelist(tdb, right, r.next) == -1) {
2347 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: right free failed at %u\n", right));
2348 goto left;
2349 }
2350 rec->rec_len += sizeof(r) + r.rec_len;
2351 }
2352 }
2353
2354 left:
2355 /* Look left */
2356 left = offset - sizeof(tdb_off_t);
2357 if (left > TDB_DATA_START(tdb->header.hash_size)) {
2358 struct list_struct l;
2359 tdb_off_t leftsize;
2360
2361 /* Read in tailer and jump back to header */
2362 if (tdb_ofs_read(tdb, left, &leftsize) == -1) {
2363 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left offset read failed at %u\n", left));
2364 goto update;
2365 }
2366
2367 /* it could be uninitialised data */
2368 if (leftsize == 0 || leftsize == TDB_PAD_U32) {
2369 goto update;
2370 }
2371
2372 left = offset - leftsize;
2373
2374 /* Now read in record */
2375 if (tdb->methods->tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
2376 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
2377 goto update;
2378 }
2379
2380 /* If it's free, expand to include it. */
2381 if (l.magic == TDB_FREE_MAGIC) {
2382 if (remove_from_freelist(tdb, left, l.next) == -1) {
2383 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left free failed at %u\n", left));
2384 goto update;
2385 } else {
2386 offset = left;
2387 rec->rec_len += leftsize;
2388 }
2389 }
2390 }
2391
2392 update:
2393 if (update_tailer(tdb, offset, rec) == -1) {
2394 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed at %u\n", offset));
2395 goto fail;
2396 }
2397
2398 /* Now, prepend to free list */
2399 rec->magic = TDB_FREE_MAGIC;
2400
2401 if (tdb_ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
2402 tdb_rec_write(tdb, offset, rec) == -1 ||
2403 tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
2404 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free record write failed at offset=%d\n", offset));
2405 goto fail;
2406 }
2407
2408 /* And we're done. */
2409 tdb_unlock(tdb, -1, F_WRLCK);
2410 return 0;
2411
2412 fail:
2413 tdb_unlock(tdb, -1, F_WRLCK);
2414 return -1;
2415 }
2416
2417
2418 /*
2419 the core of tdb_allocate - called when we have decided which
2420 free list entry to use
2421 */
tdb_allocate_ofs(struct tdb_context * tdb,tdb_len_t length,tdb_off_t rec_ptr,struct list_struct * rec,tdb_off_t last_ptr)2422 static tdb_off_t tdb_allocate_ofs(struct tdb_context *tdb, tdb_len_t length, tdb_off_t rec_ptr,
2423 struct list_struct *rec, tdb_off_t last_ptr)
2424 {
2425 struct list_struct newrec;
2426 tdb_off_t newrec_ptr;
2427
2428 memset(&newrec, '\0', sizeof(newrec));
2429
2430 /* found it - now possibly split it up */
2431 if (rec->rec_len > length + MIN_REC_SIZE) {
2432 /* Length of left piece */
2433 length = TDB_ALIGN(length, TDB_ALIGNMENT);
2434
2435 /* Right piece to go on free list */
2436 newrec.rec_len = rec->rec_len - (sizeof(*rec) + length);
2437 newrec_ptr = rec_ptr + sizeof(*rec) + length;
2438
2439 /* And left record is shortened */
2440 rec->rec_len = length;
2441 } else {
2442 newrec_ptr = 0;
2443 }
2444
2445 /* Remove allocated record from the free list */
2446 if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1) {
2447 return 0;
2448 }
2449
2450 /* Update header: do this before we drop alloc
2451 lock, otherwise tdb_free() might try to
2452 merge with us, thinking we're free.
2453 (Thanks Jeremy Allison). */
2454 rec->magic = TDB_MAGIC;
2455 if (tdb_rec_write(tdb, rec_ptr, rec) == -1) {
2456 return 0;
2457 }
2458
2459 /* Did we create new block? */
2460 if (newrec_ptr) {
2461 /* Update allocated record tailer (we
2462 shortened it). */
2463 if (update_tailer(tdb, rec_ptr, rec) == -1) {
2464 return 0;
2465 }
2466
2467 /* Free new record */
2468 if (tdb_free(tdb, newrec_ptr, &newrec) == -1) {
2469 return 0;
2470 }
2471 }
2472
2473 /* all done - return the new record offset */
2474 return rec_ptr;
2475 }
2476
2477 /* allocate some space from the free list. The offset returned points
2478 to a unconnected list_struct within the database with room for at
2479 least length bytes of total data
2480
2481 0 is returned if the space could not be allocated
2482 */
tdb_allocate(struct tdb_context * tdb,tdb_len_t length,struct list_struct * rec)2483 tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct list_struct *rec)
2484 {
2485 tdb_off_t rec_ptr, last_ptr, newrec_ptr;
2486 struct {
2487 tdb_off_t rec_ptr, last_ptr;
2488 tdb_len_t rec_len;
2489 } bestfit;
2490
2491 if (tdb_lock(tdb, -1, F_WRLCK) == -1)
2492 return 0;
2493
2494 /* Extra bytes required for tailer */
2495 length += sizeof(tdb_off_t);
2496
2497 again:
2498 last_ptr = FREELIST_TOP;
2499
2500 /* read in the freelist top */
2501 if (tdb_ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
2502 goto fail;
2503
2504 bestfit.rec_ptr = 0;
2505 bestfit.last_ptr = 0;
2506 bestfit.rec_len = 0;
2507
2508 /*
2509 this is a best fit allocation strategy. Originally we used
2510 a first fit strategy, but it suffered from massive fragmentation
2511 issues when faced with a slowly increasing record size.
2512 */
2513 while (rec_ptr) {
2514 if (tdb_rec_free_read(tdb, rec_ptr, rec) == -1) {
2515 goto fail;
2516 }
2517
2518 if (rec->rec_len >= length) {
2519 if (bestfit.rec_ptr == 0 ||
2520 rec->rec_len < bestfit.rec_len) {
2521 bestfit.rec_len = rec->rec_len;
2522 bestfit.rec_ptr = rec_ptr;
2523 bestfit.last_ptr = last_ptr;
2524 /* consider a fit to be good enough if
2525 we aren't wasting more than half
2526 the space */
2527 if (bestfit.rec_len < 2*length) {
2528 break;
2529 }
2530 }
2531 }
2532
2533 /* move to the next record */
2534 last_ptr = rec_ptr;
2535 rec_ptr = rec->next;
2536 }
2537
2538 if (bestfit.rec_ptr != 0) {
2539 if (tdb_rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) {
2540 goto fail;
2541 }
2542
2543 newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, rec, bestfit.last_ptr);
2544 tdb_unlock(tdb, -1, F_WRLCK);
2545 return newrec_ptr;
2546 }
2547
2548 /* we didn't find enough space. See if we can expand the
2549 database and if we can then try again */
2550 if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
2551 goto again;
2552 fail:
2553 tdb_unlock(tdb, -1, F_WRLCK);
2554 return 0;
2555 }
2556
2557 /* file: freelistcheck.c */
2558
2559 /* Check the freelist is good and contains no loops.
2560 Very memory intensive - only do this as a consistency
2561 checker. Heh heh - uses an in memory tdb as the storage
2562 for the "seen" record list. For some reason this strikes
2563 me as extremely clever as I don't have to write another tree
2564 data structure implementation :-).
2565 */
2566
seen_insert(struct tdb_context * mem_tdb,tdb_off_t rec_ptr)2567 static int seen_insert(struct tdb_context *mem_tdb, tdb_off_t rec_ptr)
2568 {
2569 TDB_DATA key, data;
2570
2571 memset(&data, '\0', sizeof(data));
2572 key.dptr = (unsigned char *)&rec_ptr;
2573 key.dsize = sizeof(rec_ptr);
2574 return tdb_store(mem_tdb, key, data, TDB_INSERT);
2575 }
2576
tdb_validate_freelist(struct tdb_context * tdb,int * pnum_entries)2577 int tdb_validate_freelist(struct tdb_context *tdb, int *pnum_entries)
2578 {
2579 struct tdb_context *mem_tdb = NULL;
2580 struct list_struct rec;
2581 tdb_off_t rec_ptr, last_ptr;
2582 int ret = -1;
2583
2584 *pnum_entries = 0;
2585
2586 mem_tdb = tdb_open("flval", tdb->header.hash_size,
2587 TDB_INTERNAL, O_RDWR, 0600);
2588 if (!mem_tdb) {
2589 return -1;
2590 }
2591
2592 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
2593 tdb_close(mem_tdb);
2594 return 0;
2595 }
2596
2597 last_ptr = FREELIST_TOP;
2598
2599 /* Store the FREELIST_TOP record. */
2600 if (seen_insert(mem_tdb, last_ptr) == -1) {
2601 ret = TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
2602 goto fail;
2603 }
2604
2605 /* read in the freelist top */
2606 if (tdb_ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1) {
2607 goto fail;
2608 }
2609
2610 while (rec_ptr) {
2611
2612 /* If we can't store this record (we've seen it
2613 before) then the free list has a loop and must
2614 be corrupt. */
2615
2616 if (seen_insert(mem_tdb, rec_ptr)) {
2617 ret = TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
2618 goto fail;
2619 }
2620
2621 if (tdb_rec_free_read(tdb, rec_ptr, &rec) == -1) {
2622 goto fail;
2623 }
2624
2625 /* move to the next record */
2626 last_ptr = rec_ptr;
2627 rec_ptr = rec.next;
2628 *pnum_entries += 1;
2629 }
2630
2631 ret = 0;
2632
2633 fail:
2634
2635 tdb_close(mem_tdb);
2636 tdb_unlock(tdb, -1, F_WRLCK);
2637 return ret;
2638 }
2639
2640 /* file: traverse.c */
2641
2642 /* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
tdb_next_lock(struct tdb_context * tdb,struct tdb_traverse_lock * tlock,struct list_struct * rec)2643 static int tdb_next_lock(struct tdb_context *tdb, struct tdb_traverse_lock *tlock,
2644 struct list_struct *rec)
2645 {
2646 int want_next = (tlock->off != 0);
2647
2648 /* Lock each chain from the start one. */
2649 for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
2650 if (!tlock->off && tlock->hash != 0) {
2651 /* this is an optimisation for the common case where
2652 the hash chain is empty, which is particularly
2653 common for the use of tdb with ldb, where large
2654 hashes are used. In that case we spend most of our
2655 time in tdb_brlock(), locking empty hash chains.
2656
2657 To avoid this, we do an unlocked pre-check to see
2658 if the hash chain is empty before starting to look
2659 inside it. If it is empty then we can avoid that
2660 hash chain. If it isn't empty then we can't believe
2661 the value we get back, as we read it without a
2662 lock, so instead we get the lock and re-fetch the
2663 value below.
2664
2665 Notice that not doing this optimisation on the
2666 first hash chain is critical. We must guarantee
2667 that we have done at least one fcntl lock at the
2668 start of a search to guarantee that memory is
2669 coherent on SMP systems. If records are added by
2670 others during the search then thats OK, and we
2671 could possibly miss those with this trick, but we
2672 could miss them anyway without this trick, so the
2673 semantics don't change.
2674
2675 With a non-indexed ldb search this trick gains us a
2676 factor of around 80 in speed on a linux 2.6.x
2677 system (testing using ldbtest).
2678 */
2679 tdb->methods->next_hash_chain(tdb, &tlock->hash);
2680 if (tlock->hash == tdb->header.hash_size) {
2681 continue;
2682 }
2683 }
2684
2685 if (tdb_lock(tdb, tlock->hash, tlock->lock_rw) == -1)
2686 return -1;
2687
2688 /* No previous record? Start at top of chain. */
2689 if (!tlock->off) {
2690 if (tdb_ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
2691 &tlock->off) == -1)
2692 goto fail;
2693 } else {
2694 /* Otherwise unlock the previous record. */
2695 if (tdb_unlock_record(tdb, tlock->off) != 0)
2696 goto fail;
2697 }
2698
2699 if (want_next) {
2700 /* We have offset of old record: grab next */
2701 if (tdb_rec_read(tdb, tlock->off, rec) == -1)
2702 goto fail;
2703 tlock->off = rec->next;
2704 }
2705
2706 /* Iterate through chain */
2707 while( tlock->off) {
2708 tdb_off_t current;
2709 if (tdb_rec_read(tdb, tlock->off, rec) == -1)
2710 goto fail;
2711
2712 /* Detect infinite loops. From "Shlomi Yaakobovich" <Shlomi@exanet.com>. */
2713 if (tlock->off == rec->next) {
2714 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_next_lock: loop detected.\n"));
2715 goto fail;
2716 }
2717
2718 if (!TDB_DEAD(rec)) {
2719 /* Woohoo: we found one! */
2720 if (tdb_lock_record(tdb, tlock->off) != 0)
2721 goto fail;
2722 return tlock->off;
2723 }
2724
2725 /* Try to clean dead ones from old traverses */
2726 current = tlock->off;
2727 tlock->off = rec->next;
2728 if (!(tdb->read_only || tdb->traverse_read) &&
2729 tdb_do_delete(tdb, current, rec) != 0)
2730 goto fail;
2731 }
2732 tdb_unlock(tdb, tlock->hash, tlock->lock_rw);
2733 want_next = 0;
2734 }
2735 /* We finished iteration without finding anything */
2736 return TDB_ERRCODE(TDB_SUCCESS, 0);
2737
2738 fail:
2739 tlock->off = 0;
2740 if (tdb_unlock(tdb, tlock->hash, tlock->lock_rw) != 0)
2741 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_next_lock: On error unlock failed!\n"));
2742 return -1;
2743 }
2744
2745 /* traverse the entire database - calling fn(tdb, key, data) on each element.
2746 return -1 on error or the record count traversed
2747 if fn is NULL then it is not called
2748 a non-zero return value from fn() indicates that the traversal should stop
2749 */
tdb_traverse_internal(struct tdb_context * tdb,tdb_traverse_func fn,void * private_data,struct tdb_traverse_lock * tl)2750 static int tdb_traverse_internal(struct tdb_context *tdb,
2751 tdb_traverse_func fn, void *private_data,
2752 struct tdb_traverse_lock *tl)
2753 {
2754 TDB_DATA key, dbuf;
2755 struct list_struct rec;
2756 int ret, count = 0;
2757
2758 /* This was in the initializaton, above, but the IRIX compiler
2759 * did not like it. crh
2760 */
2761 tl->next = tdb->travlocks.next;
2762
2763 /* fcntl locks don't stack: beware traverse inside traverse */
2764 tdb->travlocks.next = tl;
2765
2766 /* tdb_next_lock places locks on the record returned, and its chain */
2767 while ((ret = tdb_next_lock(tdb, tl, &rec)) > 0) {
2768 count++;
2769 /* now read the full record */
2770 key.dptr = tdb_alloc_read(tdb, tl->off + sizeof(rec),
2771 rec.key_len + rec.data_len);
2772 if (!key.dptr) {
2773 ret = -1;
2774 if (tdb_unlock(tdb, tl->hash, tl->lock_rw) != 0)
2775 goto out;
2776 if (tdb_unlock_record(tdb, tl->off) != 0)
2777 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
2778 goto out;
2779 }
2780 key.dsize = rec.key_len;
2781 dbuf.dptr = key.dptr + rec.key_len;
2782 dbuf.dsize = rec.data_len;
2783
2784 /* Drop chain lock, call out */
2785 if (tdb_unlock(tdb, tl->hash, tl->lock_rw) != 0) {
2786 ret = -1;
2787 SAFE_FREE(key.dptr);
2788 goto out;
2789 }
2790 if (fn && fn(tdb, key, dbuf, private_data)) {
2791 /* They want us to terminate traversal */
2792 ret = count;
2793 if (tdb_unlock_record(tdb, tl->off) != 0) {
2794 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_traverse: unlock_record failed!\n"));;
2795 ret = -1;
2796 }
2797 SAFE_FREE(key.dptr);
2798 goto out;
2799 }
2800 SAFE_FREE(key.dptr);
2801 }
2802 out:
2803 tdb->travlocks.next = tl->next;
2804 if (ret < 0)
2805 return -1;
2806 else
2807 return count;
2808 }
2809
2810
2811 /*
2812 a write style traverse - temporarily marks the db read only
2813 */
tdb_traverse_read(struct tdb_context * tdb,tdb_traverse_func fn,void * private_data)2814 int tdb_traverse_read(struct tdb_context *tdb,
2815 tdb_traverse_func fn, void *private_data)
2816 {
2817 struct tdb_traverse_lock tl = { NULL, 0, 0, F_RDLCK };
2818 int ret;
2819
2820 /* we need to get a read lock on the transaction lock here to
2821 cope with the lock ordering semantics of solaris10 */
2822 if (tdb_transaction_lock(tdb, F_RDLCK)) {
2823 return -1;
2824 }
2825
2826 tdb->traverse_read++;
2827 ret = tdb_traverse_internal(tdb, fn, private_data, &tl);
2828 tdb->traverse_read--;
2829
2830 tdb_transaction_unlock(tdb);
2831
2832 return ret;
2833 }
2834
2835 /*
2836 a write style traverse - needs to get the transaction lock to
2837 prevent deadlocks
2838 */
tdb_traverse(struct tdb_context * tdb,tdb_traverse_func fn,void * private_data)2839 int tdb_traverse(struct tdb_context *tdb,
2840 tdb_traverse_func fn, void *private_data)
2841 {
2842 struct tdb_traverse_lock tl = { NULL, 0, 0, F_WRLCK };
2843 int ret;
2844
2845 if (tdb->read_only || tdb->traverse_read) {
2846 return tdb_traverse_read(tdb, fn, private_data);
2847 }
2848
2849 if (tdb_transaction_lock(tdb, F_WRLCK)) {
2850 return -1;
2851 }
2852
2853 ret = tdb_traverse_internal(tdb, fn, private_data, &tl);
2854
2855 tdb_transaction_unlock(tdb);
2856
2857 return ret;
2858 }
2859
2860
2861 /* find the first entry in the database and return its key */
tdb_firstkey(struct tdb_context * tdb)2862 TDB_DATA tdb_firstkey(struct tdb_context *tdb)
2863 {
2864 TDB_DATA key;
2865 struct list_struct rec;
2866
2867 /* release any old lock */
2868 if (tdb_unlock_record(tdb, tdb->travlocks.off) != 0)
2869 return tdb_null;
2870 tdb->travlocks.off = tdb->travlocks.hash = 0;
2871 tdb->travlocks.lock_rw = F_RDLCK;
2872
2873 /* Grab first record: locks chain and returned record. */
2874 if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
2875 return tdb_null;
2876 /* now read the key */
2877 key.dsize = rec.key_len;
2878 key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
2879
2880 /* Unlock the hash chain of the record we just read. */
2881 if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0)
2882 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
2883 return key;
2884 }
2885
2886 /* find the next entry in the database, returning its key */
tdb_nextkey(struct tdb_context * tdb,TDB_DATA oldkey)2887 TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
2888 {
2889 u32 oldhash;
2890 TDB_DATA key = tdb_null;
2891 struct list_struct rec;
2892 unsigned char *k = NULL;
2893
2894 /* Is locked key the old key? If so, traverse will be reliable. */
2895 if (tdb->travlocks.off) {
2896 if (tdb_lock(tdb,tdb->travlocks.hash,tdb->travlocks.lock_rw))
2897 return tdb_null;
2898 if (tdb_rec_read(tdb, tdb->travlocks.off, &rec) == -1
2899 || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
2900 rec.key_len))
2901 || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
2902 /* No, it wasn't: unlock it and start from scratch */
2903 if (tdb_unlock_record(tdb, tdb->travlocks.off) != 0) {
2904 SAFE_FREE(k);
2905 return tdb_null;
2906 }
2907 if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0) {
2908 SAFE_FREE(k);
2909 return tdb_null;
2910 }
2911 tdb->travlocks.off = 0;
2912 }
2913
2914 SAFE_FREE(k);
2915 }
2916
2917 if (!tdb->travlocks.off) {
2918 /* No previous element: do normal find, and lock record */
2919 tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), tdb->travlocks.lock_rw, &rec);
2920 if (!tdb->travlocks.off)
2921 return tdb_null;
2922 tdb->travlocks.hash = BUCKET(rec.full_hash);
2923 if (tdb_lock_record(tdb, tdb->travlocks.off) != 0) {
2924 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
2925 return tdb_null;
2926 }
2927 }
2928 oldhash = tdb->travlocks.hash;
2929
2930 /* Grab next record: locks chain and returned record,
2931 unlocks old record */
2932 if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
2933 key.dsize = rec.key_len;
2934 key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
2935 key.dsize);
2936 /* Unlock the chain of this new record */
2937 if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0)
2938 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
2939 }
2940 /* Unlock the chain of old record */
2941 if (tdb_unlock(tdb, BUCKET(oldhash), tdb->travlocks.lock_rw) != 0)
2942 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
2943 return key;
2944 }
2945
2946 /* file: dump.c */
2947
tdb_dump_record(struct tdb_context * tdb,int hash,tdb_off_t offset)2948 static tdb_off_t tdb_dump_record(struct tdb_context *tdb, int hash,
2949 tdb_off_t offset)
2950 {
2951 struct list_struct rec;
2952 tdb_off_t tailer_ofs, tailer;
2953
2954 if (tdb->methods->tdb_read(tdb, offset, (char *)&rec,
2955 sizeof(rec), DOCONV()) == -1) {
2956 printf("ERROR: failed to read record at %u\n", offset);
2957 return 0;
2958 }
2959
2960 printf(" rec: hash=%d offset=0x%08x next=0x%08x rec_len=%d "
2961 "key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
2962 hash, offset, rec.next, rec.rec_len, rec.key_len, rec.data_len,
2963 rec.full_hash, rec.magic);
2964
2965 tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off_t);
2966
2967 if (tdb_ofs_read(tdb, tailer_ofs, &tailer) == -1) {
2968 printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
2969 return rec.next;
2970 }
2971
2972 if (tailer != rec.rec_len + sizeof(rec)) {
2973 printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
2974 (unsigned int)tailer, (unsigned int)(rec.rec_len + sizeof(rec)));
2975 }
2976 return rec.next;
2977 }
2978
tdb_dump_chain(struct tdb_context * tdb,int i)2979 static int tdb_dump_chain(struct tdb_context *tdb, int i)
2980 {
2981 tdb_off_t rec_ptr, top;
2982
2983 top = TDB_HASH_TOP(i);
2984
2985 if (tdb_lock(tdb, i, F_WRLCK) != 0)
2986 return -1;
2987
2988 if (tdb_ofs_read(tdb, top, &rec_ptr) == -1)
2989 return tdb_unlock(tdb, i, F_WRLCK);
2990
2991 if (rec_ptr)
2992 printf("hash=%d\n", i);
2993
2994 while (rec_ptr) {
2995 rec_ptr = tdb_dump_record(tdb, i, rec_ptr);
2996 }
2997
2998 return tdb_unlock(tdb, i, F_WRLCK);
2999 }
3000
tdb_dump_all(struct tdb_context * tdb)3001 void tdb_dump_all(struct tdb_context *tdb)
3002 {
3003 int i;
3004 for (i=0;i<tdb->header.hash_size;i++) {
3005 tdb_dump_chain(tdb, i);
3006 }
3007 printf("freelist:\n");
3008 tdb_dump_chain(tdb, -1);
3009 }
3010
tdb_printfreelist(struct tdb_context * tdb)3011 int tdb_printfreelist(struct tdb_context *tdb)
3012 {
3013 int ret;
3014 long total_free = 0;
3015 tdb_off_t offset, rec_ptr;
3016 struct list_struct rec;
3017
3018 if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
3019 return ret;
3020
3021 offset = FREELIST_TOP;
3022
3023 /* read in the freelist top */
3024 if (tdb_ofs_read(tdb, offset, &rec_ptr) == -1) {
3025 tdb_unlock(tdb, -1, F_WRLCK);
3026 return 0;
3027 }
3028
3029 printf("freelist top=[0x%08x]\n", rec_ptr );
3030 while (rec_ptr) {
3031 if (tdb->methods->tdb_read(tdb, rec_ptr, (char *)&rec,
3032 sizeof(rec), DOCONV()) == -1) {
3033 tdb_unlock(tdb, -1, F_WRLCK);
3034 return -1;
3035 }
3036
3037 if (rec.magic != TDB_FREE_MAGIC) {
3038 printf("bad magic 0x%08x in free list\n", rec.magic);
3039 tdb_unlock(tdb, -1, F_WRLCK);
3040 return -1;
3041 }
3042
3043 printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)] (end = 0x%08x)\n",
3044 rec_ptr, rec.rec_len, rec.rec_len, rec_ptr + rec.rec_len);
3045 total_free += rec.rec_len;
3046
3047 /* move to the next record */
3048 rec_ptr = rec.next;
3049 }
3050 printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
3051 (int)total_free);
3052
3053 return tdb_unlock(tdb, -1, F_WRLCK);
3054 }
3055
3056 /* file: tdb.c */
3057
3058 /*
3059 non-blocking increment of the tdb sequence number if the tdb has been opened using
3060 the TDB_SEQNUM flag
3061 */
tdb_increment_seqnum_nonblock(struct tdb_context * tdb)3062 void tdb_increment_seqnum_nonblock(struct tdb_context *tdb)
3063 {
3064 tdb_off_t seqnum=0;
3065
3066 if (!(tdb->flags & TDB_SEQNUM)) {
3067 return;
3068 }
3069
3070 /* we ignore errors from this, as we have no sane way of
3071 dealing with them.
3072 */
3073 tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
3074 seqnum++;
3075 tdb_ofs_write(tdb, TDB_SEQNUM_OFS, &seqnum);
3076 }
3077
3078 /*
3079 increment the tdb sequence number if the tdb has been opened using
3080 the TDB_SEQNUM flag
3081 */
tdb_increment_seqnum(struct tdb_context * tdb)3082 static void tdb_increment_seqnum(struct tdb_context *tdb)
3083 {
3084 if (!(tdb->flags & TDB_SEQNUM)) {
3085 return;
3086 }
3087
3088 if (tdb_brlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, F_SETLKW, 1, 1) != 0) {
3089 return;
3090 }
3091
3092 tdb_increment_seqnum_nonblock(tdb);
3093
3094 tdb_brlock(tdb, TDB_SEQNUM_OFS, F_UNLCK, F_SETLKW, 1, 1);
3095 }
3096
tdb_key_compare(TDB_DATA key,TDB_DATA data,void * private_data)3097 static int tdb_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
3098 {
3099 return memcmp(data.dptr, key.dptr, data.dsize);
3100 }
3101
3102 /* Returns 0 on fail. On success, return offset of record, and fills
3103 in rec */
tdb_find(struct tdb_context * tdb,TDB_DATA key,u32 hash,struct list_struct * r)3104 static tdb_off_t tdb_find(struct tdb_context *tdb, TDB_DATA key, u32 hash,
3105 struct list_struct *r)
3106 {
3107 tdb_off_t rec_ptr;
3108
3109 /* read in the hash top */
3110 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
3111 return 0;
3112
3113 /* keep looking until we find the right record */
3114 while (rec_ptr) {
3115 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
3116 return 0;
3117
3118 if (!TDB_DEAD(r) && hash==r->full_hash
3119 && key.dsize==r->key_len
3120 && tdb_parse_data(tdb, key, rec_ptr + sizeof(*r),
3121 r->key_len, tdb_key_compare,
3122 NULL) == 0) {
3123 return rec_ptr;
3124 }
3125 rec_ptr = r->next;
3126 }
3127 return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
3128 }
3129
3130 /* As tdb_find, but if you succeed, keep the lock */
tdb_find_lock_hash(struct tdb_context * tdb,TDB_DATA key,u32 hash,int locktype,struct list_struct * rec)3131 tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, u32 hash, int locktype,
3132 struct list_struct *rec)
3133 {
3134 u32 rec_ptr;
3135
3136 if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
3137 return 0;
3138 if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
3139 tdb_unlock(tdb, BUCKET(hash), locktype);
3140 return rec_ptr;
3141 }
3142
3143
3144 /* update an entry in place - this only works if the new data size
3145 is <= the old data size and the key exists.
3146 on failure return -1.
3147 */
tdb_update_hash(struct tdb_context * tdb,TDB_DATA key,u32 hash,TDB_DATA dbuf)3148 static int tdb_update_hash(struct tdb_context *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
3149 {
3150 struct list_struct rec;
3151 tdb_off_t rec_ptr;
3152
3153 /* find entry */
3154 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
3155 return -1;
3156
3157 /* must be long enough key, data and tailer */
3158 if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off_t)) {
3159 tdb->ecode = TDB_SUCCESS; /* Not really an error */
3160 return -1;
3161 }
3162
3163 if (tdb->methods->tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
3164 dbuf.dptr, dbuf.dsize) == -1)
3165 return -1;
3166
3167 if (dbuf.dsize != rec.data_len) {
3168 /* update size */
3169 rec.data_len = dbuf.dsize;
3170 return tdb_rec_write(tdb, rec_ptr, &rec);
3171 }
3172
3173 return 0;
3174 }
3175
3176 /* find an entry in the database given a key */
3177 /* If an entry doesn't exist tdb_err will be set to
3178 * TDB_ERR_NOEXIST. If a key has no data attached
3179 * then the TDB_DATA will have zero length but
3180 * a non-zero pointer
3181 */
tdb_fetch(struct tdb_context * tdb,TDB_DATA key)3182 TDB_DATA tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
3183 {
3184 tdb_off_t rec_ptr;
3185 struct list_struct rec;
3186 TDB_DATA ret;
3187 u32 hash;
3188
3189 /* find which hash bucket it is in */
3190 hash = tdb->hash_fn(&key);
3191 if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
3192 return tdb_null;
3193
3194 ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
3195 rec.data_len);
3196 ret.dsize = rec.data_len;
3197 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
3198 return ret;
3199 }
3200
3201 /*
3202 * Find an entry in the database and hand the record's data to a parsing
3203 * function. The parsing function is executed under the chain read lock, so it
3204 * should be fast and should not block on other syscalls.
3205 *
3206 * DONT CALL OTHER TDB CALLS FROM THE PARSER, THIS MIGHT LEAD TO SEGFAULTS.
3207 *
3208 * For mmapped tdb's that do not have a transaction open it points the parsing
3209 * function directly at the mmap area, it avoids the malloc/memcpy in this
3210 * case. If a transaction is open or no mmap is available, it has to do
3211 * malloc/read/parse/free.
3212 *
3213 * This is interesting for all readers of potentially large data structures in
3214 * the tdb records, ldb indexes being one example.
3215 */
3216
tdb_parse_record(struct tdb_context * tdb,TDB_DATA key,int (* parser)(TDB_DATA key,TDB_DATA data,void * private_data),void * private_data)3217 int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
3218 int (*parser)(TDB_DATA key, TDB_DATA data,
3219 void *private_data),
3220 void *private_data)
3221 {
3222 tdb_off_t rec_ptr;
3223 struct list_struct rec;
3224 int ret;
3225 u32 hash;
3226
3227 /* find which hash bucket it is in */
3228 hash = tdb->hash_fn(&key);
3229
3230 if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
3231 return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
3232 }
3233
3234 ret = tdb_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
3235 rec.data_len, parser, private_data);
3236
3237 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
3238
3239 return ret;
3240 }
3241
3242 /* check if an entry in the database exists
3243
3244 note that 1 is returned if the key is found and 0 is returned if not found
3245 this doesn't match the conventions in the rest of this module, but is
3246 compatible with gdbm
3247 */
tdb_exists_hash(struct tdb_context * tdb,TDB_DATA key,u32 hash)3248 static int tdb_exists_hash(struct tdb_context *tdb, TDB_DATA key, u32 hash)
3249 {
3250 struct list_struct rec;
3251
3252 if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
3253 return 0;
3254 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
3255 return 1;
3256 }
3257
tdb_exists(struct tdb_context * tdb,TDB_DATA key)3258 int tdb_exists(struct tdb_context *tdb, TDB_DATA key)
3259 {
3260 u32 hash = tdb->hash_fn(&key);
3261 return tdb_exists_hash(tdb, key, hash);
3262 }
3263
3264 /* actually delete an entry in the database given the offset */
tdb_do_delete(struct tdb_context * tdb,tdb_off_t rec_ptr,struct list_struct * rec)3265 int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct list_struct*rec)
3266 {
3267 tdb_off_t last_ptr, i;
3268 struct list_struct lastrec;
3269
3270 if (tdb->read_only || tdb->traverse_read) return -1;
3271
3272 if (tdb_write_lock_record(tdb, rec_ptr) == -1) {
3273 /* Someone traversing here: mark it as dead */
3274 rec->magic = TDB_DEAD_MAGIC;
3275 return tdb_rec_write(tdb, rec_ptr, rec);
3276 }
3277 if (tdb_write_unlock_record(tdb, rec_ptr) != 0)
3278 return -1;
3279
3280 /* find previous record in hash chain */
3281 if (tdb_ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
3282 return -1;
3283 for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
3284 if (tdb_rec_read(tdb, i, &lastrec) == -1)
3285 return -1;
3286
3287 /* unlink it: next ptr is at start of record. */
3288 if (last_ptr == 0)
3289 last_ptr = TDB_HASH_TOP(rec->full_hash);
3290 if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1)
3291 return -1;
3292
3293 /* recover the space */
3294 if (tdb_free(tdb, rec_ptr, rec) == -1)
3295 return -1;
3296 return 0;
3297 }
3298
tdb_count_dead(struct tdb_context * tdb,u32 hash)3299 static int tdb_count_dead(struct tdb_context *tdb, u32 hash)
3300 {
3301 int res = 0;
3302 tdb_off_t rec_ptr;
3303 struct list_struct rec;
3304
3305 /* read in the hash top */
3306 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
3307 return 0;
3308
3309 while (rec_ptr) {
3310 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1)
3311 return 0;
3312
3313 if (rec.magic == TDB_DEAD_MAGIC) {
3314 res += 1;
3315 }
3316 rec_ptr = rec.next;
3317 }
3318 return res;
3319 }
3320
3321 /*
3322 * Purge all DEAD records from a hash chain
3323 */
tdb_purge_dead(struct tdb_context * tdb,u32 hash)3324 static int tdb_purge_dead(struct tdb_context *tdb, u32 hash)
3325 {
3326 int res = -1;
3327 struct list_struct rec;
3328 tdb_off_t rec_ptr;
3329
3330 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
3331 return -1;
3332 }
3333
3334 /* read in the hash top */
3335 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
3336 goto fail;
3337
3338 while (rec_ptr) {
3339 tdb_off_t next;
3340
3341 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1) {
3342 goto fail;
3343 }
3344
3345 next = rec.next;
3346
3347 if (rec.magic == TDB_DEAD_MAGIC
3348 && tdb_do_delete(tdb, rec_ptr, &rec) == -1) {
3349 goto fail;
3350 }
3351 rec_ptr = next;
3352 }
3353 res = 0;
3354 fail:
3355 tdb_unlock(tdb, -1, F_WRLCK);
3356 return res;
3357 }
3358
3359 /* delete an entry in the database given a key */
tdb_delete_hash(struct tdb_context * tdb,TDB_DATA key,u32 hash)3360 static int tdb_delete_hash(struct tdb_context *tdb, TDB_DATA key, u32 hash)
3361 {
3362 tdb_off_t rec_ptr;
3363 struct list_struct rec;
3364 int ret;
3365
3366 if (tdb->max_dead_records != 0) {
3367
3368 /*
3369 * Allow for some dead records per hash chain, mainly for
3370 * tdb's with a very high create/delete rate like locking.tdb.
3371 */
3372
3373 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
3374 return -1;
3375
3376 if (tdb_count_dead(tdb, hash) >= tdb->max_dead_records) {
3377 /*
3378 * Don't let the per-chain freelist grow too large,
3379 * delete all existing dead records
3380 */
3381 tdb_purge_dead(tdb, hash);
3382 }
3383
3384 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
3385 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
3386 return -1;
3387 }
3388
3389 /*
3390 * Just mark the record as dead.
3391 */
3392 rec.magic = TDB_DEAD_MAGIC;
3393 ret = tdb_rec_write(tdb, rec_ptr, &rec);
3394 }
3395 else {
3396 if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK,
3397 &rec)))
3398 return -1;
3399
3400 ret = tdb_do_delete(tdb, rec_ptr, &rec);
3401 }
3402
3403 if (ret == 0) {
3404 tdb_increment_seqnum(tdb);
3405 }
3406
3407 if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
3408 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_delete: WARNING tdb_unlock failed!\n"));
3409 return ret;
3410 }
3411
tdb_delete(struct tdb_context * tdb,TDB_DATA key)3412 int tdb_delete(struct tdb_context *tdb, TDB_DATA key)
3413 {
3414 u32 hash = tdb->hash_fn(&key);
3415 return tdb_delete_hash(tdb, key, hash);
3416 }
3417
3418 /*
3419 * See if we have a dead record around with enough space
3420 */
tdb_find_dead(struct tdb_context * tdb,u32 hash,struct list_struct * r,tdb_len_t length)3421 static tdb_off_t tdb_find_dead(struct tdb_context *tdb, u32 hash,
3422 struct list_struct *r, tdb_len_t length)
3423 {
3424 tdb_off_t rec_ptr;
3425
3426 /* read in the hash top */
3427 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
3428 return 0;
3429
3430 /* keep looking until we find the right record */
3431 while (rec_ptr) {
3432 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
3433 return 0;
3434
3435 if (TDB_DEAD(r) && r->rec_len >= length) {
3436 /*
3437 * First fit for simple coding, TODO: change to best
3438 * fit
3439 */
3440 return rec_ptr;
3441 }
3442 rec_ptr = r->next;
3443 }
3444 return 0;
3445 }
3446
3447 /* store an element in the database, replacing any existing element
3448 with the same key
3449
3450 return 0 on success, -1 on failure
3451 */
tdb_store(struct tdb_context * tdb,TDB_DATA key,TDB_DATA dbuf,int flag)3452 int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
3453 {
3454 struct list_struct rec;
3455 u32 hash;
3456 tdb_off_t rec_ptr;
3457 char *p = NULL;
3458 int ret = -1;
3459
3460 if (tdb->read_only || tdb->traverse_read) {
3461 tdb->ecode = TDB_ERR_RDONLY;
3462 return -1;
3463 }
3464
3465 /* find which hash bucket it is in */
3466 hash = tdb->hash_fn(&key);
3467 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
3468 return -1;
3469
3470 /* check for it existing, on insert. */
3471 if (flag == TDB_INSERT) {
3472 if (tdb_exists_hash(tdb, key, hash)) {
3473 tdb->ecode = TDB_ERR_EXISTS;
3474 goto fail;
3475 }
3476 } else {
3477 /* first try in-place update, on modify or replace. */
3478 if (tdb_update_hash(tdb, key, hash, dbuf) == 0) {
3479 goto done;
3480 }
3481 if (tdb->ecode == TDB_ERR_NOEXIST &&
3482 flag == TDB_MODIFY) {
3483 /* if the record doesn't exist and we are in TDB_MODIFY mode then
3484 we should fail the store */
3485 goto fail;
3486 }
3487 }
3488 /* reset the error code potentially set by the tdb_update() */
3489 tdb->ecode = TDB_SUCCESS;
3490
3491 /* delete any existing record - if it doesn't exist we don't
3492 care. Doing this first reduces fragmentation, and avoids
3493 coalescing with `allocated' block before it's updated. */
3494 if (flag != TDB_INSERT)
3495 tdb_delete_hash(tdb, key, hash);
3496
3497 /* Copy key+value *before* allocating free space in case malloc
3498 fails and we are left with a dead spot in the tdb. */
3499
3500 if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
3501 tdb->ecode = TDB_ERR_OOM;
3502 goto fail;
3503 }
3504
3505 memcpy(p, key.dptr, key.dsize);
3506 if (dbuf.dsize)
3507 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
3508
3509 if (tdb->max_dead_records != 0) {
3510 /*
3511 * Allow for some dead records per hash chain, look if we can
3512 * find one that can hold the new record. We need enough space
3513 * for key, data and tailer. If we find one, we don't have to
3514 * consult the central freelist.
3515 */
3516 rec_ptr = tdb_find_dead(
3517 tdb, hash, &rec,
3518 key.dsize + dbuf.dsize + sizeof(tdb_off_t));
3519
3520 if (rec_ptr != 0) {
3521 rec.key_len = key.dsize;
3522 rec.data_len = dbuf.dsize;
3523 rec.full_hash = hash;
3524 rec.magic = TDB_MAGIC;
3525 if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
3526 || tdb->methods->tdb_write(
3527 tdb, rec_ptr + sizeof(rec),
3528 p, key.dsize + dbuf.dsize) == -1) {
3529 goto fail;
3530 }
3531 goto done;
3532 }
3533 }
3534
3535 /*
3536 * We have to allocate some space from the freelist, so this means we
3537 * have to lock it. Use the chance to purge all the DEAD records from
3538 * the hash chain under the freelist lock.
3539 */
3540
3541 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
3542 goto fail;
3543 }
3544
3545 if ((tdb->max_dead_records != 0)
3546 && (tdb_purge_dead(tdb, hash) == -1)) {
3547 tdb_unlock(tdb, -1, F_WRLCK);
3548 goto fail;
3549 }
3550
3551 /* we have to allocate some space */
3552 rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec);
3553
3554 tdb_unlock(tdb, -1, F_WRLCK);
3555
3556 if (rec_ptr == 0) {
3557 goto fail;
3558 }
3559
3560 /* Read hash top into next ptr */
3561 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
3562 goto fail;
3563
3564 rec.key_len = key.dsize;
3565 rec.data_len = dbuf.dsize;
3566 rec.full_hash = hash;
3567 rec.magic = TDB_MAGIC;
3568
3569 /* write out and point the top of the hash chain at it */
3570 if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
3571 || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
3572 || tdb_ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
3573 /* Need to tdb_unallocate() here */
3574 goto fail;
3575 }
3576
3577 done:
3578 ret = 0;
3579 fail:
3580 if (ret == 0) {
3581 tdb_increment_seqnum(tdb);
3582 }
3583
3584 SAFE_FREE(p);
3585 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
3586 return ret;
3587 }
3588
3589
3590 /* Append to an entry. Create if not exist. */
tdb_append(struct tdb_context * tdb,TDB_DATA key,TDB_DATA new_dbuf)3591 int tdb_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
3592 {
3593 u32 hash;
3594 TDB_DATA dbuf;
3595 int ret = -1;
3596
3597 /* find which hash bucket it is in */
3598 hash = tdb->hash_fn(&key);
3599 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
3600 return -1;
3601
3602 dbuf = tdb_fetch(tdb, key);
3603
3604 if (dbuf.dptr == NULL) {
3605 dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
3606 } else {
3607 unsigned char *new_dptr = (unsigned char *)realloc(dbuf.dptr,
3608 dbuf.dsize + new_dbuf.dsize);
3609 if (new_dptr == NULL) {
3610 free(dbuf.dptr);
3611 }
3612 dbuf.dptr = new_dptr;
3613 }
3614
3615 if (dbuf.dptr == NULL) {
3616 tdb->ecode = TDB_ERR_OOM;
3617 goto failed;
3618 }
3619
3620 memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
3621 dbuf.dsize += new_dbuf.dsize;
3622
3623 ret = tdb_store(tdb, key, dbuf, 0);
3624
3625 failed:
3626 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
3627 SAFE_FREE(dbuf.dptr);
3628 return ret;
3629 }
3630
3631
3632 /*
3633 return the name of the current tdb file
3634 useful for external logging functions
3635 */
tdb_name(struct tdb_context * tdb)3636 const char *tdb_name(struct tdb_context *tdb)
3637 {
3638 return tdb->name;
3639 }
3640
3641 /*
3642 return the underlying file descriptor being used by tdb, or -1
3643 useful for external routines that want to check the device/inode
3644 of the fd
3645 */
tdb_fd(struct tdb_context * tdb)3646 int tdb_fd(struct tdb_context *tdb)
3647 {
3648 return tdb->fd;
3649 }
3650
3651 /*
3652 return the current logging function
3653 useful for external tdb routines that wish to log tdb errors
3654 */
tdb_log_fn(struct tdb_context * tdb)3655 tdb_log_func tdb_log_fn(struct tdb_context *tdb)
3656 {
3657 return tdb->log.log_fn;
3658 }
3659
3660
3661 /*
3662 get the tdb sequence number. Only makes sense if the writers opened
3663 with TDB_SEQNUM set. Note that this sequence number will wrap quite
3664 quickly, so it should only be used for a 'has something changed'
3665 test, not for code that relies on the count of the number of changes
3666 made. If you want a counter then use a tdb record.
3667
3668 The aim of this sequence number is to allow for a very lightweight
3669 test of a possible tdb change.
3670 */
tdb_get_seqnum(struct tdb_context * tdb)3671 int tdb_get_seqnum(struct tdb_context *tdb)
3672 {
3673 tdb_off_t seqnum=0;
3674
3675 tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
3676 return seqnum;
3677 }
3678
tdb_hash_size(struct tdb_context * tdb)3679 int tdb_hash_size(struct tdb_context *tdb)
3680 {
3681 return tdb->header.hash_size;
3682 }
3683
tdb_map_size(struct tdb_context * tdb)3684 size_t tdb_map_size(struct tdb_context *tdb)
3685 {
3686 return tdb->map_size;
3687 }
3688
tdb_get_flags(struct tdb_context * tdb)3689 int tdb_get_flags(struct tdb_context *tdb)
3690 {
3691 return tdb->flags;
3692 }
3693
3694
3695 /*
3696 enable sequence number handling on an open tdb
3697 */
tdb_enable_seqnum(struct tdb_context * tdb)3698 void tdb_enable_seqnum(struct tdb_context *tdb)
3699 {
3700 tdb->flags |= TDB_SEQNUM;
3701 }
3702
3703 /* file: open.c */
3704
3705 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
3706 static struct tdb_context *tdbs = NULL;
3707
3708
3709 /* This is from a hash algorithm suggested by Rogier Wolff */
default_tdb_hash(TDB_DATA * key)3710 static unsigned int default_tdb_hash(TDB_DATA *key)
3711 {
3712 u32 value; /* Used to compute the hash value. */
3713 u32 i; /* Used to cycle through random values. */
3714
3715 /* Set the initial value from the key size. */
3716 for (value = 0, i=0; i < key->dsize; i++)
3717 value = value * 256 + key->dptr[i] + (value >> 24) * 241;
3718
3719 return value;
3720 }
3721
3722
3723 /* initialise a new database with a specified hash size */
tdb_new_database(struct tdb_context * tdb,int hash_size)3724 static int tdb_new_database(struct tdb_context *tdb, int hash_size)
3725 {
3726 struct tdb_header *newdb;
3727 int size, ret = -1;
3728
3729 /* We make it up in memory, then write it out if not internal */
3730 size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off_t);
3731 if (!(newdb = (struct tdb_header *)calloc(size, 1)))
3732 return TDB_ERRCODE(TDB_ERR_OOM, -1);
3733
3734 /* Fill in the header */
3735 newdb->version = TDB_VERSION;
3736 newdb->hash_size = hash_size;
3737 if (tdb->flags & TDB_INTERNAL) {
3738 tdb->map_size = size;
3739 tdb->map_ptr = (char *)newdb;
3740 memcpy(&tdb->header, newdb, sizeof(tdb->header));
3741 /* Convert the `ondisk' version if asked. */
3742 CONVERT(*newdb);
3743 return 0;
3744 }
3745 if (lseek(tdb->fd, 0, SEEK_SET) == -1)
3746 goto fail;
3747
3748 if (ftruncate(tdb->fd, 0) == -1)
3749 goto fail;
3750
3751 /* This creates an endian-converted header, as if read from disk */
3752 CONVERT(*newdb);
3753 memcpy(&tdb->header, newdb, sizeof(tdb->header));
3754 /* Don't endian-convert the magic food! */
3755 memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
3756 if (write(tdb->fd, newdb, size) != size) {
3757 ret = -1;
3758 } else {
3759 ret = 0;
3760 }
3761
3762 fail:
3763 SAFE_FREE(newdb);
3764 return ret;
3765 }
3766
3767
3768
tdb_already_open(dev_t device,ino_t ino)3769 static int tdb_already_open(dev_t device,
3770 ino_t ino)
3771 {
3772 struct tdb_context *i;
3773
3774 for (i = tdbs; i; i = i->next) {
3775 if (i->device == device && i->inode == ino) {
3776 return 1;
3777 }
3778 }
3779
3780 return 0;
3781 }
3782
3783 /* open the database, creating it if necessary
3784
3785 The open_flags and mode are passed straight to the open call on the
3786 database file. A flags value of O_WRONLY is invalid. The hash size
3787 is advisory, use zero for a default value.
3788
3789 Return is NULL on error, in which case errno is also set. Don't
3790 try to call tdb_error or tdb_errname, just do strerror(errno).
3791
3792 @param name may be NULL for internal databases. */
tdb_open(const char * name,int hash_size,int tdb_flags,int open_flags,mode_t mode)3793 struct tdb_context *tdb_open(const char *name, int hash_size, int tdb_flags,
3794 int open_flags, mode_t mode)
3795 {
3796 return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
3797 }
3798
3799 /* a default logging function */
3800 static void null_log_fn(struct tdb_context *tdb, enum tdb_debug_level level, const char *fmt, ...) PRINTF_ATTRIBUTE(3, 4);
null_log_fn(struct tdb_context * tdb,enum tdb_debug_level level,const char * fmt,...)3801 static void null_log_fn(struct tdb_context *tdb, enum tdb_debug_level level, const char *fmt, ...)
3802 {
3803 }
3804
3805
tdb_open_ex(const char * name,int hash_size,int tdb_flags,int open_flags,mode_t mode,const struct tdb_logging_context * log_ctx,tdb_hash_func hash_fn)3806 struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
3807 int open_flags, mode_t mode,
3808 const struct tdb_logging_context *log_ctx,
3809 tdb_hash_func hash_fn)
3810 {
3811 struct tdb_context *tdb;
3812 struct stat st;
3813 int rev = 0, locked = 0;
3814 unsigned char *vp;
3815 u32 vertest;
3816
3817 if (!(tdb = (struct tdb_context *)calloc(1, sizeof *tdb))) {
3818 /* Can't log this */
3819 errno = ENOMEM;
3820 goto fail;
3821 }
3822 tdb_io_init(tdb);
3823 tdb->fd = -1;
3824 tdb->name = NULL;
3825 tdb->map_ptr = NULL;
3826 tdb->flags = tdb_flags;
3827 tdb->open_flags = open_flags;
3828 if (log_ctx) {
3829 tdb->log = *log_ctx;
3830 } else {
3831 tdb->log.log_fn = null_log_fn;
3832 tdb->log.log_private = NULL;
3833 }
3834 tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
3835
3836 /* cache the page size */
3837 tdb->page_size = sysconf(_SC_PAGESIZE);
3838 if (tdb->page_size <= 0) {
3839 tdb->page_size = 0x2000;
3840 }
3841
3842 if ((open_flags & O_ACCMODE) == O_WRONLY) {
3843 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: can't open tdb %s write-only\n",
3844 name));
3845 errno = EINVAL;
3846 goto fail;
3847 }
3848
3849 if (hash_size == 0)
3850 hash_size = DEFAULT_HASH_SIZE;
3851 if ((open_flags & O_ACCMODE) == O_RDONLY) {
3852 tdb->read_only = 1;
3853 /* read only databases don't do locking or clear if first */
3854 tdb->flags |= TDB_NOLOCK;
3855 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
3856 }
3857
3858 /* internal databases don't mmap or lock, and start off cleared */
3859 if (tdb->flags & TDB_INTERNAL) {
3860 tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
3861 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
3862 if (tdb_new_database(tdb, hash_size) != 0) {
3863 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: tdb_new_database failed!"));
3864 goto fail;
3865 }
3866 goto internal;
3867 }
3868
3869 if ((tdb->fd = open(name, open_flags, mode)) == -1) {
3870 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_open_ex: could not open file %s: %s\n",
3871 name, strerror(errno)));
3872 goto fail; /* errno set by open(2) */
3873 }
3874
3875 /* ensure there is only one process initialising at once */
3876 if (tdb->methods->tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0, 1) == -1) {
3877 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to get global lock on %s: %s\n",
3878 name, strerror(errno)));
3879 goto fail; /* errno set by tdb_brlock */
3880 }
3881
3882 /* we need to zero database if we are the only one with it open */
3883 if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
3884 (locked = (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0, 1) == 0))) {
3885 open_flags |= O_CREAT;
3886 if (ftruncate(tdb->fd, 0) == -1) {
3887 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
3888 "failed to truncate %s: %s\n",
3889 name, strerror(errno)));
3890 goto fail; /* errno set by ftruncate */
3891 }
3892 }
3893
3894 if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
3895 || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
3896 || (tdb->header.version != TDB_VERSION
3897 && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
3898 /* its not a valid database - possibly initialise it */
3899 if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
3900 errno = EIO; /* ie bad format or something */
3901 goto fail;
3902 }
3903 rev = (tdb->flags & TDB_CONVERT);
3904 }
3905 vp = (unsigned char *)&tdb->header.version;
3906 vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
3907 (((u32)vp[2]) << 8) | (u32)vp[3];
3908 tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
3909 if (!rev)
3910 tdb->flags &= ~TDB_CONVERT;
3911 else {
3912 tdb->flags |= TDB_CONVERT;
3913 tdb_convert(&tdb->header, sizeof(tdb->header));
3914 }
3915 if (fstat(tdb->fd, &st) == -1)
3916 goto fail;
3917
3918 if (tdb->header.rwlocks != 0) {
3919 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: spinlocks no longer supported\n"));
3920 goto fail;
3921 }
3922
3923 /* Is it already in the open list? If so, fail. */
3924 if (tdb_already_open(st.st_dev, st.st_ino)) {
3925 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
3926 "%s (%d,%d) is already open in this process\n",
3927 name, (int)st.st_dev, (int)st.st_ino));
3928 errno = EBUSY;
3929 goto fail;
3930 }
3931
3932 if (!(tdb->name = (char *)strdup(name))) {
3933 errno = ENOMEM;
3934 goto fail;
3935 }
3936
3937 tdb->map_size = st.st_size;
3938 tdb->device = st.st_dev;
3939 tdb->inode = st.st_ino;
3940 tdb->max_dead_records = 0;
3941 tdb_mmap(tdb);
3942 if (locked) {
3943 if (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0, 1) == -1) {
3944 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
3945 "failed to take ACTIVE_LOCK on %s: %s\n",
3946 name, strerror(errno)));
3947 goto fail;
3948 }
3949
3950 }
3951
3952 /* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
3953 we didn't get the initial exclusive lock as we need to let all other
3954 users know we're using it. */
3955
3956 if (tdb_flags & TDB_CLEAR_IF_FIRST) {
3957 /* leave this lock in place to indicate it's in use */
3958 if (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0, 1) == -1)
3959 goto fail;
3960 }
3961
3962 /* if needed, run recovery */
3963 if (tdb_transaction_recover(tdb) == -1) {
3964 goto fail;
3965 }
3966
3967 internal:
3968 /* Internal (memory-only) databases skip all the code above to
3969 * do with disk files, and resume here by releasing their
3970 * global lock and hooking into the active list. */
3971 if (tdb->methods->tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0, 1) == -1)
3972 goto fail;
3973 tdb->next = tdbs;
3974 tdbs = tdb;
3975 return tdb;
3976
3977 fail:
3978 { int save_errno = errno;
3979
3980 if (!tdb)
3981 return NULL;
3982
3983 if (tdb->map_ptr) {
3984 if (tdb->flags & TDB_INTERNAL)
3985 SAFE_FREE(tdb->map_ptr);
3986 else
3987 tdb_munmap(tdb);
3988 }
3989 SAFE_FREE(tdb->name);
3990 if (tdb->fd != -1)
3991 if (close(tdb->fd) != 0)
3992 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to close tdb->fd on error!\n"));
3993 SAFE_FREE(tdb);
3994 errno = save_errno;
3995 return NULL;
3996 }
3997 }
3998
3999 /*
4000 * Set the maximum number of dead records per hash chain
4001 */
4002
tdb_set_max_dead(struct tdb_context * tdb,int max_dead)4003 void tdb_set_max_dead(struct tdb_context *tdb, int max_dead)
4004 {
4005 tdb->max_dead_records = max_dead;
4006 }
4007
4008 /**
4009 * Close a database.
4010 *
4011 * @returns -1 for error; 0 for success.
4012 **/
tdb_close(struct tdb_context * tdb)4013 int tdb_close(struct tdb_context *tdb)
4014 {
4015 struct tdb_context **i;
4016 int ret = 0;
4017
4018 if (tdb->transaction) {
4019 tdb_transaction_cancel(tdb);
4020 }
4021
4022 if (tdb->map_ptr) {
4023 if (tdb->flags & TDB_INTERNAL)
4024 SAFE_FREE(tdb->map_ptr);
4025 else
4026 tdb_munmap(tdb);
4027 }
4028 SAFE_FREE(tdb->name);
4029 if (tdb->fd != -1)
4030 ret = close(tdb->fd);
4031 SAFE_FREE(tdb->lockrecs);
4032
4033 /* Remove from contexts list */
4034 for (i = &tdbs; *i; i = &(*i)->next) {
4035 if (*i == tdb) {
4036 *i = tdb->next;
4037 break;
4038 }
4039 }
4040
4041 memset(tdb, 0, sizeof(*tdb));
4042 SAFE_FREE(tdb);
4043
4044 return ret;
4045 }
4046
4047 /* register a loging function */
tdb_set_logging_function(struct tdb_context * tdb,const struct tdb_logging_context * log_ctx)4048 void tdb_set_logging_function(struct tdb_context *tdb,
4049 const struct tdb_logging_context *log_ctx)
4050 {
4051 tdb->log = *log_ctx;
4052 }
4053
tdb_get_logging_private(struct tdb_context * tdb)4054 void *tdb_get_logging_private(struct tdb_context *tdb)
4055 {
4056 return tdb->log.log_private;
4057 }
4058
4059 /* reopen a tdb - this can be used after a fork to ensure that we have an independent
4060 seek pointer from our parent and to re-establish locks */
tdb_reopen(struct tdb_context * tdb)4061 int tdb_reopen(struct tdb_context *tdb)
4062 {
4063 struct stat st;
4064
4065 if (tdb->flags & TDB_INTERNAL) {
4066 return 0; /* Nothing to do. */
4067 }
4068
4069 if (tdb->num_locks != 0 || tdb->global_lock.count) {
4070 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_reopen: reopen not allowed with locks held\n"));
4071 goto fail;
4072 }
4073
4074 if (tdb->transaction != 0) {
4075 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_reopen: reopen not allowed inside a transaction\n"));
4076 goto fail;
4077 }
4078
4079 if (tdb_munmap(tdb) != 0) {
4080 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
4081 goto fail;
4082 }
4083 if (close(tdb->fd) != 0)
4084 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
4085 tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
4086 if (tdb->fd == -1) {
4087 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: open failed (%s)\n", strerror(errno)));
4088 goto fail;
4089 }
4090 if ((tdb->flags & TDB_CLEAR_IF_FIRST) &&
4091 (tdb->methods->tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0, 1) == -1)) {
4092 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: failed to obtain active lock\n"));
4093 goto fail;
4094 }
4095 if (fstat(tdb->fd, &st) != 0) {
4096 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
4097 goto fail;
4098 }
4099 if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
4100 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: file dev/inode has changed!\n"));
4101 goto fail;
4102 }
4103 tdb_mmap(tdb);
4104
4105 return 0;
4106
4107 fail:
4108 tdb_close(tdb);
4109 return -1;
4110 }
4111
4112 /* reopen all tdb's */
tdb_reopen_all(int parent_longlived)4113 int tdb_reopen_all(int parent_longlived)
4114 {
4115 struct tdb_context *tdb;
4116
4117 for (tdb=tdbs; tdb; tdb = tdb->next) {
4118 /*
4119 * If the parent is longlived (ie. a
4120 * parent daemon architecture), we know
4121 * it will keep it's active lock on a
4122 * tdb opened with CLEAR_IF_FIRST. Thus
4123 * for child processes we don't have to
4124 * add an active lock. This is essential
4125 * to improve performance on systems that
4126 * keep POSIX locks as a non-scalable data
4127 * structure in the kernel.
4128 */
4129 if (parent_longlived) {
4130 /* Ensure no clear-if-first. */
4131 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
4132 }
4133
4134 if (tdb_reopen(tdb) != 0)
4135 return -1;
4136 }
4137
4138 return 0;
4139 }
4140